bio-bigwig 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,537 @@
1
+ #include "bigWig.h"
2
+ #include "bwCommon.h"
3
+ #include <errno.h>
4
+ #include <stdlib.h>
5
+ #include <zlib.h>
6
+ #include <math.h>
7
+ #include <string.h>
8
+
9
+ //Returns -1 if there are no applicable levels, otherwise an integer indicating the most appropriate level.
10
+ //Like Kent's library, this divides the desired bin size by 2 to minimize the effect of blocks overlapping multiple bins
11
+ static int32_t determineZoomLevel(const bigWigFile_t *fp, int basesPerBin) {
12
+ int32_t out = -1;
13
+ int64_t diff;
14
+ uint32_t bestDiff = -1;
15
+ uint16_t i;
16
+
17
+ basesPerBin/=2;
18
+ for(i=0; i<fp->hdr->nLevels; i++) {
19
+ diff = basesPerBin - (int64_t) fp->hdr->zoomHdrs->level[i];
20
+ if(diff >= 0 && diff < bestDiff) {
21
+ bestDiff = diff;
22
+ out = i;
23
+ }
24
+ }
25
+ return out;
26
+ }
27
+
28
+ /// @cond SKIP
29
+ struct val_t {
30
+ uint32_t nBases;
31
+ float min, max, sum, sumsq;
32
+ double scalar;
33
+ };
34
+
35
+ struct vals_t {
36
+ uint32_t n;
37
+ struct val_t **vals;
38
+ };
39
+ /// @endcond
40
+
41
+ void destroyVals_t(struct vals_t *v) {
42
+ uint32_t i;
43
+ if(!v) return;
44
+ for(i=0; i<v->n; i++) free(v->vals[i]);
45
+ if(v->vals) free(v->vals);
46
+ free(v);
47
+ }
48
+
49
+ //Determine the base-pair overlap between an interval and a block
50
+ double getScalar(uint32_t i_start, uint32_t i_end, uint32_t b_start, uint32_t b_end) {
51
+ double rv = 0.0;
52
+ if(b_start <= i_start) {
53
+ if(b_end > i_start) rv = ((double)(b_end - i_start))/(b_end-b_start);
54
+ } else if(b_start < i_end) {
55
+ if(b_end < i_end) rv = ((double)(b_end - b_start))/(b_end-b_start);
56
+ else rv = ((double)(i_end - b_start))/(b_end-b_start);
57
+ }
58
+
59
+ return rv;
60
+ }
61
+
62
+ //Returns NULL on error
63
+ static struct vals_t *getVals(bigWigFile_t *fp, bwOverlapBlock_t *o, int i, uint32_t tid, uint32_t start, uint32_t end) {
64
+ void *buf = NULL, *compBuf = NULL;
65
+ uLongf sz = fp->hdr->bufSize;
66
+ int compressed = 0, rv;
67
+ uint32_t *p, vtid, vstart, vend;
68
+ struct vals_t *vals = NULL;
69
+ struct val_t *v = NULL;
70
+
71
+ if(sz) {
72
+ compressed = 1;
73
+ buf = malloc(sz);
74
+ }
75
+ sz = 0; //This is now the size of the compressed buffer
76
+
77
+ if(bwSetPos(fp, o->offset[i])) goto error;
78
+
79
+ vals = calloc(1,sizeof(struct vals_t));
80
+ if(!vals) goto error;
81
+
82
+ v = malloc(sizeof(struct val_t));
83
+ if(!v) goto error;
84
+
85
+ if(sz < o->size[i]) compBuf = malloc(o->size[i]);
86
+ if(!compBuf) goto error;
87
+
88
+ if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error;
89
+ if(compressed) {
90
+ sz = fp->hdr->bufSize;
91
+ rv = uncompress(buf, &sz, compBuf, o->size[i]);
92
+ if(rv != Z_OK) goto error;
93
+ } else {
94
+ buf = compBuf;
95
+ sz = o->size[i];
96
+ }
97
+
98
+ p = buf;
99
+ while(((uLongf) ((void*)p-buf)) < sz) {
100
+ vtid = p[0];
101
+ vstart = p[1];
102
+ vend = p[2];
103
+ v->nBases = p[3];
104
+ v->min = ((float*) p)[4];
105
+ v->max = ((float*) p)[5];
106
+ v->sum = ((float*) p)[6];
107
+ v->sumsq = ((float*) p)[7];
108
+ v->scalar = getScalar(start, end, vstart, vend);
109
+
110
+ if(tid == vtid) {
111
+ if((start <= vstart && end > vstart) || (start < vend && start >= vstart)) {
112
+ vals->vals = realloc(vals->vals, sizeof(struct val_t*)*(vals->n+1));
113
+ if(!vals->vals) goto error;
114
+ vals->vals[vals->n++] = v;
115
+ v = malloc(sizeof(struct val_t));
116
+ if(!v) goto error;
117
+ }
118
+ if(vstart > end) break;
119
+ } else if(vtid > tid) {
120
+ break;
121
+ }
122
+ p+=8;
123
+ }
124
+
125
+ free(v);
126
+ free(buf);
127
+ if(compressed) free(compBuf);
128
+ return vals;
129
+
130
+ error:
131
+ if(buf) free(buf);
132
+ if(compBuf && compressed) free(compBuf);
133
+ if(v) free(v);
134
+ destroyVals_t(vals);
135
+ return NULL;
136
+ }
137
+
138
+ //On error, errno is set to ENOMEM and NaN is returned (though NaN can be returned normally)
139
+ static double blockMean(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
140
+ uint32_t i, j;
141
+ double output = 0.0, coverage = 0.0;
142
+ struct vals_t *v = NULL;
143
+
144
+ if(!blocks->n) return strtod("NaN", NULL);
145
+
146
+ //Iterate over the blocks
147
+ for(i=0; i<blocks->n; i++) {
148
+ v = getVals(fp, blocks, i, tid, start, end);
149
+ if(!v) goto error;
150
+ for(j=0; j<v->n; j++) {
151
+ output += v->vals[j]->sum * v->vals[j]->scalar;
152
+ coverage += v->vals[j]->nBases * v->vals[j]->scalar;
153
+ }
154
+ destroyVals_t(v);
155
+ }
156
+
157
+
158
+ if(!coverage) return strtod("NaN", NULL);
159
+
160
+ return output/coverage;
161
+
162
+ error:
163
+ if(v) free(v);
164
+ errno = ENOMEM;
165
+ return strtod("NaN", NULL);
166
+ }
167
+
168
+ static double intMean(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
169
+ double sum = 0.0;
170
+ uint32_t nBases = 0, i, start_use, end_use;
171
+
172
+ if(!ints->l) return strtod("NaN", NULL);
173
+
174
+ for(i=0; i<ints->l; i++) {
175
+ start_use = ints->start[i];
176
+ end_use = ints->end[i];
177
+ if(ints->start[i] < start) start_use = start;
178
+ if(ints->end[i] > end) end_use = end;
179
+ nBases += end_use-start_use;
180
+ sum += (end_use-start_use)*((double) ints->value[i]);
181
+ }
182
+
183
+ return sum/nBases;
184
+ }
185
+
186
+ //Does UCSC compensate for partial block/range overlap?
187
+ static double blockDev(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
188
+ uint32_t i, j;
189
+ double mean = 0.0, ssq = 0.0, coverage = 0.0, diff;
190
+ struct vals_t *v = NULL;
191
+
192
+ if(!blocks->n) return strtod("NaN", NULL);
193
+
194
+ //Iterate over the blocks
195
+ for(i=0; i<blocks->n; i++) {
196
+ v = getVals(fp, blocks, i, tid, start, end);
197
+ if(!v) goto error;
198
+ for(j=0; j<v->n; j++) {
199
+ coverage += v->vals[j]->nBases * v->vals[j]->scalar;
200
+ mean += v->vals[j]->sum * v->vals[j]->scalar;
201
+ ssq += v->vals[j]->sumsq * v->vals[j]->scalar;
202
+ }
203
+ destroyVals_t(v);
204
+ v = NULL;
205
+ }
206
+
207
+ if(coverage<=1.0) return strtod("NaN", NULL);
208
+ diff = ssq-mean*mean/coverage;
209
+ if(coverage > 1.0) diff /= coverage-1;
210
+ if(fabs(diff) > 1e-8) { //Ignore floating point differences
211
+ return sqrt(diff);
212
+ } else {
213
+ return 0.0;
214
+ }
215
+
216
+ error:
217
+ if(v) destroyVals_t(v);
218
+ errno = ENOMEM;
219
+ return strtod("NaN", NULL);
220
+ }
221
+
222
+ //This uses compensated summation to account for finite precision math
223
+ static double intDev(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
224
+ double v1 = 0.0, mean, rv;
225
+ uint32_t nBases = 0, i, start_use, end_use;
226
+
227
+ if(!ints->l) return strtod("NaN", NULL);
228
+ mean = intMean(ints, start, end);
229
+
230
+ for(i=0; i<ints->l; i++) {
231
+ start_use = ints->start[i];
232
+ end_use = ints->end[i];
233
+ if(ints->start[i] < start) start_use = start;
234
+ if(ints->end[i] > end) end_use = end;
235
+ nBases += end_use-start_use;
236
+ v1 += (end_use-start_use) * pow(ints->value[i]-mean, 2.0); //running sum of squared difference
237
+ }
238
+
239
+ if(nBases>=2) rv = sqrt(v1/(nBases-1));
240
+ else if(nBases==1) rv = sqrt(v1);
241
+ else rv = strtod("NaN", NULL);
242
+
243
+ return rv;
244
+ }
245
+
246
+ static double blockMax(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
247
+ uint32_t i, j, isNA = 1;
248
+ double o = strtod("NaN", NULL);
249
+ struct vals_t *v = NULL;
250
+
251
+ if(!blocks->n) return o;
252
+
253
+ //Iterate the blocks
254
+ for(i=0; i<blocks->n; i++) {
255
+ v = getVals(fp, blocks, i, tid, start, end);
256
+ if(!v) goto error;
257
+ for(j=0; j<v->n; j++) {
258
+ if(isNA) {
259
+ o = v->vals[j]->max;
260
+ isNA = 0;
261
+ } else if(v->vals[j]->max > o) {
262
+ o = v->vals[j]->max;
263
+ }
264
+ }
265
+ destroyVals_t(v);
266
+ }
267
+
268
+ return o;
269
+
270
+ error:
271
+ destroyVals_t(v);
272
+ errno = ENOMEM;
273
+ return strtod("NaN", NULL);
274
+ }
275
+
276
+ static double intMax(bwOverlappingIntervals_t* ints) {
277
+ uint32_t i;
278
+ double o;
279
+
280
+ if(ints->l < 1) return strtod("NaN", NULL);
281
+
282
+ o = ints->value[0];
283
+ for(i=1; i<ints->l; i++) {
284
+ if(ints->value[i] > o) o = ints->value[i];
285
+ }
286
+
287
+ return o;
288
+ }
289
+
290
+ static double blockMin(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
291
+ uint32_t i, j, isNA = 1;
292
+ double o = strtod("NaN", NULL);
293
+ struct vals_t *v = NULL;
294
+
295
+ if(!blocks->n) return o;
296
+
297
+ //Iterate the blocks
298
+ for(i=0; i<blocks->n; i++) {
299
+ v = getVals(fp, blocks, i, tid, start, end);
300
+ if(!v) goto error;
301
+ for(j=0; j<v->n; j++) {
302
+ if(isNA) {
303
+ o = v->vals[j]->min;
304
+ isNA = 0;
305
+ } else if(v->vals[j]->min < o) o = v->vals[j]->min;
306
+ }
307
+ destroyVals_t(v);
308
+ }
309
+
310
+ return o;
311
+
312
+ error:
313
+ destroyVals_t(v);
314
+ errno = ENOMEM;
315
+ return strtod("NaN", NULL);
316
+ }
317
+
318
+ static double intMin(bwOverlappingIntervals_t* ints) {
319
+ uint32_t i;
320
+ double o;
321
+
322
+ if(ints->l < 1) return strtod("NaN", NULL);
323
+
324
+ o = ints->value[0];
325
+ for(i=1; i<ints->l; i++) {
326
+ if(ints->value[i] < o) o = ints->value[i];
327
+ }
328
+
329
+ return o;
330
+ }
331
+
332
+ //Does UCSC compensate for only partial block/interval overlap?
333
+ static double blockCoverage(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
334
+ uint32_t i, j;
335
+ double o = 0.0;
336
+ struct vals_t *v = NULL;
337
+
338
+ if(!blocks->n) return strtod("NaN", NULL);
339
+
340
+ //Iterate over the blocks
341
+ for(i=0; i<blocks->n; i++) {
342
+ v = getVals(fp, blocks, i, tid, start, end);
343
+ if(!v) goto error;
344
+ for(j=0; j<v->n; j++) {
345
+ o+= v->vals[j]->nBases * v->vals[j]->scalar;
346
+ }
347
+ destroyVals_t(v);
348
+ }
349
+
350
+ if(o == 0.0) return strtod("NaN", NULL);
351
+ return o;
352
+
353
+ error:
354
+ destroyVals_t(v);
355
+ errno = ENOMEM;
356
+ return strtod("NaN", NULL);
357
+ }
358
+
359
+ static double intCoverage(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
360
+ uint32_t i, start_use, end_use;
361
+ double o = 0.0;
362
+
363
+ if(!ints->l) return strtod("NaN", NULL);
364
+
365
+ for(i=0; i<ints->l; i++) {
366
+ start_use = ints->start[i];
367
+ end_use = ints->end[i];
368
+ if(start_use < start) start_use = start;
369
+ if(end_use > end) end_use = end;
370
+ o += end_use - start_use;
371
+ }
372
+
373
+ return o/(end-start);
374
+ }
375
+
376
+ static double blockSum(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
377
+ uint32_t i, j, sizeUse;
378
+ double o = 0.0;
379
+ struct vals_t *v = NULL;
380
+
381
+ if(!blocks->n) return strtod("NaN", NULL);
382
+
383
+ //Iterate over the blocks
384
+ for(i=0; i<blocks->n; i++) {
385
+ v = getVals(fp, blocks, i, tid, start, end);
386
+ if(!v) goto error;
387
+ for(j=0; j<v->n; j++) {
388
+ //Multiply the block average by min(bases covered, block overlap with interval)
389
+ sizeUse = v->vals[j]->scalar;
390
+ if(sizeUse > v->vals[j]->nBases) sizeUse = v->vals[j]->nBases;
391
+ o+= (v->vals[j]->sum * sizeUse) / v->vals[j]->nBases;
392
+ }
393
+ destroyVals_t(v);
394
+ }
395
+
396
+ if(o == 0.0) return strtod("NaN", NULL);
397
+ return o;
398
+
399
+ error:
400
+ destroyVals_t(v);
401
+ errno = ENOMEM;
402
+ return strtod("NaN", NULL);
403
+ }
404
+
405
+ static double intSum(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
406
+ uint32_t i, start_use, end_use;
407
+ double o = 0.0;
408
+
409
+ if(!ints->l) return strtod("NaN", NULL);
410
+
411
+ for(i=0; i<ints->l; i++) {
412
+ start_use = ints->start[i];
413
+ end_use = ints->end[i];
414
+ if(start_use < start) start_use = start;
415
+ if(end_use > end) end_use = end;
416
+ o += (end_use - start_use) * ints->value[i];
417
+ }
418
+
419
+ return o;
420
+ }
421
+
422
+ //Returns NULL on error, otherwise a double* that needs to be free()d
423
+ static double *bwStatsFromZoom(bigWigFile_t *fp, int32_t level, uint32_t tid, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
424
+ bwOverlapBlock_t *blocks = NULL;
425
+ double *output = NULL;
426
+ uint32_t pos = start, i, end2;
427
+
428
+ if(!fp->hdr->zoomHdrs->idx[level]) {
429
+ fp->hdr->zoomHdrs->idx[level] = bwReadIndex(fp, fp->hdr->zoomHdrs->indexOffset[level]);
430
+ if(!fp->hdr->zoomHdrs->idx[level]) return NULL;
431
+ }
432
+ errno = 0; //Sometimes libCurls sets and then doesn't unset errno on errors
433
+
434
+ output = malloc(sizeof(double)*nBins);
435
+ if(!output) return NULL;
436
+
437
+ for(i=0, pos=start; i<nBins; i++) {
438
+ end2 = start + ((double)(end-start)*(i+1))/((int) nBins);
439
+ blocks = walkRTreeNodes(fp, fp->hdr->zoomHdrs->idx[level]->root, tid, pos, end2);
440
+ if(!blocks) goto error;
441
+
442
+ switch(type) {
443
+ case 0:
444
+ //mean
445
+ output[i] = blockMean(fp, blocks, tid, pos, end2);
446
+ break;
447
+ case 1:
448
+ //stdev
449
+ output[i] = blockDev(fp, blocks, tid, pos, end2);
450
+ break;
451
+ case 2:
452
+ //max
453
+ output[i] = blockMax(fp, blocks, tid, pos, end2);
454
+ break;
455
+ case 3:
456
+ //min
457
+ output[i] = blockMin(fp, blocks, tid, pos, end2);
458
+ break;
459
+ case 4:
460
+ //cov
461
+ output[i] = blockCoverage(fp, blocks, tid, pos, end2)/(end2-pos);
462
+ break;
463
+ case 5:
464
+ //sum
465
+ output[i] = blockSum(fp, blocks, tid, pos, end2);
466
+ break;
467
+ default:
468
+ goto error;
469
+ break;
470
+ }
471
+ if(errno) goto error;
472
+ destroyBWOverlapBlock(blocks);
473
+ pos = end2;
474
+ }
475
+
476
+ return output;
477
+
478
+ error:
479
+ fprintf(stderr, "got an error in bwStatsFromZoom in the range %"PRIu32"-%"PRIu32": %s\n", pos, end2, strerror(errno));
480
+ if(blocks) destroyBWOverlapBlock(blocks);
481
+ if(output) free(output);
482
+ return NULL;
483
+ }
484
+
485
+ double *bwStatsFromFull(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
486
+ bwOverlappingIntervals_t *ints = NULL;
487
+ double *output = malloc(sizeof(double)*nBins);
488
+ uint32_t i, pos = start, end2;
489
+ if(!output) return NULL;
490
+
491
+ for(i=0; i<nBins; i++) {
492
+ end2 = start + ((double)(end-start)*(i+1))/((int) nBins);
493
+ ints = bwGetOverlappingIntervals(fp, chrom, pos, end2);
494
+
495
+ if(!ints) {
496
+ output[i] = strtod("NaN", NULL);
497
+ continue;
498
+ }
499
+
500
+ switch(type) {
501
+ default :
502
+ case 0:
503
+ output[i] = intMean(ints, pos, end2);
504
+ break;
505
+ case 1:
506
+ output[i] = intDev(ints, pos, end2);
507
+ break;
508
+ case 2:
509
+ output[i] = intMax(ints);
510
+ break;
511
+ case 3:
512
+ output[i] = intMin(ints);
513
+ break;
514
+ case 4:
515
+ output[i] = intCoverage(ints, pos, end2);
516
+ break;
517
+ case 5:
518
+ output[i] = intSum(ints, pos, end2);
519
+ break;
520
+ }
521
+ bwDestroyOverlappingIntervals(ints);
522
+ pos = end2;
523
+ }
524
+
525
+ return output;
526
+ }
527
+
528
+ //Returns a list of floats of length nBins that must be free()d
529
+ //On error, NULL is returned
530
+ double *bwStats(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
531
+ int32_t level = determineZoomLevel(fp, ((double)(end-start))/((int) nBins));
532
+ uint32_t tid = bwGetTid(fp, chrom);
533
+ if(tid == (uint32_t) -1) return NULL;
534
+
535
+ if(level == -1) return bwStatsFromFull(fp, chrom, start, end, nBins, type);
536
+ return bwStatsFromZoom(fp, level, tid, start, end, nBins, type);
537
+ }