bio-bigwig 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,537 @@
1
+ #include "bigWig.h"
2
+ #include "bwCommon.h"
3
+ #include <errno.h>
4
+ #include <stdlib.h>
5
+ #include <zlib.h>
6
+ #include <math.h>
7
+ #include <string.h>
8
+
9
+ //Returns -1 if there are no applicable levels, otherwise an integer indicating the most appropriate level.
10
+ //Like Kent's library, this divides the desired bin size by 2 to minimize the effect of blocks overlapping multiple bins
11
+ static int32_t determineZoomLevel(const bigWigFile_t *fp, int basesPerBin) {
12
+ int32_t out = -1;
13
+ int64_t diff;
14
+ uint32_t bestDiff = -1;
15
+ uint16_t i;
16
+
17
+ basesPerBin/=2;
18
+ for(i=0; i<fp->hdr->nLevels; i++) {
19
+ diff = basesPerBin - (int64_t) fp->hdr->zoomHdrs->level[i];
20
+ if(diff >= 0 && diff < bestDiff) {
21
+ bestDiff = diff;
22
+ out = i;
23
+ }
24
+ }
25
+ return out;
26
+ }
27
+
28
+ /// @cond SKIP
29
+ struct val_t {
30
+ uint32_t nBases;
31
+ float min, max, sum, sumsq;
32
+ double scalar;
33
+ };
34
+
35
+ struct vals_t {
36
+ uint32_t n;
37
+ struct val_t **vals;
38
+ };
39
+ /// @endcond
40
+
41
+ void destroyVals_t(struct vals_t *v) {
42
+ uint32_t i;
43
+ if(!v) return;
44
+ for(i=0; i<v->n; i++) free(v->vals[i]);
45
+ if(v->vals) free(v->vals);
46
+ free(v);
47
+ }
48
+
49
+ //Determine the base-pair overlap between an interval and a block
50
+ double getScalar(uint32_t i_start, uint32_t i_end, uint32_t b_start, uint32_t b_end) {
51
+ double rv = 0.0;
52
+ if(b_start <= i_start) {
53
+ if(b_end > i_start) rv = ((double)(b_end - i_start))/(b_end-b_start);
54
+ } else if(b_start < i_end) {
55
+ if(b_end < i_end) rv = ((double)(b_end - b_start))/(b_end-b_start);
56
+ else rv = ((double)(i_end - b_start))/(b_end-b_start);
57
+ }
58
+
59
+ return rv;
60
+ }
61
+
62
+ //Returns NULL on error
63
+ static struct vals_t *getVals(bigWigFile_t *fp, bwOverlapBlock_t *o, int i, uint32_t tid, uint32_t start, uint32_t end) {
64
+ void *buf = NULL, *compBuf = NULL;
65
+ uLongf sz = fp->hdr->bufSize;
66
+ int compressed = 0, rv;
67
+ uint32_t *p, vtid, vstart, vend;
68
+ struct vals_t *vals = NULL;
69
+ struct val_t *v = NULL;
70
+
71
+ if(sz) {
72
+ compressed = 1;
73
+ buf = malloc(sz);
74
+ }
75
+ sz = 0; //This is now the size of the compressed buffer
76
+
77
+ if(bwSetPos(fp, o->offset[i])) goto error;
78
+
79
+ vals = calloc(1,sizeof(struct vals_t));
80
+ if(!vals) goto error;
81
+
82
+ v = malloc(sizeof(struct val_t));
83
+ if(!v) goto error;
84
+
85
+ if(sz < o->size[i]) compBuf = malloc(o->size[i]);
86
+ if(!compBuf) goto error;
87
+
88
+ if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error;
89
+ if(compressed) {
90
+ sz = fp->hdr->bufSize;
91
+ rv = uncompress(buf, &sz, compBuf, o->size[i]);
92
+ if(rv != Z_OK) goto error;
93
+ } else {
94
+ buf = compBuf;
95
+ sz = o->size[i];
96
+ }
97
+
98
+ p = buf;
99
+ while(((uLongf) ((void*)p-buf)) < sz) {
100
+ vtid = p[0];
101
+ vstart = p[1];
102
+ vend = p[2];
103
+ v->nBases = p[3];
104
+ v->min = ((float*) p)[4];
105
+ v->max = ((float*) p)[5];
106
+ v->sum = ((float*) p)[6];
107
+ v->sumsq = ((float*) p)[7];
108
+ v->scalar = getScalar(start, end, vstart, vend);
109
+
110
+ if(tid == vtid) {
111
+ if((start <= vstart && end > vstart) || (start < vend && start >= vstart)) {
112
+ vals->vals = realloc(vals->vals, sizeof(struct val_t*)*(vals->n+1));
113
+ if(!vals->vals) goto error;
114
+ vals->vals[vals->n++] = v;
115
+ v = malloc(sizeof(struct val_t));
116
+ if(!v) goto error;
117
+ }
118
+ if(vstart > end) break;
119
+ } else if(vtid > tid) {
120
+ break;
121
+ }
122
+ p+=8;
123
+ }
124
+
125
+ free(v);
126
+ free(buf);
127
+ if(compressed) free(compBuf);
128
+ return vals;
129
+
130
+ error:
131
+ if(buf) free(buf);
132
+ if(compBuf && compressed) free(compBuf);
133
+ if(v) free(v);
134
+ destroyVals_t(vals);
135
+ return NULL;
136
+ }
137
+
138
+ //On error, errno is set to ENOMEM and NaN is returned (though NaN can be returned normally)
139
+ static double blockMean(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
140
+ uint32_t i, j;
141
+ double output = 0.0, coverage = 0.0;
142
+ struct vals_t *v = NULL;
143
+
144
+ if(!blocks->n) return strtod("NaN", NULL);
145
+
146
+ //Iterate over the blocks
147
+ for(i=0; i<blocks->n; i++) {
148
+ v = getVals(fp, blocks, i, tid, start, end);
149
+ if(!v) goto error;
150
+ for(j=0; j<v->n; j++) {
151
+ output += v->vals[j]->sum * v->vals[j]->scalar;
152
+ coverage += v->vals[j]->nBases * v->vals[j]->scalar;
153
+ }
154
+ destroyVals_t(v);
155
+ }
156
+
157
+
158
+ if(!coverage) return strtod("NaN", NULL);
159
+
160
+ return output/coverage;
161
+
162
+ error:
163
+ if(v) free(v);
164
+ errno = ENOMEM;
165
+ return strtod("NaN", NULL);
166
+ }
167
+
168
+ static double intMean(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
169
+ double sum = 0.0;
170
+ uint32_t nBases = 0, i, start_use, end_use;
171
+
172
+ if(!ints->l) return strtod("NaN", NULL);
173
+
174
+ for(i=0; i<ints->l; i++) {
175
+ start_use = ints->start[i];
176
+ end_use = ints->end[i];
177
+ if(ints->start[i] < start) start_use = start;
178
+ if(ints->end[i] > end) end_use = end;
179
+ nBases += end_use-start_use;
180
+ sum += (end_use-start_use)*((double) ints->value[i]);
181
+ }
182
+
183
+ return sum/nBases;
184
+ }
185
+
186
+ //Does UCSC compensate for partial block/range overlap?
187
+ static double blockDev(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
188
+ uint32_t i, j;
189
+ double mean = 0.0, ssq = 0.0, coverage = 0.0, diff;
190
+ struct vals_t *v = NULL;
191
+
192
+ if(!blocks->n) return strtod("NaN", NULL);
193
+
194
+ //Iterate over the blocks
195
+ for(i=0; i<blocks->n; i++) {
196
+ v = getVals(fp, blocks, i, tid, start, end);
197
+ if(!v) goto error;
198
+ for(j=0; j<v->n; j++) {
199
+ coverage += v->vals[j]->nBases * v->vals[j]->scalar;
200
+ mean += v->vals[j]->sum * v->vals[j]->scalar;
201
+ ssq += v->vals[j]->sumsq * v->vals[j]->scalar;
202
+ }
203
+ destroyVals_t(v);
204
+ v = NULL;
205
+ }
206
+
207
+ if(coverage<=1.0) return strtod("NaN", NULL);
208
+ diff = ssq-mean*mean/coverage;
209
+ if(coverage > 1.0) diff /= coverage-1;
210
+ if(fabs(diff) > 1e-8) { //Ignore floating point differences
211
+ return sqrt(diff);
212
+ } else {
213
+ return 0.0;
214
+ }
215
+
216
+ error:
217
+ if(v) destroyVals_t(v);
218
+ errno = ENOMEM;
219
+ return strtod("NaN", NULL);
220
+ }
221
+
222
+ //This uses compensated summation to account for finite precision math
223
+ static double intDev(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
224
+ double v1 = 0.0, mean, rv;
225
+ uint32_t nBases = 0, i, start_use, end_use;
226
+
227
+ if(!ints->l) return strtod("NaN", NULL);
228
+ mean = intMean(ints, start, end);
229
+
230
+ for(i=0; i<ints->l; i++) {
231
+ start_use = ints->start[i];
232
+ end_use = ints->end[i];
233
+ if(ints->start[i] < start) start_use = start;
234
+ if(ints->end[i] > end) end_use = end;
235
+ nBases += end_use-start_use;
236
+ v1 += (end_use-start_use) * pow(ints->value[i]-mean, 2.0); //running sum of squared difference
237
+ }
238
+
239
+ if(nBases>=2) rv = sqrt(v1/(nBases-1));
240
+ else if(nBases==1) rv = sqrt(v1);
241
+ else rv = strtod("NaN", NULL);
242
+
243
+ return rv;
244
+ }
245
+
246
+ static double blockMax(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
247
+ uint32_t i, j, isNA = 1;
248
+ double o = strtod("NaN", NULL);
249
+ struct vals_t *v = NULL;
250
+
251
+ if(!blocks->n) return o;
252
+
253
+ //Iterate the blocks
254
+ for(i=0; i<blocks->n; i++) {
255
+ v = getVals(fp, blocks, i, tid, start, end);
256
+ if(!v) goto error;
257
+ for(j=0; j<v->n; j++) {
258
+ if(isNA) {
259
+ o = v->vals[j]->max;
260
+ isNA = 0;
261
+ } else if(v->vals[j]->max > o) {
262
+ o = v->vals[j]->max;
263
+ }
264
+ }
265
+ destroyVals_t(v);
266
+ }
267
+
268
+ return o;
269
+
270
+ error:
271
+ destroyVals_t(v);
272
+ errno = ENOMEM;
273
+ return strtod("NaN", NULL);
274
+ }
275
+
276
+ static double intMax(bwOverlappingIntervals_t* ints) {
277
+ uint32_t i;
278
+ double o;
279
+
280
+ if(ints->l < 1) return strtod("NaN", NULL);
281
+
282
+ o = ints->value[0];
283
+ for(i=1; i<ints->l; i++) {
284
+ if(ints->value[i] > o) o = ints->value[i];
285
+ }
286
+
287
+ return o;
288
+ }
289
+
290
+ static double blockMin(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
291
+ uint32_t i, j, isNA = 1;
292
+ double o = strtod("NaN", NULL);
293
+ struct vals_t *v = NULL;
294
+
295
+ if(!blocks->n) return o;
296
+
297
+ //Iterate the blocks
298
+ for(i=0; i<blocks->n; i++) {
299
+ v = getVals(fp, blocks, i, tid, start, end);
300
+ if(!v) goto error;
301
+ for(j=0; j<v->n; j++) {
302
+ if(isNA) {
303
+ o = v->vals[j]->min;
304
+ isNA = 0;
305
+ } else if(v->vals[j]->min < o) o = v->vals[j]->min;
306
+ }
307
+ destroyVals_t(v);
308
+ }
309
+
310
+ return o;
311
+
312
+ error:
313
+ destroyVals_t(v);
314
+ errno = ENOMEM;
315
+ return strtod("NaN", NULL);
316
+ }
317
+
318
+ static double intMin(bwOverlappingIntervals_t* ints) {
319
+ uint32_t i;
320
+ double o;
321
+
322
+ if(ints->l < 1) return strtod("NaN", NULL);
323
+
324
+ o = ints->value[0];
325
+ for(i=1; i<ints->l; i++) {
326
+ if(ints->value[i] < o) o = ints->value[i];
327
+ }
328
+
329
+ return o;
330
+ }
331
+
332
+ //Does UCSC compensate for only partial block/interval overlap?
333
+ static double blockCoverage(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
334
+ uint32_t i, j;
335
+ double o = 0.0;
336
+ struct vals_t *v = NULL;
337
+
338
+ if(!blocks->n) return strtod("NaN", NULL);
339
+
340
+ //Iterate over the blocks
341
+ for(i=0; i<blocks->n; i++) {
342
+ v = getVals(fp, blocks, i, tid, start, end);
343
+ if(!v) goto error;
344
+ for(j=0; j<v->n; j++) {
345
+ o+= v->vals[j]->nBases * v->vals[j]->scalar;
346
+ }
347
+ destroyVals_t(v);
348
+ }
349
+
350
+ if(o == 0.0) return strtod("NaN", NULL);
351
+ return o;
352
+
353
+ error:
354
+ destroyVals_t(v);
355
+ errno = ENOMEM;
356
+ return strtod("NaN", NULL);
357
+ }
358
+
359
+ static double intCoverage(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
360
+ uint32_t i, start_use, end_use;
361
+ double o = 0.0;
362
+
363
+ if(!ints->l) return strtod("NaN", NULL);
364
+
365
+ for(i=0; i<ints->l; i++) {
366
+ start_use = ints->start[i];
367
+ end_use = ints->end[i];
368
+ if(start_use < start) start_use = start;
369
+ if(end_use > end) end_use = end;
370
+ o += end_use - start_use;
371
+ }
372
+
373
+ return o/(end-start);
374
+ }
375
+
376
+ static double blockSum(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
377
+ uint32_t i, j, sizeUse;
378
+ double o = 0.0;
379
+ struct vals_t *v = NULL;
380
+
381
+ if(!blocks->n) return strtod("NaN", NULL);
382
+
383
+ //Iterate over the blocks
384
+ for(i=0; i<blocks->n; i++) {
385
+ v = getVals(fp, blocks, i, tid, start, end);
386
+ if(!v) goto error;
387
+ for(j=0; j<v->n; j++) {
388
+ //Multiply the block average by min(bases covered, block overlap with interval)
389
+ sizeUse = v->vals[j]->scalar;
390
+ if(sizeUse > v->vals[j]->nBases) sizeUse = v->vals[j]->nBases;
391
+ o+= (v->vals[j]->sum * sizeUse) / v->vals[j]->nBases;
392
+ }
393
+ destroyVals_t(v);
394
+ }
395
+
396
+ if(o == 0.0) return strtod("NaN", NULL);
397
+ return o;
398
+
399
+ error:
400
+ destroyVals_t(v);
401
+ errno = ENOMEM;
402
+ return strtod("NaN", NULL);
403
+ }
404
+
405
+ static double intSum(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
406
+ uint32_t i, start_use, end_use;
407
+ double o = 0.0;
408
+
409
+ if(!ints->l) return strtod("NaN", NULL);
410
+
411
+ for(i=0; i<ints->l; i++) {
412
+ start_use = ints->start[i];
413
+ end_use = ints->end[i];
414
+ if(start_use < start) start_use = start;
415
+ if(end_use > end) end_use = end;
416
+ o += (end_use - start_use) * ints->value[i];
417
+ }
418
+
419
+ return o;
420
+ }
421
+
422
+ //Returns NULL on error, otherwise a double* that needs to be free()d
423
+ static double *bwStatsFromZoom(bigWigFile_t *fp, int32_t level, uint32_t tid, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
424
+ bwOverlapBlock_t *blocks = NULL;
425
+ double *output = NULL;
426
+ uint32_t pos = start, i, end2;
427
+
428
+ if(!fp->hdr->zoomHdrs->idx[level]) {
429
+ fp->hdr->zoomHdrs->idx[level] = bwReadIndex(fp, fp->hdr->zoomHdrs->indexOffset[level]);
430
+ if(!fp->hdr->zoomHdrs->idx[level]) return NULL;
431
+ }
432
+ errno = 0; //Sometimes libCurls sets and then doesn't unset errno on errors
433
+
434
+ output = malloc(sizeof(double)*nBins);
435
+ if(!output) return NULL;
436
+
437
+ for(i=0, pos=start; i<nBins; i++) {
438
+ end2 = start + ((double)(end-start)*(i+1))/((int) nBins);
439
+ blocks = walkRTreeNodes(fp, fp->hdr->zoomHdrs->idx[level]->root, tid, pos, end2);
440
+ if(!blocks) goto error;
441
+
442
+ switch(type) {
443
+ case 0:
444
+ //mean
445
+ output[i] = blockMean(fp, blocks, tid, pos, end2);
446
+ break;
447
+ case 1:
448
+ //stdev
449
+ output[i] = blockDev(fp, blocks, tid, pos, end2);
450
+ break;
451
+ case 2:
452
+ //max
453
+ output[i] = blockMax(fp, blocks, tid, pos, end2);
454
+ break;
455
+ case 3:
456
+ //min
457
+ output[i] = blockMin(fp, blocks, tid, pos, end2);
458
+ break;
459
+ case 4:
460
+ //cov
461
+ output[i] = blockCoverage(fp, blocks, tid, pos, end2)/(end2-pos);
462
+ break;
463
+ case 5:
464
+ //sum
465
+ output[i] = blockSum(fp, blocks, tid, pos, end2);
466
+ break;
467
+ default:
468
+ goto error;
469
+ break;
470
+ }
471
+ if(errno) goto error;
472
+ destroyBWOverlapBlock(blocks);
473
+ pos = end2;
474
+ }
475
+
476
+ return output;
477
+
478
+ error:
479
+ fprintf(stderr, "got an error in bwStatsFromZoom in the range %"PRIu32"-%"PRIu32": %s\n", pos, end2, strerror(errno));
480
+ if(blocks) destroyBWOverlapBlock(blocks);
481
+ if(output) free(output);
482
+ return NULL;
483
+ }
484
+
485
+ double *bwStatsFromFull(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
486
+ bwOverlappingIntervals_t *ints = NULL;
487
+ double *output = malloc(sizeof(double)*nBins);
488
+ uint32_t i, pos = start, end2;
489
+ if(!output) return NULL;
490
+
491
+ for(i=0; i<nBins; i++) {
492
+ end2 = start + ((double)(end-start)*(i+1))/((int) nBins);
493
+ ints = bwGetOverlappingIntervals(fp, chrom, pos, end2);
494
+
495
+ if(!ints) {
496
+ output[i] = strtod("NaN", NULL);
497
+ continue;
498
+ }
499
+
500
+ switch(type) {
501
+ default :
502
+ case 0:
503
+ output[i] = intMean(ints, pos, end2);
504
+ break;
505
+ case 1:
506
+ output[i] = intDev(ints, pos, end2);
507
+ break;
508
+ case 2:
509
+ output[i] = intMax(ints);
510
+ break;
511
+ case 3:
512
+ output[i] = intMin(ints);
513
+ break;
514
+ case 4:
515
+ output[i] = intCoverage(ints, pos, end2);
516
+ break;
517
+ case 5:
518
+ output[i] = intSum(ints, pos, end2);
519
+ break;
520
+ }
521
+ bwDestroyOverlappingIntervals(ints);
522
+ pos = end2;
523
+ }
524
+
525
+ return output;
526
+ }
527
+
528
+ //Returns a list of floats of length nBins that must be free()d
529
+ //On error, NULL is returned
530
+ double *bwStats(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
531
+ int32_t level = determineZoomLevel(fp, ((double)(end-start))/((int) nBins));
532
+ uint32_t tid = bwGetTid(fp, chrom);
533
+ if(tid == (uint32_t) -1) return NULL;
534
+
535
+ if(level == -1) return bwStatsFromFull(fp, chrom, start, end, nBins, type);
536
+ return bwStatsFromZoom(fp, level, tid, start, end, nBins, type);
537
+ }