bio-bigwig 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,537 +0,0 @@
1
- #include "bigWig.h"
2
- #include "bwCommon.h"
3
- #include <errno.h>
4
- #include <stdlib.h>
5
- #include <zlib.h>
6
- #include <math.h>
7
- #include <string.h>
8
-
9
- //Returns -1 if there are no applicable levels, otherwise an integer indicating the most appropriate level.
10
- //Like Kent's library, this divides the desired bin size by 2 to minimize the effect of blocks overlapping multiple bins
11
- static int32_t determineZoomLevel(bigWigFile_t *fp, int basesPerBin) {
12
- int32_t out = -1;
13
- int64_t diff;
14
- uint32_t bestDiff = -1;
15
- uint16_t i;
16
-
17
- basesPerBin/=2;
18
- for(i=0; i<fp->hdr->nLevels; i++) {
19
- diff = basesPerBin - (int64_t) fp->hdr->zoomHdrs->level[i];
20
- if(diff >= 0 && diff < bestDiff) {
21
- bestDiff = diff;
22
- out = i;
23
- }
24
- }
25
- return out;
26
- }
27
-
28
- /// @cond SKIP
29
- struct val_t {
30
- uint32_t nBases;
31
- float min, max, sum, sumsq;
32
- double scalar;
33
- };
34
-
35
- struct vals_t {
36
- uint32_t n;
37
- struct val_t **vals;
38
- };
39
- /// @endcond
40
-
41
- void destroyVals_t(struct vals_t *v) {
42
- uint32_t i;
43
- if(!v) return;
44
- for(i=0; i<v->n; i++) free(v->vals[i]);
45
- if(v->vals) free(v->vals);
46
- free(v);
47
- }
48
-
49
- //Determine the base-pair overlap between an interval and a block
50
- double getScalar(uint32_t i_start, uint32_t i_end, uint32_t b_start, uint32_t b_end) {
51
- double rv = 0.0;
52
- if(b_start <= i_start) {
53
- if(b_end > i_start) rv = ((double)(b_end - i_start))/(b_end-b_start);
54
- } else if(b_start < i_end) {
55
- if(b_end < i_end) rv = ((double)(b_end - b_start))/(b_end-b_start);
56
- else rv = ((double)(i_end - b_start))/(b_end-b_start);
57
- }
58
-
59
- return rv;
60
- }
61
-
62
- //Returns NULL on error
63
- static struct vals_t *getVals(bigWigFile_t *fp, bwOverlapBlock_t *o, int i, uint32_t tid, uint32_t start, uint32_t end) {
64
- void *buf = NULL, *compBuf = NULL;
65
- uLongf sz = fp->hdr->bufSize;
66
- int compressed = 0, rv;
67
- uint32_t *p, vtid, vstart, vend;
68
- struct vals_t *vals = NULL;
69
- struct val_t *v = NULL;
70
-
71
- if(sz) {
72
- compressed = 1;
73
- buf = malloc(sz);
74
- }
75
- sz = 0; //This is now the size of the compressed buffer
76
-
77
- if(bwSetPos(fp, o->offset[i])) goto error;
78
-
79
- vals = calloc(1,sizeof(struct vals_t));
80
- if(!vals) goto error;
81
-
82
- v = malloc(sizeof(struct val_t));
83
- if(!v) goto error;
84
-
85
- if(sz < o->size[i]) compBuf = malloc(o->size[i]);
86
- if(!compBuf) goto error;
87
-
88
- if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error;
89
- if(compressed) {
90
- sz = fp->hdr->bufSize;
91
- rv = uncompress(buf, &sz, compBuf, o->size[i]);
92
- if(rv != Z_OK) goto error;
93
- } else {
94
- buf = compBuf;
95
- sz = o->size[i];
96
- }
97
-
98
- p = buf;
99
- while(((uLongf) ((void*)p-buf)) < sz) {
100
- vtid = p[0];
101
- vstart = p[1];
102
- vend = p[2];
103
- v->nBases = p[3];
104
- v->min = ((float*) p)[4];
105
- v->max = ((float*) p)[5];
106
- v->sum = ((float*) p)[6];
107
- v->sumsq = ((float*) p)[7];
108
- v->scalar = getScalar(start, end, vstart, vend);
109
-
110
- if(tid == vtid) {
111
- if((start <= vstart && end > vstart) || (start < vend && start >= vstart)) {
112
- vals->vals = realloc(vals->vals, sizeof(struct val_t*)*(vals->n+1));
113
- if(!vals->vals) goto error;
114
- vals->vals[vals->n++] = v;
115
- v = malloc(sizeof(struct val_t));
116
- if(!v) goto error;
117
- }
118
- if(vstart > end) break;
119
- } else if(vtid > tid) {
120
- break;
121
- }
122
- p+=8;
123
- }
124
-
125
- free(v);
126
- free(buf);
127
- if(compressed) free(compBuf);
128
- return vals;
129
-
130
- error:
131
- if(buf) free(buf);
132
- if(compBuf && compressed) free(compBuf);
133
- if(v) free(v);
134
- destroyVals_t(vals);
135
- return NULL;
136
- }
137
-
138
- //On error, errno is set to ENOMEM and NaN is returned (though NaN can be returned normally)
139
- static double blockMean(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
140
- uint32_t i, j;
141
- double output = 0.0, coverage = 0.0;
142
- struct vals_t *v = NULL;
143
-
144
- if(!blocks->n) return strtod("NaN", NULL);
145
-
146
- //Iterate over the blocks
147
- for(i=0; i<blocks->n; i++) {
148
- v = getVals(fp, blocks, i, tid, start, end);
149
- if(!v) goto error;
150
- for(j=0; j<v->n; j++) {
151
- output += v->vals[j]->sum * v->vals[j]->scalar;
152
- coverage += v->vals[j]->nBases * v->vals[j]->scalar;
153
- }
154
- destroyVals_t(v);
155
- }
156
-
157
-
158
- if(!coverage) return strtod("NaN", NULL);
159
-
160
- return output/coverage;
161
-
162
- error:
163
- if(v) free(v);
164
- errno = ENOMEM;
165
- return strtod("NaN", NULL);
166
- }
167
-
168
- static double intMean(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
169
- double sum = 0.0;
170
- uint32_t nBases = 0, i, start_use, end_use;
171
-
172
- if(!ints->l) return strtod("NaN", NULL);
173
-
174
- for(i=0; i<ints->l; i++) {
175
- start_use = ints->start[i];
176
- end_use = ints->end[i];
177
- if(ints->start[i] < start) start_use = start;
178
- if(ints->end[i] > end) end_use = end;
179
- nBases += end_use-start_use;
180
- sum += (end_use-start_use)*((double) ints->value[i]);
181
- }
182
-
183
- return sum/nBases;
184
- }
185
-
186
- //Does UCSC compensate for partial block/range overlap?
187
- static double blockDev(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
188
- uint32_t i, j;
189
- double mean = 0.0, ssq = 0.0, coverage = 0.0, diff;
190
- struct vals_t *v = NULL;
191
-
192
- if(!blocks->n) return strtod("NaN", NULL);
193
-
194
- //Iterate over the blocks
195
- for(i=0; i<blocks->n; i++) {
196
- v = getVals(fp, blocks, i, tid, start, end);
197
- if(!v) goto error;
198
- for(j=0; j<v->n; j++) {
199
- coverage += v->vals[j]->nBases * v->vals[j]->scalar;
200
- mean += v->vals[j]->sum * v->vals[j]->scalar;
201
- ssq += v->vals[j]->sumsq * v->vals[j]->scalar;
202
- }
203
- destroyVals_t(v);
204
- v = NULL;
205
- }
206
-
207
- if(coverage<=1.0) return strtod("NaN", NULL);
208
- diff = ssq-mean*mean/coverage;
209
- if(coverage > 1.0) diff /= coverage-1;
210
- if(fabs(diff) > 1e-8) { //Ignore floating point differences
211
- return sqrt(diff);
212
- } else {
213
- return 0.0;
214
- }
215
-
216
- error:
217
- if(v) destroyVals_t(v);
218
- errno = ENOMEM;
219
- return strtod("NaN", NULL);
220
- }
221
-
222
- //This uses compensated summation to account for finite precision math
223
- static double intDev(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
224
- double v1 = 0.0, mean, rv;
225
- uint32_t nBases = 0, i, start_use, end_use;
226
-
227
- if(!ints->l) return strtod("NaN", NULL);
228
- mean = intMean(ints, start, end);
229
-
230
- for(i=0; i<ints->l; i++) {
231
- start_use = ints->start[i];
232
- end_use = ints->end[i];
233
- if(ints->start[i] < start) start_use = start;
234
- if(ints->end[i] > end) end_use = end;
235
- nBases += end_use-start_use;
236
- v1 += (end_use-start_use) * pow(ints->value[i]-mean, 2.0); //running sum of squared difference
237
- }
238
-
239
- if(nBases>=2) rv = sqrt(v1/(nBases-1));
240
- else if(nBases==1) rv = sqrt(v1);
241
- else rv = strtod("NaN", NULL);
242
-
243
- return rv;
244
- }
245
-
246
- static double blockMax(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
247
- uint32_t i, j, isNA = 1;
248
- double o = strtod("NaN", NULL);
249
- struct vals_t *v = NULL;
250
-
251
- if(!blocks->n) return o;
252
-
253
- //Iterate the blocks
254
- for(i=0; i<blocks->n; i++) {
255
- v = getVals(fp, blocks, i, tid, start, end);
256
- if(!v) goto error;
257
- for(j=0; j<v->n; j++) {
258
- if(isNA) {
259
- o = v->vals[j]->max;
260
- isNA = 0;
261
- } else if(v->vals[j]->max > o) {
262
- o = v->vals[j]->max;
263
- }
264
- }
265
- destroyVals_t(v);
266
- }
267
-
268
- return o;
269
-
270
- error:
271
- destroyVals_t(v);
272
- errno = ENOMEM;
273
- return strtod("NaN", NULL);
274
- }
275
-
276
- static double intMax(bwOverlappingIntervals_t* ints) {
277
- uint32_t i;
278
- double o;
279
-
280
- if(ints->l < 1) return strtod("NaN", NULL);
281
-
282
- o = ints->value[0];
283
- for(i=1; i<ints->l; i++) {
284
- if(ints->value[i] > o) o = ints->value[i];
285
- }
286
-
287
- return o;
288
- }
289
-
290
- static double blockMin(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
291
- uint32_t i, j, isNA = 1;
292
- double o = strtod("NaN", NULL);
293
- struct vals_t *v = NULL;
294
-
295
- if(!blocks->n) return o;
296
-
297
- //Iterate the blocks
298
- for(i=0; i<blocks->n; i++) {
299
- v = getVals(fp, blocks, i, tid, start, end);
300
- if(!v) goto error;
301
- for(j=0; j<v->n; j++) {
302
- if(isNA) {
303
- o = v->vals[j]->min;
304
- isNA = 0;
305
- } else if(v->vals[j]->min < o) o = v->vals[j]->min;
306
- }
307
- destroyVals_t(v);
308
- }
309
-
310
- return o;
311
-
312
- error:
313
- destroyVals_t(v);
314
- errno = ENOMEM;
315
- return strtod("NaN", NULL);
316
- }
317
-
318
- static double intMin(bwOverlappingIntervals_t* ints) {
319
- uint32_t i;
320
- double o;
321
-
322
- if(ints->l < 1) return strtod("NaN", NULL);
323
-
324
- o = ints->value[0];
325
- for(i=1; i<ints->l; i++) {
326
- if(ints->value[i] < o) o = ints->value[i];
327
- }
328
-
329
- return o;
330
- }
331
-
332
- //Does UCSC compensate for only partial block/interval overlap?
333
- static double blockCoverage(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
334
- uint32_t i, j;
335
- double o = 0.0;
336
- struct vals_t *v = NULL;
337
-
338
- if(!blocks->n) return strtod("NaN", NULL);
339
-
340
- //Iterate over the blocks
341
- for(i=0; i<blocks->n; i++) {
342
- v = getVals(fp, blocks, i, tid, start, end);
343
- if(!v) goto error;
344
- for(j=0; j<v->n; j++) {
345
- o+= v->vals[j]->nBases * v->vals[j]->scalar;
346
- }
347
- destroyVals_t(v);
348
- }
349
-
350
- if(o == 0.0) return strtod("NaN", NULL);
351
- return o;
352
-
353
- error:
354
- destroyVals_t(v);
355
- errno = ENOMEM;
356
- return strtod("NaN", NULL);
357
- }
358
-
359
- static double intCoverage(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
360
- uint32_t i, start_use, end_use;
361
- double o = 0.0;
362
-
363
- if(!ints->l) return strtod("NaN", NULL);
364
-
365
- for(i=0; i<ints->l; i++) {
366
- start_use = ints->start[i];
367
- end_use = ints->end[i];
368
- if(start_use < start) start_use = start;
369
- if(end_use > end) end_use = end;
370
- o += end_use - start_use;
371
- }
372
-
373
- return o/(end-start);
374
- }
375
-
376
- static double blockSum(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) {
377
- uint32_t i, j, sizeUse;
378
- double o = 0.0;
379
- struct vals_t *v = NULL;
380
-
381
- if(!blocks->n) return strtod("NaN", NULL);
382
-
383
- //Iterate over the blocks
384
- for(i=0; i<blocks->n; i++) {
385
- v = getVals(fp, blocks, i, tid, start, end);
386
- if(!v) goto error;
387
- for(j=0; j<v->n; j++) {
388
- //Multiply the block average by min(bases covered, block overlap with interval)
389
- sizeUse = v->vals[j]->scalar;
390
- if(sizeUse > v->vals[j]->nBases) sizeUse = v->vals[j]->nBases;
391
- o+= (v->vals[j]->sum * sizeUse) / v->vals[j]->nBases;
392
- }
393
- destroyVals_t(v);
394
- }
395
-
396
- if(o == 0.0) return strtod("NaN", NULL);
397
- return o;
398
-
399
- error:
400
- destroyVals_t(v);
401
- errno = ENOMEM;
402
- return strtod("NaN", NULL);
403
- }
404
-
405
- static double intSum(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) {
406
- uint32_t i, start_use, end_use;
407
- double o = 0.0;
408
-
409
- if(!ints->l) return strtod("NaN", NULL);
410
-
411
- for(i=0; i<ints->l; i++) {
412
- start_use = ints->start[i];
413
- end_use = ints->end[i];
414
- if(start_use < start) start_use = start;
415
- if(end_use > end) end_use = end;
416
- o += (end_use - start_use) * ints->value[i];
417
- }
418
-
419
- return o;
420
- }
421
-
422
- //Returns NULL on error, otherwise a double* that needs to be free()d
423
- double *bwStatsFromZoom(bigWigFile_t *fp, int32_t level, uint32_t tid, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
424
- bwOverlapBlock_t *blocks = NULL;
425
- double *output = NULL;
426
- uint32_t pos = start, i, end2;
427
-
428
- if(!fp->hdr->zoomHdrs->idx[level]) {
429
- fp->hdr->zoomHdrs->idx[level] = bwReadIndex(fp, fp->hdr->zoomHdrs->indexOffset[level]);
430
- if(!fp->hdr->zoomHdrs->idx[level]) return NULL;
431
- }
432
- errno = 0; //Sometimes libCurls sets and then doesn't unset errno on errors
433
-
434
- output = malloc(sizeof(double)*nBins);
435
- if(!output) return NULL;
436
-
437
- for(i=0, pos=start; i<nBins; i++) {
438
- end2 = start + ((double)(end-start)*(i+1))/((int) nBins);
439
- blocks = walkRTreeNodes(fp, fp->hdr->zoomHdrs->idx[level]->root, tid, pos, end2);
440
- if(!blocks) goto error;
441
-
442
- switch(type) {
443
- case 0:
444
- //mean
445
- output[i] = blockMean(fp, blocks, tid, pos, end2);
446
- break;
447
- case 1:
448
- //stdev
449
- output[i] = blockDev(fp, blocks, tid, pos, end2);
450
- break;
451
- case 2:
452
- //max
453
- output[i] = blockMax(fp, blocks, tid, pos, end2);
454
- break;
455
- case 3:
456
- //min
457
- output[i] = blockMin(fp, blocks, tid, pos, end2);
458
- break;
459
- case 4:
460
- //cov
461
- output[i] = blockCoverage(fp, blocks, tid, pos, end2)/(end2-pos);
462
- break;
463
- case 5:
464
- //sum
465
- output[i] = blockSum(fp, blocks, tid, pos, end2);
466
- break;
467
- default:
468
- goto error;
469
- break;
470
- }
471
- if(errno) goto error;
472
- destroyBWOverlapBlock(blocks);
473
- pos = end2;
474
- }
475
-
476
- return output;
477
-
478
- error:
479
- fprintf(stderr, "got an error in bwStatsFromZoom in the range %"PRIu32"-%"PRIu32": %s\n", pos, end2, strerror(errno));
480
- if(blocks) destroyBWOverlapBlock(blocks);
481
- if(output) free(output);
482
- return NULL;
483
- }
484
-
485
- double *bwStatsFromFull(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
486
- bwOverlappingIntervals_t *ints = NULL;
487
- double *output = malloc(sizeof(double)*nBins);
488
- uint32_t i, pos = start, end2;
489
- if(!output) return NULL;
490
-
491
- for(i=0; i<nBins; i++) {
492
- end2 = start + ((double)(end-start)*(i+1))/((int) nBins);
493
- ints = bwGetOverlappingIntervals(fp, chrom, pos, end2);
494
-
495
- if(!ints) {
496
- output[i] = strtod("NaN", NULL);
497
- continue;
498
- }
499
-
500
- switch(type) {
501
- default :
502
- case 0:
503
- output[i] = intMean(ints, pos, end2);
504
- break;
505
- case 1:
506
- output[i] = intDev(ints, pos, end2);
507
- break;
508
- case 2:
509
- output[i] = intMax(ints);
510
- break;
511
- case 3:
512
- output[i] = intMin(ints);
513
- break;
514
- case 4:
515
- output[i] = intCoverage(ints, pos, end2);
516
- break;
517
- case 5:
518
- output[i] = intSum(ints, pos, end2);
519
- break;
520
- }
521
- bwDestroyOverlappingIntervals(ints);
522
- pos = end2;
523
- }
524
-
525
- return output;
526
- }
527
-
528
- //Returns a list of floats of length nBins that must be free()d
529
- //On error, NULL is returned
530
- double *bwStats(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) {
531
- int32_t level = determineZoomLevel(fp, ((double)(end-start))/((int) nBins));
532
- uint32_t tid = bwGetTid(fp, chrom);
533
- if(tid == (uint32_t) -1) return NULL;
534
-
535
- if(level == -1) return bwStatsFromFull(fp, chrom, start, end, nBins, type);
536
- return bwStatsFromZoom(fp, level, tid, start, end, nBins, type);
537
- }