bio-bigwig 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1333 @@
1
+ #include <limits.h>
2
+ #include <float.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+ #include <math.h>
6
+ #include "bigWig.h"
7
+ #include "bwCommon.h"
8
+
9
+ /// @cond SKIP
10
+ struct val_t {
11
+ uint32_t tid;
12
+ uint32_t start;
13
+ uint32_t nBases;
14
+ float min, max, sum, sumsq;
15
+ double scalar;
16
+ struct val_t *next;
17
+ };
18
+ /// @endcond
19
+
20
+ //Create a chromList_t and attach it to a bigWigFile_t *. Returns NULL on error
21
+ //Note that chroms and lengths are duplicated, so you MUST free the input
22
+ chromList_t *bwCreateChromList(const char* const* chroms, const uint32_t *lengths, int64_t n) {
23
+ int64_t i = 0;
24
+ chromList_t *cl = calloc(1, sizeof(chromList_t));
25
+ if(!cl) return NULL;
26
+
27
+ cl->nKeys = n;
28
+ cl->chrom = malloc(sizeof(char*)*n);
29
+ cl->len = malloc(sizeof(uint32_t)*n);
30
+ if(!cl->chrom) goto error;
31
+ if(!cl->len) goto error;
32
+
33
+ for(i=0; i<n; i++) {
34
+ cl->len[i] = lengths[i];
35
+ cl->chrom[i] = bwStrdup(chroms[i]);
36
+ if(!cl->chrom[i]) goto error;
37
+ }
38
+
39
+ return cl;
40
+
41
+ error:
42
+ if(i) {
43
+ int64_t j;
44
+ for(j=0; j<i; j++) free(cl->chrom[j]);
45
+ }
46
+ if(cl) {
47
+ if(cl->chrom) free(cl->chrom);
48
+ if(cl->len) free(cl->len);
49
+ free(cl);
50
+ }
51
+ return NULL;
52
+ }
53
+
54
+ //If maxZooms == 0, then 0 is used (i.e., there are no zoom levels). If maxZooms < 0 or > 65535 then 10 is used.
55
+ //TODO allow changing bufSize and blockSize
56
+ int bwCreateHdr(bigWigFile_t *fp, int32_t maxZooms) {
57
+ if(!fp->isWrite) return 1;
58
+ bigWigHdr_t *hdr = calloc(1, sizeof(bigWigHdr_t));
59
+ if(!hdr) return 2;
60
+
61
+ hdr->version = 4;
62
+ if(maxZooms < 0 || maxZooms > 65535) {
63
+ hdr->nLevels = 10;
64
+ } else {
65
+ hdr->nLevels = maxZooms;
66
+ }
67
+
68
+ hdr->bufSize = 32768; //When the file is finalized this is reset if fp->writeBuffer->compressPsz is 0!
69
+ hdr->minVal = DBL_MAX;
70
+ hdr->maxVal = DBL_MIN;
71
+ fp->hdr = hdr;
72
+ fp->writeBuffer->blockSize = 64;
73
+
74
+ //Allocate the writeBuffer buffers
75
+ fp->writeBuffer->compressPsz = compressBound(hdr->bufSize);
76
+ fp->writeBuffer->compressP = malloc(fp->writeBuffer->compressPsz);
77
+ if(!fp->writeBuffer->compressP) return 3;
78
+ fp->writeBuffer->p = calloc(1,hdr->bufSize);
79
+ if(!fp->writeBuffer->p) return 4;
80
+
81
+ return 0;
82
+ }
83
+
84
+ //return 0 on success
85
+ static int writeAtPos(void *ptr, size_t sz, size_t nmemb, size_t pos, FILE *fp) {
86
+ size_t curpos = ftell(fp);
87
+ if(fseek(fp, pos, SEEK_SET)) return 1;
88
+ if(fwrite(ptr, sz, nmemb, fp) != nmemb) return 2;
89
+ if(fseek(fp, curpos, SEEK_SET)) return 3;
90
+ return 0;
91
+ }
92
+
93
+ //We lose keySize bytes on error
94
+ static int writeChromList(FILE *fp, chromList_t *cl) {
95
+ uint16_t k;
96
+ uint32_t j, magic = CIRTREE_MAGIC;
97
+ uint32_t nperblock = (cl->nKeys > 0x7FFF) ? 0x7FFF : cl->nKeys; //Items per leaf/non-leaf, there are no unsigned ints in java :(
98
+ uint32_t nblocks, keySize = 0, valSize = 8; //In theory valSize could be optimized, in practice that'd be annoying
99
+ uint64_t i, nonLeafEnd, leafSize, nextLeaf;
100
+ uint8_t eight;
101
+ int64_t i64;
102
+ char *chrom;
103
+ size_t l;
104
+
105
+ if(cl->nKeys > 1073676289) {
106
+ fprintf(stderr, "[writeChromList] Error: Currently only 1,073,676,289 contigs are supported. If you really need more then please post a request on github.\n");
107
+ return 1;
108
+ }
109
+ nblocks = cl->nKeys/nperblock;
110
+ nblocks += ((cl->nKeys % nperblock) > 0)?1:0;
111
+
112
+ for(i64=0; i64<cl->nKeys; i64++) {
113
+ l = strlen(cl->chrom[i64]);
114
+ if(l>keySize) keySize = l;
115
+ }
116
+ l--; //We don't null terminate strings, because schiess mich tot
117
+ chrom = calloc(keySize, sizeof(char));
118
+
119
+ //Write the root node of a largely pointless tree
120
+ if(fwrite(&magic, sizeof(uint32_t), 1, fp) != 1) return 1;
121
+ if(fwrite(&nperblock, sizeof(uint32_t), 1, fp) != 1) return 2;
122
+ if(fwrite(&keySize, sizeof(uint32_t), 1, fp) != 1) return 3;
123
+ if(fwrite(&valSize, sizeof(uint32_t), 1, fp) != 1) return 4;
124
+ if(fwrite(&(cl->nKeys), sizeof(uint64_t), 1, fp) != 1) return 5;
125
+
126
+ //Padding?
127
+ i=0;
128
+ if(fwrite(&i, sizeof(uint64_t), 1, fp) != 1) return 6;
129
+
130
+ //Do we need a non-leaf node?
131
+ if(nblocks > 1) {
132
+ eight = 0;
133
+ if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 7;
134
+ if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 8; //padding
135
+ if(fwrite(&nblocks, sizeof(uint16_t), 1, fp) != 1) return 8;
136
+ nonLeafEnd = ftell(fp) + nperblock * (keySize + 8);
137
+ leafSize = nperblock * (keySize + 8) + 4;
138
+ for(i=0; i<nblocks; i++) { //Why yes, this is pointless
139
+ chrom = strncpy(chrom, cl->chrom[i * nperblock], keySize);
140
+ nextLeaf = nonLeafEnd + i * leafSize;
141
+ if(fwrite(chrom, keySize, 1, fp) != 1) return 9;
142
+ if(fwrite(&nextLeaf, sizeof(uint64_t), 1, fp) != 1) return 10;
143
+ }
144
+ for(i=0; i<keySize; i++) chrom[i] = '\0';
145
+ nextLeaf = 0;
146
+ for(i=nblocks; i<nperblock; i++) {
147
+ if(fwrite(chrom, keySize, 1, fp) != 1) return 9;
148
+ if(fwrite(&nextLeaf, sizeof(uint64_t), 1, fp) != 1) return 10;
149
+ }
150
+ }
151
+
152
+ //Write the leaves
153
+ nextLeaf = 0;
154
+ for(i=0, j=0; i<nblocks; i++) {
155
+ eight = 1;
156
+ if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 11;
157
+ eight = 0;
158
+ if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 12;
159
+ if(cl->nKeys - j < nperblock) {
160
+ k = cl->nKeys - j;
161
+ if(fwrite(&k, sizeof(uint16_t), 1, fp) != 1) return 13;
162
+ } else {
163
+ if(fwrite(&nperblock, sizeof(uint16_t), 1, fp) != 1) return 13;
164
+ }
165
+ for(k=0; k<nperblock; k++) {
166
+ if(j>=cl->nKeys) {
167
+ if(chrom[0]) {
168
+ for(l=0; l<keySize; l++) chrom[l] = '\0';
169
+ }
170
+ if(fwrite(chrom, keySize, 1, fp) != 1) return 15;
171
+ if(fwrite(&nextLeaf, sizeof(uint64_t), 1, fp) != 1) return 16;
172
+ } else {
173
+ chrom = strncpy(chrom, cl->chrom[j], keySize);
174
+ if(fwrite(chrom, keySize, 1, fp) != 1) return 15;
175
+ if(fwrite(&j, sizeof(uint32_t), 1, fp) != 1) return 16;
176
+ if(fwrite(&(cl->len[j++]), sizeof(uint32_t), 1, fp) != 1) return 17;
177
+ }
178
+ }
179
+ }
180
+
181
+ free(chrom);
182
+ return 0;
183
+ }
184
+
185
+ //returns 0 on success
186
+ //Still need to fill in indexOffset
187
+ int bwWriteHdr(bigWigFile_t *bw) {
188
+ uint32_t magic = BIGWIG_MAGIC;
189
+ uint16_t two = 4;
190
+ FILE *fp;
191
+ const uint8_t pbuff[58] = {0}; // 58 bytes of nothing
192
+ const void *p = (const void *)&pbuff;
193
+ if(!bw->isWrite) return 1;
194
+
195
+ //The header itself, largely just reserving space...
196
+ fp = bw->URL->x.fp;
197
+ if(!fp) return 2;
198
+ if(fseek(fp, 0, SEEK_SET)) return 3;
199
+ if(fwrite(&magic, sizeof(uint32_t), 1, fp) != 1) return 4;
200
+ if(fwrite(&two, sizeof(uint16_t), 1, fp) != 1) return 5;
201
+ if(fwrite(p, sizeof(uint8_t), 58, fp) != 58) return 6;
202
+
203
+ //Empty zoom headers
204
+ if(bw->hdr->nLevels) {
205
+ for(two=0; two<bw->hdr->nLevels; two++) {
206
+ if(fwrite(p, sizeof(uint8_t), 24, fp) != 24) return 9;
207
+ }
208
+ }
209
+
210
+ //Update summaryOffset and write an empty summary block
211
+ bw->hdr->summaryOffset = ftell(fp);
212
+ if(fwrite(p, sizeof(uint8_t), 40, fp) != 40) return 10;
213
+ if(writeAtPos(&(bw->hdr->summaryOffset), sizeof(uint64_t), 1, 0x2c, fp)) return 11;
214
+
215
+ //Write the chromosome list as a stupid freaking tree (because let's TREE ALL THE THINGS!!!)
216
+ bw->hdr->ctOffset = ftell(fp);
217
+ if(writeChromList(fp, bw->cl)) return 7;
218
+ if(writeAtPos(&(bw->hdr->ctOffset), sizeof(uint64_t), 1, 0x8, fp)) return 8;
219
+
220
+ //Update the dataOffset
221
+ bw->hdr->dataOffset = ftell(fp);
222
+ if(writeAtPos(&bw->hdr->dataOffset, sizeof(uint64_t), 1, 0x10, fp)) return 12;
223
+
224
+ //Save space for the number of blocks
225
+ if(fwrite(p, sizeof(uint8_t), 8, fp) != 8) return 13;
226
+
227
+ return 0;
228
+ }
229
+
230
+ static int insertIndexNode(bigWigFile_t *fp, bwRTreeNode_t *leaf) {
231
+ bwLL *l = malloc(sizeof(bwLL));
232
+ if(!l) return 1;
233
+ l->node = leaf;
234
+ l->next = NULL;
235
+
236
+ if(!fp->writeBuffer->firstIndexNode) {
237
+ fp->writeBuffer->firstIndexNode = l;
238
+ } else {
239
+ fp->writeBuffer->currentIndexNode->next = l;
240
+ }
241
+ fp->writeBuffer->currentIndexNode = l;
242
+ return 0;
243
+ }
244
+
245
+ //0 on success
246
+ static int appendIndexNodeEntry(bigWigFile_t *fp, uint32_t tid0, uint32_t tid1, uint32_t start, uint32_t end, uint64_t offset, uint64_t size) {
247
+ bwLL *n = fp->writeBuffer->currentIndexNode;
248
+ if(!n) return 1;
249
+ if(n->node->nChildren >= fp->writeBuffer->blockSize) return 2;
250
+
251
+ n->node->chrIdxStart[n->node->nChildren] = tid0;
252
+ n->node->baseStart[n->node->nChildren] = start;
253
+ n->node->chrIdxEnd[n->node->nChildren] = tid1;
254
+ n->node->baseEnd[n->node->nChildren] = end;
255
+ n->node->dataOffset[n->node->nChildren] = offset;
256
+ n->node->x.size[n->node->nChildren] = size;
257
+ n->node->nChildren++;
258
+ return 0;
259
+ }
260
+
261
+ //Returns 0 on success
262
+ static int addIndexEntry(bigWigFile_t *fp, uint32_t tid0, uint32_t tid1, uint32_t start, uint32_t end, uint64_t offset, uint64_t size) {
263
+ bwRTreeNode_t *node;
264
+
265
+ if(appendIndexNodeEntry(fp, tid0, tid1, start, end, offset, size)) {
266
+ //The last index node is full, we need to add a new one
267
+ node = calloc(1, sizeof(bwRTreeNode_t));
268
+ if(!node) return 1;
269
+
270
+ //Allocate and set the fields
271
+ node->isLeaf = 1;
272
+ node->nChildren = 1;
273
+ node->chrIdxStart = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize);
274
+ if(!node->chrIdxStart) goto error;
275
+ node->baseStart = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize);
276
+ if(!node->baseStart) goto error;
277
+ node->chrIdxEnd = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize);
278
+ if(!node->chrIdxEnd) goto error;
279
+ node->baseEnd = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize);
280
+ if(!node->baseEnd) goto error;
281
+ node->dataOffset = malloc(sizeof(uint64_t)*fp->writeBuffer->blockSize);
282
+ if(!node->dataOffset) goto error;
283
+ node->x.size = malloc(sizeof(uint64_t)*fp->writeBuffer->blockSize);
284
+ if(!node->x.size) goto error;
285
+
286
+ node->chrIdxStart[0] = tid0;
287
+ node->baseStart[0] = start;
288
+ node->chrIdxEnd[0] = tid1;
289
+ node->baseEnd[0] = end;
290
+ node->dataOffset[0] = offset;
291
+ node->x.size[0] = size;
292
+
293
+ if(insertIndexNode(fp, node)) goto error;
294
+ }
295
+
296
+ return 0;
297
+
298
+ error:
299
+ if(node->chrIdxStart) free(node->chrIdxStart);
300
+ if(node->baseStart) free(node->baseStart);
301
+ if(node->chrIdxEnd) free(node->chrIdxEnd);
302
+ if(node->baseEnd) free(node->baseEnd);
303
+ if(node->dataOffset) free(node->dataOffset);
304
+ if(node->x.size) free(node->x.size);
305
+ return 2;
306
+ }
307
+
308
+ /*
309
+ * TODO:
310
+ * The buffer size and compression sz need to be determined elsewhere (and p and compressP filled in!)
311
+ */
312
+ static int flushBuffer(bigWigFile_t *fp) {
313
+ bwWriteBuffer_t *wb = fp->writeBuffer;
314
+ uLongf sz = wb->compressPsz;
315
+ uint16_t nItems;
316
+ if(!fp->writeBuffer->l) return 0;
317
+ if(!wb->ltype) return 0;
318
+
319
+ //Fill in the header
320
+ if(!memcpy(wb->p, &(wb->tid), sizeof(uint32_t))) return 1;
321
+ if(!memcpy(wb->p+4, &(wb->start), sizeof(uint32_t))) return 2;
322
+ if(!memcpy(wb->p+8, &(wb->end), sizeof(uint32_t))) return 3;
323
+ if(!memcpy(wb->p+12, &(wb->step), sizeof(uint32_t))) return 4;
324
+ if(!memcpy(wb->p+16, &(wb->span), sizeof(uint32_t))) return 5;
325
+ if(!memcpy(wb->p+20, &(wb->ltype), sizeof(uint8_t))) return 6;
326
+ //1 byte padding
327
+ //Determine the number of items
328
+ switch(wb->ltype) {
329
+ case 1:
330
+ nItems = (wb->l-24)/12;
331
+ break;
332
+ case 2:
333
+ nItems = (wb->l-24)/8;
334
+ break;
335
+ case 3:
336
+ nItems = (wb->l-24)/4;
337
+ break;
338
+ default:
339
+ return 7;
340
+ }
341
+ if(!memcpy(wb->p+22, &nItems, sizeof(uint16_t))) return 8;
342
+
343
+ if(sz) {
344
+ //compress
345
+ if(compress(wb->compressP, &sz, wb->p, wb->l) != Z_OK) return 9;
346
+
347
+ //write the data to disk
348
+ if(fwrite(wb->compressP, sizeof(uint8_t), sz, fp->URL->x.fp) != sz) return 10;
349
+ } else {
350
+ sz = wb->l;
351
+ if(fwrite(wb->p, sizeof(uint8_t), wb->l, fp->URL->x.fp) != wb->l) return 10;
352
+ }
353
+
354
+ //Add an entry into the index
355
+ if(addIndexEntry(fp, wb->tid, wb->tid, wb->start, wb->end, bwTell(fp)-sz, sz)) return 11;
356
+
357
+ wb->nBlocks++;
358
+ wb->l = 24;
359
+ return 0;
360
+ }
361
+
362
+ static void updateStats(bigWigFile_t *fp, uint32_t span, float val) {
363
+ if(val < fp->hdr->minVal) fp->hdr->minVal = val;
364
+ else if(val > fp->hdr->maxVal) fp->hdr->maxVal = val;
365
+ fp->hdr->nBasesCovered += span;
366
+ fp->hdr->sumData += span*val;
367
+ fp->hdr->sumSquared += span*pow(val,2);
368
+
369
+ fp->writeBuffer->nEntries++;
370
+ fp->writeBuffer->runningWidthSum += span;
371
+ }
372
+
373
+ //12 bytes per entry
374
+ int bwAddIntervals(bigWigFile_t *fp, const char* const* chrom, const uint32_t *start, const uint32_t *end, const float *values, uint32_t n) {
375
+ uint32_t tid = 0, i;
376
+ const char *lastChrom = NULL;
377
+ bwWriteBuffer_t *wb = fp->writeBuffer;
378
+ if(!n) return 0; //Not an error per se
379
+ if(!fp->isWrite) return 1;
380
+ if(!wb) return 2;
381
+
382
+ //Flush if needed
383
+ if(wb->ltype != 1) if(flushBuffer(fp)) return 3;
384
+ if(wb->l+36 > fp->hdr->bufSize) if(flushBuffer(fp)) return 4;
385
+ lastChrom = chrom[0];
386
+ tid = bwGetTid(fp, chrom[0]);
387
+ if(tid == (uint32_t) -1) return 5;
388
+ if(tid != wb->tid) {
389
+ if(flushBuffer(fp)) return 6;
390
+ wb->tid = tid;
391
+ wb->start = start[0];
392
+ wb->end = end[0];
393
+ }
394
+
395
+ //Ensure that everything is set correctly
396
+ wb->ltype = 1;
397
+ if(wb->l <= 24) {
398
+ wb->start = start[0];
399
+ wb->span = 0;
400
+ wb->step = 0;
401
+ }
402
+ if(!memcpy(wb->p+wb->l, start, sizeof(uint32_t))) return 7;
403
+ if(!memcpy(wb->p+wb->l+4, end, sizeof(uint32_t))) return 8;
404
+ if(!memcpy(wb->p+wb->l+8, values, sizeof(float))) return 9;
405
+ updateStats(fp, end[0]-start[0], values[0]);
406
+ wb->l += 12;
407
+
408
+ for(i=1; i<n; i++) {
409
+ if(strcmp(chrom[i],lastChrom) != 0) {
410
+ wb->end = end[i-1];
411
+ flushBuffer(fp);
412
+ lastChrom = chrom[i];
413
+ tid = bwGetTid(fp, chrom[i]);
414
+ if(tid == (uint32_t) -1) return 10;
415
+ wb->tid = tid;
416
+ wb->start = start[i];
417
+ }
418
+ if(wb->l+12 > fp->hdr->bufSize) { //12 bytes/entry
419
+ wb->end = end[i-1];
420
+ flushBuffer(fp);
421
+ wb->start = start[i];
422
+ }
423
+ if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 11;
424
+ if(!memcpy(wb->p+wb->l+4, &(end[i]), sizeof(uint32_t))) return 12;
425
+ if(!memcpy(wb->p+wb->l+8, &(values[i]), sizeof(float))) return 13;
426
+ updateStats(fp, end[i]-start[i], values[i]);
427
+ wb->l += 12;
428
+ }
429
+ wb->end = end[i-1];
430
+
431
+ return 0;
432
+ }
433
+
434
+ int bwAppendIntervals(bigWigFile_t *fp, const uint32_t *start, const uint32_t *end, const float *values, uint32_t n) {
435
+ uint32_t i;
436
+ bwWriteBuffer_t *wb = fp->writeBuffer;
437
+ if(!n) return 0;
438
+ if(!fp->isWrite) return 1;
439
+ if(!wb) return 2;
440
+ if(wb->ltype != 1) return 3;
441
+
442
+ for(i=0; i<n; i++) {
443
+ if(wb->l+12 > fp->hdr->bufSize) {
444
+ if(i>0) { //otherwise it's already set
445
+ wb->end = end[i-1];
446
+ }
447
+ flushBuffer(fp);
448
+ wb->start = start[i];
449
+ }
450
+ if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 4;
451
+ if(!memcpy(wb->p+wb->l+4, &(end[i]), sizeof(uint32_t))) return 5;
452
+ if(!memcpy(wb->p+wb->l+8, &(values[i]), sizeof(float))) return 6;
453
+ updateStats(fp, end[i]-start[i], values[i]);
454
+ wb->l += 12;
455
+ }
456
+ wb->end = end[i-1];
457
+
458
+ return 0;
459
+ }
460
+
461
+ //8 bytes per entry
462
+ int bwAddIntervalSpans(bigWigFile_t *fp, const char *chrom, const uint32_t *start, uint32_t span, const float *values, uint32_t n) {
463
+ uint32_t i, tid;
464
+ bwWriteBuffer_t *wb = fp->writeBuffer;
465
+ if(!n) return 0;
466
+ if(!fp->isWrite) return 1;
467
+ if(!wb) return 2;
468
+ if(wb->ltype != 2) if(flushBuffer(fp)) return 3;
469
+ if(flushBuffer(fp)) return 4;
470
+
471
+ tid = bwGetTid(fp, chrom);
472
+ if(tid == (uint32_t) -1) return 5;
473
+ wb->tid = tid;
474
+ wb->start = start[0];
475
+ wb->step = 0;
476
+ wb->span = span;
477
+ wb->ltype = 2;
478
+
479
+ for(i=0; i<n; i++) {
480
+ if(wb->l + 8 >= fp->hdr->bufSize) { //8 bytes/entry
481
+ if(i) wb->end = start[i-1]+span;
482
+ flushBuffer(fp);
483
+ wb->start = start[i];
484
+ }
485
+ if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 5;
486
+ if(!memcpy(wb->p+wb->l+4, &(values[i]), sizeof(float))) return 6;
487
+ updateStats(fp, span, values[i]);
488
+ wb->l += 8;
489
+ }
490
+ wb->end = start[n-1] + span;
491
+
492
+ return 0;
493
+ }
494
+
495
+ int bwAppendIntervalSpans(bigWigFile_t *fp, const uint32_t *start, const float *values, uint32_t n) {
496
+ uint32_t i;
497
+ bwWriteBuffer_t *wb = fp->writeBuffer;
498
+ if(!n) return 0;
499
+ if(!fp->isWrite) return 1;
500
+ if(!wb) return 2;
501
+ if(wb->ltype != 2) return 3;
502
+
503
+ for(i=0; i<n; i++) {
504
+ if(wb->l + 8 >= fp->hdr->bufSize) {
505
+ if(i) wb->end = start[i-1]+wb->span;
506
+ flushBuffer(fp);
507
+ wb->start = start[i];
508
+ }
509
+ if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 4;
510
+ if(!memcpy(wb->p+wb->l+4, &(values[i]), sizeof(float))) return 5;
511
+ updateStats(fp, wb->span, values[i]);
512
+ wb->l += 8;
513
+ }
514
+ wb->end = start[n-1] + wb->span;
515
+
516
+ return 0;
517
+ }
518
+
519
+ //4 bytes per entry
520
+ int bwAddIntervalSpanSteps(bigWigFile_t *fp, const char *chrom, uint32_t start, uint32_t span, uint32_t step, const float *values, uint32_t n) {
521
+ uint32_t i, tid;
522
+ bwWriteBuffer_t *wb = fp->writeBuffer;
523
+ if(!n) return 0;
524
+ if(!fp->isWrite) return 1;
525
+ if(!wb) return 2;
526
+ if(wb->ltype != 3) flushBuffer(fp);
527
+ if(flushBuffer(fp)) return 3;
528
+
529
+ tid = bwGetTid(fp, chrom);
530
+ if(tid == (uint32_t) -1) return 4;
531
+ wb->tid = tid;
532
+ wb->start = start;
533
+ wb->step = step;
534
+ wb->span = span;
535
+ wb->ltype = 3;
536
+
537
+ for(i=0; i<n; i++) {
538
+ if(wb->l + 4 >= fp->hdr->bufSize) {
539
+ wb->end = wb->start + ((wb->l-24)>>2) * step;
540
+ flushBuffer(fp);
541
+ wb->start = wb->end;
542
+ }
543
+ if(!memcpy(wb->p+wb->l, &(values[i]), sizeof(float))) return 5;
544
+ updateStats(fp, wb->span, values[i]);
545
+ wb->l += 4;
546
+ }
547
+ wb->end = wb->start + (wb->l>>2) * step;
548
+
549
+ return 0;
550
+ }
551
+
552
+ int bwAppendIntervalSpanSteps(bigWigFile_t *fp, const float *values, uint32_t n) {
553
+ uint32_t i;
554
+ bwWriteBuffer_t *wb = fp->writeBuffer;
555
+ if(!n) return 0;
556
+ if(!fp->isWrite) return 1;
557
+ if(!wb) return 2;
558
+ if(wb->ltype != 3) return 3;
559
+
560
+ for(i=0; i<n; i++) {
561
+ if(wb->l + 4 >= fp->hdr->bufSize) {
562
+ wb->end = wb->start + ((wb->l-24)>>2) * wb->step;
563
+ flushBuffer(fp);
564
+ wb->start = wb->end;
565
+ }
566
+ if(!memcpy(wb->p+wb->l, &(values[i]), sizeof(float))) return 4;
567
+ updateStats(fp, wb->span, values[i]);
568
+ wb->l += 4;
569
+ }
570
+ wb->end = wb->start + (wb->l>>2) * wb->step;
571
+
572
+ return 0;
573
+ }
574
+
575
+ //0 on success
576
+ int writeSummary(bigWigFile_t *fp) {
577
+ if(writeAtPos(&(fp->hdr->nBasesCovered), sizeof(uint64_t), 1, fp->hdr->summaryOffset, fp->URL->x.fp)) return 1;
578
+ if(writeAtPos(&(fp->hdr->minVal), sizeof(double), 1, fp->hdr->summaryOffset+8, fp->URL->x.fp)) return 2;
579
+ if(writeAtPos(&(fp->hdr->maxVal), sizeof(double), 1, fp->hdr->summaryOffset+16, fp->URL->x.fp)) return 3;
580
+ if(writeAtPos(&(fp->hdr->sumData), sizeof(double), 1, fp->hdr->summaryOffset+24, fp->URL->x.fp)) return 4;
581
+ if(writeAtPos(&(fp->hdr->sumSquared), sizeof(double), 1, fp->hdr->summaryOffset+32, fp->URL->x.fp)) return 5;
582
+ return 0;
583
+ }
584
+
585
+ static bwRTreeNode_t *makeEmptyNode(uint32_t blockSize) {
586
+ bwRTreeNode_t *n = calloc(1, sizeof(bwRTreeNode_t));
587
+ if(!n) return NULL;
588
+
589
+ n->chrIdxStart = malloc(blockSize*sizeof(uint32_t));
590
+ if(!n->chrIdxStart) goto error;
591
+ n->baseStart = malloc(blockSize*sizeof(uint32_t));
592
+ if(!n->baseStart) goto error;
593
+ n->chrIdxEnd = malloc(blockSize*sizeof(uint32_t));
594
+ if(!n->chrIdxEnd) goto error;
595
+ n->baseEnd = malloc(blockSize*sizeof(uint32_t));
596
+ if(!n->baseEnd) goto error;
597
+ n->dataOffset = calloc(blockSize,sizeof(uint64_t)); //This MUST be 0 for node writing!
598
+ if(!n->dataOffset) goto error;
599
+ n->x.child = malloc(blockSize*sizeof(uint64_t));
600
+ if(!n->x.child) goto error;
601
+
602
+ return n;
603
+
604
+ error:
605
+ if(n->chrIdxStart) free(n->chrIdxStart);
606
+ if(n->baseStart) free(n->baseStart);
607
+ if(n->chrIdxEnd) free(n->chrIdxEnd);
608
+ if(n->baseEnd) free(n->baseEnd);
609
+ if(n->dataOffset) free(n->dataOffset);
610
+ if(n->x.child) free(n->x.child);
611
+ free(n);
612
+ return NULL;
613
+ }
614
+
615
+ //Returns 0 on success. This doesn't attempt to clean up!
616
+ static bwRTreeNode_t *addLeaves(bwLL **ll, uint64_t *sz, uint64_t toProcess, uint32_t blockSize) {
617
+ uint32_t i;
618
+ uint64_t foo;
619
+ bwRTreeNode_t *n = makeEmptyNode(blockSize);
620
+ if(!n) return NULL;
621
+
622
+ if(toProcess <= blockSize) {
623
+ for(i=0; i<toProcess; i++) {
624
+ n->chrIdxStart[i] = (*ll)->node->chrIdxStart[0];
625
+ n->baseStart[i] = (*ll)->node->baseStart[0];
626
+ n->chrIdxEnd[i] = (*ll)->node->chrIdxEnd[(*ll)->node->nChildren-1];
627
+ n->baseEnd[i] = (*ll)->node->baseEnd[(*ll)->node->nChildren-1];
628
+ n->x.child[i] = (*ll)->node;
629
+ *sz += 4 + 32*(*ll)->node->nChildren;
630
+ *ll = (*ll)->next;
631
+ n->nChildren++;
632
+ }
633
+ } else {
634
+ for(i=0; i<blockSize; i++) {
635
+ foo = ceil(((double) toProcess)/((double) blockSize-i));
636
+ if(!ll) break;
637
+ n->x.child[i] = addLeaves(ll, sz, foo, blockSize);
638
+ if(!n->x.child[i]) goto error;
639
+ n->chrIdxStart[i] = n->x.child[i]->chrIdxStart[0];
640
+ n->baseStart[i] = n->x.child[i]->baseStart[0];
641
+ n->chrIdxEnd[i] = n->x.child[i]->chrIdxEnd[n->x.child[i]->nChildren-1];
642
+ n->baseEnd[i] = n->x.child[i]->baseEnd[n->x.child[i]->nChildren-1];
643
+ n->nChildren++;
644
+ toProcess -= foo;
645
+ }
646
+ }
647
+
648
+ *sz += 4 + 24*n->nChildren;
649
+ return n;
650
+
651
+ error:
652
+ bwDestroyIndexNode(n);
653
+ return NULL;
654
+ }
655
+
656
+ //Returns 1 on error
657
+ int writeIndexTreeNode(FILE *fp, bwRTreeNode_t *n, uint8_t *wrote, int level) {
658
+ uint8_t one = 0;
659
+ uint32_t i, j, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0
660
+
661
+ if(n->isLeaf) return 0;
662
+
663
+ for(i=0; i<n->nChildren; i++) {
664
+ if(n->dataOffset[i]) { //traverse into child
665
+ if(n->isLeaf) return 0; //Only write leaves once!
666
+ if(writeIndexTreeNode(fp, n->x.child[i], wrote, level+1)) return 1;
667
+ } else {
668
+ n->dataOffset[i] = ftell(fp);
669
+ if(fwrite(&(n->x.child[i]->isLeaf), sizeof(uint8_t), 1, fp) != 1) return 1;
670
+ if(fwrite(&one, sizeof(uint8_t), 1, fp) != 1) return 1; //one byte of padding
671
+ if(fwrite(&(n->x.child[i]->nChildren), sizeof(uint16_t), 1, fp) != 1) return 1;
672
+ for(j=0; j<n->x.child[i]->nChildren; j++) {
673
+ vector[0] = n->x.child[i]->chrIdxStart[j];
674
+ vector[1] = n->x.child[i]->baseStart[j];
675
+ vector[2] = n->x.child[i]->chrIdxEnd[j];
676
+ vector[3] = n->x.child[i]->baseEnd[j];
677
+ if(n->x.child[i]->isLeaf) {
678
+ //Include the offset and size
679
+ if(fwrite(vector, sizeof(uint32_t), 4, fp) != 4) return 1;
680
+ if(fwrite(&(n->x.child[i]->dataOffset[j]), sizeof(uint64_t), 1, fp) != 1) return 1;
681
+ if(fwrite(&(n->x.child[i]->x.size[j]), sizeof(uint64_t), 1, fp) != 1) return 1;
682
+ } else {
683
+ if(fwrite(vector, sizeof(uint32_t), 6, fp) != 6) return 1;
684
+ }
685
+ }
686
+ *wrote = 1;
687
+ }
688
+ }
689
+
690
+ return 0;
691
+ }
692
+
693
+ //returns 1 on success
694
+ int writeIndexOffsets(FILE *fp, bwRTreeNode_t *n, uint64_t offset) {
695
+ uint32_t i;
696
+
697
+ if(n->isLeaf) return 0;
698
+ for(i=0; i<n->nChildren; i++) {
699
+ if(writeIndexOffsets(fp, n->x.child[i], n->dataOffset[i])) return 1;
700
+ if(writeAtPos(&(n->dataOffset[i]), sizeof(uint64_t), 1, offset+20+24*i, fp)) return 2;
701
+ }
702
+ return 0;
703
+ }
704
+
705
+ //Returns 0 on success
706
+ int writeIndexTree(bigWigFile_t *fp) {
707
+ uint64_t offset;
708
+ uint8_t wrote = 0;
709
+ int rv;
710
+
711
+ while((rv = writeIndexTreeNode(fp->URL->x.fp, fp->idx->root, &wrote, 0)) == 0) {
712
+ if(!wrote) break;
713
+ wrote = 0;
714
+ }
715
+
716
+ if(rv || wrote) return 1;
717
+
718
+ //Save the file position
719
+ offset = bwTell(fp);
720
+
721
+ //Write the offsets
722
+ if(writeIndexOffsets(fp->URL->x.fp, fp->idx->root, fp->idx->rootOffset)) return 2;
723
+
724
+ //Move the file pointer back to the end
725
+ bwSetPos(fp, offset);
726
+
727
+ return 0;
728
+ }
729
+
730
+ //Returns 0 on success. The original state SHOULD be preserved on error
731
+ int writeIndex(bigWigFile_t *fp) {
732
+ uint32_t four = IDX_MAGIC;
733
+ uint64_t idxSize = 0, foo;
734
+ uint8_t one = 0;
735
+ uint32_t i, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0
736
+ bwLL *ll = fp->writeBuffer->firstIndexNode, *p;
737
+ bwRTreeNode_t *root = NULL;
738
+
739
+ if(!fp->writeBuffer->nBlocks) return 0;
740
+ fp->idx = malloc(sizeof(bwRTree_t));
741
+ if(!fp->idx) return 2;
742
+ fp->idx->root = root;
743
+
744
+ //Update the file header to indicate the proper index position
745
+ foo = bwTell(fp);
746
+ if(writeAtPos(&foo, sizeof(uint64_t), 1, 0x18, fp->URL->x.fp)) return 3;
747
+
748
+ //Make the tree
749
+ if(ll == fp->writeBuffer->currentIndexNode) {
750
+ root = ll->node;
751
+ idxSize = 4 + 24*root->nChildren;
752
+ } else {
753
+ root = addLeaves(&ll, &idxSize, ceil(((double)fp->writeBuffer->nBlocks)/fp->writeBuffer->blockSize), fp->writeBuffer->blockSize);
754
+ }
755
+ if(!root) return 4;
756
+ fp->idx->root = root;
757
+
758
+ ll = fp->writeBuffer->firstIndexNode;
759
+ while(ll) {
760
+ p = ll->next;
761
+ free(ll);
762
+ ll=p;
763
+ }
764
+
765
+ //write the header
766
+ if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 5;
767
+ if(fwrite(&(fp->writeBuffer->blockSize), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 6;
768
+ if(fwrite(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 7;
769
+ if(fwrite(&(root->chrIdxStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8;
770
+ if(fwrite(&(root->baseStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9;
771
+ if(fwrite(&(root->chrIdxEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 10;
772
+ if(fwrite(&(root->baseEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 11;
773
+ if(fwrite(&idxSize, sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 12;
774
+ four = 1;
775
+ if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 13;
776
+ four = 0;
777
+ if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 14; //padding
778
+ fp->idx->rootOffset = bwTell(fp);
779
+
780
+ //Write the root node, since writeIndexTree writes the children and fills in the offset
781
+ if(fwrite(&(root->isLeaf), sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 16;
782
+ if(fwrite(&one, sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 17; //one byte of padding
783
+ if(fwrite(&(root->nChildren), sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 18;
784
+ for(i=0; i<root->nChildren; i++) {
785
+ vector[0] = root->chrIdxStart[i];
786
+ vector[1] = root->baseStart[i];
787
+ vector[2] = root->chrIdxEnd[i];
788
+ vector[3] = root->baseEnd[i];
789
+ if(root->isLeaf) {
790
+ //Include the offset and size
791
+ if(fwrite(vector, sizeof(uint32_t), 4, fp->URL->x.fp) != 4) return 19;
792
+ if(fwrite(&(root->dataOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 20;
793
+ if(fwrite(&(root->x.size[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 21;
794
+ } else {
795
+ root->dataOffset[i] = 0; //FIXME: Something upstream is setting this to impossible values (e.g., 0x21?!?!?)
796
+ if(fwrite(vector, sizeof(uint32_t), 6, fp->URL->x.fp) != 6) return 22;
797
+ }
798
+ }
799
+
800
+ //Write each level
801
+ if(writeIndexTree(fp)) return 23;
802
+
803
+ return 0;
804
+ }
805
+
806
+ //The first zoom level has a resolution of 4x mean entry size
807
+ //This may or may not produce the requested number of zoom levels
808
+ int makeZoomLevels(bigWigFile_t *fp) {
809
+ uint32_t meanBinSize, i;
810
+ uint32_t multiplier = 4, zoom = 10, maxZoom = 0;
811
+ uint16_t nLevels = 0;
812
+
813
+ meanBinSize = ((double) fp->writeBuffer->runningWidthSum)/(fp->writeBuffer->nEntries);
814
+ //In reality, one level is skipped
815
+ meanBinSize *= 4;
816
+ //N.B., we must ALWAYS check that the zoom doesn't overflow a uint32_t!
817
+ if(((uint32_t)-1)>>2 < meanBinSize) return 0; //No zoom levels!
818
+ if(meanBinSize*4 > zoom) zoom = multiplier*meanBinSize;
819
+
820
+ fp->hdr->zoomHdrs = calloc(1, sizeof(bwZoomHdr_t));
821
+ if(!fp->hdr->zoomHdrs) return 1;
822
+ fp->hdr->zoomHdrs->level = malloc(fp->hdr->nLevels * sizeof(uint32_t));
823
+ fp->hdr->zoomHdrs->dataOffset = calloc(fp->hdr->nLevels, sizeof(uint64_t));
824
+ fp->hdr->zoomHdrs->indexOffset = calloc(fp->hdr->nLevels, sizeof(uint64_t));
825
+ fp->hdr->zoomHdrs->idx = calloc(fp->hdr->nLevels, sizeof(bwRTree_t*));
826
+ if(!fp->hdr->zoomHdrs->level) return 2;
827
+ if(!fp->hdr->zoomHdrs->dataOffset) return 3;
828
+ if(!fp->hdr->zoomHdrs->indexOffset) return 4;
829
+ if(!fp->hdr->zoomHdrs->idx) return 5;
830
+
831
+ //There's no point in having a zoom level larger than the largest chromosome
832
+ //This will none the less allow at least one zoom level, which is generally needed for IGV et al.
833
+ for(i=0; i<fp->cl->nKeys; i++) {
834
+ if(fp->cl->len[i] > maxZoom) maxZoom = fp->cl->len[i];
835
+ }
836
+ if(zoom > maxZoom) zoom = maxZoom;
837
+
838
+ for(i=0; i<fp->hdr->nLevels; i++) {
839
+ if(zoom > maxZoom) break; //prevent absurdly large zoom levels
840
+ fp->hdr->zoomHdrs->level[i] = zoom;
841
+ nLevels++;
842
+ if(((uint32_t)-1)/multiplier < zoom) break;
843
+ zoom *= multiplier;
844
+ }
845
+ fp->hdr->nLevels = nLevels;
846
+
847
+ fp->writeBuffer->firstZoomBuffer = calloc(nLevels,sizeof(bwZoomBuffer_t*));
848
+ if(!fp->writeBuffer->firstZoomBuffer) goto error;
849
+ fp->writeBuffer->lastZoomBuffer = calloc(nLevels,sizeof(bwZoomBuffer_t*));
850
+ if(!fp->writeBuffer->lastZoomBuffer) goto error;
851
+ fp->writeBuffer->nNodes = calloc(nLevels, sizeof(uint64_t));
852
+
853
+ for(i=0; i<fp->hdr->nLevels; i++) {
854
+ fp->writeBuffer->firstZoomBuffer[i] = calloc(1, sizeof(bwZoomBuffer_t));
855
+ if(!fp->writeBuffer->firstZoomBuffer[i]) goto error;
856
+ fp->writeBuffer->firstZoomBuffer[i]->p = calloc(fp->hdr->bufSize/32, 32);
857
+ if(!fp->writeBuffer->firstZoomBuffer[i]->p) goto error;
858
+ fp->writeBuffer->firstZoomBuffer[i]->m = fp->hdr->bufSize;
859
+ ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[0] = 0;
860
+ ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[1] = 0;
861
+ ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[2] = fp->hdr->zoomHdrs->level[i];
862
+ if(fp->hdr->zoomHdrs->level[i] > fp->cl->len[0]) ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[2] = fp->cl->len[0];
863
+ fp->writeBuffer->lastZoomBuffer[i] = fp->writeBuffer->firstZoomBuffer[i];
864
+ }
865
+
866
+ return 0;
867
+
868
+ error:
869
+ if(fp->writeBuffer->firstZoomBuffer) {
870
+ for(i=0; i<fp->hdr->nLevels; i++) {
871
+ if(fp->writeBuffer->firstZoomBuffer[i]) {
872
+ if(fp->writeBuffer->firstZoomBuffer[i]->p) free(fp->writeBuffer->firstZoomBuffer[i]->p);
873
+ free(fp->writeBuffer->firstZoomBuffer[i]);
874
+ }
875
+ }
876
+ free(fp->writeBuffer->firstZoomBuffer);
877
+ }
878
+ if(fp->writeBuffer->lastZoomBuffer) free(fp->writeBuffer->lastZoomBuffer);
879
+ if(fp->writeBuffer->nNodes) free(fp->writeBuffer->lastZoomBuffer);
880
+ return 6;
881
+ }
882
+
883
+ //Given an interval start, calculate the next one at a zoom level
884
+ void nextPos(bigWigFile_t *fp, uint32_t size, uint32_t *pos, uint32_t desiredTid) {
885
+ uint32_t *tid = pos;
886
+ uint32_t *start = pos+1;
887
+ uint32_t *end = pos+2;
888
+ *start += size;
889
+ if(*start >= fp->cl->len[*tid]) {
890
+ (*start) = 0;
891
+ (*tid)++;
892
+ }
893
+
894
+ //prevent needless iteration when changing chromosomes
895
+ if(*tid < desiredTid) {
896
+ *tid = desiredTid;
897
+ *start = 0;
898
+ }
899
+
900
+ (*end) = *start+size;
901
+ if(*end > fp->cl->len[*tid]) (*end) = fp->cl->len[*tid];
902
+ }
903
+
904
+ //Return the number of bases two intervals overlap
905
+ uint32_t overlapsInterval(uint32_t tid0, uint32_t start0, uint32_t end0, uint32_t tid1, uint32_t start1, uint32_t end1) {
906
+ if(tid0 != tid1) return 0;
907
+ if(end0 <= start1) return 0;
908
+ if(end1 <= start0) return 0;
909
+ if(end0 <= end1) {
910
+ if(start1 > start0) return end0-start1;
911
+ return end0-start0;
912
+ } else {
913
+ if(start1 > start0) return end1-start1;
914
+ return end1-start0;
915
+ }
916
+ }
917
+
918
+ //Returns the number of bases of the interval written
919
+ uint32_t updateInterval(bigWigFile_t *fp, bwZoomBuffer_t *buffer, double *sum, double *sumsq, uint32_t size, uint32_t tid, uint32_t start, uint32_t end, float value) {
920
+ uint32_t *p2 = (uint32_t*) buffer->p;
921
+ float *fp2 = (float*) p2;
922
+ uint32_t rv = 0, offset = 0;
923
+ if(!buffer) return 0;
924
+ if(buffer->l+32 >= buffer->m) return 0;
925
+
926
+ //Make sure that we don't overflow a uint32_t by adding some huge value to start
927
+ if(start + size < start) size = ((uint32_t) -1) - start;
928
+
929
+ if(buffer->l) {
930
+ offset = buffer->l/32;
931
+ } else {
932
+ p2[0] = tid;
933
+ p2[1] = start;
934
+ if(start+size < end) p2[2] = start+size;
935
+ else p2[2] = end;
936
+ }
937
+
938
+ //Do we have any overlap with the previously added interval?
939
+ if(offset) {
940
+ rv = overlapsInterval(p2[8*(offset-1)], p2[8*(offset-1)+1], p2[8*(offset-1)+1] + size, tid, start, end);
941
+ if(rv) {
942
+ p2[8*(offset-1)+2] = start + rv;
943
+ p2[8*(offset-1)+3] += rv;
944
+ if(fp2[8*(offset-1)+4] > value) fp2[8*(offset-1)+4] = value;
945
+ if(fp2[8*(offset-1)+5] < value) fp2[8*(offset-1)+5] = value;
946
+ *sum += rv*value;
947
+ *sumsq += rv*pow(value, 2.0);
948
+ return rv;
949
+ } else {
950
+ fp2[8*(offset-1)+6] = *sum;
951
+ fp2[8*(offset-1)+7] = *sumsq;
952
+ *sum = 0.0;
953
+ *sumsq = 0.0;
954
+ }
955
+ }
956
+
957
+ //If we move to a new interval then skip iterating over a bunch of obviously non-overlapping intervals
958
+ if(offset && p2[8*offset+2] == 0) {
959
+ p2[8*offset] = tid;
960
+ p2[8*offset+1] = start;
961
+ if(start+size < end) p2[8*offset+2] = start+size;
962
+ else p2[8*offset+2] = end;
963
+ //nextPos(fp, size, p2+8*offset, tid); //We can actually skip uncovered intervals
964
+ }
965
+
966
+ //Add a new entry
967
+ while(!(rv = overlapsInterval(p2[8*offset], p2[8*offset+1], p2[8*offset+1] + size, tid, start, end))) {
968
+ p2[8*offset] = tid;
969
+ p2[8*offset+1] = start;
970
+ if(start+size < end) p2[8*offset+2] = start+size;
971
+ else p2[8*offset+2] = end;
972
+ //nextPos(fp, size, p2+8*offset, tid);
973
+ }
974
+ p2[8*offset+3] = rv;
975
+ fp2[8*offset+4] = value; //min
976
+ fp2[8*offset+5] = value; //max
977
+ *sum += rv * value;
978
+ *sumsq += rv * pow(value,2.0);
979
+ buffer->l += 32;
980
+ return rv;
981
+ }
982
+
983
+ //Returns 0 on success
984
+ int addIntervalValue(bigWigFile_t *fp, uint64_t *nEntries, double *sum, double *sumsq, bwZoomBuffer_t *buffer, uint32_t itemsPerSlot, uint32_t zoom, uint32_t tid, uint32_t start, uint32_t end, float value) {
985
+ bwZoomBuffer_t *newBuffer = NULL;
986
+ uint32_t rv;
987
+
988
+ while(start < end) {
989
+ rv = updateInterval(fp, buffer, sum, sumsq, zoom, tid, start, end, value);
990
+ if(!rv) {
991
+ //Allocate a new buffer
992
+ newBuffer = calloc(1, sizeof(bwZoomBuffer_t));
993
+ if(!newBuffer) return 1;
994
+ newBuffer->p = calloc(itemsPerSlot, 32);
995
+ if(!newBuffer->p) goto error;
996
+ newBuffer->m = itemsPerSlot*32;
997
+ memcpy(newBuffer->p, buffer->p+buffer->l-32, 4);
998
+ memcpy(newBuffer->p+4, buffer->p+buffer->l-28, 4);
999
+ ((uint32_t*) newBuffer->p)[2] = ((uint32_t*) newBuffer->p)[1] + zoom;
1000
+ *sum = *sumsq = 0.0;
1001
+ rv = updateInterval(fp, newBuffer, sum, sumsq, zoom, tid, start, end, value);
1002
+ if(!rv) goto error;
1003
+ buffer->next = newBuffer;
1004
+ buffer = buffer->next;
1005
+ *nEntries += 1;
1006
+ }
1007
+ start += rv;
1008
+ }
1009
+
1010
+ return 0;
1011
+
1012
+ error:
1013
+ if(newBuffer) {
1014
+ if(newBuffer->m) free(newBuffer->p);
1015
+ free(newBuffer);
1016
+ }
1017
+ return 2;
1018
+ }
1019
+
1020
+ //Get all of the intervals and add them to the appropriate zoomBuffer
1021
+ int constructZoomLevels(bigWigFile_t *fp) {
1022
+ bwOverlapIterator_t *it = NULL;
1023
+ double *sum = NULL, *sumsq = NULL;
1024
+ uint32_t i, j, k;
1025
+
1026
+ sum = calloc(fp->hdr->nLevels, sizeof(double));
1027
+ sumsq = calloc(fp->hdr->nLevels, sizeof(double));
1028
+ if(!sum || !sumsq) goto error;
1029
+
1030
+ for(i=0; i<fp->cl->nKeys; i++) {
1031
+ it = bwOverlappingIntervalsIterator(fp, fp->cl->chrom[i], 0, fp->cl->len[i], 100000);
1032
+ if(!it) goto error;
1033
+ while(it->data != NULL){
1034
+ for(j=0;j<it->intervals->l;j++){
1035
+ for(k=0;k<fp->hdr->nLevels;k++){
1036
+ if(addIntervalValue(fp, &(fp->writeBuffer->nNodes[k]), sum+k, sumsq+k, fp->writeBuffer->lastZoomBuffer[k], fp->hdr->bufSize/32, fp->hdr->zoomHdrs->level[k], i, it->intervals->start[j], it->intervals->end[j], it->intervals->value[j])) goto error;
1037
+ while(fp->writeBuffer->lastZoomBuffer[k]->next) fp->writeBuffer->lastZoomBuffer[k] = fp->writeBuffer->lastZoomBuffer[k]->next;
1038
+ }
1039
+ }
1040
+ it = bwIteratorNext(it);
1041
+ }
1042
+ bwIteratorDestroy(it);
1043
+
1044
+ }
1045
+
1046
+ //Make an index for each zoom level
1047
+ for(i=0; i<fp->hdr->nLevels; i++) {
1048
+ fp->hdr->zoomHdrs->idx[i] = calloc(1, sizeof(bwRTree_t));
1049
+ if(!fp->hdr->zoomHdrs->idx[i]) return 1;
1050
+ fp->hdr->zoomHdrs->idx[i]->blockSize = fp->writeBuffer->blockSize;
1051
+ }
1052
+
1053
+
1054
+ free(sum);
1055
+ free(sumsq);
1056
+
1057
+ return 0;
1058
+
1059
+ error:
1060
+ if(it) bwIteratorDestroy(it);
1061
+ if(sum) free(sum);
1062
+ if(sumsq) free(sumsq);
1063
+ return 1;
1064
+ }
1065
+
1066
+ int writeZoomLevels(bigWigFile_t *fp) {
1067
+ uint64_t offset1, offset2, idxSize = 0;
1068
+ uint32_t i, j, four = 0, last, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0;
1069
+ uint8_t wrote, one = 0;
1070
+ uint16_t actualNLevels = 0;
1071
+ int rv;
1072
+ bwLL *ll, *p;
1073
+ bwRTreeNode_t *root;
1074
+ bwZoomBuffer_t *zb, *zb2;
1075
+ bwWriteBuffer_t *wb = fp->writeBuffer;
1076
+ uLongf sz;
1077
+
1078
+ for(i=0; i<fp->hdr->nLevels; i++) {
1079
+ if(i) {
1080
+ //Is this a duplicate level?
1081
+ if(fp->writeBuffer->nNodes[i] == fp->writeBuffer->nNodes[i-1]) break;
1082
+ }
1083
+ actualNLevels++;
1084
+
1085
+ //reserve a uint32_t for the number of blocks
1086
+ fp->hdr->zoomHdrs->dataOffset[i] = bwTell(fp);
1087
+ fp->writeBuffer->nBlocks = 0;
1088
+ fp->writeBuffer->l = 24;
1089
+ if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 1;
1090
+ zb = fp->writeBuffer->firstZoomBuffer[i];
1091
+ fp->writeBuffer->firstIndexNode = NULL;
1092
+ fp->writeBuffer->currentIndexNode = NULL;
1093
+ while(zb) {
1094
+ sz = fp->hdr->bufSize;
1095
+ if(compress(wb->compressP, &sz, zb->p, zb->l) != Z_OK) return 2;
1096
+
1097
+ //write the data to disk
1098
+ if(fwrite(wb->compressP, sizeof(uint8_t), sz, fp->URL->x.fp) != sz) return 3;
1099
+
1100
+ //Add an entry into the index
1101
+ last = (zb->l - 32)>>2;
1102
+ if(addIndexEntry(fp, ((uint32_t*)zb->p)[0], ((uint32_t*)zb->p)[last], ((uint32_t*)zb->p)[1], ((uint32_t*)zb->p)[last+2], bwTell(fp)-sz, sz)) return 4;
1103
+
1104
+ wb->nBlocks++;
1105
+ wb->l = 24;
1106
+ zb = zb->next;
1107
+ }
1108
+ if(writeAtPos(&(wb->nBlocks), sizeof(uint32_t), 1, fp->hdr->zoomHdrs->dataOffset[i], fp->URL->x.fp)) return 5;
1109
+
1110
+ //Make the tree
1111
+ ll = fp->writeBuffer->firstIndexNode;
1112
+ if(ll == fp->writeBuffer->currentIndexNode) {
1113
+ root = ll->node;
1114
+ idxSize = 4 + 24*root->nChildren;
1115
+ } else {
1116
+ root = addLeaves(&ll, &idxSize, ceil(((double)fp->writeBuffer->nBlocks)/fp->writeBuffer->blockSize), fp->writeBuffer->blockSize);
1117
+ }
1118
+ if(!root) return 4;
1119
+ fp->hdr->zoomHdrs->idx[i]->root = root;
1120
+
1121
+ ll = fp->writeBuffer->firstIndexNode;
1122
+ while(ll) {
1123
+ p = ll->next;
1124
+ free(ll);
1125
+ ll=p;
1126
+ }
1127
+
1128
+
1129
+ //write the index
1130
+ wrote = 0;
1131
+ fp->hdr->zoomHdrs->indexOffset[i] = bwTell(fp);
1132
+ four = IDX_MAGIC;
1133
+ if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 1;
1134
+ root = fp->hdr->zoomHdrs->idx[i]->root;
1135
+ if(fwrite(&(fp->writeBuffer->blockSize), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 6;
1136
+ if(fwrite(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 7;
1137
+ if(fwrite(&(root->chrIdxStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8;
1138
+ if(fwrite(&(root->baseStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9;
1139
+ if(fwrite(&(root->chrIdxEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 10;
1140
+ if(fwrite(&(root->baseEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 11;
1141
+ if(fwrite(&idxSize, sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 12;
1142
+ four = fp->hdr->bufSize/32;
1143
+ if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 13;
1144
+ four = 0;
1145
+ if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 14; //padding
1146
+ fp->hdr->zoomHdrs->idx[i]->rootOffset = bwTell(fp);
1147
+
1148
+ //Write the root node, since writeIndexTree writes the children and fills in the offset
1149
+ offset1 = bwTell(fp);
1150
+ if(fwrite(&(root->isLeaf), sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 16;
1151
+ if(fwrite(&one, sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 17; //one byte of padding
1152
+ if(fwrite(&(root->nChildren), sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 18;
1153
+ for(j=0; j<root->nChildren; j++) {
1154
+ vector[0] = root->chrIdxStart[j];
1155
+ vector[1] = root->baseStart[j];
1156
+ vector[2] = root->chrIdxEnd[j];
1157
+ vector[3] = root->baseEnd[j];
1158
+ if(root->isLeaf) {
1159
+ //Include the offset and size
1160
+ if(fwrite(vector, sizeof(uint32_t), 4, fp->URL->x.fp) != 4) return 19;
1161
+ if(fwrite(&(root->dataOffset[j]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 20;
1162
+ if(fwrite(&(root->x.size[j]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 21;
1163
+ } else {
1164
+ if(fwrite(vector, sizeof(uint32_t), 6, fp->URL->x.fp) != 6) return 22;
1165
+ }
1166
+ }
1167
+
1168
+ while((rv = writeIndexTreeNode(fp->URL->x.fp, fp->hdr->zoomHdrs->idx[i]->root, &wrote, 0)) == 0) {
1169
+ if(!wrote) break;
1170
+ wrote = 0;
1171
+ }
1172
+
1173
+ if(rv || wrote) return 6;
1174
+
1175
+ //Save the file position
1176
+ offset2 = bwTell(fp);
1177
+
1178
+ //Write the offsets
1179
+ if(writeIndexOffsets(fp->URL->x.fp, root, offset1)) return 2;
1180
+
1181
+ //Move the file pointer back to the end
1182
+ bwSetPos(fp, offset2);
1183
+
1184
+
1185
+ //Free the linked list
1186
+ zb = fp->writeBuffer->firstZoomBuffer[i];
1187
+ while(zb) {
1188
+ if(zb->p) free(zb->p);
1189
+ zb2 = zb->next;
1190
+ free(zb);
1191
+ zb = zb2;
1192
+ }
1193
+ fp->writeBuffer->firstZoomBuffer[i] = NULL;
1194
+ }
1195
+
1196
+ //Free unused zoom levels
1197
+ for(i=actualNLevels; i<fp->hdr->nLevels; i++) {
1198
+ zb = fp->writeBuffer->firstZoomBuffer[i];
1199
+ while(zb) {
1200
+ if(zb->p) free(zb->p);
1201
+ zb2 = zb->next;
1202
+ free(zb);
1203
+ zb = zb2;
1204
+ }
1205
+ fp->writeBuffer->firstZoomBuffer[i] = NULL;
1206
+ }
1207
+
1208
+ //Write the zoom headers to disk
1209
+ offset1 = bwTell(fp);
1210
+ if(bwSetPos(fp, 0x40)) return 7;
1211
+ four = 0;
1212
+ for(i=0; i<actualNLevels; i++) {
1213
+ if(fwrite(&(fp->hdr->zoomHdrs->level[i]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8;
1214
+ if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9;
1215
+ if(fwrite(&(fp->hdr->zoomHdrs->dataOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 10;
1216
+ if(fwrite(&(fp->hdr->zoomHdrs->indexOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 11;
1217
+ }
1218
+
1219
+ //Write the number of levels if needed
1220
+ if(bwSetPos(fp, 0x6)) return 12;
1221
+ if(fwrite(&actualNLevels, sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 13;
1222
+
1223
+ if(bwSetPos(fp, offset1)) return 14;
1224
+
1225
+ return 0;
1226
+ }
1227
+
1228
+ //0 on success
1229
+ int bwFinalize(bigWigFile_t *fp) {
1230
+ uint32_t four;
1231
+ uint64_t offset;
1232
+ if(!fp->isWrite) return 0;
1233
+
1234
+ //Flush the buffer
1235
+ if(flushBuffer(fp)) return 1; //Valgrind reports a problem here!
1236
+
1237
+ //Update the data section with the number of blocks written
1238
+ if(fp->hdr) {
1239
+ if(writeAtPos(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->hdr->dataOffset, fp->URL->x.fp)) return 2;
1240
+ } else {
1241
+ //The header wasn't written!
1242
+ return 1;
1243
+ }
1244
+
1245
+ //write the bufferSize
1246
+ if(fp->hdr->bufSize) {
1247
+ if(writeAtPos(&(fp->hdr->bufSize), sizeof(uint32_t), 1, 0x34, fp->URL->x.fp)) return 2;
1248
+ }
1249
+
1250
+ //write the summary information
1251
+ if(writeSummary(fp)) return 3;
1252
+
1253
+ //Convert the linked-list to a tree and write to disk
1254
+ if(writeIndex(fp)) return 4;
1255
+
1256
+ //Zoom level stuff here?
1257
+ if(fp->hdr->nLevels && fp->writeBuffer->nBlocks) {
1258
+ offset = bwTell(fp);
1259
+ if(makeZoomLevels(fp)) return 5;
1260
+ if(constructZoomLevels(fp)) return 6;
1261
+ bwSetPos(fp, offset);
1262
+ if(writeZoomLevels(fp)) return 7; //This write nLevels as well
1263
+ }
1264
+
1265
+ //write magic at the end of the file
1266
+ four = BIGWIG_MAGIC;
1267
+ if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9;
1268
+
1269
+ return 0;
1270
+ }
1271
+
1272
+ /*
1273
+ data chunk:
1274
+ uint64_t number of blocks (2 / 110851)
1275
+ some blocks
1276
+
1277
+ an uncompressed data block (24 byte header)
1278
+ uint32_t Tid 0-4
1279
+ uint32_t start 4-8
1280
+ uint32_t end 8-12
1281
+ uint32_t step 12-16
1282
+ uint32_t span 16-20
1283
+ uint8_t type 20
1284
+ uint8_t padding
1285
+ uint16_t nItems 22
1286
+ nItems of:
1287
+ type 1: //12 bytes
1288
+ uint32_t start
1289
+ uint32_t end
1290
+ float value
1291
+ type 2: //8 bytes
1292
+ uint32_t start
1293
+ float value
1294
+ type 3: //4 bytes
1295
+ float value
1296
+
1297
+ data block index header
1298
+ uint32_t magic
1299
+ uint32_t blockSize (256 in the example) maximum number of children
1300
+ uint64_t number of blocks (2 / 110851)
1301
+ uint32_t startTid
1302
+ uint32_t startPos
1303
+ uint32_t endTid
1304
+ uint32_t endPos
1305
+ uint64_t index size? (0x1E7 / 0x1AF0401F) index address?
1306
+ uint32_t itemsPerBlock (1 / 1) 1024 for zoom headers 1024 for zoom headers
1307
+ uint32_t padding
1308
+
1309
+ data block index node non-leaf (4 bytes + 24*nChildren)
1310
+ uint8_t isLeaf
1311
+ uint8_t padding
1312
+ uint16_t nChildren (2, 256)
1313
+ uint32_t startTid
1314
+ uint32_t startPos
1315
+ uint32_t endTid
1316
+ uint32_t endPos
1317
+ uint64_t dataOffset (0x1AF05853, 0x1AF07057)
1318
+
1319
+ data block index node leaf (4 bytes + 32*nChildren)
1320
+ uint8_t isLeaf
1321
+ uint8_t padding
1322
+ uint16_t nChildren (2)
1323
+ uint32_t startTid
1324
+ uint32_t startPos
1325
+ uint32_t endTid
1326
+ uint32_t endPos
1327
+ uint64_t dataOffset (0x198, 0x1CF)
1328
+ uint64_t dataSize (55, 24)
1329
+
1330
+ zoom data block
1331
+ uint32_t number of blocks (10519766)
1332
+ some data blocks
1333
+ */