@gmod/bam 1.1.13 → 1.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/bamFile.ts ADDED
@@ -0,0 +1,546 @@
1
+ import crc32 from 'buffer-crc32'
2
+ import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
3
+ import entries from 'object.entries-ponyfill'
4
+ import { LocalFile, RemoteFile, GenericFilehandle } from 'generic-filehandle'
5
+ import AbortablePromiseCache from 'abortable-promise-cache'
6
+ import QuickLRU from 'quick-lru'
7
+ //locals
8
+ import BAI from './bai'
9
+ import CSI from './csi'
10
+ import Chunk from './chunk'
11
+ import BAMFeature from './record'
12
+ import IndexFile from './indexFile'
13
+ import { parseHeaderText } from './sam'
14
+ import {
15
+ abortBreakPoint,
16
+ checkAbortSignal,
17
+ timeout,
18
+ makeOpts,
19
+ BamOpts,
20
+ BaseOpts,
21
+ } from './util'
22
+
23
+ export const BAM_MAGIC = 21840194
24
+
25
+ const blockLen = 1 << 16
26
+
27
+ function flat<T>(arr: T[][]) {
28
+ return ([] as T[]).concat(...arr)
29
+ }
30
+
31
+ async function gen2array<T>(gen: AsyncIterable<T>): Promise<T[]> {
32
+ const out: T[] = []
33
+ for await (const x of gen) {
34
+ out.push(x)
35
+ }
36
+ return out
37
+ }
38
+
39
+ export default class BamFile {
40
+ private renameRefSeq: (a: string) => string
41
+ private bam: GenericFilehandle
42
+ private index: IndexFile
43
+ private chunkSizeLimit: number
44
+ private fetchSizeLimit: number
45
+ private header: any
46
+ protected chrToIndex: any
47
+ protected indexToChr: any
48
+ private yieldThreadTime: number
49
+
50
+ private featureCache = new AbortablePromiseCache({
51
+ //@ts-ignore
52
+ cache: new QuickLRU({
53
+ maxSize: 50,
54
+ }),
55
+ //@ts-ignore
56
+ fill: async ({ chunk, opts }, signal) => {
57
+ const { data, cpositions, dpositions } = await this._readChunk({
58
+ chunk,
59
+ opts: { ...opts, signal },
60
+ })
61
+ const feats = await this.readBamFeatures(
62
+ data,
63
+ cpositions,
64
+ dpositions,
65
+ chunk,
66
+ )
67
+ return feats
68
+ },
69
+ })
70
+
71
+ /**
72
+ * @param {object} args
73
+ * @param {string} [args.bamPath]
74
+ * @param {FileHandle} [args.bamFilehandle]
75
+ * @param {string} [args.baiPath]
76
+ * @param {FileHandle} [args.baiFilehandle]
77
+ */
78
+ constructor({
79
+ bamFilehandle,
80
+ bamPath,
81
+ bamUrl,
82
+ baiPath,
83
+ baiFilehandle,
84
+ baiUrl,
85
+ csiPath,
86
+ csiFilehandle,
87
+ csiUrl,
88
+ fetchSizeLimit,
89
+ chunkSizeLimit,
90
+ yieldThreadTime = 100,
91
+ renameRefSeqs = n => n,
92
+ }: {
93
+ bamFilehandle?: GenericFilehandle
94
+ bamPath?: string
95
+ bamUrl?: string
96
+ baiPath?: string
97
+ baiFilehandle?: GenericFilehandle
98
+ baiUrl?: string
99
+ csiPath?: string
100
+ csiFilehandle?: GenericFilehandle
101
+ csiUrl?: string
102
+ fetchSizeLimit?: number
103
+ chunkSizeLimit?: number
104
+ renameRefSeqs?: (a: string) => string
105
+ yieldThreadTime?: number
106
+ }) {
107
+ this.renameRefSeq = renameRefSeqs
108
+
109
+ if (bamFilehandle) {
110
+ this.bam = bamFilehandle
111
+ } else if (bamPath) {
112
+ this.bam = new LocalFile(bamPath)
113
+ } else if (bamUrl) {
114
+ this.bam = new RemoteFile(bamUrl)
115
+ } else {
116
+ throw new Error('unable to initialize bam')
117
+ }
118
+ if (csiFilehandle) {
119
+ this.index = new CSI({ filehandle: csiFilehandle })
120
+ } else if (csiPath) {
121
+ this.index = new CSI({ filehandle: new LocalFile(csiPath) })
122
+ } else if (csiUrl) {
123
+ this.index = new CSI({ filehandle: new RemoteFile(csiUrl) })
124
+ } else if (baiFilehandle) {
125
+ this.index = new BAI({ filehandle: baiFilehandle })
126
+ } else if (baiPath) {
127
+ this.index = new BAI({ filehandle: new LocalFile(baiPath) })
128
+ } else if (baiUrl) {
129
+ this.index = new BAI({ filehandle: new RemoteFile(baiUrl) })
130
+ } else if (bamPath) {
131
+ this.index = new BAI({ filehandle: new LocalFile(`${bamPath}.bai`) })
132
+ } else if (bamUrl) {
133
+ this.index = new BAI({ filehandle: new RemoteFile(`${bamUrl}.bai`) })
134
+ } else {
135
+ throw new Error('unable to infer index format')
136
+ }
137
+ this.fetchSizeLimit = fetchSizeLimit || 500000000 // 500MB
138
+ this.chunkSizeLimit = chunkSizeLimit || 300000000 // 300MB
139
+ this.yieldThreadTime = yieldThreadTime
140
+ }
141
+
142
+ async getHeader(origOpts: AbortSignal | BaseOpts = {}) {
143
+ const opts = makeOpts(origOpts)
144
+ const indexData = await this.index.parse(opts)
145
+ const ret = indexData.firstDataLine
146
+ ? indexData.firstDataLine.blockPosition + 65535
147
+ : undefined
148
+ let buffer
149
+ if (ret) {
150
+ const res = await this.bam.read(
151
+ Buffer.alloc(ret + blockLen),
152
+ 0,
153
+ ret + blockLen,
154
+ 0,
155
+ opts,
156
+ )
157
+
158
+ const { bytesRead } = res
159
+ ;({ buffer } = res)
160
+ if (!bytesRead) {
161
+ throw new Error('Error reading header')
162
+ }
163
+ if (bytesRead < ret) {
164
+ buffer = buffer.subarray(0, bytesRead)
165
+ } else {
166
+ buffer = buffer.subarray(0, ret)
167
+ }
168
+ } else {
169
+ buffer = (await this.bam.readFile(opts)) as Buffer
170
+ }
171
+
172
+ const uncba = await unzip(buffer)
173
+
174
+ if (uncba.readInt32LE(0) !== BAM_MAGIC) {
175
+ throw new Error('Not a BAM file')
176
+ }
177
+ const headLen = uncba.readInt32LE(4)
178
+
179
+ this.header = uncba.toString('utf8', 8, 8 + headLen)
180
+ const { chrToIndex, indexToChr } = await this._readRefSeqs(
181
+ headLen + 8,
182
+ 65535,
183
+ opts,
184
+ )
185
+ this.chrToIndex = chrToIndex
186
+ this.indexToChr = indexToChr
187
+
188
+ return parseHeaderText(this.header)
189
+ }
190
+
191
+ async getHeaderText(opts: BaseOpts = {}) {
192
+ await this.getHeader(opts)
193
+ return this.header
194
+ }
195
+
196
+ // the full length of the refseq block is not given in advance so this grabs
197
+ // a chunk and doubles it if all refseqs haven't been processed
198
+ async _readRefSeqs(
199
+ start: number,
200
+ refSeqBytes: number,
201
+ opts: BaseOpts = {},
202
+ ): Promise<{
203
+ chrToIndex: { [key: string]: number }
204
+ indexToChr: { refName: string; length: number }[]
205
+ }> {
206
+ if (start > refSeqBytes) {
207
+ return this._readRefSeqs(start, refSeqBytes * 2, opts)
208
+ }
209
+ const size = refSeqBytes + blockLen
210
+ const { bytesRead, buffer } = await this.bam.read(
211
+ Buffer.alloc(size),
212
+ 0,
213
+ refSeqBytes,
214
+ 0,
215
+ opts,
216
+ )
217
+ if (!bytesRead) {
218
+ throw new Error('Error reading refseqs from header')
219
+ }
220
+ const uncba = await unzip(
221
+ buffer.subarray(0, Math.min(bytesRead, refSeqBytes)),
222
+ )
223
+ const nRef = uncba.readInt32LE(start)
224
+ let p = start + 4
225
+ const chrToIndex: { [key: string]: number } = {}
226
+ const indexToChr: { refName: string; length: number }[] = []
227
+ for (let i = 0; i < nRef; i += 1) {
228
+ const lName = uncba.readInt32LE(p)
229
+ const refName = this.renameRefSeq(
230
+ uncba.toString('utf8', p + 4, p + 4 + lName - 1),
231
+ )
232
+ const lRef = uncba.readInt32LE(p + lName + 4)
233
+
234
+ chrToIndex[refName] = i
235
+ indexToChr.push({ refName, length: lRef })
236
+
237
+ p = p + 8 + lName
238
+ if (p > uncba.length) {
239
+ console.warn(
240
+ `BAM header is very big. Re-fetching ${refSeqBytes} bytes.`,
241
+ )
242
+ return this._readRefSeqs(start, refSeqBytes * 2, opts)
243
+ }
244
+ }
245
+ return { chrToIndex, indexToChr }
246
+ }
247
+
248
+ async getRecordsForRange(
249
+ chr: string,
250
+ min: number,
251
+ max: number,
252
+ opts: BamOpts = {
253
+ viewAsPairs: false,
254
+ pairAcrossChr: false,
255
+ maxInsertSize: 200000,
256
+ },
257
+ ) {
258
+ return flat(
259
+ await gen2array(this.streamRecordsForRange(chr, min, max, opts)),
260
+ )
261
+ }
262
+
263
+ async *streamRecordsForRange(
264
+ chr: string,
265
+ min: number,
266
+ max: number,
267
+ opts: BamOpts = {},
268
+ ) {
269
+ const { signal } = opts
270
+ const chrId = this.chrToIndex && this.chrToIndex[chr]
271
+ let chunks: Chunk[]
272
+ if (!(chrId >= 0)) {
273
+ chunks = []
274
+ } else {
275
+ chunks = await this.index.blocksForRange(chrId, min - 1, max, opts)
276
+
277
+ if (!chunks) {
278
+ throw new Error('Error in index fetch')
279
+ }
280
+ }
281
+
282
+ for (let i = 0; i < chunks.length; i += 1) {
283
+ await abortBreakPoint(signal)
284
+ const size = chunks[i].fetchedSize()
285
+ if (size > this.chunkSizeLimit) {
286
+ throw new Error(
287
+ `Too many BAM features. BAM chunk size ${size} bytes exceeds chunkSizeLimit of ${this.chunkSizeLimit}`,
288
+ )
289
+ }
290
+ }
291
+
292
+ const totalSize = chunks
293
+ .map(s => s.fetchedSize())
294
+ .reduce((a, b) => a + b, 0)
295
+ if (totalSize > this.fetchSizeLimit) {
296
+ throw new Error(
297
+ `data size of ${totalSize.toLocaleString()} bytes exceeded fetch size limit of ${this.fetchSizeLimit.toLocaleString()} bytes`,
298
+ )
299
+ }
300
+ yield* this._fetchChunkFeatures(chunks, chrId, min, max, opts)
301
+ }
302
+
303
+ async *_fetchChunkFeatures(
304
+ chunks: Chunk[],
305
+ chrId: number,
306
+ min: number,
307
+ max: number,
308
+ opts: BamOpts,
309
+ ) {
310
+ const { viewAsPairs = false } = opts
311
+ const feats = []
312
+ let done = false
313
+
314
+ for (let i = 0; i < chunks.length; i++) {
315
+ const c = chunks[i]
316
+ const records = (await this.featureCache.get(
317
+ c.toString(),
318
+ {
319
+ chunk: c,
320
+ opts,
321
+ },
322
+ opts.signal,
323
+ )) as BAMFeature[]
324
+
325
+ const recs = []
326
+ for (let i = 0; i < records.length; i += 1) {
327
+ const feature = records[i]
328
+ if (feature.seq_id() === chrId) {
329
+ if (feature.get('start') >= max) {
330
+ // past end of range, can stop iterating
331
+ done = true
332
+ break
333
+ } else if (feature.get('end') >= min) {
334
+ // must be in range
335
+ recs.push(feature)
336
+ }
337
+ }
338
+ }
339
+ feats.push(recs)
340
+ yield recs
341
+ if (done) {
342
+ break
343
+ }
344
+ }
345
+
346
+ checkAbortSignal(opts.signal)
347
+ if (viewAsPairs) {
348
+ yield this.fetchPairs(chrId, feats, opts)
349
+ }
350
+ }
351
+
352
+ async fetchPairs(chrId: number, feats: BAMFeature[][], opts: BamOpts) {
353
+ const { pairAcrossChr = false, maxInsertSize = 200000 } = opts
354
+ const unmatedPairs: { [key: string]: boolean } = {}
355
+ const readIds: { [key: string]: number } = {}
356
+ feats.map(ret => {
357
+ const readNames: { [key: string]: number } = {}
358
+ for (let i = 0; i < ret.length; i++) {
359
+ const name = ret[i].name()
360
+ const id = ret[i].id()
361
+ if (!readNames[name]) {
362
+ readNames[name] = 0
363
+ }
364
+ readNames[name]++
365
+ readIds[id] = 1
366
+ }
367
+ entries(readNames).forEach(([k, v]: [string, number]) => {
368
+ if (v === 1) {
369
+ unmatedPairs[k] = true
370
+ }
371
+ })
372
+ })
373
+
374
+ const matePromises: Promise<Chunk[]>[] = []
375
+ feats.map(ret => {
376
+ for (let i = 0; i < ret.length; i++) {
377
+ const f = ret[i]
378
+ const name = f.name()
379
+ const start = f.get('start')
380
+ const pnext = f._next_pos()
381
+ const rnext = f._next_refid()
382
+ if (
383
+ unmatedPairs[name] &&
384
+ (pairAcrossChr ||
385
+ (rnext === chrId && Math.abs(start - pnext) < maxInsertSize))
386
+ ) {
387
+ matePromises.push(
388
+ this.index.blocksForRange(rnext, pnext, pnext + 1, opts),
389
+ )
390
+ }
391
+ }
392
+ })
393
+
394
+ // filter out duplicate chunks (the blocks are lists of chunks, blocks are
395
+ // concatenated, then filter dup chunks)
396
+ const mateChunks = flat(await Promise.all(matePromises))
397
+ .sort()
398
+ .filter(
399
+ (item, pos, ary) => !pos || item.toString() !== ary[pos - 1].toString(),
400
+ )
401
+
402
+ const mateTotalSize = mateChunks
403
+ .map(s => s.fetchedSize())
404
+ .reduce((a, b) => a + b, 0)
405
+ if (mateTotalSize > this.fetchSizeLimit) {
406
+ throw new Error(
407
+ `data size of ${mateTotalSize.toLocaleString()} bytes exceeded fetch size limit of ${this.fetchSizeLimit.toLocaleString()} bytes`,
408
+ )
409
+ }
410
+ const mateFeatPromises = mateChunks.map(async c => {
411
+ const { data, cpositions, dpositions, chunk } = await this._readChunk({
412
+ chunk: c,
413
+ opts,
414
+ })
415
+ const feats = await this.readBamFeatures(
416
+ data,
417
+ cpositions,
418
+ dpositions,
419
+ chunk,
420
+ )
421
+ const mateRecs = []
422
+ for (let i = 0; i < feats.length; i += 1) {
423
+ const feature = feats[i]
424
+ if (unmatedPairs[feature.get('name')] && !readIds[feature.id()]) {
425
+ mateRecs.push(feature)
426
+ }
427
+ }
428
+ return mateRecs
429
+ })
430
+ return flat(await Promise.all(mateFeatPromises))
431
+ }
432
+
433
+ async _readChunk({ chunk, opts }: { chunk: Chunk; opts: BaseOpts }) {
434
+ const size = chunk.fetchedSize()
435
+ const { buffer, bytesRead } = await this.bam.read(
436
+ Buffer.alloc(size),
437
+ 0,
438
+ size,
439
+ chunk.minv.blockPosition,
440
+ opts,
441
+ )
442
+
443
+ const {
444
+ buffer: data,
445
+ cpositions,
446
+ dpositions,
447
+ } = await unzipChunkSlice(
448
+ buffer.subarray(0, Math.min(bytesRead, size)),
449
+ chunk,
450
+ )
451
+ return { data, cpositions, dpositions, chunk }
452
+ }
453
+
454
+ async readBamFeatures(
455
+ ba: Buffer,
456
+ cpositions: number[],
457
+ dpositions: number[],
458
+ chunk: Chunk,
459
+ ) {
460
+ let blockStart = 0
461
+ const sink = []
462
+ let pos = 0
463
+ let last = +Date.now()
464
+
465
+ while (blockStart + 4 < ba.length) {
466
+ const blockSize = ba.readInt32LE(blockStart)
467
+ const blockEnd = blockStart + 4 + blockSize - 1
468
+
469
+ // increment position to the current decompressed status
470
+ if (dpositions) {
471
+ while (blockStart + chunk.minv.dataPosition >= dpositions[pos++]) {}
472
+ pos--
473
+ }
474
+
475
+ // only try to read the feature if we have all the bytes for it
476
+ if (blockEnd < ba.length) {
477
+ const feature = new BAMFeature({
478
+ bytes: {
479
+ byteArray: ba,
480
+ start: blockStart,
481
+ end: blockEnd,
482
+ },
483
+ // the below results in an automatically calculated file-offset based ID
484
+ // if the info for that is available, otherwise crc32 of the features
485
+ //
486
+ // cpositions[pos] refers to actual file offset of a bgzip block boundaries
487
+ //
488
+ // we multiply by (1 <<8) in order to make sure each block has a "unique"
489
+ // address space so that data in that block could never overlap
490
+ //
491
+ // then the blockStart-dpositions is an uncompressed file offset from
492
+ // that bgzip block boundary, and since the cpositions are multiplied by
493
+ // (1 << 8) these uncompressed offsets get a unique space
494
+ //
495
+ // this has an extra chunk.minv.dataPosition added on because it blockStart
496
+ // starts at 0 instead of chunk.minv.dataPosition
497
+ //
498
+ // the +1 is just to avoid any possible uniqueId 0 but this does not realistically happen
499
+ fileOffset: cpositions
500
+ ? cpositions[pos] * (1 << 8) +
501
+ (blockStart - dpositions[pos]) +
502
+ chunk.minv.dataPosition +
503
+ 1
504
+ : // must be slice, not subarray for buffer polyfill on web
505
+ crc32.signed(ba.slice(blockStart, blockEnd)),
506
+ })
507
+
508
+ sink.push(feature)
509
+ if (this.yieldThreadTime && +Date.now() - last > this.yieldThreadTime) {
510
+ await timeout(1)
511
+ last = +Date.now()
512
+ }
513
+ }
514
+
515
+ blockStart = blockEnd + 1
516
+ }
517
+ return sink
518
+ }
519
+
520
+ async hasRefSeq(seqName: string) {
521
+ const refId = this.chrToIndex && this.chrToIndex[seqName]
522
+ return this.index.hasRefSeq(refId)
523
+ }
524
+
525
+ async lineCount(seqName: string) {
526
+ const refId = this.chrToIndex && this.chrToIndex[seqName]
527
+ return this.index.lineCount(refId)
528
+ }
529
+
530
+ async indexCov(seqName: string, start?: number, end?: number) {
531
+ await this.index.parse()
532
+ const seqId = this.chrToIndex && this.chrToIndex[seqName]
533
+ return this.index.indexCov(seqId, start, end)
534
+ }
535
+
536
+ async blocksForRange(
537
+ seqName: string,
538
+ start: number,
539
+ end: number,
540
+ opts?: BaseOpts,
541
+ ) {
542
+ await this.index.parse()
543
+ const seqId = this.chrToIndex && this.chrToIndex[seqName]
544
+ return this.index.blocksForRange(seqId, start, end, opts)
545
+ }
546
+ }
package/src/chunk.ts ADDED
@@ -0,0 +1,52 @@
1
+ import VirtualOffset from './virtualOffset'
2
+
3
+ // little class representing a chunk in the index
4
+ export default class Chunk {
5
+ public minv: VirtualOffset
6
+ public maxv: VirtualOffset
7
+ public bin: number
8
+ public _fetchedSize?: number
9
+
10
+ /**
11
+ * @param {VirtualOffset} minv
12
+ * @param {VirtualOffset} maxv
13
+ * @param {number} bin
14
+ * @param {number} [fetchedSize]
15
+ */
16
+ constructor(
17
+ minv: VirtualOffset,
18
+ maxv: VirtualOffset,
19
+ bin: number,
20
+ fetchedSize = undefined,
21
+ ) {
22
+ this.minv = minv
23
+ this.maxv = maxv
24
+ this.bin = bin
25
+ this._fetchedSize = fetchedSize
26
+ }
27
+
28
+ toUniqueString() {
29
+ return `${this.minv}..${this.maxv} (bin ${
30
+ this.bin
31
+ }, fetchedSize ${this.fetchedSize()})`
32
+ }
33
+
34
+ toString() {
35
+ return this.toUniqueString()
36
+ }
37
+
38
+ compareTo(b: Chunk) {
39
+ return (
40
+ this.minv.compareTo(b.minv) ||
41
+ this.maxv.compareTo(b.maxv) ||
42
+ this.bin - b.bin
43
+ )
44
+ }
45
+
46
+ fetchedSize() {
47
+ if (this._fetchedSize !== undefined) {
48
+ return this._fetchedSize
49
+ }
50
+ return this.maxv.blockPosition + (1 << 16) - this.minv.blockPosition
51
+ }
52
+ }
@@ -0,0 +1,26 @@
1
+ export default {
2
+ // the read is paired in sequencing, no matter whether it is mapped in a pair
3
+ BAM_FPAIRED: 1,
4
+ // the read is mapped in a proper pair
5
+ BAM_FPROPER_PAIR: 2,
6
+ // the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
7
+ BAM_FUNMAP: 4,
8
+ // the mate is unmapped
9
+ BAM_FMUNMAP: 8,
10
+ // the read is mapped to the reverse strand
11
+ BAM_FREVERSE: 16,
12
+ // the mate is mapped to the reverse strand
13
+ BAM_FMREVERSE: 32,
14
+ // this is read1
15
+ BAM_FREAD1: 64,
16
+ // this is read2
17
+ BAM_FREAD2: 128,
18
+ // not primary alignment
19
+ BAM_FSECONDARY: 256,
20
+ // QC failure
21
+ BAM_FQCFAIL: 512,
22
+ // optical or PCR duplicate
23
+ BAM_FDUP: 1024,
24
+ // supplementary alignment
25
+ BAM_FSUPPLEMENTARY: 2048,
26
+ }