@gmod/bam 1.1.13 → 1.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/csi.ts ADDED
@@ -0,0 +1,246 @@
1
+ import Long from 'long'
2
+ import { unzip } from '@gmod/bgzf-filehandle'
3
+ import VirtualOffset, { fromBytes } from './virtualOffset'
4
+ import Chunk from './chunk'
5
+ import { longToNumber, abortBreakPoint, optimizeChunks, BaseOpts } from './util'
6
+
7
+ import IndexFile from './indexFile'
8
+
9
+ const CSI1_MAGIC = 21582659 // CSI\1
10
+ const CSI2_MAGIC = 38359875 // CSI\2
11
+
12
+ function lshift(num: number, bits: number) {
13
+ return num * 2 ** bits
14
+ }
15
+ function rshift(num: number, bits: number) {
16
+ return Math.floor(num / 2 ** bits)
17
+ }
18
+
19
+ export default class CSI extends IndexFile {
20
+ private maxBinNumber: number
21
+ private depth: number
22
+ private minShift: number
23
+ constructor(args: any) {
24
+ super(args)
25
+ this.maxBinNumber = 0
26
+ this.depth = 0
27
+ this.minShift = 0
28
+ }
29
+ async lineCount(refId: number): Promise<number> {
30
+ const indexData = await this.parse()
31
+ if (!indexData) {
32
+ return -1
33
+ }
34
+ const idx = indexData.indices[refId]
35
+ if (!idx) {
36
+ return -1
37
+ }
38
+ const { stats } = indexData.indices[refId]
39
+ if (stats) {
40
+ return stats.lineCount
41
+ }
42
+ return -1
43
+ }
44
+
45
+ async indexCov() {
46
+ return []
47
+ }
48
+
49
+ parseAuxData(bytes: Buffer, offset: number, auxLength: number) {
50
+ if (auxLength < 30) {
51
+ return {}
52
+ }
53
+
54
+ const data: { [key: string]: any } = {}
55
+ data.formatFlags = bytes.readInt32LE(offset)
56
+ data.coordinateType =
57
+ data.formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
58
+ data.format = (
59
+ { 0: 'generic', 1: 'SAM', 2: 'VCF' } as {
60
+ [key: number]: string
61
+ }
62
+ )[data.formatFlags & 0xf]
63
+ if (!data.format) {
64
+ throw new Error(`invalid Tabix preset format flags ${data.formatFlags}`)
65
+ }
66
+ data.columnNumbers = {
67
+ ref: bytes.readInt32LE(offset + 4),
68
+ start: bytes.readInt32LE(offset + 8),
69
+ end: bytes.readInt32LE(offset + 12),
70
+ }
71
+ data.metaValue = bytes.readInt32LE(offset + 16)
72
+ data.metaChar = data.metaValue ? String.fromCharCode(data.metaValue) : ''
73
+ data.skipLines = bytes.readInt32LE(offset + 20)
74
+ const nameSectionLength = bytes.readInt32LE(offset + 24)
75
+
76
+ Object.assign(
77
+ data,
78
+ this._parseNameBytes(
79
+ bytes.subarray(offset + 28, offset + 28 + nameSectionLength),
80
+ ),
81
+ )
82
+ return data
83
+ }
84
+
85
+ _parseNameBytes(namesBytes: Buffer) {
86
+ let currRefId = 0
87
+ let currNameStart = 0
88
+ const refIdToName = []
89
+ const refNameToId: { [key: string]: number } = {}
90
+ for (let i = 0; i < namesBytes.length; i += 1) {
91
+ if (!namesBytes[i]) {
92
+ if (currNameStart < i) {
93
+ let refName = namesBytes.toString('utf8', currNameStart, i)
94
+ refName = this.renameRefSeq(refName)
95
+ refIdToName[currRefId] = refName
96
+ refNameToId[refName] = currRefId
97
+ }
98
+ currNameStart = i + 1
99
+ currRefId += 1
100
+ }
101
+ }
102
+ return { refNameToId, refIdToName }
103
+ }
104
+
105
+ // fetch and parse the index
106
+ async _parse(opts: { signal?: AbortSignal }) {
107
+ const data: { [key: string]: any } = { csi: true, maxBlockSize: 1 << 16 }
108
+ const buffer = (await this.filehandle.readFile(opts)) as Buffer
109
+ const bytes = await unzip(buffer)
110
+
111
+ // check TBI magic numbers
112
+ if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
113
+ data.csiVersion = 1
114
+ } else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
115
+ data.csiVersion = 2
116
+ } else {
117
+ throw new Error('Not a CSI file')
118
+ // TODO: do we need to support big-endian CSI files?
119
+ }
120
+
121
+ this.minShift = bytes.readInt32LE(4)
122
+ this.depth = bytes.readInt32LE(8)
123
+ this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
124
+ const auxLength = bytes.readInt32LE(12)
125
+ if (auxLength) {
126
+ Object.assign(data, this.parseAuxData(bytes, 16, auxLength))
127
+ }
128
+ data.refCount = bytes.readInt32LE(16 + auxLength)
129
+
130
+ // read the indexes for each reference sequence
131
+ data.indices = new Array(data.refCount)
132
+ let currOffset = 16 + auxLength + 4
133
+ for (let i = 0; i < data.refCount; i += 1) {
134
+ await abortBreakPoint(opts.signal)
135
+ // the binning index
136
+ const binCount = bytes.readInt32LE(currOffset)
137
+ currOffset += 4
138
+ const binIndex: { [key: string]: Chunk[] } = {}
139
+ let stats // < provided by parsing a pseudo-bin, if present
140
+ for (let j = 0; j < binCount; j += 1) {
141
+ const bin = bytes.readUInt32LE(currOffset)
142
+ if (bin > this.maxBinNumber) {
143
+ // this is a fake bin that actually has stats information
144
+ // about the reference sequence in it
145
+ stats = this.parsePseudoBin(bytes, currOffset + 4)
146
+ currOffset += 4 + 8 + 4 + 16 + 16
147
+ } else {
148
+ const loffset = fromBytes(bytes, currOffset + 4)
149
+ this._findFirstData(data, loffset)
150
+ const chunkCount = bytes.readInt32LE(currOffset + 12)
151
+ currOffset += 16
152
+ const chunks = new Array(chunkCount)
153
+ for (let k = 0; k < chunkCount; k += 1) {
154
+ const u = fromBytes(bytes, currOffset)
155
+ const v = fromBytes(bytes, currOffset + 8)
156
+ currOffset += 16
157
+ // this._findFirstData(data, u)
158
+ chunks[k] = new Chunk(u, v, bin)
159
+ }
160
+ binIndex[bin] = chunks
161
+ }
162
+ }
163
+
164
+ data.indices[i] = { binIndex, stats }
165
+ }
166
+
167
+ return data
168
+ }
169
+
170
+ parsePseudoBin(bytes: Buffer, offset: number) {
171
+ const lineCount = longToNumber(
172
+ Long.fromBytesLE(
173
+ Array.prototype.slice.call(bytes, offset + 28, offset + 36),
174
+ true,
175
+ ),
176
+ )
177
+ return { lineCount }
178
+ }
179
+
180
+ async blocksForRange(
181
+ refId: number,
182
+ min: number,
183
+ max: number,
184
+ opts: BaseOpts = {},
185
+ ) {
186
+ if (min < 0) {
187
+ min = 0
188
+ }
189
+
190
+ const indexData = await this.parse(opts)
191
+ if (!indexData) {
192
+ return []
193
+ }
194
+ const ba = indexData.indices[refId]
195
+ if (!ba) {
196
+ return []
197
+ }
198
+
199
+ const overlappingBins = this.reg2bins(min, max) // List of bin #s that overlap min, max
200
+ const chunks: Chunk[] = []
201
+
202
+ // Find chunks in overlapping bins. Leaf bins (< 4681) are not pruned
203
+ for (const [start, end] of overlappingBins) {
204
+ for (let bin = start; bin <= end; bin++) {
205
+ if (ba.binIndex[bin]) {
206
+ const binChunks = ba.binIndex[bin]
207
+ for (let c = 0; c < binChunks.length; ++c) {
208
+ chunks.push(new Chunk(binChunks[c].minv, binChunks[c].maxv, bin))
209
+ }
210
+ }
211
+ }
212
+ }
213
+
214
+ return optimizeChunks(chunks, new VirtualOffset(0, 0))
215
+ }
216
+
217
+ /**
218
+ * calculate the list of bins that may overlap with region [beg,end) (zero-based half-open)
219
+ * @returns {Array[number]}
220
+ */
221
+ reg2bins(beg: number, end: number) {
222
+ beg -= 1 // < convert to 1-based closed
223
+ if (beg < 1) {
224
+ beg = 1
225
+ }
226
+ if (end > 2 ** 50) {
227
+ end = 2 ** 34
228
+ } // 17 GiB ought to be enough for anybody
229
+ end -= 1
230
+ let l = 0
231
+ let t = 0
232
+ let s = this.minShift + this.depth * 3
233
+ const bins = []
234
+ for (; l <= this.depth; s -= 3, t += lshift(1, l * 3), l += 1) {
235
+ const b = t + rshift(beg, s)
236
+ const e = t + rshift(end, s)
237
+ if (e - b + bins.length > this.maxBinNumber) {
238
+ throw new Error(
239
+ `query ${beg}-${end} is too large for current binning scheme (shift ${this.minShift}, depth ${this.depth}), try a smaller query or a coarser index binning scheme`,
240
+ )
241
+ }
242
+ bins.push([b, e])
243
+ }
244
+ return bins
245
+ }
246
+ }
@@ -0,0 +1,2 @@
1
+ declare module 'object.entries-ponyfill'
2
+ declare module '@gmod/bgzf-filehandle'
package/src/errors.ts ADDED
@@ -0,0 +1,22 @@
1
+ export class BamError extends Error {}
2
+
3
+ /** Error caused by encountering a part of the BAM spec that has not yet been implemented */
4
+ export class BamUnimplementedError extends Error {}
5
+
6
+ /** An error caused by malformed data. */
7
+ export class BamMalformedError extends BamError {}
8
+
9
+ /**
10
+ * An error caused by attempting to read beyond the end of the defined data.
11
+ */
12
+ export class BamBufferOverrunError extends BamMalformedError {}
13
+
14
+ /**
15
+ * An error caused by data being too big, exceeding a size limit.
16
+ */
17
+ export class BamSizeLimitError extends BamError {}
18
+
19
+ /**
20
+ * An invalid argument was supplied to a bam-js method or object.
21
+ */
22
+ export class BamArgumentError extends BamError {}
package/src/htsget.ts ADDED
@@ -0,0 +1,138 @@
1
+ import { BaseOpts, BamOpts } from './util'
2
+ import BamFile, { BAM_MAGIC } from './bamFile'
3
+ import 'cross-fetch/polyfill'
4
+ import Chunk from './chunk'
5
+ import { unzip } from '@gmod/bgzf-filehandle'
6
+ import { parseHeaderText } from './sam'
7
+
8
+ interface HeaderLine {
9
+ tag: string
10
+ value: string
11
+ }
12
+
13
+ interface HtsgetChunk {
14
+ url: string
15
+ headers?: Record<string, string>
16
+ }
17
+ async function concat(arr: { url: string }[], opts: Record<string, any>) {
18
+ const res = await Promise.all(
19
+ arr.map(async (chunk: HtsgetChunk) => {
20
+ const { url, headers } = chunk
21
+ if (url.startsWith('data:')) {
22
+ return Buffer.from(url.split(',')[1], 'base64')
23
+ } else {
24
+ //remove referer header, it is not even allowed to be specified
25
+ //@ts-ignore
26
+ //eslint-disable-next-line @typescript-eslint/no-unused-vars
27
+ const { referer, ...rest } = headers
28
+ const res = await fetch(url, {
29
+ ...opts,
30
+ headers: { ...opts.headers, ...rest },
31
+ })
32
+ if (!res.ok) {
33
+ throw new Error(`Failed to fetch ${res.statusText}`)
34
+ }
35
+ return Buffer.from(await res.arrayBuffer())
36
+ }
37
+ }),
38
+ )
39
+
40
+ return Buffer.concat(await Promise.all(res.map(elt => unzip(elt))))
41
+ }
42
+
43
+ export default class HtsgetFile extends BamFile {
44
+ private baseUrl: string
45
+
46
+ private trackId: string
47
+
48
+ constructor(args: { trackId: string; baseUrl: string }) {
49
+ // @ts-ignore override bam defaults
50
+ super({ bamFilehandle: '?', baiFilehandle: '?' })
51
+ this.baseUrl = args.baseUrl
52
+ this.trackId = args.trackId
53
+ }
54
+
55
+ async *streamRecordsForRange(
56
+ chr: string,
57
+ min: number,
58
+ max: number,
59
+ opts: BamOpts = {
60
+ viewAsPairs: false,
61
+ pairAcrossChr: false,
62
+ maxInsertSize: 200000,
63
+ },
64
+ ) {
65
+ const base = `${this.baseUrl}/${this.trackId}`
66
+ const url = `${base}?referenceName=${chr}&start=${min}&end=${max}&format=BAM`
67
+ const chrId = this.chrToIndex && this.chrToIndex[chr]
68
+ const result = await fetch(url, { ...opts })
69
+ if (!result.ok) {
70
+ throw new Error(result.statusText)
71
+ }
72
+ const data = await result.json()
73
+ const uncba = await concat(data.htsget.urls.slice(1), opts)
74
+ const chunk = {
75
+ buffer: uncba,
76
+ chunk: { minv: { dataPosition: 0 } },
77
+ toString() {
78
+ return `${chr}_${min}_${max}`
79
+ },
80
+ }
81
+
82
+ yield* this._fetchChunkFeatures(
83
+ // @ts-ignore
84
+ [chunk],
85
+ chrId,
86
+ min,
87
+ max,
88
+ opts,
89
+ )
90
+ }
91
+
92
+ //@ts-ignore
93
+ async _readChunk(params: {
94
+ chunk: { buffer: Buffer; chunk: Chunk }
95
+ opts: BaseOpts
96
+ }) {
97
+ const { chunk } = params
98
+ const { buffer, chunk: c2 } = chunk
99
+ return { data: buffer, cpositions: null, dpositions: null, chunk: c2 }
100
+ }
101
+
102
+ async getHeader(opts: BaseOpts = {}) {
103
+ const url = `${this.baseUrl}/${this.trackId}?referenceName=na&class=header`
104
+ const result = await fetch(url, opts)
105
+ if (!result.ok) {
106
+ throw new Error(`Failed to fetch ${result.statusText}`)
107
+ }
108
+ const data = await result.json()
109
+ const uncba = await concat(data.htsget.urls, opts)
110
+
111
+ if (uncba.readInt32LE(0) !== BAM_MAGIC) {
112
+ throw new Error('Not a BAM file')
113
+ }
114
+ const headLen = uncba.readInt32LE(4)
115
+
116
+ const headerText = uncba.toString('utf8', 8, 8 + headLen)
117
+ const samHeader = parseHeaderText(headerText)
118
+
119
+ // use the @SQ lines in the header to figure out the
120
+ // mapping between ref ref ID numbers and names
121
+ const idToName: string[] = []
122
+ const nameToId: Record<string, number> = {}
123
+ const sqLines = samHeader.filter((l: { tag: string }) => l.tag === 'SQ')
124
+ sqLines.forEach((sqLine: { data: HeaderLine[] }, refId: number) => {
125
+ sqLine.data.forEach((item: HeaderLine) => {
126
+ if (item.tag === 'SN') {
127
+ // this is the ref name
128
+ const refName = item.value
129
+ nameToId[refName] = refId
130
+ idToName[refId] = refName
131
+ }
132
+ })
133
+ })
134
+ this.chrToIndex = nameToId
135
+ this.indexToChr = idToName
136
+ return samHeader
137
+ }
138
+ }
package/src/index.ts ADDED
@@ -0,0 +1,7 @@
1
+ import BAI from './bai'
2
+ import CSI from './csi'
3
+ import BamFile from './bamFile'
4
+ import HtsgetFile from './htsget'
5
+ import BamRecord from './record'
6
+
7
+ export { BAI, CSI, BamFile, BamRecord, HtsgetFile }
@@ -0,0 +1,63 @@
1
+ import { GenericFilehandle } from 'generic-filehandle'
2
+ import VirtualOffset from './virtualOffset'
3
+ import Chunk from './chunk'
4
+ import { BaseOpts } from './util'
5
+
6
+ export default abstract class IndexFile {
7
+ public filehandle: GenericFilehandle
8
+ public renameRefSeq: (s: string) => string
9
+ public setupP?: Promise<any>
10
+
11
+ /**
12
+ * @param {filehandle} filehandle
13
+ * @param {function} [renameRefSeqs]
14
+ */
15
+ constructor({
16
+ filehandle,
17
+ renameRefSeq = (n: string) => n,
18
+ }: {
19
+ filehandle: GenericFilehandle
20
+ renameRefSeq?: (a: string) => string
21
+ }) {
22
+ this.filehandle = filehandle
23
+ this.renameRefSeq = renameRefSeq
24
+ }
25
+ public abstract lineCount(refId: number): Promise<number>
26
+ protected abstract _parse(opts?: BaseOpts): Promise<any>
27
+ public abstract indexCov(
28
+ refId: number,
29
+ start?: number,
30
+ end?: number,
31
+ ): Promise<{ start: number; end: number; score: number }[]>
32
+
33
+ public abstract blocksForRange(
34
+ chrId: number,
35
+ start: number,
36
+ end: number,
37
+ opts?: BaseOpts,
38
+ ): Promise<Chunk[]>
39
+
40
+ _findFirstData(data: any, virtualOffset: VirtualOffset) {
41
+ const currentFdl = data.firstDataLine
42
+ if (currentFdl) {
43
+ data.firstDataLine =
44
+ currentFdl.compareTo(virtualOffset) > 0 ? virtualOffset : currentFdl
45
+ } else {
46
+ data.firstDataLine = virtualOffset
47
+ }
48
+ }
49
+
50
+ async parse(opts: BaseOpts = {}) {
51
+ if (!this.setupP) {
52
+ this.setupP = this._parse(opts).catch(e => {
53
+ this.setupP = undefined
54
+ throw e
55
+ })
56
+ }
57
+ return this.setupP
58
+ }
59
+
60
+ async hasRefSeq(seqId: number, opts: BaseOpts = {}) {
61
+ return !!((await this.parse(opts)).indices[seqId] || {}).binIndex
62
+ }
63
+ }