@gmod/cram 1.6.1 → 1.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/dist/craiIndex.js +1 -1
- package/dist/craiIndex.js.map +1 -1
- package/dist/cram-bundle.js +3 -3
- package/dist/cramFile/codecs/byteArrayLength.d.ts +1 -1
- package/dist/cramFile/codecs/byteArrayLength.js +1 -1
- package/dist/cramFile/codecs/byteArrayLength.js.map +1 -1
- package/dist/cramFile/codecs/byteArrayStop.js +1 -2
- package/dist/cramFile/codecs/byteArrayStop.js.map +1 -1
- package/dist/cramFile/codecs/external.js +1 -3
- package/dist/cramFile/codecs/external.js.map +1 -1
- package/dist/cramFile/container/compressionScheme.d.ts +1 -1
- package/dist/cramFile/container/compressionScheme.js +6 -4
- package/dist/cramFile/container/compressionScheme.js.map +1 -1
- package/dist/cramFile/file.js +17 -0
- package/dist/cramFile/file.js.map +1 -1
- package/dist/cramFile/slice/decodeRecord.js +62 -58
- package/dist/cramFile/slice/decodeRecord.js.map +1 -1
- package/dist/cramFile/slice/index.js +8 -5
- package/dist/cramFile/slice/index.js.map +1 -1
- package/esm/craiIndex.js +1 -1
- package/esm/craiIndex.js.map +1 -1
- package/esm/cramFile/codecs/byteArrayLength.d.ts +1 -1
- package/esm/cramFile/codecs/byteArrayLength.js +1 -1
- package/esm/cramFile/codecs/byteArrayLength.js.map +1 -1
- package/esm/cramFile/codecs/byteArrayStop.js +1 -2
- package/esm/cramFile/codecs/byteArrayStop.js.map +1 -1
- package/esm/cramFile/codecs/external.js +1 -3
- package/esm/cramFile/codecs/external.js.map +1 -1
- package/esm/cramFile/container/compressionScheme.d.ts +1 -1
- package/esm/cramFile/container/compressionScheme.js +6 -4
- package/esm/cramFile/container/compressionScheme.js.map +1 -1
- package/esm/cramFile/file.js +17 -0
- package/esm/cramFile/file.js.map +1 -1
- package/esm/cramFile/slice/decodeRecord.js +63 -59
- package/esm/cramFile/slice/decodeRecord.js.map +1 -1
- package/esm/cramFile/slice/index.js +11 -7
- package/esm/cramFile/slice/index.js.map +1 -1
- package/package.json +2 -1
- package/src/craiIndex.js +180 -0
- package/src/cramFile/codecs/_base.js +49 -0
- package/src/cramFile/codecs/beta.js +23 -0
- package/src/cramFile/codecs/byteArrayLength.js +55 -0
- package/src/cramFile/codecs/byteArrayStop.js +49 -0
- package/src/cramFile/codecs/external.js +52 -0
- package/src/cramFile/codecs/gamma.js +30 -0
- package/src/cramFile/codecs/huffman.js +137 -0
- package/src/cramFile/codecs/index.js +38 -0
- package/src/cramFile/codecs/subexp.js +32 -0
- package/src/cramFile/constants.js +55 -0
- package/src/cramFile/container/compressionScheme.js +143 -0
- package/src/cramFile/container/index.js +119 -0
- package/src/cramFile/file.js +363 -0
- package/src/cramFile/index.js +3 -0
- package/src/cramFile/record.js +337 -0
- package/src/cramFile/sectionParsers.js +379 -0
- package/src/cramFile/slice/decodeRecord.js +359 -0
- package/src/cramFile/slice/index.js +501 -0
- package/src/cramFile/util.js +169 -0
- package/src/errors.js +22 -0
- package/src/index.js +5 -0
- package/src/indexedCramFile.js +191 -0
- package/src/io/bufferCache.js +66 -0
- package/src/io/index.js +26 -0
- package/src/io/localFile.js +35 -0
- package/src/io/remoteFile.js +71 -0
- package/src/rans/README.md +1 -0
- package/src/rans/constants.js +5 -0
- package/src/rans/d04.js +83 -0
- package/src/rans/d14.js +59 -0
- package/src/rans/decoding.js +141 -0
- package/src/rans/frequencies.js +121 -0
- package/src/rans/index.js +249 -0
- package/src/sam.js +15 -0
- package/src/unzip-pako.ts +5 -0
- package/src/unzip.ts +2 -0
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
import Long from 'long'
|
|
2
|
+
import { CramMalformedError } from '../../errors'
|
|
3
|
+
import CramRecord from '../record'
|
|
4
|
+
import Constants from '../constants'
|
|
5
|
+
/**
|
|
6
|
+
* given a Buffer, read a string up to the first null character
|
|
7
|
+
* @private
|
|
8
|
+
*/
|
|
9
|
+
function readNullTerminatedString(buffer) {
|
|
10
|
+
let r = ''
|
|
11
|
+
for (let i = 0; i < buffer.length && buffer[i] !== 0; i++) {
|
|
12
|
+
r += String.fromCharCode(buffer[i])
|
|
13
|
+
}
|
|
14
|
+
return r
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* parse a BAM tag's array value from a binary buffer
|
|
19
|
+
* @private
|
|
20
|
+
*/
|
|
21
|
+
function parseTagValueArray(buffer) {
|
|
22
|
+
const arrayType = String.fromCharCode(buffer[0])
|
|
23
|
+
const length = Int32Array.from(buffer.slice(1))[0]
|
|
24
|
+
|
|
25
|
+
const array = new Array(length)
|
|
26
|
+
buffer = buffer.slice(5)
|
|
27
|
+
|
|
28
|
+
if (arrayType === 'c') {
|
|
29
|
+
const arr = new Int8Array(buffer.buffer)
|
|
30
|
+
for (let i = 0; i < length; i += 1) {
|
|
31
|
+
array[i] = arr[i]
|
|
32
|
+
}
|
|
33
|
+
} else if (arrayType === 'C') {
|
|
34
|
+
const arr = new Uint8Array(buffer.buffer)
|
|
35
|
+
for (let i = 0; i < length; i += 1) {
|
|
36
|
+
array[i] = arr[i]
|
|
37
|
+
}
|
|
38
|
+
} else if (arrayType === 's') {
|
|
39
|
+
const arr = new Int16Array(buffer.buffer)
|
|
40
|
+
for (let i = 0; i < length; i += 1) {
|
|
41
|
+
array[i] = arr[i]
|
|
42
|
+
}
|
|
43
|
+
} else if (arrayType === 'S') {
|
|
44
|
+
const arr = new Uint16Array(buffer.buffer)
|
|
45
|
+
for (let i = 0; i < length; i += 1) {
|
|
46
|
+
array[i] = arr[i]
|
|
47
|
+
}
|
|
48
|
+
} else if (arrayType === 'i') {
|
|
49
|
+
const arr = new Int32Array(buffer.buffer)
|
|
50
|
+
for (let i = 0; i < length; i += 1) {
|
|
51
|
+
array[i] = arr[i]
|
|
52
|
+
}
|
|
53
|
+
} else if (arrayType === 'I') {
|
|
54
|
+
const arr = new Uint32Array(buffer.buffer)
|
|
55
|
+
for (let i = 0; i < length; i += 1) {
|
|
56
|
+
array[i] = arr[i]
|
|
57
|
+
}
|
|
58
|
+
} else if (arrayType === 'f') {
|
|
59
|
+
const arr = new Float32Array(buffer.buffer)
|
|
60
|
+
for (let i = 0; i < length; i += 1) {
|
|
61
|
+
array[i] = arr[i]
|
|
62
|
+
}
|
|
63
|
+
} else {
|
|
64
|
+
throw new Error('unknown type: ' + arrayType)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return array
|
|
68
|
+
}
|
|
69
|
+
function parseTagData(tagType, buffer) {
|
|
70
|
+
if (tagType === 'Z') {
|
|
71
|
+
return readNullTerminatedString(buffer)
|
|
72
|
+
}
|
|
73
|
+
if (tagType === 'A') {
|
|
74
|
+
return String.fromCharCode(buffer[0])
|
|
75
|
+
}
|
|
76
|
+
if (tagType === 'I') {
|
|
77
|
+
return Long.fromBytesLE(buffer).toNumber()
|
|
78
|
+
}
|
|
79
|
+
if (tagType === 'i') {
|
|
80
|
+
return new Int32Array(buffer.buffer)[0]
|
|
81
|
+
}
|
|
82
|
+
if (tagType === 's') {
|
|
83
|
+
return new Int16Array(buffer.buffer)[0]
|
|
84
|
+
}
|
|
85
|
+
if (tagType === 'S') {
|
|
86
|
+
return new Uint16Array(buffer.buffer)[0]
|
|
87
|
+
}
|
|
88
|
+
if (tagType === 'c') {
|
|
89
|
+
return new Int8Array(buffer.buffer)[0]
|
|
90
|
+
}
|
|
91
|
+
if (tagType === 'C') {
|
|
92
|
+
return buffer[0]
|
|
93
|
+
}
|
|
94
|
+
if (tagType === 'f') {
|
|
95
|
+
return new Float32Array(buffer.buffer)[0]
|
|
96
|
+
}
|
|
97
|
+
if (tagType === 'H') {
|
|
98
|
+
return Number.parseInt(
|
|
99
|
+
readNullTerminatedString(buffer).replace(/^0x/, ''),
|
|
100
|
+
16,
|
|
101
|
+
)
|
|
102
|
+
}
|
|
103
|
+
if (tagType === 'B') {
|
|
104
|
+
return parseTagValueArray(buffer)
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
throw new CramMalformedError(`Unrecognized tag type ${tagType}`)
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function decodeReadFeatures(
|
|
111
|
+
cramRecord,
|
|
112
|
+
readFeatureCount,
|
|
113
|
+
decodeDataSeries,
|
|
114
|
+
compressionScheme,
|
|
115
|
+
majorVersion,
|
|
116
|
+
) {
|
|
117
|
+
let currentReadPos = 0
|
|
118
|
+
let currentRefPos = cramRecord.alignmentStart - 1
|
|
119
|
+
const readFeatures = new Array(readFeatureCount)
|
|
120
|
+
|
|
121
|
+
function decodeRFData([type, dataSeriesName]) {
|
|
122
|
+
const data = decodeDataSeries(dataSeriesName)
|
|
123
|
+
if (type === 'character') {
|
|
124
|
+
return String.fromCharCode(data)
|
|
125
|
+
}
|
|
126
|
+
if (type === 'string') {
|
|
127
|
+
return data.toString('utf8')
|
|
128
|
+
}
|
|
129
|
+
if (type === 'numArray') {
|
|
130
|
+
return data.toArray()
|
|
131
|
+
}
|
|
132
|
+
// else if (type === 'number') {
|
|
133
|
+
// return data[0]
|
|
134
|
+
// }
|
|
135
|
+
return data
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
for (let i = 0; i < readFeatureCount; i += 1) {
|
|
139
|
+
const code = String.fromCharCode(decodeDataSeries('FC'))
|
|
140
|
+
|
|
141
|
+
const readPosDelta = decodeDataSeries('FP')
|
|
142
|
+
|
|
143
|
+
const readFeature = { code }
|
|
144
|
+
// map of operator name -> data series name
|
|
145
|
+
const data1Schema = {
|
|
146
|
+
B: ['character', 'BA'],
|
|
147
|
+
S: ['string', majorVersion > 1 ? 'SC' : 'IN'], // IN if cram v1, SC otherwise
|
|
148
|
+
X: ['number', 'BS'],
|
|
149
|
+
D: ['number', 'DL'],
|
|
150
|
+
I: ['string', 'IN'],
|
|
151
|
+
i: ['character', 'BA'],
|
|
152
|
+
b: ['string', 'BB'],
|
|
153
|
+
q: ['numArray', 'QQ'],
|
|
154
|
+
Q: ['number', 'QS'],
|
|
155
|
+
H: ['number', 'HC'],
|
|
156
|
+
P: ['number', 'PD'],
|
|
157
|
+
N: ['number', 'RS'],
|
|
158
|
+
}[code]
|
|
159
|
+
|
|
160
|
+
if (!data1Schema) {
|
|
161
|
+
throw new CramMalformedError(`invalid read feature code "${code}"`)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
readFeature.data = decodeRFData(data1Schema)
|
|
165
|
+
|
|
166
|
+
// if this is a tag with two data items, make the data an array and add the second item
|
|
167
|
+
const data2Schema = { B: ['number', 'QS'] }[code]
|
|
168
|
+
if (data2Schema) {
|
|
169
|
+
readFeature.data = [readFeature.data, decodeRFData(data2Schema)]
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
currentReadPos += readPosDelta
|
|
173
|
+
readFeature.pos = currentReadPos
|
|
174
|
+
|
|
175
|
+
currentRefPos += readPosDelta
|
|
176
|
+
readFeature.refPos = currentRefPos
|
|
177
|
+
|
|
178
|
+
// for gapping features, adjust the reference position for read features that follow
|
|
179
|
+
if (code === 'D' || code === 'N') {
|
|
180
|
+
currentRefPos += readFeature.data
|
|
181
|
+
} else if (code === 'I' || code === 'S') {
|
|
182
|
+
currentRefPos -= readFeature.data.length
|
|
183
|
+
} else if (code === 'i') {
|
|
184
|
+
currentRefPos -= 1
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
readFeatures[i] = readFeature
|
|
188
|
+
}
|
|
189
|
+
return readFeatures
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
export default function decodeRecord(
|
|
193
|
+
slice,
|
|
194
|
+
decodeDataSeries,
|
|
195
|
+
compressionScheme,
|
|
196
|
+
sliceHeader,
|
|
197
|
+
coreDataBlock,
|
|
198
|
+
blocksByContentId,
|
|
199
|
+
cursors,
|
|
200
|
+
majorVersion,
|
|
201
|
+
recordNumber,
|
|
202
|
+
) {
|
|
203
|
+
const cramRecord = new CramRecord()
|
|
204
|
+
|
|
205
|
+
cramRecord.flags = decodeDataSeries('BF')
|
|
206
|
+
|
|
207
|
+
// note: the C data type of compressionFlags is byte in cram v1
|
|
208
|
+
// and int32 in cram v2+, but that does not matter for us here
|
|
209
|
+
// in javascript land.
|
|
210
|
+
cramRecord.cramFlags = decodeDataSeries('CF')
|
|
211
|
+
|
|
212
|
+
if (majorVersion > 1 && sliceHeader.content.refSeqId === -2) {
|
|
213
|
+
cramRecord.sequenceId = decodeDataSeries('RI')
|
|
214
|
+
} else {
|
|
215
|
+
cramRecord.sequenceId = sliceHeader.content.refSeqId
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
cramRecord.readLength = decodeDataSeries('RL')
|
|
219
|
+
// if APDelta, will calculate the true start in a second pass
|
|
220
|
+
cramRecord.alignmentStart = decodeDataSeries('AP')
|
|
221
|
+
if (compressionScheme.APdelta) {
|
|
222
|
+
cramRecord.alignmentStart += cursors.lastAlignmentStart
|
|
223
|
+
}
|
|
224
|
+
cursors.lastAlignmentStart = cramRecord.alignmentStart
|
|
225
|
+
cramRecord.readGroupId = decodeDataSeries('RG')
|
|
226
|
+
|
|
227
|
+
if (compressionScheme.readNamesIncluded) {
|
|
228
|
+
cramRecord.readName = readNullTerminatedString(decodeDataSeries('RN'))
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// mate record
|
|
232
|
+
if (cramRecord.isDetached()) {
|
|
233
|
+
// note: the MF is a byte in 1.0, int32 in 2+, but once again this doesn't matter for javascript
|
|
234
|
+
const mate = {}
|
|
235
|
+
mate.flags = decodeDataSeries('MF')
|
|
236
|
+
if (!compressionScheme.readNamesIncluded) {
|
|
237
|
+
mate.readName = readNullTerminatedString(decodeDataSeries('RN'))
|
|
238
|
+
cramRecord.readName = mate.readName
|
|
239
|
+
}
|
|
240
|
+
mate.sequenceId = decodeDataSeries('NS')
|
|
241
|
+
mate.alignmentStart = decodeDataSeries('NP')
|
|
242
|
+
if (mate.flags || mate.sequenceId > -1) {
|
|
243
|
+
cramRecord.mate = mate
|
|
244
|
+
}
|
|
245
|
+
cramRecord.templateSize = decodeDataSeries('TS')
|
|
246
|
+
|
|
247
|
+
// set mate unmapped if needed
|
|
248
|
+
if (mate.flags & Constants.CRAM_M_UNMAP) {
|
|
249
|
+
cramRecord.flags |= Constants.BAM_FMUNMAP
|
|
250
|
+
}
|
|
251
|
+
// set mate reversed if needed
|
|
252
|
+
if (mate.flags & Constants.CRAM_M_REVERSE) {
|
|
253
|
+
cramRecord.flags |= Constants.BAM_FMREVERSE
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// detachedCount++
|
|
257
|
+
} else if (cramRecord.hasMateDownStream()) {
|
|
258
|
+
cramRecord.mateRecordNumber = decodeDataSeries('NF') + recordNumber + 1
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// TODO: the aux tag parsing will have to be refactored if we want to support
|
|
262
|
+
// cram v1
|
|
263
|
+
const TLindex = decodeDataSeries('TL')
|
|
264
|
+
if (TLindex < 0) {
|
|
265
|
+
/* TODO: check nTL: TLindex >= compressionHeader.tagEncoding.size */
|
|
266
|
+
throw new CramMalformedError('invalid TL index')
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// TN = tag names
|
|
270
|
+
const TN = compressionScheme.getTagNames(TLindex)
|
|
271
|
+
const ntags = TN.length
|
|
272
|
+
|
|
273
|
+
for (let i = 0; i < ntags; i += 1) {
|
|
274
|
+
const tagId = TN[i]
|
|
275
|
+
const tagName = tagId.substr(0, 2)
|
|
276
|
+
const tagType = tagId.substr(2, 1)
|
|
277
|
+
|
|
278
|
+
const tagCodec = compressionScheme.getCodecForTag(tagId)
|
|
279
|
+
if (!tagCodec) {
|
|
280
|
+
throw new CramMalformedError(
|
|
281
|
+
`no codec defined for auxiliary tag ${tagId}`,
|
|
282
|
+
)
|
|
283
|
+
}
|
|
284
|
+
const tagData = tagCodec.decode(
|
|
285
|
+
slice,
|
|
286
|
+
coreDataBlock,
|
|
287
|
+
blocksByContentId,
|
|
288
|
+
cursors,
|
|
289
|
+
)
|
|
290
|
+
cramRecord.tags[tagName] = parseTagData(tagType, tagData)
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
if (!cramRecord.isSegmentUnmapped()) {
|
|
294
|
+
// reading read features
|
|
295
|
+
const /* int */ readFeatureCount = decodeDataSeries('FN')
|
|
296
|
+
if (readFeatureCount) {
|
|
297
|
+
cramRecord.readFeatures = decodeReadFeatures(
|
|
298
|
+
cramRecord,
|
|
299
|
+
readFeatureCount,
|
|
300
|
+
decodeDataSeries,
|
|
301
|
+
compressionScheme,
|
|
302
|
+
majorVersion,
|
|
303
|
+
)
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// compute the read's true span on the reference sequence, and the end coordinate of the alignment on the reference
|
|
307
|
+
let lengthOnRef = cramRecord.readLength
|
|
308
|
+
if (cramRecord.readFeatures) {
|
|
309
|
+
cramRecord.readFeatures.forEach(({ code, data }) => {
|
|
310
|
+
if (code === 'D' || code === 'N') {
|
|
311
|
+
lengthOnRef += data
|
|
312
|
+
} else if (code === 'I' || code === 'S') {
|
|
313
|
+
lengthOnRef -= data.length
|
|
314
|
+
} else if (code === 'i') {
|
|
315
|
+
lengthOnRef -= 1
|
|
316
|
+
}
|
|
317
|
+
})
|
|
318
|
+
}
|
|
319
|
+
if (Number.isNaN(lengthOnRef)) {
|
|
320
|
+
console.warn(
|
|
321
|
+
`${
|
|
322
|
+
cramRecord.readName ||
|
|
323
|
+
`${cramRecord.sequenceId}:${cramRecord.alignmentStart}`
|
|
324
|
+
} record has invalid read features`,
|
|
325
|
+
)
|
|
326
|
+
lengthOnRef = cramRecord.readLength
|
|
327
|
+
}
|
|
328
|
+
cramRecord.lengthOnRef = lengthOnRef
|
|
329
|
+
|
|
330
|
+
// mapping quality
|
|
331
|
+
cramRecord.mappingQuality = decodeDataSeries('MQ')
|
|
332
|
+
if (cramRecord.isPreservingQualityScores()) {
|
|
333
|
+
const bases = new Array(cramRecord.readLength)
|
|
334
|
+
for (let i = 0; i < bases.length; i += 1) {
|
|
335
|
+
bases[i] = decodeDataSeries('QS')
|
|
336
|
+
}
|
|
337
|
+
cramRecord.qualityScores = bases
|
|
338
|
+
}
|
|
339
|
+
} else if (cramRecord.isUnknownBases()) {
|
|
340
|
+
cramRecord.readBases = null
|
|
341
|
+
cramRecord.qualityScores = null
|
|
342
|
+
} else {
|
|
343
|
+
const bases = new Array(cramRecord.readLength)
|
|
344
|
+
for (let i = 0; i < bases.length; i += 1) {
|
|
345
|
+
bases[i] = decodeDataSeries('BA')
|
|
346
|
+
}
|
|
347
|
+
cramRecord.readBases = String.fromCharCode(...bases)
|
|
348
|
+
|
|
349
|
+
if (cramRecord.isPreservingQualityScores()) {
|
|
350
|
+
for (let i = 0; i < bases.length; i += 1) {
|
|
351
|
+
bases[i] = decodeDataSeries('QS')
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
cramRecord.qualityScores = bases
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
return cramRecord
|
|
359
|
+
}
|