@gmod/cram 1.6.1 → 1.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CHANGELOG.md +13 -0
  2. package/dist/craiIndex.js +1 -1
  3. package/dist/craiIndex.js.map +1 -1
  4. package/dist/cram-bundle.js +3 -3
  5. package/dist/cramFile/codecs/byteArrayLength.d.ts +1 -1
  6. package/dist/cramFile/codecs/byteArrayLength.js +1 -1
  7. package/dist/cramFile/codecs/byteArrayLength.js.map +1 -1
  8. package/dist/cramFile/codecs/byteArrayStop.js +1 -2
  9. package/dist/cramFile/codecs/byteArrayStop.js.map +1 -1
  10. package/dist/cramFile/codecs/external.js +1 -3
  11. package/dist/cramFile/codecs/external.js.map +1 -1
  12. package/dist/cramFile/container/compressionScheme.d.ts +1 -1
  13. package/dist/cramFile/container/compressionScheme.js +6 -4
  14. package/dist/cramFile/container/compressionScheme.js.map +1 -1
  15. package/dist/cramFile/file.js +17 -0
  16. package/dist/cramFile/file.js.map +1 -1
  17. package/dist/cramFile/slice/decodeRecord.js +62 -58
  18. package/dist/cramFile/slice/decodeRecord.js.map +1 -1
  19. package/dist/cramFile/slice/index.js +8 -5
  20. package/dist/cramFile/slice/index.js.map +1 -1
  21. package/esm/craiIndex.js +1 -1
  22. package/esm/craiIndex.js.map +1 -1
  23. package/esm/cramFile/codecs/byteArrayLength.d.ts +1 -1
  24. package/esm/cramFile/codecs/byteArrayLength.js +1 -1
  25. package/esm/cramFile/codecs/byteArrayLength.js.map +1 -1
  26. package/esm/cramFile/codecs/byteArrayStop.js +1 -2
  27. package/esm/cramFile/codecs/byteArrayStop.js.map +1 -1
  28. package/esm/cramFile/codecs/external.js +1 -3
  29. package/esm/cramFile/codecs/external.js.map +1 -1
  30. package/esm/cramFile/container/compressionScheme.d.ts +1 -1
  31. package/esm/cramFile/container/compressionScheme.js +6 -4
  32. package/esm/cramFile/container/compressionScheme.js.map +1 -1
  33. package/esm/cramFile/file.js +17 -0
  34. package/esm/cramFile/file.js.map +1 -1
  35. package/esm/cramFile/slice/decodeRecord.js +63 -59
  36. package/esm/cramFile/slice/decodeRecord.js.map +1 -1
  37. package/esm/cramFile/slice/index.js +11 -7
  38. package/esm/cramFile/slice/index.js.map +1 -1
  39. package/package.json +2 -1
  40. package/src/craiIndex.js +180 -0
  41. package/src/cramFile/codecs/_base.js +49 -0
  42. package/src/cramFile/codecs/beta.js +23 -0
  43. package/src/cramFile/codecs/byteArrayLength.js +55 -0
  44. package/src/cramFile/codecs/byteArrayStop.js +49 -0
  45. package/src/cramFile/codecs/external.js +52 -0
  46. package/src/cramFile/codecs/gamma.js +30 -0
  47. package/src/cramFile/codecs/huffman.js +137 -0
  48. package/src/cramFile/codecs/index.js +38 -0
  49. package/src/cramFile/codecs/subexp.js +32 -0
  50. package/src/cramFile/constants.js +55 -0
  51. package/src/cramFile/container/compressionScheme.js +143 -0
  52. package/src/cramFile/container/index.js +119 -0
  53. package/src/cramFile/file.js +363 -0
  54. package/src/cramFile/index.js +3 -0
  55. package/src/cramFile/record.js +337 -0
  56. package/src/cramFile/sectionParsers.js +379 -0
  57. package/src/cramFile/slice/decodeRecord.js +359 -0
  58. package/src/cramFile/slice/index.js +501 -0
  59. package/src/cramFile/util.js +169 -0
  60. package/src/errors.js +22 -0
  61. package/src/index.js +5 -0
  62. package/src/indexedCramFile.js +191 -0
  63. package/src/io/bufferCache.js +66 -0
  64. package/src/io/index.js +26 -0
  65. package/src/io/localFile.js +35 -0
  66. package/src/io/remoteFile.js +71 -0
  67. package/src/rans/README.md +1 -0
  68. package/src/rans/constants.js +5 -0
  69. package/src/rans/d04.js +83 -0
  70. package/src/rans/d14.js +59 -0
  71. package/src/rans/decoding.js +141 -0
  72. package/src/rans/frequencies.js +121 -0
  73. package/src/rans/index.js +249 -0
  74. package/src/sam.js +15 -0
  75. package/src/unzip-pako.ts +5 -0
  76. package/src/unzip.ts +2 -0
@@ -0,0 +1,359 @@
1
+ import Long from 'long'
2
+ import { CramMalformedError } from '../../errors'
3
+ import CramRecord from '../record'
4
+ import Constants from '../constants'
5
+ /**
6
+ * given a Buffer, read a string up to the first null character
7
+ * @private
8
+ */
9
+ function readNullTerminatedString(buffer) {
10
+ let r = ''
11
+ for (let i = 0; i < buffer.length && buffer[i] !== 0; i++) {
12
+ r += String.fromCharCode(buffer[i])
13
+ }
14
+ return r
15
+ }
16
+
17
+ /**
18
+ * parse a BAM tag's array value from a binary buffer
19
+ * @private
20
+ */
21
+ function parseTagValueArray(buffer) {
22
+ const arrayType = String.fromCharCode(buffer[0])
23
+ const length = Int32Array.from(buffer.slice(1))[0]
24
+
25
+ const array = new Array(length)
26
+ buffer = buffer.slice(5)
27
+
28
+ if (arrayType === 'c') {
29
+ const arr = new Int8Array(buffer.buffer)
30
+ for (let i = 0; i < length; i += 1) {
31
+ array[i] = arr[i]
32
+ }
33
+ } else if (arrayType === 'C') {
34
+ const arr = new Uint8Array(buffer.buffer)
35
+ for (let i = 0; i < length; i += 1) {
36
+ array[i] = arr[i]
37
+ }
38
+ } else if (arrayType === 's') {
39
+ const arr = new Int16Array(buffer.buffer)
40
+ for (let i = 0; i < length; i += 1) {
41
+ array[i] = arr[i]
42
+ }
43
+ } else if (arrayType === 'S') {
44
+ const arr = new Uint16Array(buffer.buffer)
45
+ for (let i = 0; i < length; i += 1) {
46
+ array[i] = arr[i]
47
+ }
48
+ } else if (arrayType === 'i') {
49
+ const arr = new Int32Array(buffer.buffer)
50
+ for (let i = 0; i < length; i += 1) {
51
+ array[i] = arr[i]
52
+ }
53
+ } else if (arrayType === 'I') {
54
+ const arr = new Uint32Array(buffer.buffer)
55
+ for (let i = 0; i < length; i += 1) {
56
+ array[i] = arr[i]
57
+ }
58
+ } else if (arrayType === 'f') {
59
+ const arr = new Float32Array(buffer.buffer)
60
+ for (let i = 0; i < length; i += 1) {
61
+ array[i] = arr[i]
62
+ }
63
+ } else {
64
+ throw new Error('unknown type: ' + arrayType)
65
+ }
66
+
67
+ return array
68
+ }
69
+ function parseTagData(tagType, buffer) {
70
+ if (tagType === 'Z') {
71
+ return readNullTerminatedString(buffer)
72
+ }
73
+ if (tagType === 'A') {
74
+ return String.fromCharCode(buffer[0])
75
+ }
76
+ if (tagType === 'I') {
77
+ return Long.fromBytesLE(buffer).toNumber()
78
+ }
79
+ if (tagType === 'i') {
80
+ return new Int32Array(buffer.buffer)[0]
81
+ }
82
+ if (tagType === 's') {
83
+ return new Int16Array(buffer.buffer)[0]
84
+ }
85
+ if (tagType === 'S') {
86
+ return new Uint16Array(buffer.buffer)[0]
87
+ }
88
+ if (tagType === 'c') {
89
+ return new Int8Array(buffer.buffer)[0]
90
+ }
91
+ if (tagType === 'C') {
92
+ return buffer[0]
93
+ }
94
+ if (tagType === 'f') {
95
+ return new Float32Array(buffer.buffer)[0]
96
+ }
97
+ if (tagType === 'H') {
98
+ return Number.parseInt(
99
+ readNullTerminatedString(buffer).replace(/^0x/, ''),
100
+ 16,
101
+ )
102
+ }
103
+ if (tagType === 'B') {
104
+ return parseTagValueArray(buffer)
105
+ }
106
+
107
+ throw new CramMalformedError(`Unrecognized tag type ${tagType}`)
108
+ }
109
+
110
+ function decodeReadFeatures(
111
+ cramRecord,
112
+ readFeatureCount,
113
+ decodeDataSeries,
114
+ compressionScheme,
115
+ majorVersion,
116
+ ) {
117
+ let currentReadPos = 0
118
+ let currentRefPos = cramRecord.alignmentStart - 1
119
+ const readFeatures = new Array(readFeatureCount)
120
+
121
+ function decodeRFData([type, dataSeriesName]) {
122
+ const data = decodeDataSeries(dataSeriesName)
123
+ if (type === 'character') {
124
+ return String.fromCharCode(data)
125
+ }
126
+ if (type === 'string') {
127
+ return data.toString('utf8')
128
+ }
129
+ if (type === 'numArray') {
130
+ return data.toArray()
131
+ }
132
+ // else if (type === 'number') {
133
+ // return data[0]
134
+ // }
135
+ return data
136
+ }
137
+
138
+ for (let i = 0; i < readFeatureCount; i += 1) {
139
+ const code = String.fromCharCode(decodeDataSeries('FC'))
140
+
141
+ const readPosDelta = decodeDataSeries('FP')
142
+
143
+ const readFeature = { code }
144
+ // map of operator name -> data series name
145
+ const data1Schema = {
146
+ B: ['character', 'BA'],
147
+ S: ['string', majorVersion > 1 ? 'SC' : 'IN'], // IN if cram v1, SC otherwise
148
+ X: ['number', 'BS'],
149
+ D: ['number', 'DL'],
150
+ I: ['string', 'IN'],
151
+ i: ['character', 'BA'],
152
+ b: ['string', 'BB'],
153
+ q: ['numArray', 'QQ'],
154
+ Q: ['number', 'QS'],
155
+ H: ['number', 'HC'],
156
+ P: ['number', 'PD'],
157
+ N: ['number', 'RS'],
158
+ }[code]
159
+
160
+ if (!data1Schema) {
161
+ throw new CramMalformedError(`invalid read feature code "${code}"`)
162
+ }
163
+
164
+ readFeature.data = decodeRFData(data1Schema)
165
+
166
+ // if this is a tag with two data items, make the data an array and add the second item
167
+ const data2Schema = { B: ['number', 'QS'] }[code]
168
+ if (data2Schema) {
169
+ readFeature.data = [readFeature.data, decodeRFData(data2Schema)]
170
+ }
171
+
172
+ currentReadPos += readPosDelta
173
+ readFeature.pos = currentReadPos
174
+
175
+ currentRefPos += readPosDelta
176
+ readFeature.refPos = currentRefPos
177
+
178
+ // for gapping features, adjust the reference position for read features that follow
179
+ if (code === 'D' || code === 'N') {
180
+ currentRefPos += readFeature.data
181
+ } else if (code === 'I' || code === 'S') {
182
+ currentRefPos -= readFeature.data.length
183
+ } else if (code === 'i') {
184
+ currentRefPos -= 1
185
+ }
186
+
187
+ readFeatures[i] = readFeature
188
+ }
189
+ return readFeatures
190
+ }
191
+
192
+ export default function decodeRecord(
193
+ slice,
194
+ decodeDataSeries,
195
+ compressionScheme,
196
+ sliceHeader,
197
+ coreDataBlock,
198
+ blocksByContentId,
199
+ cursors,
200
+ majorVersion,
201
+ recordNumber,
202
+ ) {
203
+ const cramRecord = new CramRecord()
204
+
205
+ cramRecord.flags = decodeDataSeries('BF')
206
+
207
+ // note: the C data type of compressionFlags is byte in cram v1
208
+ // and int32 in cram v2+, but that does not matter for us here
209
+ // in javascript land.
210
+ cramRecord.cramFlags = decodeDataSeries('CF')
211
+
212
+ if (majorVersion > 1 && sliceHeader.content.refSeqId === -2) {
213
+ cramRecord.sequenceId = decodeDataSeries('RI')
214
+ } else {
215
+ cramRecord.sequenceId = sliceHeader.content.refSeqId
216
+ }
217
+
218
+ cramRecord.readLength = decodeDataSeries('RL')
219
+ // if APDelta, will calculate the true start in a second pass
220
+ cramRecord.alignmentStart = decodeDataSeries('AP')
221
+ if (compressionScheme.APdelta) {
222
+ cramRecord.alignmentStart += cursors.lastAlignmentStart
223
+ }
224
+ cursors.lastAlignmentStart = cramRecord.alignmentStart
225
+ cramRecord.readGroupId = decodeDataSeries('RG')
226
+
227
+ if (compressionScheme.readNamesIncluded) {
228
+ cramRecord.readName = readNullTerminatedString(decodeDataSeries('RN'))
229
+ }
230
+
231
+ // mate record
232
+ if (cramRecord.isDetached()) {
233
+ // note: the MF is a byte in 1.0, int32 in 2+, but once again this doesn't matter for javascript
234
+ const mate = {}
235
+ mate.flags = decodeDataSeries('MF')
236
+ if (!compressionScheme.readNamesIncluded) {
237
+ mate.readName = readNullTerminatedString(decodeDataSeries('RN'))
238
+ cramRecord.readName = mate.readName
239
+ }
240
+ mate.sequenceId = decodeDataSeries('NS')
241
+ mate.alignmentStart = decodeDataSeries('NP')
242
+ if (mate.flags || mate.sequenceId > -1) {
243
+ cramRecord.mate = mate
244
+ }
245
+ cramRecord.templateSize = decodeDataSeries('TS')
246
+
247
+ // set mate unmapped if needed
248
+ if (mate.flags & Constants.CRAM_M_UNMAP) {
249
+ cramRecord.flags |= Constants.BAM_FMUNMAP
250
+ }
251
+ // set mate reversed if needed
252
+ if (mate.flags & Constants.CRAM_M_REVERSE) {
253
+ cramRecord.flags |= Constants.BAM_FMREVERSE
254
+ }
255
+
256
+ // detachedCount++
257
+ } else if (cramRecord.hasMateDownStream()) {
258
+ cramRecord.mateRecordNumber = decodeDataSeries('NF') + recordNumber + 1
259
+ }
260
+
261
+ // TODO: the aux tag parsing will have to be refactored if we want to support
262
+ // cram v1
263
+ const TLindex = decodeDataSeries('TL')
264
+ if (TLindex < 0) {
265
+ /* TODO: check nTL: TLindex >= compressionHeader.tagEncoding.size */
266
+ throw new CramMalformedError('invalid TL index')
267
+ }
268
+
269
+ // TN = tag names
270
+ const TN = compressionScheme.getTagNames(TLindex)
271
+ const ntags = TN.length
272
+
273
+ for (let i = 0; i < ntags; i += 1) {
274
+ const tagId = TN[i]
275
+ const tagName = tagId.substr(0, 2)
276
+ const tagType = tagId.substr(2, 1)
277
+
278
+ const tagCodec = compressionScheme.getCodecForTag(tagId)
279
+ if (!tagCodec) {
280
+ throw new CramMalformedError(
281
+ `no codec defined for auxiliary tag ${tagId}`,
282
+ )
283
+ }
284
+ const tagData = tagCodec.decode(
285
+ slice,
286
+ coreDataBlock,
287
+ blocksByContentId,
288
+ cursors,
289
+ )
290
+ cramRecord.tags[tagName] = parseTagData(tagType, tagData)
291
+ }
292
+
293
+ if (!cramRecord.isSegmentUnmapped()) {
294
+ // reading read features
295
+ const /* int */ readFeatureCount = decodeDataSeries('FN')
296
+ if (readFeatureCount) {
297
+ cramRecord.readFeatures = decodeReadFeatures(
298
+ cramRecord,
299
+ readFeatureCount,
300
+ decodeDataSeries,
301
+ compressionScheme,
302
+ majorVersion,
303
+ )
304
+ }
305
+
306
+ // compute the read's true span on the reference sequence, and the end coordinate of the alignment on the reference
307
+ let lengthOnRef = cramRecord.readLength
308
+ if (cramRecord.readFeatures) {
309
+ cramRecord.readFeatures.forEach(({ code, data }) => {
310
+ if (code === 'D' || code === 'N') {
311
+ lengthOnRef += data
312
+ } else if (code === 'I' || code === 'S') {
313
+ lengthOnRef -= data.length
314
+ } else if (code === 'i') {
315
+ lengthOnRef -= 1
316
+ }
317
+ })
318
+ }
319
+ if (Number.isNaN(lengthOnRef)) {
320
+ console.warn(
321
+ `${
322
+ cramRecord.readName ||
323
+ `${cramRecord.sequenceId}:${cramRecord.alignmentStart}`
324
+ } record has invalid read features`,
325
+ )
326
+ lengthOnRef = cramRecord.readLength
327
+ }
328
+ cramRecord.lengthOnRef = lengthOnRef
329
+
330
+ // mapping quality
331
+ cramRecord.mappingQuality = decodeDataSeries('MQ')
332
+ if (cramRecord.isPreservingQualityScores()) {
333
+ const bases = new Array(cramRecord.readLength)
334
+ for (let i = 0; i < bases.length; i += 1) {
335
+ bases[i] = decodeDataSeries('QS')
336
+ }
337
+ cramRecord.qualityScores = bases
338
+ }
339
+ } else if (cramRecord.isUnknownBases()) {
340
+ cramRecord.readBases = null
341
+ cramRecord.qualityScores = null
342
+ } else {
343
+ const bases = new Array(cramRecord.readLength)
344
+ for (let i = 0; i < bases.length; i += 1) {
345
+ bases[i] = decodeDataSeries('BA')
346
+ }
347
+ cramRecord.readBases = String.fromCharCode(...bases)
348
+
349
+ if (cramRecord.isPreservingQualityScores()) {
350
+ for (let i = 0; i < bases.length; i += 1) {
351
+ bases[i] = decodeDataSeries('QS')
352
+ }
353
+
354
+ cramRecord.qualityScores = bases
355
+ }
356
+ }
357
+
358
+ return cramRecord
359
+ }