npm - @gmod/cram - Versions diffs - 8.0.2 → 8.0.4 - Mend

@gmod/cram 8.0.2 → 8.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/dist/cram-bundle.js +1 -1
package/dist/cramFile/codecs/_base.d.ts +1 -0
package/dist/cramFile/codecs/_base.js +3 -0
package/dist/cramFile/codecs/_base.js.map +1 -1
package/dist/cramFile/codecs/byteArrayLength.d.ts +1 -1
package/dist/cramFile/codecs/byteArrayLength.js +14 -7
package/dist/cramFile/codecs/byteArrayLength.js.map +1 -1
package/dist/cramFile/codecs/external.d.ts +1 -1
package/dist/cramFile/codecs/external.js +32 -4
package/dist/cramFile/codecs/external.js.map +1 -1
package/dist/cramFile/codecs/getBits.d.ts +1 -0
package/dist/cramFile/codecs/getBits.js +4 -0
package/dist/cramFile/codecs/getBits.js.map +1 -1
package/dist/cramFile/record.d.ts +39 -9
package/dist/cramFile/record.js +35 -35
package/dist/cramFile/record.js.map +1 -1
package/dist/cramFile/slice/decodeRecord.d.ts +4 -3
package/dist/cramFile/slice/decodeRecord.js +62 -77
package/dist/cramFile/slice/decodeRecord.js.map +1 -1
package/dist/cramFile/slice/index.js +17 -27
package/dist/cramFile/slice/index.js.map +1 -1
package/dist/cramFile/util.d.ts +2 -0
package/dist/cramFile/util.js +13 -0
package/dist/cramFile/util.js.map +1 -1
package/dist/indexedCramFile.js +0 -3
package/dist/indexedCramFile.js.map +1 -1
package/esm/cramFile/codecs/_base.d.ts +1 -0
package/esm/cramFile/codecs/_base.js +3 -0
package/esm/cramFile/codecs/_base.js.map +1 -1
package/esm/cramFile/codecs/byteArrayLength.d.ts +1 -1
package/esm/cramFile/codecs/byteArrayLength.js +14 -7
package/esm/cramFile/codecs/byteArrayLength.js.map +1 -1
package/esm/cramFile/codecs/external.d.ts +1 -1
package/esm/cramFile/codecs/external.js +32 -4
package/esm/cramFile/codecs/external.js.map +1 -1
package/esm/cramFile/codecs/getBits.d.ts +1 -0
package/esm/cramFile/codecs/getBits.js +4 -0
package/esm/cramFile/codecs/getBits.js.map +1 -1
package/esm/cramFile/record.d.ts +39 -9
package/esm/cramFile/record.js +35 -35
package/esm/cramFile/record.js.map +1 -1
package/esm/cramFile/slice/decodeRecord.d.ts +4 -3
package/esm/cramFile/slice/decodeRecord.js +62 -77
package/esm/cramFile/slice/decodeRecord.js.map +1 -1
package/esm/cramFile/slice/index.js +17 -27
package/esm/cramFile/slice/index.js.map +1 -1
package/esm/cramFile/util.d.ts +2 -0
package/esm/cramFile/util.js +11 -0
package/esm/cramFile/util.js.map +1 -1
package/esm/indexedCramFile.js +0 -3
package/esm/indexedCramFile.js.map +1 -1
package/package.json +1 -1
package/src/cramFile/codecs/_base.ts +8 -0
package/src/cramFile/codecs/byteArrayLength.ts +21 -8
package/src/cramFile/codecs/external.ts +41 -9
package/src/cramFile/codecs/getBits.ts +3 -1
package/src/cramFile/record.ts +76 -49
package/src/cramFile/slice/decodeRecord.ts +77 -96
package/src/cramFile/slice/index.ts +31 -47
package/src/cramFile/util.ts +14 -0
package/src/indexedCramFile.ts +0 -4

package/src/cramFile/slice/decodeRecord.ts CHANGED Viewed

@@ -14,72 +14,54 @@ import {
 import CramSlice, { SliceHeader } from './index.ts'
 import { CramFileBlock } from '../file.ts'
 import { isMappedSliceHeader } from '../sectionParsers.ts'
-// Reusable TextDecoder instance for string decoding (ASCII/Latin1)
-const textDecoder = new TextDecoder('latin1')
-/**
- * given a Buffer, read a string up to the first null character
- * @private
- */
-function readNullTerminatedString(buffer: Uint8Array) {
-  // Find the null terminator
-  let end = 0
-  while (end < buffer.length && buffer[end] !== 0) {
-    end++
-  }
-  // Decode using TextDecoder (faster than char-by-char concatenation)
-  return textDecoder.decode(buffer.subarray(0, end))
-}
+import { decodeLatin1, readNullTerminatedStringFromBuffer } from '../util.ts'
 /**
  * parse a BAM tag's array value from a binary buffer
  * @private
  */
+// Uses DataView instead of typed arrays (e.g. new Int32Array(buffer.buffer))
+// because the buffer may be a subarray of a larger ArrayBuffer. Typed array
+// constructors like Int32Array interpret .buffer as the entire underlying
+// ArrayBuffer starting at byte 0, ignoring the subarray's byteOffset. This
+// caused silent data corruption when reading tag values. DataView with explicit
+// byteOffset reads from the correct position within the parent buffer.
 function parseTagValueArray(buffer: Uint8Array) {
   const arrayType = String.fromCharCode(buffer[0]!)
-  const dataView = new DataView(buffer.buffer)
-  const littleEndian = true
-  const length = dataView.getUint32(1, littleEndian)
+  const dv = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength)
+  const length = dv.getUint32(1, true)
   const array: number[] = new Array(length)
-  buffer = buffer.slice(5)
+  const dataOffset = 5
   if (arrayType === 'c') {
-    const arr = new Int8Array(buffer.buffer)
     for (let i = 0; i < length; i++) {
-      array[i] = arr[i]!
+      array[i] = dv.getInt8(dataOffset + i)
     }
   } else if (arrayType === 'C') {
-    const arr = new Uint8Array(buffer.buffer)
     for (let i = 0; i < length; i++) {
-      array[i] = arr[i]!
+      array[i] = dv.getUint8(dataOffset + i)
     }
   } else if (arrayType === 's') {
-    const arr = new Int16Array(buffer.buffer)
     for (let i = 0; i < length; i++) {
-      array[i] = arr[i]!
+      array[i] = dv.getInt16(dataOffset + i * 2, true)
     }
   } else if (arrayType === 'S') {
-    const arr = new Uint16Array(buffer.buffer)
     for (let i = 0; i < length; i++) {
-      array[i] = arr[i]!
+      array[i] = dv.getUint16(dataOffset + i * 2, true)
     }
   } else if (arrayType === 'i') {
-    const arr = new Int32Array(buffer.buffer)
     for (let i = 0; i < length; i++) {
-      array[i] = arr[i]!
+      array[i] = dv.getInt32(dataOffset + i * 4, true)
     }
   } else if (arrayType === 'I') {
-    const arr = new Uint32Array(buffer.buffer)
     for (let i = 0; i < length; i++) {
-      array[i] = arr[i]!
+      array[i] = dv.getUint32(dataOffset + i * 4, true)
     }
   } else if (arrayType === 'f') {
-    const arr = new Float32Array(buffer.buffer)
     for (let i = 0; i < length; i++) {
-      array[i] = arr[i]!
+      array[i] = dv.getFloat32(dataOffset + i * 4, true)
     }
   } else {
     throw new Error(`unknown type: ${arrayType}`)
@@ -90,35 +72,36 @@ function parseTagValueArray(buffer: Uint8Array) {
 function parseTagData(tagType: string, buffer: Uint8Array) {
   if (tagType === 'Z') {
-    return readNullTerminatedString(buffer)
+    return readNullTerminatedStringFromBuffer(buffer)
   }
   if (tagType === 'A') {
     return String.fromCharCode(buffer[0]!)
   }
+  const dv = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength)
   if (tagType === 'I') {
-    return new Uint32Array(buffer.buffer)[0]
+    return dv.getUint32(0, true)
   }
   if (tagType === 'i') {
-    return new Int32Array(buffer.buffer)[0]
+    return dv.getInt32(0, true)
   }
   if (tagType === 's') {
-    return new Int16Array(buffer.buffer)[0]
+    return dv.getInt16(0, true)
   }
   if (tagType === 'S') {
-    return new Uint16Array(buffer.buffer)[0]
+    return dv.getUint16(0, true)
   }
   if (tagType === 'c') {
-    return new Int8Array(buffer.buffer)[0]
+    return dv.getInt8(0)
   }
   if (tagType === 'C') {
     return buffer[0]!
   }
   if (tagType === 'f') {
-    return new Float32Array(buffer.buffer)[0]
+    return dv.getFloat32(0, true)
   }
   if (tagType === 'H') {
     return Number.parseInt(
-      readNullTerminatedString(buffer).replace(/^0x/, ''),
+      readNullTerminatedStringFromBuffer(buffer).replace(/^0x/, ''),
       16,
     )
   }
@@ -129,22 +112,25 @@ function parseTagData(tagType: string, buffer: Uint8Array) {
   throw new CramMalformedError(`Unrecognized tag type ${tagType}`)
 }
-// Pre-defined schema lookup tables (version-independent entries)
+// Read feature schema lookup tables. Each entry maps a feature code to
+// [dataType, dataSeriesName] where dataType controls how the raw codec
+// output is converted (character→fromCharCode, string→TextDecoder,
+// numArray→Array.from, number→as-is).
 const data1SchemaBase = {
-  B: ['character', 'BA'] as const,
-  X: ['number', 'BS'] as const,
-  D: ['number', 'DL'] as const,
-  I: ['string', 'IN'] as const,
-  i: ['character', 'BA'] as const,
-  b: ['string', 'BB'] as const,
-  q: ['numArray', 'QQ'] as const,
-  Q: ['number', 'QS'] as const,
-  H: ['number', 'HC'] as const,
-  P: ['number', 'PD'] as const,
-  N: ['number', 'RS'] as const,
+  B: ['character', 'BA'] as const, // base substitution (base component)
+  X: ['number', 'BS'] as const, // base substitution matrix index
+  D: ['number', 'DL'] as const, // deletion length
+  I: ['string', 'IN'] as const, // insertion bases
+  i: ['character', 'BA'] as const, // single-base insertion
+  b: ['string', 'BB'] as const, // stretch of bases
+  q: ['numArray', 'QQ'] as const, // stretch of quality scores
+  Q: ['number', 'QS'] as const, // single quality score
+  H: ['number', 'HC'] as const, // hard clip length
+  P: ['number', 'PD'] as const, // padding length
+  N: ['number', 'RS'] as const, // reference skip length
 } as const
-// Version-specific S entry
+// Soft clip data series changed between CRAM v1 (IN) and v2+ (SC)
 const data1SchemaV1: Record<string, readonly [string, string]> = {
   ...data1SchemaBase,
   S: ['string', 'IN'] as const,
@@ -154,7 +140,7 @@ const data1SchemaV2Plus: Record<string, readonly [string, string]> = {
   S: ['string', 'SC'] as const,
 }
-// Second data item schema for read features that have two values
+// Features with a second data item (B has both a base and a quality score)
 const data2Schema: Record<string, readonly [string, string]> = {
   B: ['number', 'QS'] as const,
 }
@@ -162,8 +148,7 @@ const data2Schema: Record<string, readonly [string, string]> = {
 function decodeReadFeatures(
   alignmentStart: number,
   readFeatureCount: number,
-  decodeDataSeries: any,
-  _compressionScheme: CramContainerCompressionScheme,
+  decodeDataSeries: DataSeriesDecoder,
   majorVersion: number,
 ) {
   let currentReadPos = 0
@@ -176,22 +161,22 @@ function decodeReadFeatures(
   function decodeRFData([type, dataSeriesName]: readonly [
     type: string,
     dataSeriesName: string,
-  ]) {
-    const data = decodeDataSeries(dataSeriesName)
+  ]): string | number | number[] {
+    const data = decodeDataSeries(dataSeriesName as DataSeriesEncodingKey)
     if (type === 'character') {
-      return String.fromCharCode(data)
+      return String.fromCharCode(data as number)
     } else if (type === 'string') {
-      return textDecoder.decode(data)
+      return decodeLatin1(data as Uint8Array)
     } else if (type === 'numArray') {
-      return Array.from(data)
+      return Array.from(data as Uint8Array)
     }
-    return data
+    return data as number
   }
   for (let i = 0; i < readFeatureCount; i++) {
-    const code = String.fromCharCode(decodeDataSeries('FC'))
+    const code = String.fromCharCode(decodeDataSeries('FC')!)
-    const readPosDelta = decodeDataSeries('FP')
+    const readPosDelta = decodeDataSeries('FP')!
     const schema = data1Schema[code]
@@ -199,12 +184,13 @@ function decodeReadFeatures(
       throw new CramMalformedError(`invalid read feature code "${code}"`)
     }
-    let data: any = decodeRFData(schema)
+    let data: string | number | number[] | [string, number] =
+      decodeRFData(schema)
-    // if this is a read feature with two data items, make the data an array
+    // if this is a read feature with two data items, make the data a tuple
     const schema2 = data2Schema[code]
     if (schema2) {
-      data = [data, decodeRFData(schema2)]
+      data = [data as string, decodeRFData(schema2) as number]
     }
     currentReadPos += readPosDelta
@@ -215,14 +201,14 @@ function decodeReadFeatures(
     // for gapping features, adjust the reference position for read features that follow
     if (code === 'D' || code === 'N') {
-      currentRefPos += data
+      currentRefPos += data as number
     } else if (code === 'I' || code === 'S') {
-      currentRefPos -= data.length
+      currentRefPos -= (data as string).length
     } else if (code === 'i') {
       currentRefPos -= 1
     }
-    readFeatures[i] = { code, pos, refPos, data }
+    readFeatures[i] = { code, pos, refPos, data } as ReadFeature
   }
   return readFeatures
 }
@@ -246,6 +232,7 @@ export default function decodeRecord(
   cursors: Cursors,
   majorVersion: number,
   recordNumber: number,
+  uniqueId: number,
   decodeOptions?: Required<DecodeOptions>,
   decodeBulkBytesRaw?: BulkByteRawDecoder,
 ) {
@@ -273,9 +260,9 @@ export default function decodeRecord(
   cursors.lastAlignmentStart = alignmentStart
   const readGroupId = decodeDataSeries('RG')!
-  let readName: string | undefined
+  let readNameRaw: Uint8Array | undefined
   if (compressionScheme.readNamesIncluded) {
-    readName = readNullTerminatedString(decodeDataSeries('RN')!)
+    readNameRaw = decodeDataSeries('RN')!
   }
   let mateToUse:
@@ -295,8 +282,8 @@ export default function decodeRecord(
     const mateFlags = decodeDataSeries('MF')!
     let mateReadName: string | undefined
     if (!compressionScheme.readNamesIncluded) {
-      mateReadName = readNullTerminatedString(decodeDataSeries('RN')!)
-      readName = mateReadName
+      readNameRaw = decodeDataSeries('RN')!
+      mateReadName = readNullTerminatedStringFromBuffer(readNameRaw)
     }
     const mateSequenceId = decodeDataSeries('NS')!
     const mateAlignmentStart = decodeDataSeries('NP')!
@@ -319,8 +306,6 @@ export default function decodeRecord(
     if (MateFlagsDecoder.isOnNegativeStrand(mateFlags)) {
       flags = BamFlagsDecoder.setMateReverseComplemented(flags)
     }
-    // detachedCount++
   } else if (CramFlagsDecoder.isWithMateDownstream(cramFlags)) {
     mateRecordNumber = decodeDataSeries('NF')! + recordNumber + 1
   }
@@ -333,21 +318,19 @@ export default function decodeRecord(
     throw new CramMalformedError('invalid TL index')
   }
-  const tags: Record<string, any> = {}
+  type TagValue = string | number | number[] | undefined
+  const tags: Record<string, TagValue> = {}
   // TN = tag names
   const TN = compressionScheme.getTagNames(TLindex)!
   const ntags = TN.length
   const shouldDecodeTags = decodeOptions?.decodeTags !== false
-  for (let i = 0; i < ntags; i++) {
-    const tagId = TN[i]!
-    // Always decode to advance cursor position
-    const tagData = compressionScheme
-      .getCodecForTag(tagId)
-      .decode(slice, coreDataBlock, blocksByContentId, cursors)
-    // Only parse tags if requested (default: true)
-    if (shouldDecodeTags) {
-      // Use direct character access instead of slice() to avoid string allocation
+  if (shouldDecodeTags) {
+    for (let i = 0; i < ntags; i++) {
+      const tagId = TN[i]!
+      const tagData = compressionScheme
+        .getCodecForTag(tagId)
+        .decode(slice, coreDataBlock, blocksByContentId, cursors)
       const tagName = tagId[0]! + tagId[1]!
       const tagType = tagId[2]!
       tags[tagName] =
@@ -372,7 +355,6 @@ export default function decodeRecord(
         alignmentStart,
         readFeatureCount,
         decodeDataSeries,
-        compressionScheme,
         majorVersion,
       )
     }
@@ -393,9 +375,7 @@ export default function decodeRecord(
     }
     if (Number.isNaN(lengthOnRef)) {
       console.warn(
-        `${
-          readName || `${sequenceId}:${alignmentStart}`
-        } record has invalid read features`,
+        `${sequenceId}:${alignmentStart} record has invalid read features`,
       )
       lengthOnRef = readLength
     }
@@ -423,7 +403,7 @@ export default function decodeRecord(
     // Try raw bytes first for TextDecoder (most efficient)
     const rawBA = decodeBulkBytesRaw?.('BA', readLength)
     if (rawBA) {
-      readBases = textDecoder.decode(rawBA)
+      readBases = decodeLatin1(rawBA)
     } else {
       // Fallback to single-byte decoding
       let s = ''
@@ -455,7 +435,7 @@ export default function decodeRecord(
     flags,
     alignmentStart,
     readGroupId,
-    readName,
+    readNameRaw,
     mateToUse,
     templateSize,
     mateRecordNumber,
@@ -465,5 +445,6 @@ export default function decodeRecord(
     qualityScores,
     readBases,
     tags,
+    uniqueId,
   }
 }

package/src/cramFile/slice/index.ts CHANGED Viewed

@@ -1,13 +1,11 @@
 import { CramArgumentError, CramMalformedError } from '../../errors.ts'
 import { Cursors, DataTypeMapping } from '../codecs/_base.ts'
 import { DataSeriesEncodingKey } from '../codecs/dataSeriesTypes.ts'
-import { CramBufferOverrunError } from '../codecs/getBits.ts'
 import Constants from '../constants.ts'
 import decodeRecord, {
   BulkByteRawDecoder,
   DataSeriesDecoder,
 } from './decodeRecord.ts'
-import ExternalCodec from '../codecs/external.ts'
 import { DataSeriesTypes } from '../container/compressionScheme.ts'
 import CramContainer from '../container/index.ts'
 import CramFile, { CramFileBlock } from '../file.ts'
@@ -111,10 +109,12 @@ function associateIntraSliceMate(
       mateRecord.mateRecordNumber !== currentRecordNumber)
   )
-  // Deal with lossy read names
+  // Deal with lossy read names — assign a synthetic name from uniqueId
+  // so that paired records share the same name
   if (!thisRecord.readName) {
-    thisRecord.readName = String(thisRecord.uniqueId)
-    mateRecord.readName = thisRecord.readName
+    const syntheticName = String(thisRecord.uniqueId)
+    thisRecord._syntheticReadName = syntheticName
+    mateRecord._syntheticReadName = syntheticName
   }
   thisRecord.mate = {
@@ -446,30 +446,15 @@ export default class CramSlice {
       return codec.decode(this, coreDataBlock, blocksByContentId, cursors)
     }
-    // Create bulk byte decoder for QS and BA data series if they use External codec
+    // Bulk byte decoder for QS and BA — getBytesSubarray returns a subarray
+    // view when the codec supports it (e.g. ExternalCodec), or undefined otherwise
     const qsCodec = compressionScheme.getCodecForDataSeries('QS')
     const baCodec = compressionScheme.getCodecForDataSeries('BA')
-    const qsIsExternal = qsCodec instanceof ExternalCodec
-    const baIsExternal = baCodec instanceof ExternalCodec
-    // Create raw byte decoder for QS/BA decoding
     const decodeBulkBytesRaw: BulkByteRawDecoder | undefined =
-      qsIsExternal || baIsExternal
+      qsCodec || baCodec
         ? (dataSeriesName, length) => {
-            if (dataSeriesName === 'QS' && qsIsExternal) {
-              return qsCodec.getBytesSubarray(
-                blocksByContentId,
-                cursors,
-                length,
-              )
-            }
-            if (dataSeriesName === 'BA' && baIsExternal) {
-              return baCodec.getBytesSubarray(
-                blocksByContentId,
-                cursors,
-                length,
-              )
-            }
-            return undefined
+            const codec = dataSeriesName === 'QS' ? qsCodec : baCodec
+            return codec?.getBytesSubarray(blocksByContentId, cursors, length)
           }
         : undefined
@@ -478,35 +463,34 @@ export default class CramSlice {
     )
     for (let i = 0; i < records.length; i += 1) {
       try {
-        const init = decodeRecord(
-          this,
-          decodeDataSeries,
-          compressionScheme,
-          sliceHeader,
-          coreDataBlock,
-          blocksByContentId,
-          cursors,
-          majorVersion,
-          i,
-          decodeOptions,
-          decodeBulkBytesRaw,
-        )
-        records[i] = new CramRecord({
-          ...init,
-          uniqueId:
+        records[i] = new CramRecord(
+          decodeRecord(
+            this,
+            decodeDataSeries,
+            compressionScheme,
+            sliceHeader,
+            coreDataBlock,
+            blocksByContentId,
+            cursors,
+            majorVersion,
+            i,
             sliceHeader.contentPosition +
-            sliceHeader.parsedContent.recordCounter +
-            i +
-            1,
-        })
+              sliceHeader.parsedContent.recordCounter +
+              i +
+              1,
+            decodeOptions,
+            decodeBulkBytesRaw,
+          ),
+        )
       } catch (e) {
-        if (e instanceof CramBufferOverrunError) {
+        const err = e as { code?: string; message?: string }
+        if (err.code === 'CRAM_BUFFER_OVERRUN') {
           const recordsDecoded = i
           const recordsExpected = sliceHeader.parsedContent.numRecords
           throw new CramMalformedError(
             `Failed to decode all records in slice. Decoded ${recordsDecoded} of ${recordsExpected} expected records. ` +
               `Buffer overrun suggests either: (1) file is truncated/corrupted, (2) compression scheme is incorrect, ` +
-              `or (3) there's a bug in the decoder. Original error: ${e.message}`,
+              `or (3) there's a bug in the decoder. Original error: ${err.message}`,
           )
         } else {
           throw e

package/src/cramFile/util.ts CHANGED Viewed

@@ -1,5 +1,19 @@
 import md5 from 'md5'
+const textDecoder = new TextDecoder('latin1')
+export function readNullTerminatedStringFromBuffer(buffer: Uint8Array) {
+  let end = 0
+  while (end < buffer.length && buffer[end] !== 0) {
+    end++
+  }
+  return textDecoder.decode(buffer.subarray(0, end))
+}
+export function decodeLatin1(buffer: Uint8Array) {
+  return textDecoder.decode(buffer)
+}
 export const TWO_PWR_16_DBL = 1 << 16
 export const TWO_PWR_32_DBL = TWO_PWR_16_DBL * TWO_PWR_16_DBL
 export const TWO_PWR_64_DBL = TWO_PWR_32_DBL * TWO_PWR_32_DBL

package/src/indexedCramFile.ts CHANGED Viewed

@@ -65,10 +65,6 @@ export default class IndexedCramFile {
         cacheSize: args.cacheSize,
       })
-    if (!(this.cram instanceof CramFile)) {
-      throw new Error('invalid arguments: no cramfile')
-    }
     this.index = args.index
   }