@gmod/cram 1.6.1 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +4 -0
- package/package.json +2 -1
- package/src/craiIndex.js +180 -0
- package/src/cramFile/codecs/_base.js +49 -0
- package/src/cramFile/codecs/beta.js +23 -0
- package/src/cramFile/codecs/byteArrayLength.js +55 -0
- package/src/cramFile/codecs/byteArrayStop.js +50 -0
- package/src/cramFile/codecs/external.js +54 -0
- package/src/cramFile/codecs/gamma.js +30 -0
- package/src/cramFile/codecs/huffman.js +137 -0
- package/src/cramFile/codecs/index.js +38 -0
- package/src/cramFile/codecs/subexp.js +32 -0
- package/src/cramFile/constants.js +55 -0
- package/src/cramFile/container/compressionScheme.js +144 -0
- package/src/cramFile/container/index.js +119 -0
- package/src/cramFile/file.js +347 -0
- package/src/cramFile/index.js +3 -0
- package/src/cramFile/record.js +337 -0
- package/src/cramFile/sectionParsers.js +379 -0
- package/src/cramFile/slice/decodeRecord.js +362 -0
- package/src/cramFile/slice/index.js +497 -0
- package/src/cramFile/util.js +169 -0
- package/src/errors.js +22 -0
- package/src/index.js +5 -0
- package/src/indexedCramFile.js +191 -0
- package/src/io/bufferCache.js +66 -0
- package/src/io/index.js +26 -0
- package/src/io/localFile.js +35 -0
- package/src/io/remoteFile.js +71 -0
- package/src/rans/README.md +1 -0
- package/src/rans/constants.js +5 -0
- package/src/rans/d04.js +83 -0
- package/src/rans/d14.js +59 -0
- package/src/rans/decoding.js +141 -0
- package/src/rans/frequencies.js +121 -0
- package/src/rans/index.js +249 -0
- package/src/sam.js +15 -0
- package/src/unzip-pako.ts +5 -0
- package/src/unzip.ts +2 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
const Constants = {
|
|
2
|
+
CRAM_FLAG_PRESERVE_QUAL_SCORES: 1 << 0,
|
|
3
|
+
CRAM_FLAG_DETACHED: 1 << 1,
|
|
4
|
+
CRAM_FLAG_MATE_DOWNSTREAM: 1 << 2,
|
|
5
|
+
CRAM_FLAG_NO_SEQ: 1 << 3,
|
|
6
|
+
CRAM_FLAG_MASK: (1 << 4) - 1,
|
|
7
|
+
|
|
8
|
+
// mate read is reversed
|
|
9
|
+
CRAM_M_REVERSE: 1,
|
|
10
|
+
// mated read is unmapped
|
|
11
|
+
CRAM_M_UNMAP: 2,
|
|
12
|
+
|
|
13
|
+
// the read is paired in sequencing, no matter whether it is mapped in a pair
|
|
14
|
+
BAM_FPAIRED: 1,
|
|
15
|
+
// the read is mapped in a proper pair
|
|
16
|
+
BAM_FPROPER_PAIR: 2,
|
|
17
|
+
// the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
|
|
18
|
+
BAM_FUNMAP: 4,
|
|
19
|
+
// the mate is unmapped
|
|
20
|
+
BAM_FMUNMAP: 8,
|
|
21
|
+
// the read is mapped to the reverse strand
|
|
22
|
+
BAM_FREVERSE: 16,
|
|
23
|
+
// the mate is mapped to the reverse strand
|
|
24
|
+
BAM_FMREVERSE: 32,
|
|
25
|
+
// this is read1
|
|
26
|
+
BAM_FREAD1: 64,
|
|
27
|
+
// this is read2
|
|
28
|
+
BAM_FREAD2: 128,
|
|
29
|
+
// not primary alignment
|
|
30
|
+
BAM_FSECONDARY: 256,
|
|
31
|
+
// QC failure
|
|
32
|
+
BAM_FQCFAIL: 512,
|
|
33
|
+
// optical or PCR duplicate
|
|
34
|
+
BAM_FDUP: 1024,
|
|
35
|
+
// supplementary alignment
|
|
36
|
+
BAM_FSUPPLEMENTARY: 2048,
|
|
37
|
+
|
|
38
|
+
BAM_CMATCH: 0,
|
|
39
|
+
BAM_CINS: 1,
|
|
40
|
+
BAM_CDEL: 2,
|
|
41
|
+
BAM_CREF_SKIP: 3,
|
|
42
|
+
BAM_CSOFT_CLIP: 4,
|
|
43
|
+
BAM_CHARD_CLIP: 5,
|
|
44
|
+
BAM_CPAD: 6,
|
|
45
|
+
BAM_CEQUAL: 7,
|
|
46
|
+
BAM_CDIFF: 8,
|
|
47
|
+
BAM_CBACK: 9,
|
|
48
|
+
|
|
49
|
+
BAM_CIGAR_STR: 'MIDNSHP:XB',
|
|
50
|
+
BAM_CIGAR_SHIFT: 4,
|
|
51
|
+
BAM_CIGAR_MASK: 0xf,
|
|
52
|
+
BAM_CIGAR_TYPE: 0x3c1a7,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export default Constants
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import { CramMalformedError } from '../../errors'
|
|
2
|
+
import { instantiateCodec } from '../codecs'
|
|
3
|
+
|
|
4
|
+
// the hardcoded data type to be decoded for each core
|
|
5
|
+
// data field
|
|
6
|
+
const dataSeriesTypes = {
|
|
7
|
+
BF: 'int',
|
|
8
|
+
CF: 'int',
|
|
9
|
+
RI: 'int',
|
|
10
|
+
RL: 'int',
|
|
11
|
+
AP: 'int',
|
|
12
|
+
RG: 'int',
|
|
13
|
+
MF: 'int',
|
|
14
|
+
NS: 'int',
|
|
15
|
+
NP: 'int',
|
|
16
|
+
TS: 'int',
|
|
17
|
+
NF: 'int',
|
|
18
|
+
TC: 'byte',
|
|
19
|
+
TN: 'int',
|
|
20
|
+
FN: 'int',
|
|
21
|
+
FC: 'byte',
|
|
22
|
+
FP: 'int',
|
|
23
|
+
BS: 'byte',
|
|
24
|
+
IN: 'byteArray',
|
|
25
|
+
SC: 'byteArray',
|
|
26
|
+
DL: 'int',
|
|
27
|
+
BA: 'byte',
|
|
28
|
+
BB: 'byteArray',
|
|
29
|
+
RS: 'int',
|
|
30
|
+
PD: 'int',
|
|
31
|
+
HC: 'int',
|
|
32
|
+
MQ: 'int',
|
|
33
|
+
RN: 'byteArray',
|
|
34
|
+
QS: 'byte',
|
|
35
|
+
QQ: 'byteArray',
|
|
36
|
+
TL: 'int',
|
|
37
|
+
TM: 'ignore',
|
|
38
|
+
TV: 'ignore',
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function parseSubstitutionMatrix(byteArray) {
|
|
42
|
+
const matrix = new Array(5)
|
|
43
|
+
for (let i = 0; i < 5; i += 1) {
|
|
44
|
+
matrix[i] = new Array(4)
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
matrix[0][(byteArray[0] >> 6) & 3] = 'C'
|
|
48
|
+
matrix[0][(byteArray[0] >> 4) & 3] = 'G'
|
|
49
|
+
matrix[0][(byteArray[0] >> 2) & 3] = 'T'
|
|
50
|
+
matrix[0][(byteArray[0] >> 0) & 3] = 'N'
|
|
51
|
+
|
|
52
|
+
matrix[1][(byteArray[1] >> 6) & 3] = 'A'
|
|
53
|
+
matrix[1][(byteArray[1] >> 4) & 3] = 'G'
|
|
54
|
+
matrix[1][(byteArray[1] >> 2) & 3] = 'T'
|
|
55
|
+
matrix[1][(byteArray[1] >> 0) & 3] = 'N'
|
|
56
|
+
|
|
57
|
+
matrix[2][(byteArray[2] >> 6) & 3] = 'A'
|
|
58
|
+
matrix[2][(byteArray[2] >> 4) & 3] = 'C'
|
|
59
|
+
matrix[2][(byteArray[2] >> 2) & 3] = 'T'
|
|
60
|
+
matrix[2][(byteArray[2] >> 0) & 3] = 'N'
|
|
61
|
+
|
|
62
|
+
matrix[3][(byteArray[3] >> 6) & 3] = 'A'
|
|
63
|
+
matrix[3][(byteArray[3] >> 4) & 3] = 'C'
|
|
64
|
+
matrix[3][(byteArray[3] >> 2) & 3] = 'G'
|
|
65
|
+
matrix[3][(byteArray[3] >> 0) & 3] = 'N'
|
|
66
|
+
|
|
67
|
+
matrix[4][(byteArray[4] >> 6) & 3] = 'A'
|
|
68
|
+
matrix[4][(byteArray[4] >> 4) & 3] = 'C'
|
|
69
|
+
matrix[4][(byteArray[4] >> 2) & 3] = 'G'
|
|
70
|
+
matrix[4][(byteArray[4] >> 0) & 3] = 'T'
|
|
71
|
+
|
|
72
|
+
return matrix
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export default class CramContainerCompressionScheme {
|
|
76
|
+
constructor(content) {
|
|
77
|
+
Object.assign(this, content)
|
|
78
|
+
// interpret some of the preservation map tags for convenient use
|
|
79
|
+
this.readNamesIncluded = content.preservation.RN
|
|
80
|
+
this.APdelta = content.preservation.AP
|
|
81
|
+
this.referenceRequired = !!content.preservation.RR
|
|
82
|
+
this.tagIdsDictionary = content.preservation.TD
|
|
83
|
+
this.substitutionMatrix = parseSubstitutionMatrix(content.preservation.SM)
|
|
84
|
+
|
|
85
|
+
this.dataSeriesCodecCache = {}
|
|
86
|
+
this.tagCodecCache = {}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* @param {string} tagName three-character tag name
|
|
91
|
+
* @private
|
|
92
|
+
*/
|
|
93
|
+
getCodecForTag(tagName) {
|
|
94
|
+
if (!this.tagCodecCache[tagName]) {
|
|
95
|
+
const encodingData = this.tagEncoding[tagName]
|
|
96
|
+
if (encodingData) {
|
|
97
|
+
this.tagCodecCache[tagName] = instantiateCodec(
|
|
98
|
+
encodingData,
|
|
99
|
+
'byteArray', // all tags are byte array data
|
|
100
|
+
)
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return this.tagCodecCache[tagName]
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
*
|
|
108
|
+
* @param {number} tagListId ID of the tag list to fetch from the tag dictionary
|
|
109
|
+
* @private
|
|
110
|
+
*/
|
|
111
|
+
getTagNames(tagListId) {
|
|
112
|
+
return this.tagIdsDictionary[tagListId]
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
getCodecForDataSeries(dataSeriesName) {
|
|
116
|
+
if (!this.dataSeriesCodecCache[dataSeriesName]) {
|
|
117
|
+
const encodingData = this.dataSeriesEncoding[dataSeriesName]
|
|
118
|
+
if (encodingData) {
|
|
119
|
+
const dataType = dataSeriesTypes[dataSeriesName]
|
|
120
|
+
if (!dataType) {
|
|
121
|
+
throw new CramMalformedError(
|
|
122
|
+
`data series name ${dataSeriesName} not defined in file compression header`,
|
|
123
|
+
)
|
|
124
|
+
}
|
|
125
|
+
this.dataSeriesCodecCache[dataSeriesName] = instantiateCodec(
|
|
126
|
+
encodingData,
|
|
127
|
+
dataType,
|
|
128
|
+
)
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return this.dataSeriesCodecCache[dataSeriesName]
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
toJSON() {
|
|
135
|
+
const data = {}
|
|
136
|
+
Object.keys(this).forEach(k => {
|
|
137
|
+
if (/Cache$/.test(k)) {
|
|
138
|
+
return
|
|
139
|
+
}
|
|
140
|
+
data[k] = this[k]
|
|
141
|
+
})
|
|
142
|
+
return data
|
|
143
|
+
}
|
|
144
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { CramMalformedError } from '../../errors'
|
|
2
|
+
|
|
3
|
+
import { itf8Size, parseItem, tinyMemoize } from '../util'
|
|
4
|
+
import CramSlice from '../slice'
|
|
5
|
+
import CramContainerCompressionScheme from './compressionScheme'
|
|
6
|
+
|
|
7
|
+
export default class CramContainer {
|
|
8
|
+
constructor(cramFile, position) {
|
|
9
|
+
// cram file this container comes from
|
|
10
|
+
this.file = cramFile
|
|
11
|
+
// position of this container in the file
|
|
12
|
+
this.filePosition = position
|
|
13
|
+
// console.log(`container: ${this.filePosition}`)
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
// memoize
|
|
17
|
+
getHeader() {
|
|
18
|
+
return this._readContainerHeader(this.filePosition)
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// memoize
|
|
22
|
+
async getCompressionHeaderBlock() {
|
|
23
|
+
const containerHeader = await this.getHeader()
|
|
24
|
+
|
|
25
|
+
// if there are no records in the container, there will be no compression header
|
|
26
|
+
if (!containerHeader.numRecords) {
|
|
27
|
+
return null
|
|
28
|
+
}
|
|
29
|
+
const sectionParsers = await this.file.getSectionParsers()
|
|
30
|
+
const block = await this.getFirstBlock()
|
|
31
|
+
if (block.contentType !== 'COMPRESSION_HEADER') {
|
|
32
|
+
throw new CramMalformedError(
|
|
33
|
+
`invalid content type ${block.contentType} in what is supposed to be the compression header block`,
|
|
34
|
+
)
|
|
35
|
+
}
|
|
36
|
+
const content = parseItem(
|
|
37
|
+
block.content,
|
|
38
|
+
sectionParsers.cramCompressionHeader.parser,
|
|
39
|
+
0,
|
|
40
|
+
block.contentPosition,
|
|
41
|
+
)
|
|
42
|
+
block.content = content
|
|
43
|
+
return block
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
async getFirstBlock() {
|
|
47
|
+
const containerHeader = await this.getHeader()
|
|
48
|
+
return this.file.readBlock(containerHeader._endPosition)
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// parses the compression header data into a CramContainerCompressionScheme object
|
|
52
|
+
// memoize
|
|
53
|
+
async getCompressionScheme() {
|
|
54
|
+
const header = await this.getCompressionHeaderBlock()
|
|
55
|
+
if (!header) {
|
|
56
|
+
return undefined
|
|
57
|
+
}
|
|
58
|
+
return new CramContainerCompressionScheme(header.content)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
getSlice(slicePosition, sliceSize) {
|
|
62
|
+
// note: slicePosition is relative to the end of the container header
|
|
63
|
+
// TODO: perhaps we should cache slices?
|
|
64
|
+
return new CramSlice(this, slicePosition, sliceSize)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
async _readContainerHeader(position) {
|
|
68
|
+
const sectionParsers = await this.file.getSectionParsers()
|
|
69
|
+
const { cramContainerHeader1, cramContainerHeader2 } = sectionParsers
|
|
70
|
+
const { size: fileSize } = await this.file.stat()
|
|
71
|
+
|
|
72
|
+
if (position >= fileSize) {
|
|
73
|
+
return undefined
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// parse the container header. do it in 2 pieces because you cannot tell
|
|
77
|
+
// how much to buffer until you read numLandmarks
|
|
78
|
+
const bytes1 = Buffer.allocUnsafe(cramContainerHeader1.maxLength)
|
|
79
|
+
await this.file.read(bytes1, 0, cramContainerHeader1.maxLength, position)
|
|
80
|
+
const header1 = parseItem(bytes1, cramContainerHeader1.parser)
|
|
81
|
+
const numLandmarksSize = itf8Size(header1.numLandmarks)
|
|
82
|
+
if (position + header1.length >= fileSize) {
|
|
83
|
+
console.warn(
|
|
84
|
+
`${this.file}: container header at ${position} indicates that the container has length ${header1.length}, which extends beyond the length of the file. Skipping this container.`,
|
|
85
|
+
)
|
|
86
|
+
return undefined
|
|
87
|
+
}
|
|
88
|
+
const bytes2 = Buffer.allocUnsafe(
|
|
89
|
+
cramContainerHeader2.maxLength(header1.numLandmarks),
|
|
90
|
+
)
|
|
91
|
+
await this.file.read(
|
|
92
|
+
bytes2,
|
|
93
|
+
0,
|
|
94
|
+
cramContainerHeader2.maxLength(header1.numLandmarks),
|
|
95
|
+
position + header1._size - numLandmarksSize,
|
|
96
|
+
)
|
|
97
|
+
const header2 = parseItem(bytes2, cramContainerHeader2.parser)
|
|
98
|
+
|
|
99
|
+
if (this.file.validateChecksums && header2.crc32 !== undefined) {
|
|
100
|
+
await this.file.checkCrc32(
|
|
101
|
+
position,
|
|
102
|
+
header1._size + header2._size - numLandmarksSize - 4,
|
|
103
|
+
header2.crc32,
|
|
104
|
+
`container header beginning at position ${position}`,
|
|
105
|
+
)
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const completeHeader = Object.assign(header1, header2, {
|
|
109
|
+
_size: header1._size + header2._size - numLandmarksSize,
|
|
110
|
+
_endPosition: header1._size + header2._size - numLandmarksSize + position,
|
|
111
|
+
})
|
|
112
|
+
|
|
113
|
+
return completeHeader
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
'getHeader getCompressionHeaderBlock getCompressionScheme'
|
|
118
|
+
.split(' ')
|
|
119
|
+
.forEach(method => tinyMemoize(CramContainer, method))
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
import { unzip } from '../unzip'
|
|
2
|
+
import crc32 from 'buffer-crc32'
|
|
3
|
+
import LRU from 'quick-lru'
|
|
4
|
+
|
|
5
|
+
import { CramUnimplementedError, CramMalformedError } from '../errors'
|
|
6
|
+
import ransuncompress from '../rans'
|
|
7
|
+
import {
|
|
8
|
+
cramFileDefinition as cramFileDefinitionParser,
|
|
9
|
+
getSectionParsers,
|
|
10
|
+
} from './sectionParsers'
|
|
11
|
+
import htscodecs from '@jkbonfield/htscodecs'
|
|
12
|
+
import CramContainer from './container'
|
|
13
|
+
|
|
14
|
+
import { open } from '../io'
|
|
15
|
+
import { parseItem, tinyMemoize } from './util'
|
|
16
|
+
import { parseHeaderText } from '../sam'
|
|
17
|
+
|
|
18
|
+
export default class CramFile {
|
|
19
|
+
/**
|
|
20
|
+
* @param {object} args
|
|
21
|
+
* @param {object} [args.filehandle] - a filehandle that implements the stat() and
|
|
22
|
+
* read() methods of the Node filehandle API https://nodejs.org/api/fs.html#fs_class_filehandle
|
|
23
|
+
* @param {object} [args.path] - path to the cram file
|
|
24
|
+
* @param {object} [args.url] - url for the cram file. also supports file:// urls for local files
|
|
25
|
+
* @param {function} [args.seqFetch] - a function with signature
|
|
26
|
+
* `(seqId, startCoordinate, endCoordinate)` that returns a promise for a string of sequence bases
|
|
27
|
+
* @param {number} [args.cacheSize] optional maximum number of CRAM records to cache. default 20,000
|
|
28
|
+
* @param {boolean} [args.checkSequenceMD5] - default true. if false, disables verifying the MD5
|
|
29
|
+
* checksum of the reference sequence underlying a slice. In some applications, this check can cause an inconvenient amount (many megabases) of sequences to be fetched.
|
|
30
|
+
*/
|
|
31
|
+
constructor(args) {
|
|
32
|
+
this.file = open(args.url, args.path, args.filehandle)
|
|
33
|
+
this.validateChecksums = true
|
|
34
|
+
this.fetchReferenceSequenceCallback = args.seqFetch
|
|
35
|
+
this.options = {
|
|
36
|
+
checkSequenceMD5: args.checkSequenceMD5 !== false,
|
|
37
|
+
cacheSize: args.cacheSize !== undefined ? args.cacheSize : 20000,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// cache of features in a slice, keyed by the
|
|
41
|
+
// slice offset. caches all of the features in a slice, or none.
|
|
42
|
+
// the cache is actually used by the slice object, it's just
|
|
43
|
+
// kept here at the level of the file
|
|
44
|
+
this.featureCache = new LRU({
|
|
45
|
+
maxSize: this.options.cacheSize,
|
|
46
|
+
})
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
toString() {
|
|
50
|
+
if (this.file.filename) {
|
|
51
|
+
return this.file.filename
|
|
52
|
+
}
|
|
53
|
+
if (this.file.url) {
|
|
54
|
+
return this.file.url
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return '(cram file)'
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// can just read this object like a filehandle
|
|
61
|
+
read(buffer, offset, length, position) {
|
|
62
|
+
return this.file.read(buffer, offset, length, position)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// can just stat this object like a filehandle
|
|
66
|
+
stat() {
|
|
67
|
+
return this.file.stat()
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// memoized
|
|
71
|
+
async getDefinition() {
|
|
72
|
+
const headbytes = Buffer.allocUnsafe(cramFileDefinitionParser.maxLength)
|
|
73
|
+
await this.file.read(headbytes, 0, cramFileDefinitionParser.maxLength, 0)
|
|
74
|
+
const definition = cramFileDefinitionParser.parser.parse(headbytes).result
|
|
75
|
+
if (definition.majorVersion !== 2 && definition.majorVersion !== 3) {
|
|
76
|
+
throw new CramUnimplementedError(
|
|
77
|
+
`CRAM version ${definition.majorVersion} not supported`,
|
|
78
|
+
)
|
|
79
|
+
}
|
|
80
|
+
return definition
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// memoize
|
|
84
|
+
async getSamHeader() {
|
|
85
|
+
const firstContainer = await this.getContainerById(0)
|
|
86
|
+
if (!firstContainer) {
|
|
87
|
+
throw new CramMalformedError('file contains no containers')
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const { content } = await firstContainer.getFirstBlock()
|
|
91
|
+
// find the end of the trailing zeros in the header text
|
|
92
|
+
const headerLength = content.readInt32LE(0)
|
|
93
|
+
const textStart = 4
|
|
94
|
+
// let textEnd = content.length - 1
|
|
95
|
+
// while (textEnd >= textStart && !content[textEnd]) textEnd -= 1
|
|
96
|
+
// trim off the trailing zeros
|
|
97
|
+
const text = content.toString('utf8', textStart, textStart + headerLength)
|
|
98
|
+
this.header = text
|
|
99
|
+
return parseHeaderText(text)
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
async getHeaderText() {
|
|
103
|
+
await this.getSamHeader()
|
|
104
|
+
return this.header
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// memoize
|
|
108
|
+
async getSectionParsers() {
|
|
109
|
+
const { majorVersion } = await this.getDefinition()
|
|
110
|
+
return getSectionParsers(majorVersion)
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
async getContainerById(containerNumber) {
|
|
114
|
+
const sectionParsers = await this.getSectionParsers()
|
|
115
|
+
let position = sectionParsers.cramFileDefinition.maxLength
|
|
116
|
+
const { size: fileSize } = await this.file.stat()
|
|
117
|
+
const { cramContainerHeader1 } = sectionParsers
|
|
118
|
+
|
|
119
|
+
// skip with a series of reads to the proper container
|
|
120
|
+
let currentContainer
|
|
121
|
+
for (let i = 0; i <= containerNumber; i += 1) {
|
|
122
|
+
// if we are about to go off the end of the file
|
|
123
|
+
// and have not found that container, it does not exist
|
|
124
|
+
if (position + cramContainerHeader1.maxLength + 8 >= fileSize) {
|
|
125
|
+
return undefined
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
currentContainer = this.getContainerAtPosition(position)
|
|
129
|
+
const currentHeader = await currentContainer.getHeader()
|
|
130
|
+
if (!currentHeader) {
|
|
131
|
+
throw new CramMalformedError(
|
|
132
|
+
`container ${containerNumber} not found in file`,
|
|
133
|
+
)
|
|
134
|
+
}
|
|
135
|
+
// if this is the first container, read all the blocks in the
|
|
136
|
+
// container to determine its length, because we cannot trust
|
|
137
|
+
// the container header's given length due to a bug somewhere
|
|
138
|
+
// in htslib
|
|
139
|
+
if (i === 0) {
|
|
140
|
+
position = currentHeader._endPosition
|
|
141
|
+
for (let j = 0; j < currentHeader.numBlocks; j += 1) {
|
|
142
|
+
const block = await this.readBlock(position)
|
|
143
|
+
position = block._endPosition
|
|
144
|
+
}
|
|
145
|
+
} else {
|
|
146
|
+
// otherwise, just traverse to the next container using the container's length
|
|
147
|
+
position += currentHeader._size + currentHeader.length
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return currentContainer
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
async checkCrc32(position, length, recordedCrc32, description) {
|
|
155
|
+
const b = Buffer.allocUnsafe(length)
|
|
156
|
+
await this.file.read(b, 0, length, position)
|
|
157
|
+
const calculatedCrc32 = crc32.unsigned(b)
|
|
158
|
+
if (calculatedCrc32 !== recordedCrc32) {
|
|
159
|
+
throw new CramMalformedError(
|
|
160
|
+
`crc mismatch in ${description}: recorded CRC32 = ${recordedCrc32}, but calculated CRC32 = ${calculatedCrc32}`,
|
|
161
|
+
)
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* @returns {Promise[number]} the number of containers in the file
|
|
167
|
+
*/
|
|
168
|
+
async containerCount() {
|
|
169
|
+
const sectionParsers = await this.getSectionParsers()
|
|
170
|
+
const { size: fileSize } = await this.file.stat()
|
|
171
|
+
const { cramContainerHeader1 } = sectionParsers
|
|
172
|
+
|
|
173
|
+
let containerCount = 0
|
|
174
|
+
let position = sectionParsers.cramFileDefinition.maxLength
|
|
175
|
+
while (position + cramContainerHeader1.maxLength + 8 < fileSize) {
|
|
176
|
+
const currentHeader = await this.getContainerAtPosition(
|
|
177
|
+
position,
|
|
178
|
+
).getHeader()
|
|
179
|
+
if (!currentHeader) {
|
|
180
|
+
break
|
|
181
|
+
}
|
|
182
|
+
// if this is the first container, read all the blocks in the
|
|
183
|
+
// container, because we cannot trust the container
|
|
184
|
+
// header's given length due to a bug somewhere in htslib
|
|
185
|
+
if (containerCount === 0) {
|
|
186
|
+
position = currentHeader._endPosition
|
|
187
|
+
for (let j = 0; j < currentHeader.numBlocks; j += 1) {
|
|
188
|
+
const block = await this.readBlock(position)
|
|
189
|
+
position = block._endPosition
|
|
190
|
+
}
|
|
191
|
+
} else {
|
|
192
|
+
// otherwise, just traverse to the next container using the container's length
|
|
193
|
+
position += currentHeader._size + currentHeader.length
|
|
194
|
+
}
|
|
195
|
+
containerCount += 1
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return containerCount
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
getContainerAtPosition(position) {
|
|
202
|
+
return new CramContainer(this, position)
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
async readBlockHeader(position) {
|
|
206
|
+
const sectionParsers = await this.getSectionParsers()
|
|
207
|
+
const { cramBlockHeader } = sectionParsers
|
|
208
|
+
const { size: fileSize } = await this.file.stat()
|
|
209
|
+
|
|
210
|
+
if (position + cramBlockHeader.maxLength >= fileSize) {
|
|
211
|
+
return undefined
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const buffer = Buffer.allocUnsafe(cramBlockHeader.maxLength)
|
|
215
|
+
await this.file.read(buffer, 0, cramBlockHeader.maxLength, position)
|
|
216
|
+
return parseItem(buffer, cramBlockHeader.parser, 0, position)
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
async _parseSection(
|
|
220
|
+
section,
|
|
221
|
+
position,
|
|
222
|
+
size = section.maxLength,
|
|
223
|
+
preReadBuffer,
|
|
224
|
+
) {
|
|
225
|
+
let buffer
|
|
226
|
+
if (preReadBuffer) {
|
|
227
|
+
buffer = preReadBuffer
|
|
228
|
+
} else {
|
|
229
|
+
const { size: fileSize } = await this.file.stat()
|
|
230
|
+
if (position + size >= fileSize) {
|
|
231
|
+
return undefined
|
|
232
|
+
}
|
|
233
|
+
buffer = Buffer.allocUnsafe(size)
|
|
234
|
+
await this.file.read(buffer, 0, size, position)
|
|
235
|
+
}
|
|
236
|
+
const data = parseItem(buffer, section.parser, 0, position)
|
|
237
|
+
if (data._size !== size) {
|
|
238
|
+
throw new CramMalformedError(
|
|
239
|
+
`section read error: requested size ${size} does not equal parsed size ${data._size}`,
|
|
240
|
+
)
|
|
241
|
+
}
|
|
242
|
+
return data
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
_uncompress(compressionMethod, inputBuffer, outputBuffer) {
|
|
246
|
+
if (compressionMethod === 'gzip') {
|
|
247
|
+
const result = unzip(inputBuffer)
|
|
248
|
+
result.copy(outputBuffer)
|
|
249
|
+
} else if (compressionMethod === 'bzip2') {
|
|
250
|
+
var bits = bzip2.array(inputBuffer)
|
|
251
|
+
var size = bzip2.header(bits)
|
|
252
|
+
var j = 0
|
|
253
|
+
do {
|
|
254
|
+
var chunk = bzip2.decompress(bits, size)
|
|
255
|
+
if (chunk != -1) {
|
|
256
|
+
Buffer.from(chunk).copy(outputBuffer, j)
|
|
257
|
+
j += chunk.length
|
|
258
|
+
size -= chunk.length
|
|
259
|
+
}
|
|
260
|
+
} while (chunk != -1)
|
|
261
|
+
} else if (compressionMethod === 'rans') {
|
|
262
|
+
ransuncompress(inputBuffer, outputBuffer)
|
|
263
|
+
//htscodecs r4x8 is slower, but compatible.
|
|
264
|
+
//htscodecs.r4x8_uncompress(inputBuffer, outputBuffer);
|
|
265
|
+
} else if (compressionMethod === 'rans4x16') {
|
|
266
|
+
htscodecs.r4x16_uncompress(inputBuffer, outputBuffer)
|
|
267
|
+
} else if (compressionMethod === 'arith') {
|
|
268
|
+
htscodecs.arith_uncompress(inputBuffer, outputBuffer)
|
|
269
|
+
} else if (compressionMethod === 'fqzcomp') {
|
|
270
|
+
htscodecs.fqzcomp_uncompress(inputBuffer, outputBuffer)
|
|
271
|
+
} else if (compressionMethod === 'tok3') {
|
|
272
|
+
htscodecs.tok3_uncompress(inputBuffer, outputBuffer)
|
|
273
|
+
} else {
|
|
274
|
+
throw new CramUnimplementedError(
|
|
275
|
+
`${compressionMethod} decompression not yet implemented`,
|
|
276
|
+
)
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
async readBlock(position) {
|
|
281
|
+
const { majorVersion } = await this.getDefinition()
|
|
282
|
+
const sectionParsers = await this.getSectionParsers()
|
|
283
|
+
const block = await this.readBlockHeader(position)
|
|
284
|
+
const blockContentPosition = block._endPosition
|
|
285
|
+
block.contentPosition = block._endPosition
|
|
286
|
+
|
|
287
|
+
const uncompressedData = Buffer.allocUnsafe(block.uncompressedSize)
|
|
288
|
+
|
|
289
|
+
if (block.compressionMethod !== 'raw') {
|
|
290
|
+
const compressedData = Buffer.allocUnsafe(block.compressedSize)
|
|
291
|
+
await this.read(
|
|
292
|
+
compressedData,
|
|
293
|
+
0,
|
|
294
|
+
block.compressedSize,
|
|
295
|
+
blockContentPosition,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
this._uncompress(
|
|
299
|
+
block.compressionMethod,
|
|
300
|
+
compressedData,
|
|
301
|
+
uncompressedData,
|
|
302
|
+
)
|
|
303
|
+
} else {
|
|
304
|
+
await this.read(
|
|
305
|
+
uncompressedData,
|
|
306
|
+
0,
|
|
307
|
+
block.uncompressedSize,
|
|
308
|
+
blockContentPosition,
|
|
309
|
+
)
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
block.content = uncompressedData
|
|
313
|
+
|
|
314
|
+
if (majorVersion >= 3) {
|
|
315
|
+
// parse the crc32
|
|
316
|
+
const crc = await this._parseSection(
|
|
317
|
+
sectionParsers.cramBlockCrc32,
|
|
318
|
+
blockContentPosition + block.compressedSize,
|
|
319
|
+
)
|
|
320
|
+
block.crc32 = crc.crc32
|
|
321
|
+
|
|
322
|
+
// check the block data crc32
|
|
323
|
+
if (this.validateChecksums) {
|
|
324
|
+
await this.checkCrc32(
|
|
325
|
+
position,
|
|
326
|
+
block._size + block.compressedSize,
|
|
327
|
+
block.crc32,
|
|
328
|
+
'block data',
|
|
329
|
+
)
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// make the endposition and size reflect the whole block
|
|
333
|
+
block._endPosition = crc._endPosition
|
|
334
|
+
block._size =
|
|
335
|
+
block.compressedSize + sectionParsers.cramBlockCrc32.maxLength
|
|
336
|
+
} else {
|
|
337
|
+
block._endPosition = blockContentPosition + block.compressedSize
|
|
338
|
+
block._size = block.compressedSize
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
return block
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
'getDefinition getSectionParsers getSamHeader'
|
|
346
|
+
.split(' ')
|
|
347
|
+
.forEach(method => tinyMemoize(CramFile, method))
|