@exodus/bytes 1.0.0-rc.1 → 1.0.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,279 @@
1
+ // We can't return native TextDecoder if it's present, as Node.js one is broken on windows-1252 and we fix that
2
+ // We are also faster than Node.js built-in on both TextEncoder and TextDecoder
3
+
4
+ import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
5
+ import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
6
+ import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
7
+ import labels from './encoding.labels.js'
8
+ import { unfinishedBytes } from './encoding.util.js'
9
+
10
+ const E_OPTIONS = 'The "options" argument must be of type object'
11
+ const E_ENCODING = 'Unknown encoding'
12
+ const replacementChar = '\uFFFD'
13
+
14
+ const E_MULTI =
15
+ 'Legacy multi-byte encodings are disabled in /encoding-lite.js, use /encoding.js for full encodings range support'
16
+ const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
17
+ let createMultibyteDecoder
18
+
19
+ export function setMultibyteDecoder(createDecoder) {
20
+ createMultibyteDecoder = createDecoder
21
+ }
22
+
23
+ let labelsMap
24
+
25
+ // Warning: unlike whatwg-encoding, returns lowercased labels
26
+ // Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
27
+ // https://encoding.spec.whatwg.org/#names-and-labels
28
+ export function normalizeEncoding(label) {
29
+ // fast path
30
+ if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
31
+ if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
32
+ // full map
33
+ if (!/^[\w\t\n\f\r .:-]+$/i.test(label)) throw new RangeError(E_ENCODING) // must be ASCII (with ASCII whitespace)
34
+ const low = `${label}`.trim().toLowerCase()
35
+ if (Object.hasOwn(labels, low)) return low
36
+ if (!labelsMap) {
37
+ labelsMap = new Map()
38
+ for (const [label, aliases] of Object.entries(labels)) {
39
+ for (const alias of aliases) labelsMap.set(alias, label)
40
+ }
41
+ }
42
+
43
+ const mapped = labelsMap.get(low)
44
+ if (mapped) return mapped
45
+ throw new RangeError(E_ENCODING)
46
+ }
47
+
48
+ const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
49
+
50
+ const fromSource = (x) => {
51
+ if (x instanceof Uint8Array) return x
52
+ if (x instanceof ArrayBuffer) return new Uint8Array(x)
53
+ if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
54
+ if (globalThis.SharedArrayBuffer && x instanceof globalThis.SharedArrayBuffer) {
55
+ return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
56
+ }
57
+
58
+ throw new TypeError('Argument must be a SharedArrayBuffer, ArrayBuffer or ArrayBufferView')
59
+ }
60
+
61
+ function unicodeDecoder(encoding, loose) {
62
+ if (encoding === 'utf-8') return loose ? utf8toStringLoose : utf8toString // likely
63
+ const form = encoding === 'utf-16le' ? 'uint8-le' : 'uint8-be'
64
+ return loose ? (u) => utf16toStringLoose(u, form) : (u) => utf16toString(u, form)
65
+ }
66
+
67
+ export class TextDecoder {
68
+ #decode
69
+ #unicode
70
+ #multibyte
71
+ #chunk
72
+ #canBOM
73
+
74
+ constructor(encoding = 'utf-8', options = {}) {
75
+ if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
76
+ const enc = normalizeEncoding(encoding)
77
+ if (enc === 'replacement') throw new RangeError(E_ENCODING)
78
+ define(this, 'encoding', enc)
79
+ define(this, 'fatal', Boolean(options.fatal))
80
+ define(this, 'ignoreBOM', Boolean(options.ignoreBOM))
81
+ this.#unicode = enc === 'utf-8' || enc === 'utf-16le' || enc === 'utf-16be'
82
+ this.#multibyte = !this.#unicode && multibyteSet.has(enc)
83
+ this.#canBOM = this.#unicode && !this.ignoreBOM
84
+ }
85
+
86
+ get [Symbol.toStringTag]() {
87
+ return 'TextDecoder'
88
+ }
89
+
90
+ decode(input, options = {}) {
91
+ if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
92
+ const stream = Boolean(options.stream)
93
+ let u = input === undefined ? new Uint8Array() : fromSource(input)
94
+
95
+ if (this.#unicode) {
96
+ let prefix
97
+ if (this.#chunk) {
98
+ if (u.length === 0) {
99
+ if (stream) return '' // no change
100
+ u = this.#chunk // process as final chunk to handle errors and state changes
101
+ } else if (u.length < 3) {
102
+ // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
103
+ const a = new Uint8Array(u.length + this.#chunk.length)
104
+ a.set(this.#chunk)
105
+ a.set(u, this.#chunk.length)
106
+ u = a
107
+ } else {
108
+ // Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
109
+ const t = new Uint8Array(this.#chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
110
+ t.set(this.#chunk)
111
+ t.set(u.subarray(0, 3), this.#chunk.length)
112
+
113
+ // Stop at the first offset where unfinished bytes reaches 0 or fits into u
114
+ // If that doesn't happen (u too short), just concat chunk and u completely
115
+ for (let i = 1; i <= 3; i++) {
116
+ const unfinished = unfinishedBytes(t, this.#chunk.length + i, this.encoding) // 0-3
117
+ if (unfinished <= i) {
118
+ // Always reachable at 3, but we still need 'unfinished' value for it
119
+ const add = i - unfinished // 0-3
120
+ prefix = add > 0 ? t.subarray(0, this.#chunk.length + add) : this.#chunk
121
+ if (add > 0) u = u.subarray(add)
122
+ break
123
+ }
124
+ }
125
+ }
126
+
127
+ this.#chunk = null
128
+ } else if (u.byteLength === 0) {
129
+ if (!stream) this.#canBOM = !this.ignoreBOM
130
+ return ''
131
+ }
132
+
133
+ // For non-stream utf-8 we don't have to do this as it matches utf8toStringLoose already
134
+ // For non-stream loose utf-16 we still have to do this as this API supports uneven byteLength unlike utf16toStringLoose
135
+ let suffix = ''
136
+ if (stream || (!this.fatal && this.encoding !== 'utf-8')) {
137
+ const trail = unfinishedBytes(u, u.byteLength, this.encoding)
138
+ if (trail > 0) {
139
+ if (stream) {
140
+ this.#chunk = Uint8Array.from(u.subarray(-trail)) // copy
141
+ } else {
142
+ // non-fatal mode as already checked
143
+ suffix = replacementChar
144
+ }
145
+
146
+ u = u.subarray(0, -trail)
147
+ }
148
+ }
149
+
150
+ if (this.#canBOM) {
151
+ const bom = this.#findBom(prefix ?? u)
152
+ if (bom) {
153
+ if (stream) this.#canBOM = false
154
+ if (prefix) {
155
+ prefix = prefix.subarray(bom)
156
+ } else {
157
+ u = u.subarray(bom)
158
+ }
159
+ }
160
+ }
161
+
162
+ if (!this.#decode) this.#decode = unicodeDecoder(this.encoding, !this.fatal)
163
+ try {
164
+ const res = (prefix ? this.#decode(prefix) : '') + this.#decode(u) + suffix
165
+ if (res.length > 0 && stream) this.#canBOM = false
166
+
167
+ if (!stream) this.#canBOM = !this.ignoreBOM
168
+ return res
169
+ } catch (err) {
170
+ this.#chunk = null // reset unfinished chunk on errors
171
+ throw err
172
+ }
173
+
174
+ // eslint-disable-next-line no-else-return
175
+ } else if (this.#multibyte) {
176
+ if (!createMultibyteDecoder) throw new Error(E_MULTI)
177
+ if (!this.#decode) this.#decode = createMultibyteDecoder(this.encoding, !this.fatal) // can contain state!
178
+ return this.#decode(u, stream)
179
+ } else {
180
+ if (!this.#decode) this.#decode = createSinglebyteDecoder(this.encoding, !this.fatal)
181
+ return this.#decode(u)
182
+ }
183
+ }
184
+
185
+ #findBom(u) {
186
+ switch (this.encoding) {
187
+ case 'utf-8':
188
+ return u.byteLength >= 3 && u[0] === 0xef && u[1] === 0xbb && u[2] === 0xbf ? 3 : 0
189
+ case 'utf-16le':
190
+ return u.byteLength >= 2 && u[0] === 0xff && u[1] === 0xfe ? 2 : 0
191
+ case 'utf-16be':
192
+ return u.byteLength >= 2 && u[0] === 0xfe && u[1] === 0xff ? 2 : 0
193
+ }
194
+
195
+ throw new Error('Unreachable')
196
+ }
197
+ }
198
+
199
+ export class TextEncoder {
200
+ constructor() {
201
+ define(this, 'encoding', 'utf-8')
202
+ }
203
+
204
+ get [Symbol.toStringTag]() {
205
+ return 'TextEncoder'
206
+ }
207
+
208
+ encode(str = '') {
209
+ if (typeof str !== 'string') str = `${str}`
210
+ const res = utf8fromStringLoose(str)
211
+ return res.byteOffset === 0 ? res : res.slice(0) // Ensure 0-offset. TODO: do we need this?
212
+ }
213
+
214
+ encodeInto(str, target) {
215
+ if (typeof str !== 'string') str = `${str}`
216
+ if (!(target instanceof Uint8Array)) throw new TypeError('Target must be an Uint8Array')
217
+ if (target.buffer.detached) return { read: 0, written: 0 } // Until https://github.com/whatwg/encoding/issues/324 is resolved
218
+
219
+ const tlen = target.length
220
+ if (tlen < str.length) str = str.slice(0, tlen)
221
+ let u8 = utf8fromStringLoose(str)
222
+ let read
223
+ if (tlen >= u8.length) {
224
+ read = str.length
225
+ } else if (u8.length === str.length) {
226
+ if (u8.length > tlen) u8 = u8.subarray(0, tlen) // ascii can be truncated
227
+ read = u8.length
228
+ } else {
229
+ u8 = u8.subarray(0, tlen)
230
+ const unfinished = unfinishedBytes(u8, u8.length, 'utf-8')
231
+ if (unfinished > 0) u8 = u8.subarray(0, u8.length - unfinished)
232
+
233
+ // We can do this because loose str -> u8 -> str preserves length, unlike loose u8 -> str -> u8
234
+ // Each unpaired surrogate (1 charcode) is replaced with a single charcode
235
+ read = utf8toStringLoose(u8).length // FIXME: Converting back is very inefficient
236
+ }
237
+
238
+ try {
239
+ target.set(u8)
240
+ } catch {
241
+ return { read: 0, written: 0 } // see above, likely detached but no .detached property support
242
+ }
243
+
244
+ return { read, written: u8.length }
245
+ }
246
+ }
247
+
248
+ // Warning: unlike whatwg-encoding, returns lowercased labels
249
+ // Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
250
+ export function getBOMEncoding(input) {
251
+ const u8 = fromSource(input) // asserts
252
+ if (u8.length >= 3 && u8[0] === 0xef && u8[1] === 0xbb && u8[2] === 0xbf) return 'utf-8'
253
+ if (u8.length < 2) return null
254
+ if (u8[0] === 0xff && u8[1] === 0xfe) return 'utf-16le'
255
+ if (u8[0] === 0xfe && u8[1] === 0xff) return 'utf-16be'
256
+ return null
257
+ }
258
+
259
+ // https://encoding.spec.whatwg.org/#decode
260
+ // Warning: encoding sniffed from BOM takes preference over the supplied one
261
+ // Warning: lossy, performs replacement, no option of throwing
262
+ // Expects normalized (lower-case) encoding as input. Completely ignores it and even skips validation when BOM is found
263
+ export function legacyHookDecode(input, fallbackEncoding) {
264
+ let u8 = fromSource(input)
265
+ const bomEncoding = getBOMEncoding(u8)
266
+ if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
267
+ const enc = bomEncoding ?? fallbackEncoding ?? 'utf-8' // "the byte order mark is more authoritative than anything else"
268
+ if (enc === 'utf-8') return utf8toStringLoose(u8)
269
+ if (enc === 'utf-16le') return utf16toStringLoose(u8, 'uint8-le')
270
+ if (enc === 'utf-16be') return utf16toStringLoose(u8, 'uint8-be')
271
+ if (!Object.hasOwn(labels, enc) || enc === 'replacement') throw new RangeError(E_ENCODING)
272
+
273
+ if (multibyteSet.has(enc)) {
274
+ if (!createMultibyteDecoder) throw new Error(E_MULTI)
275
+ return createMultibyteDecoder(enc, true)(u8)
276
+ }
277
+
278
+ return createSinglebyteDecoder(enc, true)(u8)
279
+ }
@@ -0,0 +1,46 @@
1
+ // See https://encoding.spec.whatwg.org/#names-and-labels
2
+
3
+ /* eslint-disable @exodus/export-default/named */
4
+ // prettier-ignore
5
+ export default {
6
+ 'utf-8': ['unicode-1-1-utf-8', 'unicode11utf8', 'unicode20utf8', 'utf8', 'x-unicode20utf8'],
7
+ ibm866: ['866', 'cp866', 'csibm866'],
8
+ 'iso-8859-2': ['csisolatin2', 'iso-ir-101', 'iso8859-2', 'iso88592', 'iso_8859-2', 'iso_8859-2:1987', 'l2', 'latin2'],
9
+ 'iso-8859-3': ['csisolatin3', 'iso-ir-109', 'iso8859-3', 'iso88593', 'iso_8859-3', 'iso_8859-3:1988', 'l3', 'latin3'],
10
+ 'iso-8859-4': ['csisolatin4', 'iso-ir-110', 'iso8859-4', 'iso88594', 'iso_8859-4', 'iso_8859-4:1988', 'l4', 'latin4'],
11
+ 'iso-8859-5': ['csisolatincyrillic', 'cyrillic', 'iso-ir-144', 'iso8859-5', 'iso88595', 'iso_8859-5', 'iso_8859-5:1988'],
12
+ 'iso-8859-6': ['arabic', 'asmo-708', 'csiso88596e', 'csiso88596i', 'csisolatinarabic', 'ecma-114', 'iso-8859-6-e', 'iso-8859-6-i', 'iso-ir-127', 'iso8859-6', 'iso88596', 'iso_8859-6', 'iso_8859-6:1987'],
13
+ 'iso-8859-7': ['csisolatingreek', 'ecma-118', 'elot_928', 'greek', 'greek8', 'iso-ir-126', 'iso8859-7', 'iso88597', 'iso_8859-7', 'iso_8859-7:1987', 'sun_eu_greek'],
14
+ 'iso-8859-8': ['csiso88598e', 'csisolatinhebrew', 'hebrew', 'iso-8859-8-e', 'iso-ir-138', 'iso8859-8', 'iso88598', 'iso_8859-8', 'iso_8859-8:1988', 'visual'],
15
+ 'iso-8859-8-i': ['csiso88598i', 'logical'],
16
+ 'iso-8859-10': ['csisolatin6', 'iso-ir-157', 'iso8859-10', 'iso885910', 'l6', 'latin6'],
17
+ 'iso-8859-13': ['iso8859-13', 'iso885913'],
18
+ 'iso-8859-14': ['iso8859-14', 'iso885914'],
19
+ 'iso-8859-15': ['csisolatin9', 'iso8859-15', 'iso885915', 'iso_8859-15', 'l9'],
20
+ 'iso-8859-16': [],
21
+ 'koi8-r': ['cskoi8r', 'koi', 'koi8', 'koi8_r'],
22
+ 'koi8-u': ['koi8-ru'],
23
+ macintosh: ['csmacintosh', 'mac', 'x-mac-roman'],
24
+ 'windows-874': ['dos-874', 'iso-8859-11', 'iso8859-11', 'iso885911', 'tis-620'],
25
+ 'windows-1250': ['cp1250', 'x-cp1250'],
26
+ 'windows-1251': ['cp1251', 'x-cp1251'],
27
+ 'windows-1252': ['ansi_x3.4-1968', 'ascii', 'cp1252', 'cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii', 'x-cp1252'],
28
+ 'windows-1253': ['cp1253', 'x-cp1253'],
29
+ 'windows-1254': ['cp1254', 'csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599', 'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5', 'x-cp1254'],
30
+ 'windows-1255': ['cp1255', 'x-cp1255'],
31
+ 'windows-1256': ['cp1256', 'x-cp1256'],
32
+ 'windows-1257': ['cp1257', 'x-cp1257'],
33
+ 'windows-1258': ['cp1258', 'x-cp1258'],
34
+ 'x-mac-cyrillic': ['x-mac-ukrainian'],
35
+ gbk: ['chinese', 'csgb2312', 'csiso58gb231280', 'gb2312', 'gb_2312', 'gb_2312-80', 'iso-ir-58', 'x-gbk'],
36
+ gb18030: [],
37
+ big5: ['big5-hkscs', 'cn-big5', 'csbig5', 'x-x-big5'],
38
+ 'euc-jp': ['cseucpkdfmtjapanese', 'x-euc-jp'],
39
+ 'iso-2022-jp': ['csiso2022jp'],
40
+ shift_jis: ['csshiftjis', 'ms932', 'ms_kanji', 'shift-jis', 'sjis', 'windows-31j', 'x-sjis'],
41
+ 'euc-kr': ['cseuckr', 'csksc56011987', 'iso-ir-149', 'korean', 'ks_c_5601-1987', 'ks_c_5601-1989', 'ksc5601', 'ksc_5601', 'windows-949'],
42
+ replacement: ['csiso2022kr', 'hz-gb-2312', 'iso-2022-cn', 'iso-2022-cn-ext', 'iso-2022-kr'],
43
+ 'utf-16be': ['unicodefffe'],
44
+ 'utf-16le': ['csunicode', 'iso-10646-ucs-2', 'ucs-2', 'unicode', 'unicodefeff', 'utf-16'],
45
+ 'x-user-defined': [],
46
+ }
@@ -0,0 +1,34 @@
1
+ export function unfinishedBytes(u, len, enc) {
2
+ switch (enc) {
3
+ case 'utf-8': {
4
+ // 0-3
5
+ let p = 0
6
+ while (p < 2 && p < len && (u[len - p - 1] & 0xc0) === 0x80) p++ // go back 0-2 trailing bytes
7
+ if (p === len) return 0 // no space for lead
8
+ const l = u[len - p - 1]
9
+ if (l < 0xc2 || l > 0xf4) return 0 // not a lead
10
+ if (p === 0) return 1 // nothing to recheck, we have only lead, return it. 2-byte must return here
11
+ if (l < 0xe0 || (l < 0xf0 && p >= 2)) return 0 // 2-byte, or 3-byte or less and we already have 2 trailing
12
+ const lower = l === 0xf0 ? 0x90 : l === 0xe0 ? 0xa0 : 0x80
13
+ const upper = l === 0xf4 ? 0x8f : l === 0xed ? 0x9f : 0xbf
14
+ const n = u[len - p]
15
+ return n >= lower && n <= upper ? p + 1 : 0
16
+ }
17
+
18
+ case 'utf-16le':
19
+ case 'utf-16be': {
20
+ // 0-3
21
+ let p = 0
22
+ if (len % 2 !== 0) p++ // uneven bytes
23
+ const l = len - p - 1
24
+ if (len - p >= 2) {
25
+ const last = enc === 'utf-16le' ? (u[l] << 8) ^ u[l - 1] : (u[l - 1] << 8) ^ u[l]
26
+ if (last >= 0xd8_00 && last < 0xdc_00) p += 2 // lone lead
27
+ }
28
+
29
+ return p
30
+ }
31
+ }
32
+
33
+ throw new Error('Unsupported encoding')
34
+ }
@@ -0,0 +1,127 @@
1
+ import { assertUint8 } from '../assert.js'
2
+ import { nativeDecoder, nativeEncoder, decode2string } from './_utils.js'
3
+ import { encodeAscii, decodeAscii } from './latin1.js'
4
+
5
+ let hexArray // array of 256 bytes converted to two-char hex strings
6
+ let hexCodes // hexArray converted to u16 code pairs
7
+ let dehexArray
8
+ const _00 = 0x30_30 // '00' string in hex, the only allowed char pair to generate 0 byte
9
+ const _ff = 0x66_66 // 'ff' string in hex, max allowed char pair (larger than 'FF' string)
10
+ const allowed = '0123456789ABCDEFabcdef'
11
+
12
+ export const E_HEX = 'Input is not a hex string'
13
+
14
+ export function toHex(arr) {
15
+ assertUint8(arr)
16
+
17
+ if (!hexArray) hexArray = Array.from({ length: 256 }, (_, i) => i.toString(16).padStart(2, '0'))
18
+ const length = arr.length // this helps Hermes
19
+
20
+ // Only old browsers use this, barebone engines don't have TextDecoder
21
+ // But Hermes can use this when it (hopefully) implements TextDecoder
22
+ if (nativeDecoder) {
23
+ if (!hexCodes) {
24
+ hexCodes = new Uint16Array(256)
25
+ const u8 = new Uint8Array(hexCodes.buffer, hexCodes.byteOffset, hexCodes.byteLength)
26
+ for (let i = 0; i < 256; i++) {
27
+ const pair = hexArray[i]
28
+ u8[2 * i] = pair.charCodeAt(0)
29
+ u8[2 * i + 1] = pair.charCodeAt(1)
30
+ }
31
+ }
32
+
33
+ const oa = new Uint16Array(length)
34
+ let i = 0
35
+ for (const last3 = arr.length - 3; ; i += 4) {
36
+ if (i >= last3) break // loop is fast enough for moving this here to be useful on JSC
37
+ const x0 = arr[i]
38
+ const x1 = arr[i + 1]
39
+ const x2 = arr[i + 2]
40
+ const x3 = arr[i + 3]
41
+ oa[i] = hexCodes[x0]
42
+ oa[i + 1] = hexCodes[x1]
43
+ oa[i + 2] = hexCodes[x2]
44
+ oa[i + 3] = hexCodes[x3]
45
+ }
46
+
47
+ for (; i < length; i++) oa[i] = hexCodes[arr[i]]
48
+ return decodeAscii(oa)
49
+ }
50
+
51
+ return decode2string(arr, 0, length, hexArray)
52
+ }
53
+
54
+ export function fromHex(str) {
55
+ if (typeof str !== 'string') throw new TypeError('Input is not a string')
56
+ if (str.length % 2 !== 0) throw new SyntaxError(E_HEX)
57
+
58
+ const length = str.length / 2 // this helps Hermes in loops
59
+ const arr = new Uint8Array(length)
60
+
61
+ // Native encoder path is beneficial even for small arrays in Hermes
62
+ if (nativeEncoder) {
63
+ if (!dehexArray) {
64
+ dehexArray = new Uint8Array(_ff + 1) // 26 KiB cache, >2x perf improvement on Hermes
65
+ const u8 = new Uint8Array(2)
66
+ const u16 = new Uint16Array(u8.buffer, u8.byteOffset, 1) // for endianess-agnostic transform
67
+ const map = [...allowed].map((c) => [c.charCodeAt(0), parseInt(c, 16)])
68
+ for (const [ch, vh] of map) {
69
+ u8[0] = ch // first we read high hex char
70
+ for (const [cl, vl] of map) {
71
+ u8[1] = cl // then we read low hex char
72
+ dehexArray[u16[0]] = (vh << 4) | vl
73
+ }
74
+ }
75
+ }
76
+
77
+ const codes = encodeAscii(str, E_HEX)
78
+ const codes16 = new Uint16Array(codes.buffer, codes.byteOffset, codes.byteLength / 2)
79
+ let i = 0
80
+ for (const last3 = length - 3; i < last3; i += 4) {
81
+ const ai = codes16[i]
82
+ const bi = codes16[i + 1]
83
+ const ci = codes16[i + 2]
84
+ const di = codes16[i + 3]
85
+ const a = dehexArray[ai]
86
+ const b = dehexArray[bi]
87
+ const c = dehexArray[ci]
88
+ const d = dehexArray[di]
89
+ if ((!a && ai !== _00) || (!b && bi !== _00) || (!c && ci !== _00) || (!d && di !== _00)) {
90
+ throw new SyntaxError(E_HEX)
91
+ }
92
+
93
+ arr[i] = a
94
+ arr[i + 1] = b
95
+ arr[i + 2] = c
96
+ arr[i + 3] = d
97
+ }
98
+
99
+ while (i < length) {
100
+ const ai = codes16[i]
101
+ const a = dehexArray[ai]
102
+ if (!a && ai !== _00) throw new SyntaxError(E_HEX)
103
+ arr[i++] = a
104
+ }
105
+ } else {
106
+ if (!dehexArray) {
107
+ // no regex input validation here, so we map all other bytes to -1 and recheck sign
108
+ // non-ASCII chars throw already though, so we should process only 0-127
109
+ dehexArray = new Int8Array(128).fill(-1)
110
+ for (let i = 0; i < 16; i++) {
111
+ const s = i.toString(16)
112
+ dehexArray[s.charCodeAt(0)] = dehexArray[s.toUpperCase().charCodeAt(0)] = i
113
+ }
114
+ }
115
+
116
+ let j = 0
117
+ for (let i = 0; i < length; i++) {
118
+ const a = str.charCodeAt(j++)
119
+ const b = str.charCodeAt(j++)
120
+ const res = (dehexArray[a] << 4) | dehexArray[b]
121
+ if (res < 0 || (0x7f | a | b) !== 0x7f) throw new SyntaxError(E_HEX) // 0-127
122
+ arr[i] = res
123
+ }
124
+ }
125
+
126
+ return arr
127
+ }
@@ -0,0 +1,120 @@
1
+ import {
2
+ nativeEncoder,
3
+ nativeDecoder,
4
+ nativeDecoderLatin1,
5
+ nativeBuffer,
6
+ isHermes,
7
+ isDeno,
8
+ } from './_utils.js'
9
+
10
+ // See http://stackoverflow.com/a/22747272/680742, which says that lowest limit is in Chrome, with 0xffff args
11
+ // On Hermes, actual max is 0x20_000 minus current stack depth, 1/16 of that should be safe
12
+ const maxFunctionArgs = 0x20_00
13
+
14
+ export function asciiPrefix(arr) {
15
+ let p = 0 // verified ascii bytes
16
+ const length = arr.length
17
+ // Threshold tested on Hermes (worse on <=48, better on >=52)
18
+ // Also on v8 arrs of size <=64 might be on heap and using Uint32Array on them is unoptimal
19
+ if (length > 64) {
20
+ // Speedup with u32
21
+ const u32start = (4 - (arr.byteOffset & 3)) % 4 // offset start by this many bytes for alignment
22
+ for (; p < u32start; p++) if (arr[p] >= 0x80) return p
23
+ const u32length = ((arr.byteLength - u32start) / 4) | 0
24
+ const u32 = new Uint32Array(arr.buffer, arr.byteOffset + u32start, u32length)
25
+ let i = 0
26
+ for (const last3 = u32length - 3; ; p += 16, i += 4) {
27
+ if (i >= last3) break // loop is fast enough for moving this here to be _very_ useful, likely due to array access checks
28
+ const a = u32[i]
29
+ const b = u32[i + 1]
30
+ const c = u32[i + 2]
31
+ const d = u32[i + 3]
32
+ if (a & 0x80_80_80_80 || b & 0x80_80_80_80 || c & 0x80_80_80_80 || d & 0x80_80_80_80) break
33
+ }
34
+
35
+ for (; i < u32length; p += 4, i++) if (u32[i] & 0x80_80_80_80) break
36
+ }
37
+
38
+ for (; p < length; p++) if (arr[p] >= 0x80) return p
39
+ return length
40
+ }
41
+
42
+ // Capable of decoding Uint16Array to UTF-16 as well as Uint8Array to Latin-1
43
+ export function decodeLatin1(arr, start = 0, stop = arr.length) {
44
+ start |= 0
45
+ stop |= 0
46
+ const total = stop - start
47
+ if (total === 0) return ''
48
+ if (total > maxFunctionArgs) {
49
+ let prefix = ''
50
+ for (let i = start; i < stop; ) {
51
+ const i1 = Math.min(stop, i + maxFunctionArgs)
52
+ prefix += String.fromCharCode.apply(String, arr.subarray(i, i1))
53
+ i = i1
54
+ }
55
+
56
+ return prefix
57
+ }
58
+
59
+ const sliced = start === 0 && stop === arr.length ? arr : arr.subarray(start, stop)
60
+ return String.fromCharCode.apply(String, sliced)
61
+ }
62
+
63
+ // Does not check input, uses best available method
64
+ // Building an array for this is only faster than proper string concatenation when TextDecoder or native Buffer are available
65
+ export const decodeAscii = nativeBuffer
66
+ ? (a) =>
67
+ // Buffer is faster on Node.js (but only for long enough data), if we know that output is ascii
68
+ a.byteLength >= 0x3_00 && !isDeno
69
+ ? nativeBuffer.from(a.buffer, a.byteOffset, a.byteLength).latin1Slice(0, a.byteLength) // .latin1Slice is faster than .asciiSlice
70
+ : nativeDecoder.decode(a) // On Node.js, utf8 decoder is faster than latin1
71
+ : nativeDecoderLatin1
72
+ ? (a) => nativeDecoderLatin1.decode(a) // On browsers (specifically WebKit), latin1 decoder is faster than utf8
73
+ : (a) => decodeLatin1(new Uint8Array(a.buffer, a.byteOffset, a.byteLength)) // Fallback. We shouldn't get here, constructing with strings directly is faster
74
+
75
+ /* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
76
+
77
+ export const encodeCharcodes = isHermes
78
+ ? (str, arr) => {
79
+ const length = str.length
80
+ if (length > 64) {
81
+ const at = str.charCodeAt.bind(str) // faster on strings from ~64 chars on Hermes, but can be 10x slower on e.g. JSC
82
+ for (let i = 0; i < length; i++) arr[i] = at(i)
83
+ } else {
84
+ for (let i = 0; i < length; i++) arr[i] = str.charCodeAt(i)
85
+ }
86
+
87
+ return arr
88
+ }
89
+ : (str, arr) => {
90
+ const length = str.length
91
+ // Can be optimized with unrolling, but this is not used on non-Hermes atm
92
+ for (let i = 0; i < length; i++) arr[i] = str.charCodeAt(i)
93
+ return arr
94
+ }
95
+
96
+ /* eslint-enable @exodus/mutable/no-param-reassign-prop-only */
97
+
98
+ export const encodeLatin1 = (str) => encodeCharcodes(str, new Uint8Array(str.length))
99
+
100
+ // Expects nativeEncoder to be present
101
+ export const encodeAscii = isHermes
102
+ ? (str, ERR) => {
103
+ // Much faster in Hermes
104
+ const codes = new Uint8Array(str.length + 4) // overshoot by a full utf8 char
105
+ const info = nativeEncoder.encodeInto(str, codes)
106
+ if (info.read !== str.length || info.written !== str.length) throw new SyntaxError(ERR) // non-ascii
107
+ return codes.subarray(0, str.length)
108
+ }
109
+ : nativeBuffer
110
+ ? (str, ERR) => {
111
+ // TextEncoder is slow on Node.js 24 / 25 (was ok on 22)
112
+ const codes = nativeBuffer.from(str, 'utf8') // ascii/latin1 coerces, we need to check
113
+ if (codes.length !== str.length) throw new SyntaxError(ERR) // non-ascii
114
+ return new Uint8Array(codes.buffer, codes.byteOffset, codes.byteLength)
115
+ }
116
+ : (str, ERR) => {
117
+ const codes = nativeEncoder.encode(str)
118
+ if (codes.length !== str.length) throw new SyntaxError(ERR) // non-ascii
119
+ return codes
120
+ }
@@ -0,0 +1 @@
1
+ module.exports = () => require('./multi-byte.encodings.json') // lazy-load