@exodus/bytes 1.0.0-rc.1 → 1.0.0-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +169 -2
- package/array.js +1 -1
- package/assert.js +10 -6
- package/base32.js +40 -0
- package/base58.js +212 -0
- package/base58check.js +69 -0
- package/base64.js +133 -161
- package/bech32.js +254 -0
- package/encoding-lite.js +7 -0
- package/encoding.js +12 -0
- package/fallback/_utils.js +118 -0
- package/fallback/base32.js +233 -0
- package/fallback/base64.js +192 -0
- package/fallback/encoding.js +279 -0
- package/fallback/encoding.labels.js +46 -0
- package/fallback/encoding.util.js +34 -0
- package/fallback/hex.js +127 -0
- package/fallback/latin1.js +120 -0
- package/fallback/multi-byte.encodings.cjs +1 -0
- package/fallback/multi-byte.encodings.json +545 -0
- package/fallback/multi-byte.js +448 -0
- package/fallback/multi-byte.table.js +114 -0
- package/fallback/single-byte.encodings.js +45 -0
- package/fallback/single-byte.js +83 -0
- package/fallback/utf16.js +180 -0
- package/fallback/utf8.js +245 -0
- package/hex.js +12 -71
- package/hex.node.js +28 -0
- package/multi-byte.js +13 -0
- package/multi-byte.node.js +25 -0
- package/package.json +105 -14
- package/single-byte.js +55 -0
- package/single-byte.node.js +62 -0
- package/utf16.js +73 -0
- package/utf16.node.js +79 -0
- package/utf8.js +80 -0
- package/utf8.node.js +54 -0
- package/wif.js +42 -0
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
// We can't return native TextDecoder if it's present, as Node.js one is broken on windows-1252 and we fix that
|
|
2
|
+
// We are also faster than Node.js built-in on both TextEncoder and TextDecoder
|
|
3
|
+
|
|
4
|
+
import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
|
|
5
|
+
import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
|
|
6
|
+
import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
|
|
7
|
+
import labels from './encoding.labels.js'
|
|
8
|
+
import { unfinishedBytes } from './encoding.util.js'
|
|
9
|
+
|
|
10
|
+
const E_OPTIONS = 'The "options" argument must be of type object'
|
|
11
|
+
const E_ENCODING = 'Unknown encoding'
|
|
12
|
+
const replacementChar = '\uFFFD'
|
|
13
|
+
|
|
14
|
+
const E_MULTI =
|
|
15
|
+
'Legacy multi-byte encodings are disabled in /encoding-lite.js, use /encoding.js for full encodings range support'
|
|
16
|
+
const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
|
|
17
|
+
let createMultibyteDecoder
|
|
18
|
+
|
|
19
|
+
export function setMultibyteDecoder(createDecoder) {
|
|
20
|
+
createMultibyteDecoder = createDecoder
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
let labelsMap
|
|
24
|
+
|
|
25
|
+
// Warning: unlike whatwg-encoding, returns lowercased labels
|
|
26
|
+
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
|
|
27
|
+
// https://encoding.spec.whatwg.org/#names-and-labels
|
|
28
|
+
export function normalizeEncoding(label) {
|
|
29
|
+
// fast path
|
|
30
|
+
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
|
|
31
|
+
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
|
|
32
|
+
// full map
|
|
33
|
+
if (!/^[\w\t\n\f\r .:-]+$/i.test(label)) throw new RangeError(E_ENCODING) // must be ASCII (with ASCII whitespace)
|
|
34
|
+
const low = `${label}`.trim().toLowerCase()
|
|
35
|
+
if (Object.hasOwn(labels, low)) return low
|
|
36
|
+
if (!labelsMap) {
|
|
37
|
+
labelsMap = new Map()
|
|
38
|
+
for (const [label, aliases] of Object.entries(labels)) {
|
|
39
|
+
for (const alias of aliases) labelsMap.set(alias, label)
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const mapped = labelsMap.get(low)
|
|
44
|
+
if (mapped) return mapped
|
|
45
|
+
throw new RangeError(E_ENCODING)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
|
|
49
|
+
|
|
50
|
+
const fromSource = (x) => {
|
|
51
|
+
if (x instanceof Uint8Array) return x
|
|
52
|
+
if (x instanceof ArrayBuffer) return new Uint8Array(x)
|
|
53
|
+
if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
|
|
54
|
+
if (globalThis.SharedArrayBuffer && x instanceof globalThis.SharedArrayBuffer) {
|
|
55
|
+
return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
throw new TypeError('Argument must be a SharedArrayBuffer, ArrayBuffer or ArrayBufferView')
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function unicodeDecoder(encoding, loose) {
|
|
62
|
+
if (encoding === 'utf-8') return loose ? utf8toStringLoose : utf8toString // likely
|
|
63
|
+
const form = encoding === 'utf-16le' ? 'uint8-le' : 'uint8-be'
|
|
64
|
+
return loose ? (u) => utf16toStringLoose(u, form) : (u) => utf16toString(u, form)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export class TextDecoder {
|
|
68
|
+
#decode
|
|
69
|
+
#unicode
|
|
70
|
+
#multibyte
|
|
71
|
+
#chunk
|
|
72
|
+
#canBOM
|
|
73
|
+
|
|
74
|
+
constructor(encoding = 'utf-8', options = {}) {
|
|
75
|
+
if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
|
|
76
|
+
const enc = normalizeEncoding(encoding)
|
|
77
|
+
if (enc === 'replacement') throw new RangeError(E_ENCODING)
|
|
78
|
+
define(this, 'encoding', enc)
|
|
79
|
+
define(this, 'fatal', Boolean(options.fatal))
|
|
80
|
+
define(this, 'ignoreBOM', Boolean(options.ignoreBOM))
|
|
81
|
+
this.#unicode = enc === 'utf-8' || enc === 'utf-16le' || enc === 'utf-16be'
|
|
82
|
+
this.#multibyte = !this.#unicode && multibyteSet.has(enc)
|
|
83
|
+
this.#canBOM = this.#unicode && !this.ignoreBOM
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
get [Symbol.toStringTag]() {
|
|
87
|
+
return 'TextDecoder'
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
decode(input, options = {}) {
|
|
91
|
+
if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
|
|
92
|
+
const stream = Boolean(options.stream)
|
|
93
|
+
let u = input === undefined ? new Uint8Array() : fromSource(input)
|
|
94
|
+
|
|
95
|
+
if (this.#unicode) {
|
|
96
|
+
let prefix
|
|
97
|
+
if (this.#chunk) {
|
|
98
|
+
if (u.length === 0) {
|
|
99
|
+
if (stream) return '' // no change
|
|
100
|
+
u = this.#chunk // process as final chunk to handle errors and state changes
|
|
101
|
+
} else if (u.length < 3) {
|
|
102
|
+
// No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
|
|
103
|
+
const a = new Uint8Array(u.length + this.#chunk.length)
|
|
104
|
+
a.set(this.#chunk)
|
|
105
|
+
a.set(u, this.#chunk.length)
|
|
106
|
+
u = a
|
|
107
|
+
} else {
|
|
108
|
+
// Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
|
|
109
|
+
const t = new Uint8Array(this.#chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
|
|
110
|
+
t.set(this.#chunk)
|
|
111
|
+
t.set(u.subarray(0, 3), this.#chunk.length)
|
|
112
|
+
|
|
113
|
+
// Stop at the first offset where unfinished bytes reaches 0 or fits into u
|
|
114
|
+
// If that doesn't happen (u too short), just concat chunk and u completely
|
|
115
|
+
for (let i = 1; i <= 3; i++) {
|
|
116
|
+
const unfinished = unfinishedBytes(t, this.#chunk.length + i, this.encoding) // 0-3
|
|
117
|
+
if (unfinished <= i) {
|
|
118
|
+
// Always reachable at 3, but we still need 'unfinished' value for it
|
|
119
|
+
const add = i - unfinished // 0-3
|
|
120
|
+
prefix = add > 0 ? t.subarray(0, this.#chunk.length + add) : this.#chunk
|
|
121
|
+
if (add > 0) u = u.subarray(add)
|
|
122
|
+
break
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
this.#chunk = null
|
|
128
|
+
} else if (u.byteLength === 0) {
|
|
129
|
+
if (!stream) this.#canBOM = !this.ignoreBOM
|
|
130
|
+
return ''
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// For non-stream utf-8 we don't have to do this as it matches utf8toStringLoose already
|
|
134
|
+
// For non-stream loose utf-16 we still have to do this as this API supports uneven byteLength unlike utf16toStringLoose
|
|
135
|
+
let suffix = ''
|
|
136
|
+
if (stream || (!this.fatal && this.encoding !== 'utf-8')) {
|
|
137
|
+
const trail = unfinishedBytes(u, u.byteLength, this.encoding)
|
|
138
|
+
if (trail > 0) {
|
|
139
|
+
if (stream) {
|
|
140
|
+
this.#chunk = Uint8Array.from(u.subarray(-trail)) // copy
|
|
141
|
+
} else {
|
|
142
|
+
// non-fatal mode as already checked
|
|
143
|
+
suffix = replacementChar
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
u = u.subarray(0, -trail)
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (this.#canBOM) {
|
|
151
|
+
const bom = this.#findBom(prefix ?? u)
|
|
152
|
+
if (bom) {
|
|
153
|
+
if (stream) this.#canBOM = false
|
|
154
|
+
if (prefix) {
|
|
155
|
+
prefix = prefix.subarray(bom)
|
|
156
|
+
} else {
|
|
157
|
+
u = u.subarray(bom)
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if (!this.#decode) this.#decode = unicodeDecoder(this.encoding, !this.fatal)
|
|
163
|
+
try {
|
|
164
|
+
const res = (prefix ? this.#decode(prefix) : '') + this.#decode(u) + suffix
|
|
165
|
+
if (res.length > 0 && stream) this.#canBOM = false
|
|
166
|
+
|
|
167
|
+
if (!stream) this.#canBOM = !this.ignoreBOM
|
|
168
|
+
return res
|
|
169
|
+
} catch (err) {
|
|
170
|
+
this.#chunk = null // reset unfinished chunk on errors
|
|
171
|
+
throw err
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// eslint-disable-next-line no-else-return
|
|
175
|
+
} else if (this.#multibyte) {
|
|
176
|
+
if (!createMultibyteDecoder) throw new Error(E_MULTI)
|
|
177
|
+
if (!this.#decode) this.#decode = createMultibyteDecoder(this.encoding, !this.fatal) // can contain state!
|
|
178
|
+
return this.#decode(u, stream)
|
|
179
|
+
} else {
|
|
180
|
+
if (!this.#decode) this.#decode = createSinglebyteDecoder(this.encoding, !this.fatal)
|
|
181
|
+
return this.#decode(u)
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
#findBom(u) {
|
|
186
|
+
switch (this.encoding) {
|
|
187
|
+
case 'utf-8':
|
|
188
|
+
return u.byteLength >= 3 && u[0] === 0xef && u[1] === 0xbb && u[2] === 0xbf ? 3 : 0
|
|
189
|
+
case 'utf-16le':
|
|
190
|
+
return u.byteLength >= 2 && u[0] === 0xff && u[1] === 0xfe ? 2 : 0
|
|
191
|
+
case 'utf-16be':
|
|
192
|
+
return u.byteLength >= 2 && u[0] === 0xfe && u[1] === 0xff ? 2 : 0
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
throw new Error('Unreachable')
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
export class TextEncoder {
|
|
200
|
+
constructor() {
|
|
201
|
+
define(this, 'encoding', 'utf-8')
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
get [Symbol.toStringTag]() {
|
|
205
|
+
return 'TextEncoder'
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
encode(str = '') {
|
|
209
|
+
if (typeof str !== 'string') str = `${str}`
|
|
210
|
+
const res = utf8fromStringLoose(str)
|
|
211
|
+
return res.byteOffset === 0 ? res : res.slice(0) // Ensure 0-offset. TODO: do we need this?
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
encodeInto(str, target) {
|
|
215
|
+
if (typeof str !== 'string') str = `${str}`
|
|
216
|
+
if (!(target instanceof Uint8Array)) throw new TypeError('Target must be an Uint8Array')
|
|
217
|
+
if (target.buffer.detached) return { read: 0, written: 0 } // Until https://github.com/whatwg/encoding/issues/324 is resolved
|
|
218
|
+
|
|
219
|
+
const tlen = target.length
|
|
220
|
+
if (tlen < str.length) str = str.slice(0, tlen)
|
|
221
|
+
let u8 = utf8fromStringLoose(str)
|
|
222
|
+
let read
|
|
223
|
+
if (tlen >= u8.length) {
|
|
224
|
+
read = str.length
|
|
225
|
+
} else if (u8.length === str.length) {
|
|
226
|
+
if (u8.length > tlen) u8 = u8.subarray(0, tlen) // ascii can be truncated
|
|
227
|
+
read = u8.length
|
|
228
|
+
} else {
|
|
229
|
+
u8 = u8.subarray(0, tlen)
|
|
230
|
+
const unfinished = unfinishedBytes(u8, u8.length, 'utf-8')
|
|
231
|
+
if (unfinished > 0) u8 = u8.subarray(0, u8.length - unfinished)
|
|
232
|
+
|
|
233
|
+
// We can do this because loose str -> u8 -> str preserves length, unlike loose u8 -> str -> u8
|
|
234
|
+
// Each unpaired surrogate (1 charcode) is replaced with a single charcode
|
|
235
|
+
read = utf8toStringLoose(u8).length // FIXME: Converting back is very inefficient
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
try {
|
|
239
|
+
target.set(u8)
|
|
240
|
+
} catch {
|
|
241
|
+
return { read: 0, written: 0 } // see above, likely detached but no .detached property support
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
return { read, written: u8.length }
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Warning: unlike whatwg-encoding, returns lowercased labels
|
|
249
|
+
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
|
|
250
|
+
export function getBOMEncoding(input) {
|
|
251
|
+
const u8 = fromSource(input) // asserts
|
|
252
|
+
if (u8.length >= 3 && u8[0] === 0xef && u8[1] === 0xbb && u8[2] === 0xbf) return 'utf-8'
|
|
253
|
+
if (u8.length < 2) return null
|
|
254
|
+
if (u8[0] === 0xff && u8[1] === 0xfe) return 'utf-16le'
|
|
255
|
+
if (u8[0] === 0xfe && u8[1] === 0xff) return 'utf-16be'
|
|
256
|
+
return null
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// https://encoding.spec.whatwg.org/#decode
|
|
260
|
+
// Warning: encoding sniffed from BOM takes preference over the supplied one
|
|
261
|
+
// Warning: lossy, performs replacement, no option of throwing
|
|
262
|
+
// Expects normalized (lower-case) encoding as input. Completely ignores it and even skips validation when BOM is found
|
|
263
|
+
export function legacyHookDecode(input, fallbackEncoding) {
|
|
264
|
+
let u8 = fromSource(input)
|
|
265
|
+
const bomEncoding = getBOMEncoding(u8)
|
|
266
|
+
if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
|
|
267
|
+
const enc = bomEncoding ?? fallbackEncoding ?? 'utf-8' // "the byte order mark is more authoritative than anything else"
|
|
268
|
+
if (enc === 'utf-8') return utf8toStringLoose(u8)
|
|
269
|
+
if (enc === 'utf-16le') return utf16toStringLoose(u8, 'uint8-le')
|
|
270
|
+
if (enc === 'utf-16be') return utf16toStringLoose(u8, 'uint8-be')
|
|
271
|
+
if (!Object.hasOwn(labels, enc) || enc === 'replacement') throw new RangeError(E_ENCODING)
|
|
272
|
+
|
|
273
|
+
if (multibyteSet.has(enc)) {
|
|
274
|
+
if (!createMultibyteDecoder) throw new Error(E_MULTI)
|
|
275
|
+
return createMultibyteDecoder(enc, true)(u8)
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
return createSinglebyteDecoder(enc, true)(u8)
|
|
279
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
// See https://encoding.spec.whatwg.org/#names-and-labels
|
|
2
|
+
|
|
3
|
+
/* eslint-disable @exodus/export-default/named */
|
|
4
|
+
// prettier-ignore
|
|
5
|
+
export default {
|
|
6
|
+
'utf-8': ['unicode-1-1-utf-8', 'unicode11utf8', 'unicode20utf8', 'utf8', 'x-unicode20utf8'],
|
|
7
|
+
ibm866: ['866', 'cp866', 'csibm866'],
|
|
8
|
+
'iso-8859-2': ['csisolatin2', 'iso-ir-101', 'iso8859-2', 'iso88592', 'iso_8859-2', 'iso_8859-2:1987', 'l2', 'latin2'],
|
|
9
|
+
'iso-8859-3': ['csisolatin3', 'iso-ir-109', 'iso8859-3', 'iso88593', 'iso_8859-3', 'iso_8859-3:1988', 'l3', 'latin3'],
|
|
10
|
+
'iso-8859-4': ['csisolatin4', 'iso-ir-110', 'iso8859-4', 'iso88594', 'iso_8859-4', 'iso_8859-4:1988', 'l4', 'latin4'],
|
|
11
|
+
'iso-8859-5': ['csisolatincyrillic', 'cyrillic', 'iso-ir-144', 'iso8859-5', 'iso88595', 'iso_8859-5', 'iso_8859-5:1988'],
|
|
12
|
+
'iso-8859-6': ['arabic', 'asmo-708', 'csiso88596e', 'csiso88596i', 'csisolatinarabic', 'ecma-114', 'iso-8859-6-e', 'iso-8859-6-i', 'iso-ir-127', 'iso8859-6', 'iso88596', 'iso_8859-6', 'iso_8859-6:1987'],
|
|
13
|
+
'iso-8859-7': ['csisolatingreek', 'ecma-118', 'elot_928', 'greek', 'greek8', 'iso-ir-126', 'iso8859-7', 'iso88597', 'iso_8859-7', 'iso_8859-7:1987', 'sun_eu_greek'],
|
|
14
|
+
'iso-8859-8': ['csiso88598e', 'csisolatinhebrew', 'hebrew', 'iso-8859-8-e', 'iso-ir-138', 'iso8859-8', 'iso88598', 'iso_8859-8', 'iso_8859-8:1988', 'visual'],
|
|
15
|
+
'iso-8859-8-i': ['csiso88598i', 'logical'],
|
|
16
|
+
'iso-8859-10': ['csisolatin6', 'iso-ir-157', 'iso8859-10', 'iso885910', 'l6', 'latin6'],
|
|
17
|
+
'iso-8859-13': ['iso8859-13', 'iso885913'],
|
|
18
|
+
'iso-8859-14': ['iso8859-14', 'iso885914'],
|
|
19
|
+
'iso-8859-15': ['csisolatin9', 'iso8859-15', 'iso885915', 'iso_8859-15', 'l9'],
|
|
20
|
+
'iso-8859-16': [],
|
|
21
|
+
'koi8-r': ['cskoi8r', 'koi', 'koi8', 'koi8_r'],
|
|
22
|
+
'koi8-u': ['koi8-ru'],
|
|
23
|
+
macintosh: ['csmacintosh', 'mac', 'x-mac-roman'],
|
|
24
|
+
'windows-874': ['dos-874', 'iso-8859-11', 'iso8859-11', 'iso885911', 'tis-620'],
|
|
25
|
+
'windows-1250': ['cp1250', 'x-cp1250'],
|
|
26
|
+
'windows-1251': ['cp1251', 'x-cp1251'],
|
|
27
|
+
'windows-1252': ['ansi_x3.4-1968', 'ascii', 'cp1252', 'cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii', 'x-cp1252'],
|
|
28
|
+
'windows-1253': ['cp1253', 'x-cp1253'],
|
|
29
|
+
'windows-1254': ['cp1254', 'csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599', 'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5', 'x-cp1254'],
|
|
30
|
+
'windows-1255': ['cp1255', 'x-cp1255'],
|
|
31
|
+
'windows-1256': ['cp1256', 'x-cp1256'],
|
|
32
|
+
'windows-1257': ['cp1257', 'x-cp1257'],
|
|
33
|
+
'windows-1258': ['cp1258', 'x-cp1258'],
|
|
34
|
+
'x-mac-cyrillic': ['x-mac-ukrainian'],
|
|
35
|
+
gbk: ['chinese', 'csgb2312', 'csiso58gb231280', 'gb2312', 'gb_2312', 'gb_2312-80', 'iso-ir-58', 'x-gbk'],
|
|
36
|
+
gb18030: [],
|
|
37
|
+
big5: ['big5-hkscs', 'cn-big5', 'csbig5', 'x-x-big5'],
|
|
38
|
+
'euc-jp': ['cseucpkdfmtjapanese', 'x-euc-jp'],
|
|
39
|
+
'iso-2022-jp': ['csiso2022jp'],
|
|
40
|
+
shift_jis: ['csshiftjis', 'ms932', 'ms_kanji', 'shift-jis', 'sjis', 'windows-31j', 'x-sjis'],
|
|
41
|
+
'euc-kr': ['cseuckr', 'csksc56011987', 'iso-ir-149', 'korean', 'ks_c_5601-1987', 'ks_c_5601-1989', 'ksc5601', 'ksc_5601', 'windows-949'],
|
|
42
|
+
replacement: ['csiso2022kr', 'hz-gb-2312', 'iso-2022-cn', 'iso-2022-cn-ext', 'iso-2022-kr'],
|
|
43
|
+
'utf-16be': ['unicodefffe'],
|
|
44
|
+
'utf-16le': ['csunicode', 'iso-10646-ucs-2', 'ucs-2', 'unicode', 'unicodefeff', 'utf-16'],
|
|
45
|
+
'x-user-defined': [],
|
|
46
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
export function unfinishedBytes(u, len, enc) {
|
|
2
|
+
switch (enc) {
|
|
3
|
+
case 'utf-8': {
|
|
4
|
+
// 0-3
|
|
5
|
+
let p = 0
|
|
6
|
+
while (p < 2 && p < len && (u[len - p - 1] & 0xc0) === 0x80) p++ // go back 0-2 trailing bytes
|
|
7
|
+
if (p === len) return 0 // no space for lead
|
|
8
|
+
const l = u[len - p - 1]
|
|
9
|
+
if (l < 0xc2 || l > 0xf4) return 0 // not a lead
|
|
10
|
+
if (p === 0) return 1 // nothing to recheck, we have only lead, return it. 2-byte must return here
|
|
11
|
+
if (l < 0xe0 || (l < 0xf0 && p >= 2)) return 0 // 2-byte, or 3-byte or less and we already have 2 trailing
|
|
12
|
+
const lower = l === 0xf0 ? 0x90 : l === 0xe0 ? 0xa0 : 0x80
|
|
13
|
+
const upper = l === 0xf4 ? 0x8f : l === 0xed ? 0x9f : 0xbf
|
|
14
|
+
const n = u[len - p]
|
|
15
|
+
return n >= lower && n <= upper ? p + 1 : 0
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
case 'utf-16le':
|
|
19
|
+
case 'utf-16be': {
|
|
20
|
+
// 0-3
|
|
21
|
+
let p = 0
|
|
22
|
+
if (len % 2 !== 0) p++ // uneven bytes
|
|
23
|
+
const l = len - p - 1
|
|
24
|
+
if (len - p >= 2) {
|
|
25
|
+
const last = enc === 'utf-16le' ? (u[l] << 8) ^ u[l - 1] : (u[l - 1] << 8) ^ u[l]
|
|
26
|
+
if (last >= 0xd8_00 && last < 0xdc_00) p += 2 // lone lead
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
return p
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
throw new Error('Unsupported encoding')
|
|
34
|
+
}
|
package/fallback/hex.js
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import { assertUint8 } from '../assert.js'
|
|
2
|
+
import { nativeDecoder, nativeEncoder, decode2string } from './_utils.js'
|
|
3
|
+
import { encodeAscii, decodeAscii } from './latin1.js'
|
|
4
|
+
|
|
5
|
+
let hexArray // array of 256 bytes converted to two-char hex strings
|
|
6
|
+
let hexCodes // hexArray converted to u16 code pairs
|
|
7
|
+
let dehexArray
|
|
8
|
+
const _00 = 0x30_30 // '00' string in hex, the only allowed char pair to generate 0 byte
|
|
9
|
+
const _ff = 0x66_66 // 'ff' string in hex, max allowed char pair (larger than 'FF' string)
|
|
10
|
+
const allowed = '0123456789ABCDEFabcdef'
|
|
11
|
+
|
|
12
|
+
export const E_HEX = 'Input is not a hex string'
|
|
13
|
+
|
|
14
|
+
export function toHex(arr) {
|
|
15
|
+
assertUint8(arr)
|
|
16
|
+
|
|
17
|
+
if (!hexArray) hexArray = Array.from({ length: 256 }, (_, i) => i.toString(16).padStart(2, '0'))
|
|
18
|
+
const length = arr.length // this helps Hermes
|
|
19
|
+
|
|
20
|
+
// Only old browsers use this, barebone engines don't have TextDecoder
|
|
21
|
+
// But Hermes can use this when it (hopefully) implements TextDecoder
|
|
22
|
+
if (nativeDecoder) {
|
|
23
|
+
if (!hexCodes) {
|
|
24
|
+
hexCodes = new Uint16Array(256)
|
|
25
|
+
const u8 = new Uint8Array(hexCodes.buffer, hexCodes.byteOffset, hexCodes.byteLength)
|
|
26
|
+
for (let i = 0; i < 256; i++) {
|
|
27
|
+
const pair = hexArray[i]
|
|
28
|
+
u8[2 * i] = pair.charCodeAt(0)
|
|
29
|
+
u8[2 * i + 1] = pair.charCodeAt(1)
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const oa = new Uint16Array(length)
|
|
34
|
+
let i = 0
|
|
35
|
+
for (const last3 = arr.length - 3; ; i += 4) {
|
|
36
|
+
if (i >= last3) break // loop is fast enough for moving this here to be useful on JSC
|
|
37
|
+
const x0 = arr[i]
|
|
38
|
+
const x1 = arr[i + 1]
|
|
39
|
+
const x2 = arr[i + 2]
|
|
40
|
+
const x3 = arr[i + 3]
|
|
41
|
+
oa[i] = hexCodes[x0]
|
|
42
|
+
oa[i + 1] = hexCodes[x1]
|
|
43
|
+
oa[i + 2] = hexCodes[x2]
|
|
44
|
+
oa[i + 3] = hexCodes[x3]
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
for (; i < length; i++) oa[i] = hexCodes[arr[i]]
|
|
48
|
+
return decodeAscii(oa)
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return decode2string(arr, 0, length, hexArray)
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export function fromHex(str) {
|
|
55
|
+
if (typeof str !== 'string') throw new TypeError('Input is not a string')
|
|
56
|
+
if (str.length % 2 !== 0) throw new SyntaxError(E_HEX)
|
|
57
|
+
|
|
58
|
+
const length = str.length / 2 // this helps Hermes in loops
|
|
59
|
+
const arr = new Uint8Array(length)
|
|
60
|
+
|
|
61
|
+
// Native encoder path is beneficial even for small arrays in Hermes
|
|
62
|
+
if (nativeEncoder) {
|
|
63
|
+
if (!dehexArray) {
|
|
64
|
+
dehexArray = new Uint8Array(_ff + 1) // 26 KiB cache, >2x perf improvement on Hermes
|
|
65
|
+
const u8 = new Uint8Array(2)
|
|
66
|
+
const u16 = new Uint16Array(u8.buffer, u8.byteOffset, 1) // for endianess-agnostic transform
|
|
67
|
+
const map = [...allowed].map((c) => [c.charCodeAt(0), parseInt(c, 16)])
|
|
68
|
+
for (const [ch, vh] of map) {
|
|
69
|
+
u8[0] = ch // first we read high hex char
|
|
70
|
+
for (const [cl, vl] of map) {
|
|
71
|
+
u8[1] = cl // then we read low hex char
|
|
72
|
+
dehexArray[u16[0]] = (vh << 4) | vl
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const codes = encodeAscii(str, E_HEX)
|
|
78
|
+
const codes16 = new Uint16Array(codes.buffer, codes.byteOffset, codes.byteLength / 2)
|
|
79
|
+
let i = 0
|
|
80
|
+
for (const last3 = length - 3; i < last3; i += 4) {
|
|
81
|
+
const ai = codes16[i]
|
|
82
|
+
const bi = codes16[i + 1]
|
|
83
|
+
const ci = codes16[i + 2]
|
|
84
|
+
const di = codes16[i + 3]
|
|
85
|
+
const a = dehexArray[ai]
|
|
86
|
+
const b = dehexArray[bi]
|
|
87
|
+
const c = dehexArray[ci]
|
|
88
|
+
const d = dehexArray[di]
|
|
89
|
+
if ((!a && ai !== _00) || (!b && bi !== _00) || (!c && ci !== _00) || (!d && di !== _00)) {
|
|
90
|
+
throw new SyntaxError(E_HEX)
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
arr[i] = a
|
|
94
|
+
arr[i + 1] = b
|
|
95
|
+
arr[i + 2] = c
|
|
96
|
+
arr[i + 3] = d
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
while (i < length) {
|
|
100
|
+
const ai = codes16[i]
|
|
101
|
+
const a = dehexArray[ai]
|
|
102
|
+
if (!a && ai !== _00) throw new SyntaxError(E_HEX)
|
|
103
|
+
arr[i++] = a
|
|
104
|
+
}
|
|
105
|
+
} else {
|
|
106
|
+
if (!dehexArray) {
|
|
107
|
+
// no regex input validation here, so we map all other bytes to -1 and recheck sign
|
|
108
|
+
// non-ASCII chars throw already though, so we should process only 0-127
|
|
109
|
+
dehexArray = new Int8Array(128).fill(-1)
|
|
110
|
+
for (let i = 0; i < 16; i++) {
|
|
111
|
+
const s = i.toString(16)
|
|
112
|
+
dehexArray[s.charCodeAt(0)] = dehexArray[s.toUpperCase().charCodeAt(0)] = i
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
let j = 0
|
|
117
|
+
for (let i = 0; i < length; i++) {
|
|
118
|
+
const a = str.charCodeAt(j++)
|
|
119
|
+
const b = str.charCodeAt(j++)
|
|
120
|
+
const res = (dehexArray[a] << 4) | dehexArray[b]
|
|
121
|
+
if (res < 0 || (0x7f | a | b) !== 0x7f) throw new SyntaxError(E_HEX) // 0-127
|
|
122
|
+
arr[i] = res
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return arr
|
|
127
|
+
}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import {
|
|
2
|
+
nativeEncoder,
|
|
3
|
+
nativeDecoder,
|
|
4
|
+
nativeDecoderLatin1,
|
|
5
|
+
nativeBuffer,
|
|
6
|
+
isHermes,
|
|
7
|
+
isDeno,
|
|
8
|
+
} from './_utils.js'
|
|
9
|
+
|
|
10
|
+
// See http://stackoverflow.com/a/22747272/680742, which says that lowest limit is in Chrome, with 0xffff args
|
|
11
|
+
// On Hermes, actual max is 0x20_000 minus current stack depth, 1/16 of that should be safe
|
|
12
|
+
const maxFunctionArgs = 0x20_00
|
|
13
|
+
|
|
14
|
+
export function asciiPrefix(arr) {
|
|
15
|
+
let p = 0 // verified ascii bytes
|
|
16
|
+
const length = arr.length
|
|
17
|
+
// Threshold tested on Hermes (worse on <=48, better on >=52)
|
|
18
|
+
// Also on v8 arrs of size <=64 might be on heap and using Uint32Array on them is unoptimal
|
|
19
|
+
if (length > 64) {
|
|
20
|
+
// Speedup with u32
|
|
21
|
+
const u32start = (4 - (arr.byteOffset & 3)) % 4 // offset start by this many bytes for alignment
|
|
22
|
+
for (; p < u32start; p++) if (arr[p] >= 0x80) return p
|
|
23
|
+
const u32length = ((arr.byteLength - u32start) / 4) | 0
|
|
24
|
+
const u32 = new Uint32Array(arr.buffer, arr.byteOffset + u32start, u32length)
|
|
25
|
+
let i = 0
|
|
26
|
+
for (const last3 = u32length - 3; ; p += 16, i += 4) {
|
|
27
|
+
if (i >= last3) break // loop is fast enough for moving this here to be _very_ useful, likely due to array access checks
|
|
28
|
+
const a = u32[i]
|
|
29
|
+
const b = u32[i + 1]
|
|
30
|
+
const c = u32[i + 2]
|
|
31
|
+
const d = u32[i + 3]
|
|
32
|
+
if (a & 0x80_80_80_80 || b & 0x80_80_80_80 || c & 0x80_80_80_80 || d & 0x80_80_80_80) break
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
for (; i < u32length; p += 4, i++) if (u32[i] & 0x80_80_80_80) break
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
for (; p < length; p++) if (arr[p] >= 0x80) return p
|
|
39
|
+
return length
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Capable of decoding Uint16Array to UTF-16 as well as Uint8Array to Latin-1
|
|
43
|
+
export function decodeLatin1(arr, start = 0, stop = arr.length) {
|
|
44
|
+
start |= 0
|
|
45
|
+
stop |= 0
|
|
46
|
+
const total = stop - start
|
|
47
|
+
if (total === 0) return ''
|
|
48
|
+
if (total > maxFunctionArgs) {
|
|
49
|
+
let prefix = ''
|
|
50
|
+
for (let i = start; i < stop; ) {
|
|
51
|
+
const i1 = Math.min(stop, i + maxFunctionArgs)
|
|
52
|
+
prefix += String.fromCharCode.apply(String, arr.subarray(i, i1))
|
|
53
|
+
i = i1
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return prefix
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const sliced = start === 0 && stop === arr.length ? arr : arr.subarray(start, stop)
|
|
60
|
+
return String.fromCharCode.apply(String, sliced)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Does not check input, uses best available method
|
|
64
|
+
// Building an array for this is only faster than proper string concatenation when TextDecoder or native Buffer are available
|
|
65
|
+
export const decodeAscii = nativeBuffer
|
|
66
|
+
? (a) =>
|
|
67
|
+
// Buffer is faster on Node.js (but only for long enough data), if we know that output is ascii
|
|
68
|
+
a.byteLength >= 0x3_00 && !isDeno
|
|
69
|
+
? nativeBuffer.from(a.buffer, a.byteOffset, a.byteLength).latin1Slice(0, a.byteLength) // .latin1Slice is faster than .asciiSlice
|
|
70
|
+
: nativeDecoder.decode(a) // On Node.js, utf8 decoder is faster than latin1
|
|
71
|
+
: nativeDecoderLatin1
|
|
72
|
+
? (a) => nativeDecoderLatin1.decode(a) // On browsers (specifically WebKit), latin1 decoder is faster than utf8
|
|
73
|
+
: (a) => decodeLatin1(new Uint8Array(a.buffer, a.byteOffset, a.byteLength)) // Fallback. We shouldn't get here, constructing with strings directly is faster
|
|
74
|
+
|
|
75
|
+
/* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
|
|
76
|
+
|
|
77
|
+
export const encodeCharcodes = isHermes
|
|
78
|
+
? (str, arr) => {
|
|
79
|
+
const length = str.length
|
|
80
|
+
if (length > 64) {
|
|
81
|
+
const at = str.charCodeAt.bind(str) // faster on strings from ~64 chars on Hermes, but can be 10x slower on e.g. JSC
|
|
82
|
+
for (let i = 0; i < length; i++) arr[i] = at(i)
|
|
83
|
+
} else {
|
|
84
|
+
for (let i = 0; i < length; i++) arr[i] = str.charCodeAt(i)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return arr
|
|
88
|
+
}
|
|
89
|
+
: (str, arr) => {
|
|
90
|
+
const length = str.length
|
|
91
|
+
// Can be optimized with unrolling, but this is not used on non-Hermes atm
|
|
92
|
+
for (let i = 0; i < length; i++) arr[i] = str.charCodeAt(i)
|
|
93
|
+
return arr
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/* eslint-enable @exodus/mutable/no-param-reassign-prop-only */
|
|
97
|
+
|
|
98
|
+
export const encodeLatin1 = (str) => encodeCharcodes(str, new Uint8Array(str.length))
|
|
99
|
+
|
|
100
|
+
// Expects nativeEncoder to be present
|
|
101
|
+
export const encodeAscii = isHermes
|
|
102
|
+
? (str, ERR) => {
|
|
103
|
+
// Much faster in Hermes
|
|
104
|
+
const codes = new Uint8Array(str.length + 4) // overshoot by a full utf8 char
|
|
105
|
+
const info = nativeEncoder.encodeInto(str, codes)
|
|
106
|
+
if (info.read !== str.length || info.written !== str.length) throw new SyntaxError(ERR) // non-ascii
|
|
107
|
+
return codes.subarray(0, str.length)
|
|
108
|
+
}
|
|
109
|
+
: nativeBuffer
|
|
110
|
+
? (str, ERR) => {
|
|
111
|
+
// TextEncoder is slow on Node.js 24 / 25 (was ok on 22)
|
|
112
|
+
const codes = nativeBuffer.from(str, 'utf8') // ascii/latin1 coerces, we need to check
|
|
113
|
+
if (codes.length !== str.length) throw new SyntaxError(ERR) // non-ascii
|
|
114
|
+
return new Uint8Array(codes.buffer, codes.byteOffset, codes.byteLength)
|
|
115
|
+
}
|
|
116
|
+
: (str, ERR) => {
|
|
117
|
+
const codes = nativeEncoder.encode(str)
|
|
118
|
+
if (codes.length !== str.length) throw new SyntaxError(ERR) // non-ascii
|
|
119
|
+
return codes
|
|
120
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
module.exports = () => require('./multi-byte.encodings.json') // lazy-load
|