@exodus/bytes 1.0.0-rc.8 → 1.0.0-rc.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +128 -4
- package/encoding.js +234 -0
- package/fallback/_utils.js +88 -10
- package/fallback/encoding.labels.js +46 -0
- package/fallback/encoding.util.js +34 -0
- package/fallback/hex.js +2 -70
- package/fallback/latin1.js +2 -1
- package/fallback/multi-byte.encodings.cjs +1 -0
- package/fallback/multi-byte.encodings.json +545 -0
- package/fallback/multi-byte.js +449 -0
- package/fallback/multi-byte.table.js +114 -0
- package/fallback/single-byte.encodings.js +45 -0
- package/fallback/single-byte.js +83 -0
- package/fallback/utf16.js +180 -0
- package/hex.node.js +2 -0
- package/multi-byte.js +13 -0
- package/multi-byte.node.js +25 -0
- package/package.json +39 -8
- package/single-byte.js +55 -0
- package/single-byte.node.js +62 -0
- package/utf16.js +73 -0
- package/utf16.node.js +79 -0
- package/utf8.js +7 -9
- package/utf8.node.js +8 -5
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import { decodeLatin1, encodeCharcodes } from './latin1.js'
|
|
2
|
+
import { isLE } from './_utils.js'
|
|
3
|
+
|
|
4
|
+
export const E_STRICT = 'Input is not well-formed utf16'
|
|
5
|
+
export const E_STRICT_UNICODE = 'Input is not well-formed Unicode'
|
|
6
|
+
|
|
7
|
+
const replacementCodepoint = 0xff_fd
|
|
8
|
+
const replacementCodepointSwapped = 0xfd_ff
|
|
9
|
+
|
|
10
|
+
const to16 = (a) => new Uint16Array(a.buffer, a.byteOffset, a.byteLength / 2) // Requires checked length and alignment!
|
|
11
|
+
|
|
12
|
+
export function to16input(u8, le) {
|
|
13
|
+
// Assume even number of bytes
|
|
14
|
+
if (le === isLE) return to16(u8.byteOffset % 2 === 0 ? u8 : Uint8Array.from(u8))
|
|
15
|
+
|
|
16
|
+
const res = new Uint8Array(u8.length)
|
|
17
|
+
|
|
18
|
+
let i = 0
|
|
19
|
+
for (const last3 = u8.length - 3; i < last3; i += 4) {
|
|
20
|
+
const x0 = u8[i]
|
|
21
|
+
const x1 = u8[i + 1]
|
|
22
|
+
const x2 = u8[i + 2]
|
|
23
|
+
const x3 = u8[i + 3]
|
|
24
|
+
res[i] = x1
|
|
25
|
+
res[i + 1] = x0
|
|
26
|
+
res[i + 2] = x3
|
|
27
|
+
res[i + 3] = x2
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
for (const last = u8.length - 1; i < last; i += 2) {
|
|
31
|
+
const x0 = u8[i]
|
|
32
|
+
const x1 = u8[i + 1]
|
|
33
|
+
res[i] = x1
|
|
34
|
+
res[i + 1] = x0
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return to16(res)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export const decode = (u16, loose = false, checked = false) => {
|
|
41
|
+
if (checked || isWellFormed(u16)) return decodeLatin1(u16, 0, u16.length) // it's capable of decoding Uint16Array to UTF-16 as well
|
|
42
|
+
if (!loose) throw new TypeError(E_STRICT)
|
|
43
|
+
return decodeLatin1(toWellFormed(Uint16Array.from(u16)), 0, u16.length) // cloned for replacement
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export function encode(str, loose = false, checked = false, swapped = false) {
|
|
47
|
+
const arr = new Uint16Array(str.length)
|
|
48
|
+
if (checked) return swapped ? encodeCheckedSwapped(str, arr) : encodeChecked(str, arr)
|
|
49
|
+
return swapped ? encodeUncheckedSwapped(str, arr, loose) : encodeUnchecked(str, arr, loose)
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Splitting paths into small functions helps (at least on SpiderMonkey)
|
|
53
|
+
/* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
|
|
54
|
+
|
|
55
|
+
const encodeChecked = (str, arr) => encodeCharcodes(str, arr) // Same as encodeLatin1, but with Uint16Array
|
|
56
|
+
|
|
57
|
+
function encodeCheckedSwapped(str, arr) {
|
|
58
|
+
// TODO: faster path for Hermes? See encodeCharcodes
|
|
59
|
+
const length = str.length
|
|
60
|
+
for (let i = 0; i < length; i++) {
|
|
61
|
+
const x = str.charCodeAt(i)
|
|
62
|
+
arr[i] = ((x & 0xff) << 8) | (x >> 8)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return arr
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// lead: d800 - dbff, trail: dc00 - dfff
|
|
69
|
+
|
|
70
|
+
function encodeUnchecked(str, arr, loose = false) {
|
|
71
|
+
// TODO: faster path for Hermes? See encodeCharcodes
|
|
72
|
+
const length = str.length
|
|
73
|
+
for (let i = 0; i < length; i++) {
|
|
74
|
+
const code = str.charCodeAt(i)
|
|
75
|
+
arr[i] = code
|
|
76
|
+
if (code >= 0xd8_00 && code < 0xe0_00) {
|
|
77
|
+
// An unexpected trail or a lead at the very end of input
|
|
78
|
+
if (code > 0xdb_ff || i + 1 >= length) {
|
|
79
|
+
if (!loose) throw new TypeError(E_STRICT_UNICODE)
|
|
80
|
+
arr[i] = replacementCodepoint
|
|
81
|
+
} else {
|
|
82
|
+
const next = str.charCodeAt(i + 1) // Process valid pairs immediately
|
|
83
|
+
if (next < 0xdc_00 || next >= 0xe0_00) {
|
|
84
|
+
if (!loose) throw new TypeError(E_STRICT_UNICODE)
|
|
85
|
+
arr[i] = replacementCodepoint
|
|
86
|
+
} else {
|
|
87
|
+
i++ // consume next
|
|
88
|
+
arr[i] = next
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return arr
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function encodeUncheckedSwapped(str, arr, loose = false) {
|
|
98
|
+
// TODO: faster path for Hermes? See encodeCharcodes
|
|
99
|
+
const length = str.length
|
|
100
|
+
for (let i = 0; i < length; i++) {
|
|
101
|
+
const code = str.charCodeAt(i)
|
|
102
|
+
arr[i] = ((code & 0xff) << 8) | (code >> 8)
|
|
103
|
+
if (code >= 0xd8_00 && code < 0xe0_00) {
|
|
104
|
+
// An unexpected trail or a lead at the very end of input
|
|
105
|
+
if (code > 0xdb_ff || i + 1 >= length) {
|
|
106
|
+
if (!loose) throw new TypeError(E_STRICT_UNICODE)
|
|
107
|
+
arr[i] = replacementCodepointSwapped
|
|
108
|
+
} else {
|
|
109
|
+
const next = str.charCodeAt(i + 1) // Process valid pairs immediately
|
|
110
|
+
if (next < 0xdc_00 || next >= 0xe0_00) {
|
|
111
|
+
if (!loose) throw new TypeError(E_STRICT_UNICODE)
|
|
112
|
+
arr[i] = replacementCodepointSwapped
|
|
113
|
+
} else {
|
|
114
|
+
i++ // consume next
|
|
115
|
+
arr[i] = ((next & 0xff) << 8) | (next >> 8)
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return arr
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export function toWellFormed(u16) {
|
|
125
|
+
const length = u16.length
|
|
126
|
+
for (let i = 0; i < length; i++) {
|
|
127
|
+
const code = u16[i]
|
|
128
|
+
if (code >= 0xd8_00 && code < 0xe0_00) {
|
|
129
|
+
// An unexpected trail or a lead at the very end of input
|
|
130
|
+
if (code > 0xdb_ff || i + 1 >= length) {
|
|
131
|
+
u16[i] = replacementCodepoint
|
|
132
|
+
} else {
|
|
133
|
+
const next = u16[i + 1] // Process valid pairs immediately
|
|
134
|
+
if (next < 0xdc_00 || next >= 0xe0_00) {
|
|
135
|
+
u16[i] = replacementCodepoint
|
|
136
|
+
} else {
|
|
137
|
+
i++ // consume next
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return u16
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
export function isWellFormed(u16) {
|
|
147
|
+
const length = u16.length
|
|
148
|
+
let i = 0
|
|
149
|
+
|
|
150
|
+
// Speedup with u32, by skipping to the first surrogate
|
|
151
|
+
// Only implemented for aligned input for now, but almost all input is aligned (pooled Buffer or 0 offset)
|
|
152
|
+
if (length > 32 && u16.byteOffset % 4 === 0) {
|
|
153
|
+
const u32length = (u16.byteLength / 4) | 0
|
|
154
|
+
const u32 = new Uint32Array(u16.buffer, u16.byteOffset, u32length)
|
|
155
|
+
for (const last3 = u32length - 3; ; i += 4) {
|
|
156
|
+
if (i >= last3) break // loop is fast enough for moving this here to be _very_ useful, likely due to array access checks
|
|
157
|
+
const a = u32[i]
|
|
158
|
+
const b = u32[i + 1]
|
|
159
|
+
const c = u32[i + 2]
|
|
160
|
+
const d = u32[i + 3]
|
|
161
|
+
if (a & 0x80_00_80_00 || b & 0x80_00_80_00 || c & 0x80_00_80_00 || d & 0x80_00_80_00) break
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
for (; i < u32length; i++) if (u32[i] & 0x80_00_80_00) break
|
|
165
|
+
i *= 2
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
for (; i < length; i++) {
|
|
169
|
+
const code = u16[i]
|
|
170
|
+
if (code >= 0xd8_00 && code < 0xe0_00) {
|
|
171
|
+
// An unexpected trail or a lead at the very end of input
|
|
172
|
+
if (code > 0xdb_ff || i + 1 >= length) return false
|
|
173
|
+
i++ // consume next
|
|
174
|
+
const next = u16[i] // Process valid pairs immediately
|
|
175
|
+
if (next < 0xdc_00 || next >= 0xe0_00) return false
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
return true
|
|
180
|
+
}
|
package/hex.node.js
CHANGED
|
@@ -5,6 +5,7 @@ import { E_HEX } from './fallback/hex.js'
|
|
|
5
5
|
if (Buffer.TYPED_ARRAY_SUPPORT) throw new Error('Unexpected Buffer polyfill')
|
|
6
6
|
|
|
7
7
|
const { toHex: webHex } = Uint8Array.prototype // Modern engines have this
|
|
8
|
+
const denoBug = Buffer.from('ag', 'hex').length > 0
|
|
8
9
|
|
|
9
10
|
export function toHex(arr) {
|
|
10
11
|
assertUint8(arr)
|
|
@@ -20,6 +21,7 @@ export const fromHex = Uint8Array.fromHex
|
|
|
20
21
|
: (str, format = 'uint8') => {
|
|
21
22
|
if (typeof str !== 'string') throw new TypeError('Input is not a string')
|
|
22
23
|
if (str.length % 2 !== 0) throw new SyntaxError(E_HEX)
|
|
24
|
+
if (denoBug && /[^\dA-Fa-f]/.test(str)) throw new SyntaxError(E_HEX)
|
|
23
25
|
const buf = Buffer.from(str, 'hex') // will stop on first non-hex character, so we can just validate length
|
|
24
26
|
if (buf.length * 2 !== str.length) throw new SyntaxError(E_HEX)
|
|
25
27
|
return typedView(buf, format)
|
package/multi-byte.js
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { assertUint8 } from './assert.js'
|
|
2
|
+
import { multibyteDecoder } from './fallback/multi-byte.js'
|
|
3
|
+
|
|
4
|
+
export function createMultibyteDecoder(encoding, loose = false) {
|
|
5
|
+
const jsDecoder = multibyteDecoder(encoding, loose) // asserts
|
|
6
|
+
let streaming = false
|
|
7
|
+
return (arr, stream = false) => {
|
|
8
|
+
assertUint8(arr)
|
|
9
|
+
if (!streaming && arr.byteLength === 0) return ''
|
|
10
|
+
streaming = stream
|
|
11
|
+
return jsDecoder(arr, stream)
|
|
12
|
+
}
|
|
13
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { assertUint8 } from './assert.js'
|
|
2
|
+
import { isDeno } from './fallback/_utils.js'
|
|
3
|
+
import { isAsciiSuperset, multibyteDecoder } from './fallback/multi-byte.js'
|
|
4
|
+
import { isAscii } from 'node:buffer'
|
|
5
|
+
|
|
6
|
+
const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
|
|
7
|
+
|
|
8
|
+
export function createMultibyteDecoder(encoding, loose = false) {
|
|
9
|
+
const jsDecoder = multibyteDecoder(encoding, loose) // asserts
|
|
10
|
+
let streaming = false
|
|
11
|
+
const asciiSuperset = isAsciiSuperset(encoding)
|
|
12
|
+
return (arr, stream = false) => {
|
|
13
|
+
assertUint8(arr)
|
|
14
|
+
if (!streaming) {
|
|
15
|
+
if (arr.byteLength === 0) return ''
|
|
16
|
+
if (asciiSuperset && isAscii(arr)) {
|
|
17
|
+
if (isDeno) return toBuf(arr).toString()
|
|
18
|
+
return toBuf(arr).latin1Slice(0, arr.byteLength) // .latin1Slice is faster than .asciiSlice
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
streaming = stream
|
|
23
|
+
return jsDecoder(arr, stream)
|
|
24
|
+
}
|
|
25
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@exodus/bytes",
|
|
3
|
-
"version": "1.0.0-rc.
|
|
3
|
+
"version": "1.0.0-rc.9",
|
|
4
4
|
"description": "Various operations on Uint8Array data",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"lint": "eslint .",
|
|
@@ -10,6 +10,10 @@
|
|
|
10
10
|
"test:spidermonkey": "exodus-test --engine=spidermonkey:bundle",
|
|
11
11
|
"test:hermes": "exodus-test --engine=hermes:bundle",
|
|
12
12
|
"test:quickjs": "exodus-test --engine=quickjs:bundle",
|
|
13
|
+
"test:xs": "exodus-test --engine=xs:bundle",
|
|
14
|
+
"test:engine262": "exodus-test --engine=engine262:bundle",
|
|
15
|
+
"test:deno": "exodus-test --engine=deno:pure",
|
|
16
|
+
"test:bun": "exodus-test --engine=bun:pure",
|
|
13
17
|
"test:electron:bundle": "exodus-test --engine=electron:bundle",
|
|
14
18
|
"test:electron:as-node": "exodus-test --engine=electron-as-node:test",
|
|
15
19
|
"test:chrome:puppeteer": "exodus-test --engine=chrome:puppeteer",
|
|
@@ -17,6 +21,7 @@
|
|
|
17
21
|
"test:webkit:playwright": "exodus-test --engine=webkit:playwright",
|
|
18
22
|
"test:firefox:puppeteer": "exodus-test --engine=firefox:puppeteer",
|
|
19
23
|
"test:firefox:playwright": "exodus-test --engine=firefox:playwright",
|
|
24
|
+
"test:servo:bundle": "exodus-test --engine=servo:bundle",
|
|
20
25
|
"test": "exodus-test",
|
|
21
26
|
"jsvu": "jsvu",
|
|
22
27
|
"playwright": "exodus-test --playwright",
|
|
@@ -25,24 +30,33 @@
|
|
|
25
30
|
},
|
|
26
31
|
"repository": {
|
|
27
32
|
"type": "git",
|
|
28
|
-
"url": "git+https://github.com/
|
|
33
|
+
"url": "git+https://github.com/ExodusOSS/bytes.git"
|
|
29
34
|
},
|
|
30
35
|
"author": "Exodus Movement, Inc.",
|
|
31
36
|
"license": "MIT",
|
|
32
37
|
"bugs": {
|
|
33
|
-
"url": "https://github.com/
|
|
38
|
+
"url": "https://github.com/ExodusOSS/bytes/issues"
|
|
34
39
|
},
|
|
35
|
-
"homepage": "https://github.com/
|
|
40
|
+
"homepage": "https://github.com/ExodusOSS/bytes#readme",
|
|
36
41
|
"engines": {
|
|
37
42
|
"node": "^20.19.0 || >=22.13.0"
|
|
38
43
|
},
|
|
39
44
|
"type": "module",
|
|
40
45
|
"files": [
|
|
41
46
|
"/fallback/_utils.js",
|
|
42
|
-
"/fallback/latin1.js",
|
|
43
47
|
"/fallback/base32.js",
|
|
44
48
|
"/fallback/base64.js",
|
|
49
|
+
"/fallback/encoding.labels.js",
|
|
50
|
+
"/fallback/encoding.util.js",
|
|
45
51
|
"/fallback/hex.js",
|
|
52
|
+
"/fallback/latin1.js",
|
|
53
|
+
"/fallback/multi-byte.encodings.cjs",
|
|
54
|
+
"/fallback/multi-byte.encodings.json",
|
|
55
|
+
"/fallback/multi-byte.js",
|
|
56
|
+
"/fallback/multi-byte.table.js",
|
|
57
|
+
"/fallback/single-byte.encodings.js",
|
|
58
|
+
"/fallback/single-byte.js",
|
|
59
|
+
"/fallback/utf16.js",
|
|
46
60
|
"/fallback/utf8.js",
|
|
47
61
|
"/array.js",
|
|
48
62
|
"/assert.js",
|
|
@@ -51,10 +65,17 @@
|
|
|
51
65
|
"/base58check.js",
|
|
52
66
|
"/base64.js",
|
|
53
67
|
"/bech32.js",
|
|
54
|
-
"/
|
|
68
|
+
"/encoding.js",
|
|
55
69
|
"/hex.js",
|
|
56
|
-
"/
|
|
70
|
+
"/hex.node.js",
|
|
71
|
+
"/multi-byte.js",
|
|
72
|
+
"/multi-byte.node.js",
|
|
73
|
+
"/single-byte.js",
|
|
74
|
+
"/single-byte.node.js",
|
|
75
|
+
"/utf16.js",
|
|
76
|
+
"/utf16.node.js",
|
|
57
77
|
"/utf8.js",
|
|
78
|
+
"/utf8.node.js",
|
|
58
79
|
"/wif.js"
|
|
59
80
|
],
|
|
60
81
|
"exports": {
|
|
@@ -68,6 +89,15 @@
|
|
|
68
89
|
"node": "./hex.node.js",
|
|
69
90
|
"default": "./hex.js"
|
|
70
91
|
},
|
|
92
|
+
"./multi-byte.js": {
|
|
93
|
+
"node": "./multi-byte.node.js",
|
|
94
|
+
"default": "./multi-byte.js"
|
|
95
|
+
},
|
|
96
|
+
"./single-byte.js": {
|
|
97
|
+
"node": "./single-byte.node.js",
|
|
98
|
+
"default": "./single-byte.js"
|
|
99
|
+
},
|
|
100
|
+
"./encoding.js": "./encoding.js",
|
|
71
101
|
"./utf16.js": {
|
|
72
102
|
"node": "./utf16.node.js",
|
|
73
103
|
"default": "./utf16.js"
|
|
@@ -91,7 +121,7 @@
|
|
|
91
121
|
"@exodus/crypto": "^1.0.0-rc.30",
|
|
92
122
|
"@exodus/eslint-config": "^5.24.0",
|
|
93
123
|
"@exodus/prettier": "^1.0.0",
|
|
94
|
-
"@exodus/test": "^1.0.0-rc.
|
|
124
|
+
"@exodus/test": "^1.0.0-rc.109",
|
|
95
125
|
"@noble/hashes": "^2.0.1",
|
|
96
126
|
"@scure/base": "^1.2.6",
|
|
97
127
|
"@stablelib/base64": "^2.0.1",
|
|
@@ -119,6 +149,7 @@
|
|
|
119
149
|
"typescript": "^5.9.3",
|
|
120
150
|
"uint8array-tools": "^0.0.9",
|
|
121
151
|
"utf8": "^3.0.0",
|
|
152
|
+
"whatwg-encoding": "^3.1.1",
|
|
122
153
|
"wif": "^5.0.0"
|
|
123
154
|
},
|
|
124
155
|
"prettier": "@exodus/prettier",
|
package/single-byte.js
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { assertUint8 } from './assert.js'
|
|
2
|
+
import { canDecoders } from './fallback/_utils.js'
|
|
3
|
+
import { assertEncoding, encodingDecoder } from './fallback/single-byte.js'
|
|
4
|
+
|
|
5
|
+
const { TextDecoder } = globalThis
|
|
6
|
+
|
|
7
|
+
let windows1252works
|
|
8
|
+
|
|
9
|
+
function shouldUseNative(enc) {
|
|
10
|
+
// https://issues.chromium.org/issues/468458388
|
|
11
|
+
// Also might be incorrectly imlemented on platforms as Latin1 (e.g. in Node.js) or regress
|
|
12
|
+
// This is the most significant single-byte encoding, 'ascii' and 'latin1' alias to this
|
|
13
|
+
// Even after Chrome bug is fixed, this should serve as a quick correctness check that it's actually windows-1252
|
|
14
|
+
if (enc === 'windows-1252') {
|
|
15
|
+
if (windows1252works === undefined) {
|
|
16
|
+
windows1252works = false
|
|
17
|
+
try {
|
|
18
|
+
const u = new Uint8Array(9) // using 9 bytes is significant to catch the bug
|
|
19
|
+
u[8] = 128
|
|
20
|
+
windows1252works = new TextDecoder(enc).decode(u).codePointAt(8) === 0x20_ac
|
|
21
|
+
} catch {}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
return windows1252works
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// iso-8859-16 is somehow broken in WebKit, at least on CI
|
|
28
|
+
return enc !== 'iso-8859-16'
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function createSinglebyteDecoder(encoding, loose = false) {
|
|
32
|
+
if (encoding === 'iso-8859-8-i') encoding = 'iso-8859-8'
|
|
33
|
+
assertEncoding(encoding)
|
|
34
|
+
|
|
35
|
+
if (canDecoders && shouldUseNative(encoding)) {
|
|
36
|
+
// In try, as not all encodings might be implemented in all engines which have native TextDecoder
|
|
37
|
+
try {
|
|
38
|
+
const decoder = new TextDecoder(encoding, { fatal: !loose })
|
|
39
|
+
return (arr) => {
|
|
40
|
+
assertUint8(arr)
|
|
41
|
+
if (arr.byteLength === 0) return ''
|
|
42
|
+
return decoder.decode(arr)
|
|
43
|
+
}
|
|
44
|
+
} catch {}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const jsDecoder = encodingDecoder(encoding)
|
|
48
|
+
return (arr) => {
|
|
49
|
+
assertUint8(arr)
|
|
50
|
+
if (arr.byteLength === 0) return ''
|
|
51
|
+
return jsDecoder(arr, loose)
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export const windows1252toString = createSinglebyteDecoder('windows-1252')
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { assertUint8 } from './assert.js'
|
|
2
|
+
import { isAscii } from 'node:buffer'
|
|
3
|
+
import { isDeno, isLE } from './fallback/_utils.js'
|
|
4
|
+
import { asciiPrefix, decodeLatin1 } from './fallback/latin1.js'
|
|
5
|
+
import { encodingMapper, encodingDecoder, E_STRICT } from './fallback/single-byte.js'
|
|
6
|
+
|
|
7
|
+
const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
|
|
8
|
+
|
|
9
|
+
function latin1Prefix(arr, start) {
|
|
10
|
+
let p = start | 0
|
|
11
|
+
const length = arr.length
|
|
12
|
+
for (const len3 = length - 3; p < len3; p += 4) {
|
|
13
|
+
if ((arr[p] & 0xe0) === 0x80) return p
|
|
14
|
+
if ((arr[p + 1] & 0xe0) === 0x80) return p + 1
|
|
15
|
+
if ((arr[p + 2] & 0xe0) === 0x80) return p + 2
|
|
16
|
+
if ((arr[p + 3] & 0xe0) === 0x80) return p + 3
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
for (; p < length; p++) {
|
|
20
|
+
if ((arr[p] & 0xe0) === 0x80) return p
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
return length
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function createSinglebyteDecoder(encoding, loose = false) {
|
|
27
|
+
if (encoding === 'iso-8859-8-i') encoding = 'iso-8859-8'
|
|
28
|
+
const latin1path = encoding === 'windows-1252' // TODO: are there more?
|
|
29
|
+
if (isDeno) {
|
|
30
|
+
const jsDecoder = encodingDecoder(encoding) // asserts
|
|
31
|
+
return (arr) => {
|
|
32
|
+
assertUint8(arr)
|
|
33
|
+
if (arr.byteLength === 0) return ''
|
|
34
|
+
if (isAscii(arr)) return toBuf(arr).toString()
|
|
35
|
+
return jsDecoder(arr, loose) // somewhy faster on Deno anyway, TODO: optimize?
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const { incomplete, mapper } = encodingMapper(encoding) // asserts
|
|
40
|
+
return (arr) => {
|
|
41
|
+
assertUint8(arr)
|
|
42
|
+
if (arr.byteLength === 0) return ''
|
|
43
|
+
if (isAscii(arr)) return toBuf(arr).latin1Slice(0, arr.byteLength) // .latin1Slice is faster than .asciiSlice
|
|
44
|
+
|
|
45
|
+
// Node.js TextDecoder is broken, so we can't use it. It's also slow anyway
|
|
46
|
+
|
|
47
|
+
let prefixBytes = asciiPrefix(arr)
|
|
48
|
+
let prefix = ''
|
|
49
|
+
if (latin1path) prefixBytes = latin1Prefix(arr, prefixBytes)
|
|
50
|
+
if (prefixBytes > 64 || prefixBytes === arr.length) {
|
|
51
|
+
prefix = toBuf(arr).latin1Slice(0, prefixBytes) // .latin1Slice is faster than .asciiSlice
|
|
52
|
+
if (prefixBytes === arr.length) return prefix
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const b = mapper(arr, prefix.length) // prefix.length can mismatch prefixBytes
|
|
56
|
+
const suffix = isLE ? toBuf(b).ucs2Slice(0, b.byteLength) : decodeLatin1(b, 0, b.length) // decodeLatin1 is actually capable of decoding 16-bit codepoints
|
|
57
|
+
if (!loose && incomplete && suffix.includes('\uFFFD')) throw new TypeError(E_STRICT)
|
|
58
|
+
return prefix + suffix
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export const windows1252toString = createSinglebyteDecoder('windows-1252')
|
package/utf16.js
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import * as js from './fallback/utf16.js'
|
|
2
|
+
import { canDecoders, isLE } from './fallback/_utils.js'
|
|
3
|
+
|
|
4
|
+
const { TextDecoder } = globalThis // Buffer is optional
|
|
5
|
+
const ignoreBOM = true
|
|
6
|
+
const decoderFatalLE = canDecoders ? new TextDecoder('utf-16le', { ignoreBOM, fatal: true }) : null
|
|
7
|
+
const decoderLooseLE = canDecoders ? new TextDecoder('utf-16le', { ignoreBOM }) : null
|
|
8
|
+
const decoderFatalBE = canDecoders ? new TextDecoder('utf-16be', { ignoreBOM, fatal: true }) : null
|
|
9
|
+
const decoderLooseBE = canDecoders ? new TextDecoder('utf-16be', { ignoreBOM }) : null
|
|
10
|
+
const decoderFatal16 = isLE ? decoderFatalLE : decoderFatalBE
|
|
11
|
+
const decoderLoose16 = isLE ? decoderLooseLE : decoderFatalBE
|
|
12
|
+
const { isWellFormed } = String.prototype
|
|
13
|
+
|
|
14
|
+
const { E_STRICT, E_STRICT_UNICODE } = js
|
|
15
|
+
|
|
16
|
+
// Unlike utf8, operates on Uint16Arrays by default
|
|
17
|
+
|
|
18
|
+
const to8 = (a) => new Uint8Array(a.buffer, a.byteOffset, a.byteLength)
|
|
19
|
+
|
|
20
|
+
function encode(str, loose = false, format = 'uint16') {
|
|
21
|
+
if (typeof str !== 'string') throw new TypeError('Input is not a string')
|
|
22
|
+
if (format !== 'uint16' && format !== 'uint8-le' && format !== 'uint8-be') {
|
|
23
|
+
throw new TypeError('Unknown format')
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const shouldSwap = (isLE && format === 'uint8-be') || (!isLE && format === 'uint8-le')
|
|
27
|
+
|
|
28
|
+
// On v8 and SpiderMonkey, check via isWellFormed is faster than js
|
|
29
|
+
// On JSC, check during loop is faster than isWellFormed
|
|
30
|
+
// If isWellFormed is available, we skip check during decoding and recheck after
|
|
31
|
+
// If isWellFormed is unavailable, we check in js during decoding
|
|
32
|
+
if (!loose && isWellFormed && !isWellFormed.call(str)) throw new TypeError(E_STRICT_UNICODE)
|
|
33
|
+
const u16 = js.encode(str, loose, !loose && isWellFormed, shouldSwap)
|
|
34
|
+
|
|
35
|
+
if (format === 'uint8-le' || format === 'uint8-be') return to8(u16) // Already swapped
|
|
36
|
+
if (format === 'uint16') return u16
|
|
37
|
+
throw new Error('Unreachable')
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function decode(input, loose = false, format = 'uint16') {
|
|
41
|
+
let u16
|
|
42
|
+
switch (format) {
|
|
43
|
+
case 'uint16':
|
|
44
|
+
if (!(input instanceof Uint16Array)) throw new TypeError('Expected an Uint16Array')
|
|
45
|
+
if (canDecoders) return loose ? decoderLoose16.decode(input) : decoderFatal16.decode(input)
|
|
46
|
+
u16 = input
|
|
47
|
+
break
|
|
48
|
+
case 'uint8-le':
|
|
49
|
+
if (!(input instanceof Uint8Array)) throw new TypeError('Expected an Uint8Array')
|
|
50
|
+
if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
|
|
51
|
+
if (canDecoders) return loose ? decoderLooseLE.decode(input) : decoderFatalLE.decode(input)
|
|
52
|
+
u16 = js.to16input(input, true)
|
|
53
|
+
break
|
|
54
|
+
case 'uint8-be':
|
|
55
|
+
if (!(input instanceof Uint8Array)) throw new TypeError('Expected an Uint8Array')
|
|
56
|
+
if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
|
|
57
|
+
if (canDecoders) return loose ? decoderLooseBE.decode(input) : decoderFatalBE.decode(input)
|
|
58
|
+
u16 = js.to16input(input, false)
|
|
59
|
+
break
|
|
60
|
+
default:
|
|
61
|
+
throw new TypeError('Unknown format')
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const str = js.decode(u16, loose, !loose && isWellFormed)
|
|
65
|
+
if (!loose && isWellFormed && !isWellFormed.call(str)) throw new TypeError(E_STRICT)
|
|
66
|
+
|
|
67
|
+
return str
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export const utf16fromString = (str, format = 'uint16') => encode(str, false, format)
|
|
71
|
+
export const utf16fromStringLoose = (str, format = 'uint16') => encode(str, true, format)
|
|
72
|
+
export const utf16toString = (arr, format = 'uint16') => decode(arr, false, format)
|
|
73
|
+
export const utf16toStringLoose = (arr, format = 'uint16') => decode(arr, true, format)
|
package/utf16.node.js
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import { nativeDecoder, isDeno, isLE } from './fallback/_utils.js'
|
|
2
|
+
import { E_STRICT, E_STRICT_UNICODE } from './fallback/utf16.js'
|
|
3
|
+
|
|
4
|
+
if (Buffer.TYPED_ARRAY_SUPPORT) throw new Error('Unexpected Buffer polyfill')
|
|
5
|
+
|
|
6
|
+
const { isWellFormed } = String.prototype
|
|
7
|
+
const to8 = (a) => new Uint8Array(a.buffer, a.byteOffset, a.byteLength)
|
|
8
|
+
|
|
9
|
+
// Unlike utf8, operates on Uint16Arrays by default
|
|
10
|
+
|
|
11
|
+
function encode(str, loose = false, format = 'uint16') {
|
|
12
|
+
if (typeof str !== 'string') throw new TypeError('Input is not a string')
|
|
13
|
+
if (format !== 'uint16' && format !== 'uint8-le' && format !== 'uint8-be') {
|
|
14
|
+
throw new TypeError('Unknown format')
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
if (!isWellFormed.call(str)) {
|
|
18
|
+
if (!loose) throw new TypeError(E_STRICT_UNICODE)
|
|
19
|
+
str = nativeDecoder.decode(Buffer.from(str)) // well, let's fix up (Buffer doesn't do this with utf16 encoding)
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const ble = Buffer.from(str, 'utf-16le')
|
|
23
|
+
|
|
24
|
+
if (format === 'uint8-le') return to8(ble)
|
|
25
|
+
if (format === 'uint8-be') return to8(ble.swap16())
|
|
26
|
+
if (format === 'uint16') {
|
|
27
|
+
const b = ble.byteOffset % 2 === 0 ? ble : Buffer.from(ble) // it should be already aligned, but just in case
|
|
28
|
+
if (!isLE) b.swap16()
|
|
29
|
+
return new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
throw new Error('Unreachable')
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const swapped = (x, swap) =>
|
|
36
|
+
swap ? Buffer.from(x).swap16() : Buffer.from(x.buffer, x.byteOffset, x.byteLength)
|
|
37
|
+
|
|
38
|
+
// We skip TextDecoder on Node.js, as it's is somewhy significantly slower than Buffer for utf16
|
|
39
|
+
function decodeNode(input, loose = false, format = 'uint16') {
|
|
40
|
+
let ble
|
|
41
|
+
if (format === 'uint16') {
|
|
42
|
+
if (!(input instanceof Uint16Array)) throw new TypeError('Expected an Uint16Array')
|
|
43
|
+
ble = swapped(input, !isLE)
|
|
44
|
+
} else if (format === 'uint8-le' || format === 'uint8-be') {
|
|
45
|
+
if (!(input instanceof Uint8Array)) throw new TypeError('Expected an Uint8Array')
|
|
46
|
+
if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
|
|
47
|
+
ble = swapped(input, format === 'uint8-be')
|
|
48
|
+
} else {
|
|
49
|
+
throw new TypeError('Unknown format')
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const str = ble.ucs2Slice(0, ble.byteLength)
|
|
53
|
+
if (isWellFormed.call(str)) return str
|
|
54
|
+
if (!loose) throw new TypeError(E_STRICT)
|
|
55
|
+
return nativeDecoder.decode(Buffer.from(str)) // fixup (see above)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function decodeDecoder(input, loose = false, format = 'uint16') {
|
|
59
|
+
let encoding
|
|
60
|
+
if (format === 'uint16') {
|
|
61
|
+
if (!(input instanceof Uint16Array)) throw new TypeError('Expected an Uint16Array')
|
|
62
|
+
encoding = isLE ? 'utf-16le' : 'utf-16be'
|
|
63
|
+
} else if (format === 'uint8-le' || format === 'uint8-be') {
|
|
64
|
+
if (!(input instanceof Uint8Array)) throw new TypeError('Expected an Uint8Array')
|
|
65
|
+
if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
|
|
66
|
+
encoding = format === 'uint8-le' ? 'utf-16le' : 'utf-16be'
|
|
67
|
+
} else {
|
|
68
|
+
throw new TypeError('Unknown format')
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return new TextDecoder(encoding, { ignoreBOM: true, fatal: !loose }).decode(input) // TODO: cache decoder?
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const decode = isDeno ? decodeDecoder : decodeNode
|
|
75
|
+
|
|
76
|
+
export const utf16fromString = (str, format = 'uint16') => encode(str, false, format)
|
|
77
|
+
export const utf16fromStringLoose = (str, format = 'uint16') => encode(str, true, format)
|
|
78
|
+
export const utf16toString = (arr, format = 'uint16') => decode(arr, false, format)
|
|
79
|
+
export const utf16toStringLoose = (arr, format = 'uint16') => decode(arr, true, format)
|
package/utf8.js
CHANGED
|
@@ -1,18 +1,16 @@
|
|
|
1
1
|
import { assertUint8 } from './assert.js'
|
|
2
2
|
import { typedView } from './array.js'
|
|
3
|
-
import { isHermes } from './fallback/_utils.js'
|
|
3
|
+
import { isHermes, nativeDecoder, nativeEncoder } from './fallback/_utils.js'
|
|
4
4
|
import { asciiPrefix, decodeLatin1 } from './fallback/latin1.js'
|
|
5
5
|
import * as js from './fallback/utf8.js'
|
|
6
6
|
|
|
7
|
-
const {
|
|
8
|
-
const haveNativeBuffer = Buffer && !Buffer.TYPED_ARRAY_SUPPORT
|
|
9
|
-
const isNative = (x) => x && (haveNativeBuffer || `${x}`.includes('[native code]')) // we consider Node.js TextDecoder/TextEncoder native
|
|
10
|
-
const haveDecoder = isNative(TextDecoder)
|
|
11
|
-
const nativeEncoder = isNative(TextEncoder) ? new TextEncoder() : null
|
|
7
|
+
const { TextDecoder, decodeURIComponent, escape } = globalThis // Buffer is optional
|
|
12
8
|
// ignoreBOM: true means that BOM will be left as-is, i.e. will be present in the output
|
|
13
9
|
// We don't want to strip anything unexpectedly
|
|
14
|
-
const
|
|
15
|
-
const
|
|
10
|
+
const decoderLoose = nativeDecoder
|
|
11
|
+
const decoderFatal = nativeDecoder
|
|
12
|
+
? new TextDecoder('utf-8', { ignoreBOM: true, fatal: true })
|
|
13
|
+
: null
|
|
16
14
|
const { isWellFormed } = String.prototype
|
|
17
15
|
|
|
18
16
|
const { E_STRICT, E_STRICT_UNICODE } = js
|
|
@@ -56,7 +54,7 @@ function encode(str, loose = false) {
|
|
|
56
54
|
function decode(arr, loose = false) {
|
|
57
55
|
assertUint8(arr)
|
|
58
56
|
if (arr.byteLength === 0) return ''
|
|
59
|
-
if (
|
|
57
|
+
if (nativeDecoder) return loose ? decoderLoose.decode(arr) : decoderFatal.decode(arr) // Node.js and browsers
|
|
60
58
|
|
|
61
59
|
// Fast path for ASCII prefix, this is faster than all alternatives below
|
|
62
60
|
const prefix = decodeLatin1(arr, 0, asciiPrefix(arr))
|