@exodus/bytes 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -14
- package/fallback/encoding.js +3 -3
- package/fallback/encoding.labels.js +10 -10
- package/package.json +1 -1
- package/utf16.node.js +1 -0
- package/utf8.node.js +16 -3
package/README.md
CHANGED
|
@@ -41,6 +41,7 @@ Spec compliant, passing WPT and covered with extra tests.
|
|
|
41
41
|
Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).
|
|
42
42
|
|
|
43
43
|
[Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
|
|
44
|
+
Runs (and passes WPT) on Node.js built without ICU.
|
|
44
45
|
|
|
45
46
|
### Caveat: `TextDecoder` / `TextEncoder` APIs are lossy by default per spec
|
|
46
47
|
|
|
@@ -205,16 +206,10 @@ as an ASCII-lowercased string.
|
|
|
205
206
|
If an encoding with that label does not exist, returns `null`.
|
|
206
207
|
|
|
207
208
|
This is the same as [`decoder.encoding` getter](https://encoding.spec.whatwg.org/#dom-textdecoder-encoding),
|
|
208
|
-
except that it
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
if (!label) return null // does not default to 'utf-8'
|
|
213
|
-
return new TextDecoder(label).encoding
|
|
214
|
-
} catch {
|
|
215
|
-
return null
|
|
216
|
-
}
|
|
217
|
-
```
|
|
209
|
+
except that it:
|
|
210
|
+
1. Supports [`replacement` encoding](https://encoding.spec.whatwg.org/#replacement) and its
|
|
211
|
+
[labels](https://encoding.spec.whatwg.org/#ref-for-replacement%E2%91%A1)
|
|
212
|
+
2. Does not throw for invalid labels and instead returns `null`
|
|
218
213
|
|
|
219
214
|
All encoding names are also valid labels for corresponding encodings.
|
|
220
215
|
|
|
@@ -233,15 +228,14 @@ Given a `TypedArray` or an `ArrayBuffer` instance `input`, returns either of:
|
|
|
233
228
|
Implements [decode](https://encoding.spec.whatwg.org/#decode) legacy hook.
|
|
234
229
|
|
|
235
230
|
Given a `TypedArray` or an `ArrayBuffer` instance `input` and an optional `fallbackEncoding`
|
|
236
|
-
|
|
231
|
+
encoding [label](https://encoding.spec.whatwg.org/#names-and-labels),
|
|
232
|
+
sniffs encoding from BOM with `fallbackEncoding` fallback and then
|
|
237
233
|
decodes the `input` using that encoding, skipping BOM if it was present.
|
|
238
234
|
|
|
239
235
|
Notes:
|
|
240
236
|
|
|
241
237
|
* BOM-sniffed encoding takes precedence over `fallbackEncoding` option per spec.
|
|
242
238
|
Use with care.
|
|
243
|
-
* `fallbackEncoding` must be ASCII-lowercased encoding name,
|
|
244
|
-
e.g. a result of `normalizeEncoding(label)` call.
|
|
245
239
|
* Always operates in non-fatal [mode](https://encoding.spec.whatwg.org/#textdecoder-error-mode),
|
|
246
240
|
aka replacement. It can convert different byte sequences to equal strings.
|
|
247
241
|
|
|
@@ -249,7 +243,7 @@ This method is similar to the following code, except that it doesn't support enc
|
|
|
249
243
|
only expects lowercased encoding name:
|
|
250
244
|
|
|
251
245
|
```js
|
|
252
|
-
new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding
|
|
246
|
+
new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding).decode(input)
|
|
253
247
|
```
|
|
254
248
|
|
|
255
249
|
### `@exodus/bytes/encoding-lite.js`
|
package/fallback/encoding.js
CHANGED
|
@@ -256,13 +256,13 @@ export function getBOMEncoding(input) {
|
|
|
256
256
|
// https://encoding.spec.whatwg.org/#decode
|
|
257
257
|
// Warning: encoding sniffed from BOM takes preference over the supplied one
|
|
258
258
|
// Warning: lossy, performs replacement, no option of throwing
|
|
259
|
-
//
|
|
259
|
+
// Completely ignores encoding and even skips validation when BOM is found
|
|
260
260
|
// Unlike TextDecoder public API, additionally supports 'replacement' encoding
|
|
261
|
-
export function legacyHookDecode(input, fallbackEncoding) {
|
|
261
|
+
export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
|
|
262
262
|
let u8 = fromSource(input)
|
|
263
263
|
const bomEncoding = getBOMEncoding(u8)
|
|
264
264
|
if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
|
|
265
|
-
const enc = bomEncoding ?? fallbackEncoding
|
|
265
|
+
const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
|
|
266
266
|
|
|
267
267
|
if (enc === 'utf-8') return utf8toStringLoose(u8)
|
|
268
268
|
if (enc === 'utf-16le' || enc === 'utf-16be') {
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
/* eslint-disable @exodus/export-default/named */
|
|
4
4
|
// prettier-ignore
|
|
5
|
-
|
|
5
|
+
const labels = {
|
|
6
6
|
'utf-8': ['unicode-1-1-utf-8', 'unicode11utf8', 'unicode20utf8', 'utf8', 'x-unicode20utf8'],
|
|
7
7
|
ibm866: ['866', 'cp866', 'csibm866'],
|
|
8
8
|
'iso-8859-2': ['csisolatin2', 'iso-ir-101', 'iso8859-2', 'iso88592', 'iso_8859-2', 'iso_8859-2:1987', 'l2', 'latin2'],
|
|
@@ -22,15 +22,6 @@ export default {
|
|
|
22
22
|
'koi8-u': ['koi8-ru'],
|
|
23
23
|
macintosh: ['csmacintosh', 'mac', 'x-mac-roman'],
|
|
24
24
|
'windows-874': ['dos-874', 'iso-8859-11', 'iso8859-11', 'iso885911', 'tis-620'],
|
|
25
|
-
'windows-1250': ['cp1250', 'x-cp1250'],
|
|
26
|
-
'windows-1251': ['cp1251', 'x-cp1251'],
|
|
27
|
-
'windows-1252': ['ansi_x3.4-1968', 'ascii', 'cp1252', 'cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii', 'x-cp1252'],
|
|
28
|
-
'windows-1253': ['cp1253', 'x-cp1253'],
|
|
29
|
-
'windows-1254': ['cp1254', 'csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599', 'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5', 'x-cp1254'],
|
|
30
|
-
'windows-1255': ['cp1255', 'x-cp1255'],
|
|
31
|
-
'windows-1256': ['cp1256', 'x-cp1256'],
|
|
32
|
-
'windows-1257': ['cp1257', 'x-cp1257'],
|
|
33
|
-
'windows-1258': ['cp1258', 'x-cp1258'],
|
|
34
25
|
'x-mac-cyrillic': ['x-mac-ukrainian'],
|
|
35
26
|
gbk: ['chinese', 'csgb2312', 'csiso58gb231280', 'gb2312', 'gb_2312', 'gb_2312-80', 'iso-ir-58', 'x-gbk'],
|
|
36
27
|
gb18030: [],
|
|
@@ -44,3 +35,12 @@ export default {
|
|
|
44
35
|
'utf-16le': ['csunicode', 'iso-10646-ucs-2', 'ucs-2', 'unicode', 'unicodefeff', 'utf-16'],
|
|
45
36
|
'x-user-defined': [],
|
|
46
37
|
}
|
|
38
|
+
|
|
39
|
+
for (let i = 0; i < 9; i++) labels[`windows-125${i}`] = [`cp125${i}`, `x-cp125${i}`]
|
|
40
|
+
|
|
41
|
+
// prettier-ignore
|
|
42
|
+
labels['windows-1252'].push('ansi_x3.4-1968', 'ascii', 'cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii')
|
|
43
|
+
// prettier-ignore
|
|
44
|
+
labels['windows-1254'].push('csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599', 'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5')
|
|
45
|
+
|
|
46
|
+
export default labels
|
package/package.json
CHANGED
package/utf16.node.js
CHANGED
|
@@ -36,6 +36,7 @@ const swapped = (x, swap) =>
|
|
|
36
36
|
swap ? Buffer.from(x).swap16() : Buffer.from(x.buffer, x.byteOffset, x.byteLength)
|
|
37
37
|
|
|
38
38
|
// We skip TextDecoder on Node.js, as it's is somewhy significantly slower than Buffer for utf16
|
|
39
|
+
// Also, it incorrectly misses replacements with Node.js is built without ICU, we fix that
|
|
39
40
|
function decodeNode(input, loose = false, format = 'uint16') {
|
|
40
41
|
let ble
|
|
41
42
|
if (format === 'uint16') {
|
package/utf8.node.js
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
import { assertUint8 } from './assert.js'
|
|
2
2
|
import { typedView } from './array.js'
|
|
3
|
-
import { E_STRICT_UNICODE } from './fallback/utf8.js'
|
|
3
|
+
import { E_STRICT, E_STRICT_UNICODE } from './fallback/utf8.js'
|
|
4
4
|
import { isAscii } from 'node:buffer'
|
|
5
5
|
|
|
6
6
|
if (Buffer.TYPED_ARRAY_SUPPORT) throw new Error('Unexpected Buffer polyfill')
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
let decoderFatal
|
|
9
9
|
const decoderLoose = new TextDecoder('utf-8', { ignoreBOM: true })
|
|
10
10
|
const { isWellFormed } = String.prototype
|
|
11
11
|
const isDeno = Boolean(globalThis.Deno)
|
|
12
12
|
|
|
13
|
+
try {
|
|
14
|
+
decoderFatal = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true })
|
|
15
|
+
} catch {
|
|
16
|
+
// Without ICU, Node.js doesn't support fatal option for utf-8
|
|
17
|
+
}
|
|
18
|
+
|
|
13
19
|
function encode(str, loose = false) {
|
|
14
20
|
if (typeof str !== 'string') throw new TypeError('Input is not a string')
|
|
15
21
|
const strLength = str.length
|
|
@@ -45,7 +51,14 @@ function decode(arr, loose = false) {
|
|
|
45
51
|
return buf.latin1Slice(0, arr.byteLength) // .latin1Slice is faster than .asciiSlice
|
|
46
52
|
}
|
|
47
53
|
|
|
48
|
-
|
|
54
|
+
if (loose) return decoderLoose.decode(arr)
|
|
55
|
+
if (decoderFatal) return decoderFatal.decode(arr)
|
|
56
|
+
|
|
57
|
+
// We are in an env without native fatal decoder support (non-fixed Node.js without ICU)
|
|
58
|
+
// Well, just recheck against encode if it contains replacement then, this is still faster than js impl
|
|
59
|
+
const str = decoderLoose.decode(arr)
|
|
60
|
+
if (str.includes('\uFFFD') && !Buffer.from(str).equals(arr)) throw new TypeError(E_STRICT)
|
|
61
|
+
return str
|
|
49
62
|
}
|
|
50
63
|
|
|
51
64
|
export const utf8fromString = (str, format = 'uint8') => typedView(encode(str, false), format)
|