@exodus/bytes 1.8.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1,62 @@
1
+ /**
2
+ * The exact same exports as `@exodus/bytes/encoding.js` are also exported as
3
+ * `@exodus/bytes/encoding-lite.js`, with the difference that the lite version does not load
4
+ * multi-byte `TextDecoder` encodings by default to reduce bundle size 10x.
5
+ *
6
+ * ```js
7
+ * import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding-lite.js'
8
+ * import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding-lite.js' // Requires Streams
9
+ *
10
+ * // Hooks for standards
11
+ * import { getBOMEncoding, legacyHookDecode, labelToName, normalizeEncoding } from '@exodus/bytes/encoding-lite.js'
12
+ * ```
13
+ *
14
+ * The only affected encodings are: `gbk`, `gb18030`, `big5`, `euc-jp`, `iso-2022-jp`, `shift_jis`
15
+ * and their [labels](https://encoding.spec.whatwg.org/#names-and-labels) when used with `TextDecoder`.
16
+ *
17
+ * Legacy single-byte encodingds are loaded by default in both cases.
18
+ *
19
+ * `TextEncoder` and hooks for standards (including `labelToName` / `normalizeEncoding`) do not have any behavior
20
+ * differences in the lite version and support full range if inputs.
21
+ *
22
+ * To avoid inconsistencies, the exported classes and methods are exactly the same objects.
23
+ *
24
+ * ```console
25
+ * > lite = require('@exodus/bytes/encoding-lite.js')
26
+ * [Module: null prototype] {
27
+ * TextDecoder: [class TextDecoder],
28
+ * TextDecoderStream: [class TextDecoderStream],
29
+ * TextEncoder: [class TextEncoder],
30
+ * TextEncoderStream: [class TextEncoderStream],
31
+ * getBOMEncoding: [Function: getBOMEncoding],
32
+ * labelToName: [Function: labelToName],
33
+ * legacyHookDecode: [Function: legacyHookDecode],
34
+ * normalizeEncoding: [Function: normalizeEncoding]
35
+ * }
36
+ * > new lite.TextDecoder('big5').decode(Uint8Array.of(0x25))
37
+ * Uncaught:
38
+ * Error: Legacy multi-byte encodings are disabled in /encoding-lite.js, use /encoding.js for full encodings range support
39
+ *
40
+ * > full = require('@exodus/bytes/encoding.js')
41
+ * [Module: null prototype] {
42
+ * TextDecoder: [class TextDecoder],
43
+ * TextDecoderStream: [class TextDecoderStream],
44
+ * TextEncoder: [class TextEncoder],
45
+ * TextEncoderStream: [class TextEncoderStream],
46
+ * getBOMEncoding: [Function: getBOMEncoding],
47
+ * labelToName: [Function: labelToName],
48
+ * legacyHookDecode: [Function: legacyHookDecode],
49
+ * normalizeEncoding: [Function: normalizeEncoding]
50
+ * }
51
+ * > full.TextDecoder === lite.TextDecoder
52
+ * true
53
+ * > new full.TextDecoder('big5').decode(Uint8Array.of(0x25))
54
+ * '%'
55
+ * > new lite.TextDecoder('big5').decode(Uint8Array.of(0x25))
56
+ * '%'
57
+ * ```
58
+ *
59
+ * @module @exodus/bytes/encoding-lite.js
60
+ */
61
+
1
62
  export * from './encoding.js'
package/encoding.d.ts CHANGED
@@ -1,14 +1,57 @@
1
+ /**
2
+ * Implements the [Encoding standard](https://encoding.spec.whatwg.org/):
3
+ * [TextDecoder](https://encoding.spec.whatwg.org/#interface-textdecoder),
4
+ * [TextEncoder](https://encoding.spec.whatwg.org/#interface-textencoder),
5
+ * [TextDecoderStream](https://encoding.spec.whatwg.org/#interface-textdecoderstream),
6
+ * [TextEncoderStream](https://encoding.spec.whatwg.org/#interface-textencoderstream),
7
+ * some [hooks](https://encoding.spec.whatwg.org/#specification-hooks).
8
+ *
9
+ * ```js
10
+ * import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding.js'
11
+ * import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding.js' // Requires Streams
12
+ *
13
+ * // Hooks for standards
14
+ * import { getBOMEncoding, legacyHookDecode, labelToName, normalizeEncoding } from '@exodus/bytes/encoding.js'
15
+ * ```
16
+ *
17
+ * @module @exodus/bytes/encoding.js
18
+ */
19
+
1
20
  /// <reference types="node" />
2
21
 
3
22
  /**
4
- * Converts an encoding label to its name, as an ASCII-lowercased string
23
+ * Convert an encoding [label](https://encoding.spec.whatwg.org/#names-and-labels) to its name,
24
+ * as an ASCII-lowercased string.
25
+ *
26
+ * If an encoding with that label does not exist, returns `null`.
27
+ *
28
+ * This is the same as [`decoder.encoding` getter](https://encoding.spec.whatwg.org/#dom-textdecoder-encoding),
29
+ * except that it:
30
+ * 1. Supports [`replacement` encoding](https://encoding.spec.whatwg.org/#replacement) and its
31
+ * [labels](https://encoding.spec.whatwg.org/#ref-for-replacement%E2%91%A1)
32
+ * 2. Does not throw for invalid labels and instead returns `null`
33
+ *
34
+ * It is identical to:
35
+ * ```js
36
+ * labelToName(label)?.toLowerCase() ?? null
37
+ * ```
38
+ *
39
+ * All encoding names are also valid labels for corresponding encodings.
40
+ *
5
41
  * @param label - The encoding label to normalize
6
42
  * @returns The normalized encoding name, or null if invalid
7
43
  */
8
44
  export function normalizeEncoding(label: string): string | null;
9
45
 
10
46
  /**
11
- * Implements BOM sniff (https://encoding.spec.whatwg.org/#bom-sniff) legacy hook.
47
+ * Implements [BOM sniff](https://encoding.spec.whatwg.org/#bom-sniff) legacy hook.
48
+ *
49
+ * Given a `TypedArray` or an `ArrayBuffer` instance `input`, returns either of:
50
+ * - `'utf-8'`, if `input` starts with UTF-8 byte order mark.
51
+ * - `'utf-16le'`, if `input` starts with UTF-16LE byte order mark.
52
+ * - `'utf-16be'`, if `input` starts with UTF-16BE byte order mark.
53
+ * - `null` otherwise.
54
+ *
12
55
  * @param input - The bytes to check for BOM
13
56
  * @returns The encoding ('utf-8', 'utf-16le', 'utf-16be'), or null if no BOM found
14
57
  */
@@ -17,7 +60,27 @@ export function getBOMEncoding(
17
60
  ): 'utf-8' | 'utf-16le' | 'utf-16be' | null;
18
61
 
19
62
  /**
20
- * Implements decode (https://encoding.spec.whatwg.org/#decode) legacy hook.
63
+ * Implements [decode](https://encoding.spec.whatwg.org/#decode) legacy hook.
64
+ *
65
+ * Given a `TypedArray` or an `ArrayBuffer` instance `input` and an optional `fallbackEncoding`
66
+ * encoding [label](https://encoding.spec.whatwg.org/#names-and-labels),
67
+ * sniffs encoding from BOM with `fallbackEncoding` fallback and then
68
+ * decodes the `input` using that encoding, skipping BOM if it was present.
69
+ *
70
+ * Notes:
71
+ *
72
+ * - BOM-sniffed encoding takes precedence over `fallbackEncoding` option per spec.
73
+ * Use with care.
74
+ * - Always operates in non-fatal [mode](https://encoding.spec.whatwg.org/#textdecoder-error-mode),
75
+ * aka replacement. It can convert different byte sequences to equal strings.
76
+ *
77
+ * This method is similar to the following code, except that it doesn't support encoding labels and
78
+ * only expects lowercased encoding name:
79
+ *
80
+ * ```js
81
+ * new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding).decode(input)
82
+ * ```
83
+ *
21
84
  * @param input - The bytes to decode
22
85
  * @param fallbackEncoding - The encoding to use if no BOM detected (default: 'utf-8')
23
86
  * @returns The decoded string
@@ -28,31 +91,50 @@ export function legacyHookDecode(
28
91
  ): string;
29
92
 
30
93
  /**
31
- * Converts an encoding label to its name, as a case-sensitive string.
94
+ * Implements [get an encoding from a string `label`](https://encoding.spec.whatwg.org/#concept-encoding-get).
95
+ *
96
+ * Convert an encoding [label](https://encoding.spec.whatwg.org/#names-and-labels) to its name,
97
+ * as a case-sensitive string.
98
+ *
99
+ * If an encoding with that label does not exist, returns `null`.
100
+ *
101
+ * All encoding names are also valid labels for corresponding encodings.
102
+ *
32
103
  * @param label - The encoding label
33
104
  * @returns The proper case encoding name, or null if invalid
34
105
  */
35
106
  export function labelToName(label: string): string | null;
36
107
 
37
108
  /**
38
- * Text decoder for decoding bytes to strings in various encodings
39
- * Supports strict and lossy modes
109
+ * [TextDecoder](https://encoding.spec.whatwg.org/#interface-textdecoder) implementation/polyfill.
110
+ *
111
+ * Decode bytes to strings according to [WHATWG Encoding](https://encoding.spec.whatwg.org) specification.
40
112
  */
41
113
  export const TextDecoder: typeof globalThis.TextDecoder;
42
114
 
43
115
  /**
44
- * Text encoder for encoding strings to UTF-8 bytes
116
+ * [TextEncoder](https://encoding.spec.whatwg.org/#interface-textencoder) implementation/polyfill.
117
+ *
118
+ * Encode strings to UTF-8 bytes according to [WHATWG Encoding](https://encoding.spec.whatwg.org) specification.
45
119
  */
46
120
  export const TextEncoder: typeof globalThis.TextEncoder;
47
121
 
48
122
  /**
49
- * Transform stream wrapper for TextDecoder
50
- * Decodes chunks of bytes to strings
123
+ * [TextDecoderStream](https://encoding.spec.whatwg.org/#interface-textdecoderstream) implementation/polyfill.
124
+ *
125
+ * A [Streams](https://streams.spec.whatwg.org/) wrapper for `TextDecoder`.
126
+ *
127
+ * Requires [Streams](https://streams.spec.whatwg.org/) to be either supported by the platform or
128
+ * [polyfilled](https://npmjs.com/package/web-streams-polyfill).
51
129
  */
52
130
  export const TextDecoderStream: typeof globalThis.TextDecoderStream;
53
131
 
54
132
  /**
55
- * Transform stream wrapper for TextEncoder
56
- * Encodes chunks of strings to UTF-8 bytes
133
+ * [TextEncoderStream](https://encoding.spec.whatwg.org/#interface-textencoderstream) implementation/polyfill.
134
+ *
135
+ * A [Streams](https://streams.spec.whatwg.org/) wrapper for `TextEncoder`.
136
+ *
137
+ * Requires [Streams](https://streams.spec.whatwg.org/) to be either supported by the platform or
138
+ * [polyfilled](https://npmjs.com/package/web-streams-polyfill).
57
139
  */
58
140
  export const TextEncoderStream: typeof globalThis.TextEncoderStream;
package/encoding.js CHANGED
@@ -1,7 +1,8 @@
1
- import { createMultibyteDecoder } from '@exodus/bytes/multi-byte.js' // eslint-disable-line @exodus/import/no-unresolved
2
- import { setMultibyteDecoder } from './fallback/encoding.js'
1
+ import { createMultibyteDecoder } from '@exodus/bytes/multi-byte.js'
2
+ import { multibyteEncoder } from './fallback/multi-byte.js'
3
+ import { setMultibyte } from './fallback/encoding.js'
3
4
 
4
- setMultibyteDecoder(createMultibyteDecoder)
5
+ setMultibyte(createMultibyteDecoder, multibyteEncoder)
5
6
 
6
7
  export {
7
8
  TextDecoder,
@@ -1,9 +1,9 @@
1
1
  const { Buffer, TextEncoder, TextDecoder } = globalThis
2
2
  const haveNativeBuffer = Buffer && !Buffer.TYPED_ARRAY_SUPPORT
3
3
  export const nativeBuffer = haveNativeBuffer ? Buffer : null
4
- export const isHermes = Boolean(globalThis.HermesInternal)
5
- export const isDeno = Boolean(globalThis.Deno)
6
- export const isLE = new Uint8Array(Uint16Array.of(258).buffer)[0] === 2
4
+ export const isHermes = !!globalThis.HermesInternal
5
+ export const isDeno = !!globalThis.Deno
6
+ export const isLE = /* @__PURE__ */ (() => new Uint8Array(Uint16Array.of(258).buffer)[0] === 2)()
7
7
 
8
8
  // We consider Node.js TextDecoder/TextEncoder native
9
9
  let isNative = (x) => x && (haveNativeBuffer || `${x}`.includes('[native code]'))
@@ -17,16 +17,19 @@ export const nativeDecoder = isNative(TextDecoder)
17
17
  // Actually windows-1252, compatible with ascii and latin1 decoding
18
18
  // Beware that on non-latin1, i.e. on windows-1252, this is broken in ~all Node.js versions released
19
19
  // in 2025 due to a regression, so we call it Latin1 as it's usable only for that
20
- let nativeDecoderLatin1impl = null
21
- if (nativeDecoder) {
20
+ const getNativeLain1 = () => {
22
21
  // Not all barebone engines with TextDecoder support something except utf-8, detect
23
- try {
24
- nativeDecoderLatin1impl = new TextDecoder('latin1', { ignoreBOM: true })
25
- } catch {}
22
+ if (!nativeDecoder) {
23
+ try {
24
+ return new TextDecoder('latin1', { ignoreBOM: true })
25
+ } catch {}
26
+ }
27
+
28
+ return null
26
29
  }
27
30
 
28
- export const nativeDecoderLatin1 = nativeDecoderLatin1impl
29
- export const canDecoders = Boolean(nativeDecoderLatin1impl)
31
+ export const nativeDecoderLatin1 = /* @__PURE__ */ getNativeLain1()
32
+ export const canDecoders = !!nativeDecoderLatin1
30
33
 
31
34
  // Block Firefox < 146 specifically from using native hex/base64, as it's very slow there
32
35
  // Refs: https://bugzilla.mozilla.org/show_bug.cgi?id=1994067 (and linked issues), fixed in 146
@@ -47,10 +50,11 @@ function shouldSkipBuiltins() {
47
50
  return /firefox/i.test(g.navigator.userAgent || '') // as simple as we can
48
51
  }
49
52
 
53
+ /* c8 ignore next */
50
54
  return false // eslint-disable-line no-unreachable
51
55
  }
52
56
 
53
- export const skipWeb = shouldSkipBuiltins()
57
+ export const skipWeb = /* @__PURE__ */ shouldSkipBuiltins()
54
58
 
55
59
  function decodePartAddition(a, start, end, m) {
56
60
  let o = ''
@@ -0,0 +1,81 @@
1
+ import labels from './encoding.labels.js'
2
+
3
+ let labelsMap
4
+
5
+ export const E_ENCODING = 'Unknown encoding'
6
+
7
+ // Warning: unlike whatwg-encoding, returns lowercased labels
8
+ // Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
9
+ // https://encoding.spec.whatwg.org/#names-and-labels
10
+ export function normalizeEncoding(label) {
11
+ // fast path
12
+ if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
13
+ if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
14
+ // full map
15
+ if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
16
+ const low = `${label}`.trim().toLowerCase()
17
+ if (Object.hasOwn(labels, low)) return low
18
+ if (!labelsMap) {
19
+ labelsMap = new Map()
20
+ for (const [label, aliases] of Object.entries(labels)) {
21
+ for (const alias of aliases) labelsMap.set(alias, label)
22
+ }
23
+ }
24
+
25
+ const mapped = labelsMap.get(low)
26
+ if (mapped) return mapped
27
+ return null
28
+ }
29
+
30
+ // TODO: make this more strict against Symbol.toStringTag
31
+ // Is not very significant though, anything faking Symbol.toStringTag could as well override
32
+ // prototypes, which is not something we protect against
33
+
34
+ function isAnyArrayBuffer(x) {
35
+ if (x instanceof ArrayBuffer) return true
36
+ if (globalThis.SharedArrayBuffer && x instanceof SharedArrayBuffer) return true
37
+ if (!x || typeof x.byteLength !== 'number') return false
38
+ const s = Object.prototype.toString.call(x)
39
+ return s === '[object ArrayBuffer]' || s === '[object SharedArrayBuffer]'
40
+ }
41
+
42
+ export function fromSource(x) {
43
+ if (x instanceof Uint8Array) return x
44
+ if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
45
+ if (isAnyArrayBuffer(x)) {
46
+ if ('detached' in x) return x.detached === true ? new Uint8Array() : new Uint8Array(x)
47
+ // Old engines without .detached, try-catch
48
+ try {
49
+ return new Uint8Array(x)
50
+ } catch {
51
+ return new Uint8Array()
52
+ }
53
+ }
54
+
55
+ throw new TypeError('Argument must be a SharedArrayBuffer, ArrayBuffer or ArrayBufferView')
56
+ }
57
+
58
+ // Warning: unlike whatwg-encoding, returns lowercased labels
59
+ // Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
60
+ export function getBOMEncoding(input) {
61
+ const u8 = fromSource(input) // asserts
62
+ if (u8.length >= 3 && u8[0] === 0xef && u8[1] === 0xbb && u8[2] === 0xbf) return 'utf-8'
63
+ if (u8.length < 2) return null
64
+ if (u8[0] === 0xff && u8[1] === 0xfe) return 'utf-16le'
65
+ if (u8[0] === 0xfe && u8[1] === 0xff) return 'utf-16be'
66
+ return null
67
+ }
68
+
69
+ const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
70
+
71
+ // Unlike normalizeEncoding, case-sensitive
72
+ // https://encoding.spec.whatwg.org/#names-and-labels
73
+ export function labelToName(label) {
74
+ const enc = normalizeEncoding(label)
75
+ if (enc === 'utf-8') return 'UTF-8' // fast path
76
+ if (!enc) return enc
77
+ if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
78
+ if (enc === 'big5') return 'Big5'
79
+ if (enc === 'shift_jis') return 'Shift_JIS'
80
+ return enc
81
+ }
@@ -5,82 +5,36 @@ import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
5
5
  import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
6
6
  import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
7
7
  import labels from './encoding.labels.js'
8
- import { unfinishedBytes } from './encoding.util.js'
8
+ import { fromSource, getBOMEncoding, normalizeEncoding, E_ENCODING } from './encoding.api.js'
9
+ import { unfinishedBytes, mergePrefix } from './encoding.util.js'
9
10
 
11
+ export { labelToName, getBOMEncoding, normalizeEncoding } from './encoding.api.js'
12
+
13
+ const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
10
14
  const E_OPTIONS = 'The "options" argument must be of type object'
11
- const E_ENCODING = 'Unknown encoding'
12
15
  const replacementChar = '\uFFFD'
13
-
14
- const E_MULTI =
15
- 'Legacy multi-byte encodings are disabled in /encoding-lite.js, use /encoding.js for full encodings range support'
16
16
  const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
17
- let createMultibyteDecoder
17
+ let createMultibyteDecoder, multibyteEncoder
18
18
 
19
- export function setMultibyteDecoder(createDecoder) {
19
+ export const isMultibyte = (enc) => multibyteSet.has(enc)
20
+ export function setMultibyte(createDecoder, createEncoder) {
20
21
  createMultibyteDecoder = createDecoder
22
+ multibyteEncoder = createEncoder
21
23
  }
22
24
 
23
- let labelsMap
24
-
25
- // Warning: unlike whatwg-encoding, returns lowercased labels
26
- // Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
27
- // https://encoding.spec.whatwg.org/#names-and-labels
28
- export function normalizeEncoding(label) {
29
- // fast path
30
- if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
31
- if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
32
- // full map
33
- if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
34
- const low = `${label}`.trim().toLowerCase()
35
- if (Object.hasOwn(labels, low)) return low
36
- if (!labelsMap) {
37
- labelsMap = new Map()
38
- for (const [label, aliases] of Object.entries(labels)) {
39
- for (const alias of aliases) labelsMap.set(alias, label)
40
- }
41
- }
42
-
43
- const mapped = labelsMap.get(low)
44
- if (mapped) return mapped
45
- return null
25
+ export function getMultibyteEncoder() {
26
+ if (!multibyteEncoder) throw new Error(E_MULTI)
27
+ return multibyteEncoder
46
28
  }
47
29
 
48
30
  const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
49
31
 
50
- // TODO: make this more strict against Symbol.toStringTag
51
- // Is not very significant though, anything faking Symbol.toStringTag could as well override
52
- // prototypes, which is not something we protect against
53
-
54
- function isAnyArrayBuffer(x) {
55
- if (x instanceof ArrayBuffer) return true
56
- if (globalThis.SharedArrayBuffer && x instanceof SharedArrayBuffer) return true
57
- if (!x || typeof x.byteLength !== 'number') return false
58
- const s = Object.prototype.toString.call(x)
59
- return s === '[object ArrayBuffer]' || s === '[object SharedArrayBuffer]'
60
- }
61
-
62
32
  function isAnyUint8Array(x) {
63
33
  if (x instanceof Uint8Array) return true
64
34
  if (!x || !ArrayBuffer.isView(x) || x.BYTES_PER_ELEMENT !== 1) return false
65
35
  return Object.prototype.toString.call(x) === '[object Uint8Array]'
66
36
  }
67
37
 
68
- const fromSource = (x) => {
69
- if (x instanceof Uint8Array) return x
70
- if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
71
- if (isAnyArrayBuffer(x)) {
72
- if ('detached' in x) return x.detached === true ? new Uint8Array() : new Uint8Array(x)
73
- // Old engines without .detached, try-catch
74
- try {
75
- return new Uint8Array(x)
76
- } catch {
77
- return new Uint8Array()
78
- }
79
- }
80
-
81
- throw new TypeError('Argument must be a SharedArrayBuffer, ArrayBuffer or ArrayBufferView')
82
- }
83
-
84
38
  function unicodeDecoder(encoding, loose) {
85
39
  if (encoding === 'utf-8') return loose ? utf8toStringLoose : utf8toString // likely
86
40
  const form = encoding === 'utf-16le' ? 'uint8-le' : 'uint8-be'
@@ -99,10 +53,10 @@ export class TextDecoder {
99
53
  const enc = normalizeEncoding(encoding)
100
54
  if (!enc || enc === 'replacement') throw new RangeError(E_ENCODING)
101
55
  define(this, 'encoding', enc)
102
- define(this, 'fatal', Boolean(options.fatal))
103
- define(this, 'ignoreBOM', Boolean(options.ignoreBOM))
56
+ define(this, 'fatal', !!options.fatal)
57
+ define(this, 'ignoreBOM', !!options.ignoreBOM)
104
58
  this.#unicode = enc === 'utf-8' || enc === 'utf-16le' || enc === 'utf-16be'
105
- this.#multibyte = !this.#unicode && multibyteSet.has(enc)
59
+ this.#multibyte = !this.#unicode && isMultibyte(enc)
106
60
  this.#canBOM = this.#unicode && !this.ignoreBOM
107
61
  }
108
62
 
@@ -112,44 +66,26 @@ export class TextDecoder {
112
66
 
113
67
  decode(input, options = {}) {
114
68
  if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
115
- const stream = Boolean(options.stream)
69
+ const stream = !!options.stream
116
70
  let u = input === undefined ? new Uint8Array() : fromSource(input)
71
+ const empty = u.length === 0 // also can't be streaming after next line
72
+ if (empty && stream) return '' // no state change
117
73
 
118
74
  if (this.#unicode) {
119
75
  let prefix
120
76
  if (this.#chunk) {
121
- if (u.length === 0) {
122
- if (stream) return '' // no change
123
- u = this.#chunk // process as final chunk to handle errors and state changes
124
- } else if (u.length < 3) {
125
- // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
126
- const a = new Uint8Array(u.length + this.#chunk.length)
127
- a.set(this.#chunk)
128
- a.set(u, this.#chunk.length)
129
- u = a
77
+ const merged = mergePrefix(u, this.#chunk, this.encoding)
78
+ if (u.length < 3) {
79
+ u = merged // might be unfinished, but fully consumed old u
130
80
  } else {
131
- // Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
132
- const t = new Uint8Array(this.#chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
133
- t.set(this.#chunk)
134
- t.set(u.subarray(0, 3), this.#chunk.length)
135
-
136
- // Stop at the first offset where unfinished bytes reaches 0 or fits into u
137
- // If that doesn't happen (u too short), just concat chunk and u completely
138
- for (let i = 1; i <= 3; i++) {
139
- const unfinished = unfinishedBytes(t, this.#chunk.length + i, this.encoding) // 0-3
140
- if (unfinished <= i) {
141
- // Always reachable at 3, but we still need 'unfinished' value for it
142
- const add = i - unfinished // 0-3
143
- prefix = add > 0 ? t.subarray(0, this.#chunk.length + add) : this.#chunk
144
- if (add > 0) u = u.subarray(add)
145
- break
146
- }
147
- }
81
+ prefix = merged // stops at complete chunk
82
+ const add = prefix.length - this.#chunk.length
83
+ if (add > 0) u = u.subarray(add)
148
84
  }
149
85
 
150
86
  this.#chunk = null
151
- } else if (u.byteLength === 0) {
152
- if (!stream) this.#canBOM = !this.ignoreBOM
87
+ } else if (empty) {
88
+ this.#canBOM = !this.ignoreBOM // not streaming
153
89
  return ''
154
90
  }
155
91
 
@@ -170,27 +106,31 @@ export class TextDecoder {
170
106
  }
171
107
  }
172
108
 
109
+ let seenBOM = false
173
110
  if (this.#canBOM) {
174
111
  const bom = this.#findBom(prefix ?? u)
175
112
  if (bom) {
176
- if (stream) this.#canBOM = false
113
+ seenBOM = true
177
114
  if (prefix) {
178
115
  prefix = prefix.subarray(bom)
179
116
  } else {
180
117
  u = u.subarray(bom)
181
118
  }
182
119
  }
120
+ } else if (!stream && !this.ignoreBOM) {
121
+ this.#canBOM = true
183
122
  }
184
123
 
185
124
  if (!this.#decode) this.#decode = unicodeDecoder(this.encoding, !this.fatal)
186
125
  try {
187
126
  const res = (prefix ? this.#decode(prefix) : '') + this.#decode(u) + suffix
188
- if (res.length > 0 && stream) this.#canBOM = false
189
-
190
- if (!stream) this.#canBOM = !this.ignoreBOM
127
+ // "BOM seen" is set on the current decode call only if it did not error, in "serialize I/O queue" after decoding
128
+ if (stream && (seenBOM || res.length > 0)) this.#canBOM = false
191
129
  return res
192
130
  } catch (err) {
193
131
  this.#chunk = null // reset unfinished chunk on errors
132
+ // The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
133
+ // See also multi-byte.js
194
134
  throw err
195
135
  }
196
136
 
@@ -215,6 +155,7 @@ export class TextDecoder {
215
155
  return u.byteLength >= 2 && u[0] === 0xfe && u[1] === 0xff ? 2 : 0
216
156
  }
217
157
 
158
+ /* c8 ignore next */
218
159
  throw new Error('Unreachable')
219
160
  }
220
161
  }
@@ -341,17 +282,6 @@ export class TextEncoderStream {
341
282
  }
342
283
  }
343
284
 
344
- // Warning: unlike whatwg-encoding, returns lowercased labels
345
- // Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
346
- export function getBOMEncoding(input) {
347
- const u8 = fromSource(input) // asserts
348
- if (u8.length >= 3 && u8[0] === 0xef && u8[1] === 0xbb && u8[2] === 0xbf) return 'utf-8'
349
- if (u8.length < 2) return null
350
- if (u8[0] === 0xff && u8[1] === 0xfe) return 'utf-16le'
351
- if (u8[0] === 0xfe && u8[1] === 0xff) return 'utf-16be'
352
- return null
353
- }
354
-
355
285
  // https://encoding.spec.whatwg.org/#decode
356
286
  // Warning: encoding sniffed from BOM takes preference over the supplied one
357
287
  // Warning: lossy, performs replacement, no option of throwing
@@ -368,7 +298,7 @@ export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
368
298
  let suffix = ''
369
299
  if (u8.byteLength % 2 !== 0) {
370
300
  suffix = replacementChar
371
- u8 = u8.subarray(0, -1)
301
+ u8 = u8.subarray(0, -unfinishedBytes(u8, u8.byteLength, enc))
372
302
  }
373
303
 
374
304
  return utf16toStringLoose(u8, enc === 'utf-16le' ? 'uint8-le' : 'uint8-be') + suffix
@@ -376,7 +306,7 @@ export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
376
306
 
377
307
  if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)
378
308
 
379
- if (multibyteSet.has(enc)) {
309
+ if (isMultibyte(enc)) {
380
310
  if (!createMultibyteDecoder) throw new Error(E_MULTI)
381
311
  return createMultibyteDecoder(enc, true)(u8)
382
312
  }
@@ -387,17 +317,3 @@ export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
387
317
 
388
318
  return createSinglebyteDecoder(enc, true)(u8)
389
319
  }
390
-
391
- const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
392
-
393
- // Unlike normalizeEncoding, case-sensitive
394
- // https://encoding.spec.whatwg.org/#names-and-labels
395
- export function labelToName(label) {
396
- const enc = normalizeEncoding(label)
397
- if (enc === 'utf-8') return 'UTF-8' // fast path
398
- if (!enc) return enc
399
- if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
400
- if (enc === 'big5') return 'Big5'
401
- if (enc === 'shift_jis') return 'Shift_JIS'
402
- return enc
403
- }
@@ -1,3 +1,5 @@
1
+ // Get a number of last bytes in an Uint8Array `u` ending at `len` that don't
2
+ // form a codepoint yet, but can be a part of a single codepoint on more data
1
3
  export function unfinishedBytes(u, len, enc) {
2
4
  switch (enc) {
3
5
  case 'utf-8': {
@@ -32,3 +34,35 @@ export function unfinishedBytes(u, len, enc) {
32
34
 
33
35
  throw new Error('Unsupported encoding')
34
36
  }
37
+
38
+ // Merge prefix `chunk` with `u` and return new combined prefix
39
+ // For u.length < 3, fully consumes u and can return unfinished data,
40
+ // otherwise returns a prefix with no unfinished bytes
41
+ export function mergePrefix(u, chunk, enc) {
42
+ if (u.length === 0) return chunk
43
+ if (u.length < 3) {
44
+ // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
45
+ const a = new Uint8Array(u.length + chunk.length)
46
+ a.set(chunk)
47
+ a.set(u, chunk.length)
48
+ return a
49
+ }
50
+
51
+ // Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
52
+ const t = new Uint8Array(chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
53
+ t.set(chunk)
54
+ t.set(u.subarray(0, 3), chunk.length)
55
+
56
+ // Stop at the first offset where unfinished bytes reaches 0 or fits into u
57
+ // If that doesn't happen (u too short), just concat chunk and u completely (above)
58
+ for (let i = 1; i <= 3; i++) {
59
+ const unfinished = unfinishedBytes(t, chunk.length + i, enc) // 0-3
60
+ if (unfinished <= i) {
61
+ // Always reachable at 3, but we still need 'unfinished' value for it
62
+ const add = i - unfinished // 0-3
63
+ return add > 0 ? t.subarray(0, chunk.length + add) : chunk
64
+ }
65
+ }
66
+
67
+ // Unreachable
68
+ }
@@ -37,6 +37,7 @@ export function asciiPrefix(arr) {
37
37
  const b = u32[i + 1]
38
38
  const c = u32[i + 2]
39
39
  const d = u32[i + 3]
40
+ // "(a | b | c | d) & mask" is slower on Hermes though faster on v8
40
41
  if (a & 0x80_80_80_80 || b & 0x80_80_80_80 || c & 0x80_80_80_80 || d & 0x80_80_80_80) break
41
42
  }
42
43