@exodus/bytes 1.9.0 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/encoding.d.ts CHANGED
@@ -1,14 +1,57 @@
1
+ /**
2
+ * Implements the [Encoding standard](https://encoding.spec.whatwg.org/):
3
+ * [TextDecoder](https://encoding.spec.whatwg.org/#interface-textdecoder),
4
+ * [TextEncoder](https://encoding.spec.whatwg.org/#interface-textencoder),
5
+ * [TextDecoderStream](https://encoding.spec.whatwg.org/#interface-textdecoderstream),
6
+ * [TextEncoderStream](https://encoding.spec.whatwg.org/#interface-textencoderstream),
7
+ * some [hooks](https://encoding.spec.whatwg.org/#specification-hooks).
8
+ *
9
+ * ```js
10
+ * import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding.js'
11
+ * import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding.js' // Requires Streams
12
+ *
13
+ * // Hooks for standards
14
+ * import { getBOMEncoding, legacyHookDecode, labelToName, normalizeEncoding } from '@exodus/bytes/encoding.js'
15
+ * ```
16
+ *
17
+ * @module @exodus/bytes/encoding.js
18
+ */
19
+
1
20
  /// <reference types="node" />
2
21
 
3
22
  /**
4
- * Converts an encoding label to its name, as an ASCII-lowercased string
23
+ * Convert an encoding [label](https://encoding.spec.whatwg.org/#names-and-labels) to its name,
24
+ * as an ASCII-lowercased string.
25
+ *
26
+ * If an encoding with that label does not exist, returns `null`.
27
+ *
28
+ * This is the same as [`decoder.encoding` getter](https://encoding.spec.whatwg.org/#dom-textdecoder-encoding),
29
+ * except that it:
30
+ * 1. Supports [`replacement` encoding](https://encoding.spec.whatwg.org/#replacement) and its
31
+ * [labels](https://encoding.spec.whatwg.org/#ref-for-replacement%E2%91%A1)
32
+ * 2. Does not throw for invalid labels and instead returns `null`
33
+ *
34
+ * It is identical to:
35
+ * ```js
36
+ * labelToName(label)?.toLowerCase() ?? null
37
+ * ```
38
+ *
39
+ * All encoding names are also valid labels for corresponding encodings.
40
+ *
5
41
  * @param label - The encoding label to normalize
6
42
  * @returns The normalized encoding name, or null if invalid
7
43
  */
8
44
  export function normalizeEncoding(label: string): string | null;
9
45
 
10
46
  /**
11
- * Implements BOM sniff (https://encoding.spec.whatwg.org/#bom-sniff) legacy hook.
47
+ * Implements [BOM sniff](https://encoding.spec.whatwg.org/#bom-sniff) legacy hook.
48
+ *
49
+ * Given a `TypedArray` or an `ArrayBuffer` instance `input`, returns either of:
50
+ * - `'utf-8'`, if `input` starts with UTF-8 byte order mark.
51
+ * - `'utf-16le'`, if `input` starts with UTF-16LE byte order mark.
52
+ * - `'utf-16be'`, if `input` starts with UTF-16BE byte order mark.
53
+ * - `null` otherwise.
54
+ *
12
55
  * @param input - The bytes to check for BOM
13
56
  * @returns The encoding ('utf-8', 'utf-16le', 'utf-16be'), or null if no BOM found
14
57
  */
@@ -17,7 +60,27 @@ export function getBOMEncoding(
17
60
  ): 'utf-8' | 'utf-16le' | 'utf-16be' | null;
18
61
 
19
62
  /**
20
- * Implements decode (https://encoding.spec.whatwg.org/#decode) legacy hook.
63
+ * Implements [decode](https://encoding.spec.whatwg.org/#decode) legacy hook.
64
+ *
65
+ * Given a `TypedArray` or an `ArrayBuffer` instance `input` and an optional `fallbackEncoding`
66
+ * encoding [label](https://encoding.spec.whatwg.org/#names-and-labels),
67
+ * sniffs encoding from BOM with `fallbackEncoding` fallback and then
68
+ * decodes the `input` using that encoding, skipping BOM if it was present.
69
+ *
70
+ * Notes:
71
+ *
72
+ * - BOM-sniffed encoding takes precedence over `fallbackEncoding` option per spec.
73
+ * Use with care.
74
+ * - Always operates in non-fatal [mode](https://encoding.spec.whatwg.org/#textdecoder-error-mode),
75
+ * aka replacement. It can convert different byte sequences to equal strings.
76
+ *
77
+ * This method is similar to the following code, except that it doesn't support encoding labels and
78
+ * only expects lowercased encoding name:
79
+ *
80
+ * ```js
81
+ * new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding).decode(input)
82
+ * ```
83
+ *
21
84
  * @param input - The bytes to decode
22
85
  * @param fallbackEncoding - The encoding to use if no BOM detected (default: 'utf-8')
23
86
  * @returns The decoded string
@@ -28,31 +91,50 @@ export function legacyHookDecode(
28
91
  ): string;
29
92
 
30
93
  /**
31
- * Converts an encoding label to its name, as a case-sensitive string.
94
+ * Implements [get an encoding from a string `label`](https://encoding.spec.whatwg.org/#concept-encoding-get).
95
+ *
96
+ * Convert an encoding [label](https://encoding.spec.whatwg.org/#names-and-labels) to its name,
97
+ * as a case-sensitive string.
98
+ *
99
+ * If an encoding with that label does not exist, returns `null`.
100
+ *
101
+ * All encoding names are also valid labels for corresponding encodings.
102
+ *
32
103
  * @param label - The encoding label
33
104
  * @returns The proper case encoding name, or null if invalid
34
105
  */
35
106
  export function labelToName(label: string): string | null;
36
107
 
37
108
  /**
38
- * Text decoder for decoding bytes to strings in various encodings
39
- * Supports strict and lossy modes
109
+ * [TextDecoder](https://encoding.spec.whatwg.org/#interface-textdecoder) implementation/polyfill.
110
+ *
111
+ * Decode bytes to strings according to [WHATWG Encoding](https://encoding.spec.whatwg.org) specification.
40
112
  */
41
113
  export const TextDecoder: typeof globalThis.TextDecoder;
42
114
 
43
115
  /**
44
- * Text encoder for encoding strings to UTF-8 bytes
116
+ * [TextEncoder](https://encoding.spec.whatwg.org/#interface-textencoder) implementation/polyfill.
117
+ *
118
+ * Encode strings to UTF-8 bytes according to [WHATWG Encoding](https://encoding.spec.whatwg.org) specification.
45
119
  */
46
120
  export const TextEncoder: typeof globalThis.TextEncoder;
47
121
 
48
122
  /**
49
- * Transform stream wrapper for TextDecoder
50
- * Decodes chunks of bytes to strings
123
+ * [TextDecoderStream](https://encoding.spec.whatwg.org/#interface-textdecoderstream) implementation/polyfill.
124
+ *
125
+ * A [Streams](https://streams.spec.whatwg.org/) wrapper for `TextDecoder`.
126
+ *
127
+ * Requires [Streams](https://streams.spec.whatwg.org/) to be either supported by the platform or
128
+ * [polyfilled](https://npmjs.com/package/web-streams-polyfill).
51
129
  */
52
130
  export const TextDecoderStream: typeof globalThis.TextDecoderStream;
53
131
 
54
132
  /**
55
- * Transform stream wrapper for TextEncoder
56
- * Encodes chunks of strings to UTF-8 bytes
133
+ * [TextEncoderStream](https://encoding.spec.whatwg.org/#interface-textencoderstream) implementation/polyfill.
134
+ *
135
+ * A [Streams](https://streams.spec.whatwg.org/) wrapper for `TextEncoder`.
136
+ *
137
+ * Requires [Streams](https://streams.spec.whatwg.org/) to be either supported by the platform or
138
+ * [polyfilled](https://npmjs.com/package/web-streams-polyfill).
57
139
  */
58
140
  export const TextEncoderStream: typeof globalThis.TextEncoderStream;
package/encoding.js CHANGED
@@ -1,7 +1,8 @@
1
- import { createMultibyteDecoder } from '@exodus/bytes/multi-byte.js' // eslint-disable-line @exodus/import/no-unresolved
2
- import { setMultibyteDecoder } from './fallback/encoding.js'
1
+ import { createMultibyteDecoder } from '@exodus/bytes/multi-byte.js'
2
+ import { multibyteEncoder } from './fallback/multi-byte.js'
3
+ import { setMultibyte } from './fallback/encoding.js'
3
4
 
4
- setMultibyteDecoder(createMultibyteDecoder)
5
+ setMultibyte(createMultibyteDecoder, multibyteEncoder)
5
6
 
6
7
  export {
7
8
  TextDecoder,
@@ -1,9 +1,9 @@
1
1
  const { Buffer, TextEncoder, TextDecoder } = globalThis
2
2
  const haveNativeBuffer = Buffer && !Buffer.TYPED_ARRAY_SUPPORT
3
3
  export const nativeBuffer = haveNativeBuffer ? Buffer : null
4
- export const isHermes = Boolean(globalThis.HermesInternal)
5
- export const isDeno = Boolean(globalThis.Deno)
6
- export const isLE = new Uint8Array(Uint16Array.of(258).buffer)[0] === 2
4
+ export const isHermes = !!globalThis.HermesInternal
5
+ export const isDeno = !!globalThis.Deno
6
+ export const isLE = /* @__PURE__ */ (() => new Uint8Array(Uint16Array.of(258).buffer)[0] === 2)()
7
7
 
8
8
  // We consider Node.js TextDecoder/TextEncoder native
9
9
  let isNative = (x) => x && (haveNativeBuffer || `${x}`.includes('[native code]'))
@@ -17,16 +17,19 @@ export const nativeDecoder = isNative(TextDecoder)
17
17
  // Actually windows-1252, compatible with ascii and latin1 decoding
18
18
  // Beware that on non-latin1, i.e. on windows-1252, this is broken in ~all Node.js versions released
19
19
  // in 2025 due to a regression, so we call it Latin1 as it's usable only for that
20
- let nativeDecoderLatin1impl = null
21
- if (nativeDecoder) {
20
+ const getNativeLatin1 = () => {
22
21
  // Not all barebone engines with TextDecoder support something except utf-8, detect
23
- try {
24
- nativeDecoderLatin1impl = new TextDecoder('latin1', { ignoreBOM: true })
25
- } catch {}
22
+ if (nativeDecoder) {
23
+ try {
24
+ return new TextDecoder('latin1', { ignoreBOM: true })
25
+ } catch {}
26
+ }
27
+
28
+ return null
26
29
  }
27
30
 
28
- export const nativeDecoderLatin1 = nativeDecoderLatin1impl
29
- export const canDecoders = Boolean(nativeDecoderLatin1impl)
31
+ export const nativeDecoderLatin1 = /* @__PURE__ */ getNativeLatin1()
32
+ export const canDecoders = !!nativeDecoderLatin1
30
33
 
31
34
  // Block Firefox < 146 specifically from using native hex/base64, as it's very slow there
32
35
  // Refs: https://bugzilla.mozilla.org/show_bug.cgi?id=1994067 (and linked issues), fixed in 146
@@ -51,7 +54,7 @@ function shouldSkipBuiltins() {
51
54
  return false // eslint-disable-line no-unreachable
52
55
  }
53
56
 
54
- export const skipWeb = shouldSkipBuiltins()
57
+ export const skipWeb = /* @__PURE__ */ shouldSkipBuiltins()
55
58
 
56
59
  function decodePartAddition(a, start, end, m) {
57
60
  let o = ''
@@ -6,19 +6,25 @@ import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/by
6
6
  import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
7
7
  import labels from './encoding.labels.js'
8
8
  import { fromSource, getBOMEncoding, normalizeEncoding, E_ENCODING } from './encoding.api.js'
9
- import { unfinishedBytes } from './encoding.util.js'
9
+ import { unfinishedBytes, mergePrefix } from './encoding.util.js'
10
10
 
11
11
  export { labelToName, getBOMEncoding, normalizeEncoding } from './encoding.api.js'
12
12
 
13
+ const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
13
14
  const E_OPTIONS = 'The "options" argument must be of type object'
14
- const E_MULTI =
15
- 'Legacy multi-byte encodings are disabled in /encoding-lite.js, use /encoding.js for full encodings range support'
16
15
  const replacementChar = '\uFFFD'
17
16
  const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
18
- let createMultibyteDecoder
17
+ let createMultibyteDecoder, multibyteEncoder
19
18
 
20
- export function setMultibyteDecoder(createDecoder) {
19
+ export const isMultibyte = (enc) => multibyteSet.has(enc)
20
+ export function setMultibyte(createDecoder, createEncoder) {
21
21
  createMultibyteDecoder = createDecoder
22
+ multibyteEncoder = createEncoder
23
+ }
24
+
25
+ export function getMultibyteEncoder() {
26
+ if (!multibyteEncoder) throw new Error(E_MULTI)
27
+ return multibyteEncoder
22
28
  }
23
29
 
24
30
  const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
@@ -47,10 +53,10 @@ export class TextDecoder {
47
53
  const enc = normalizeEncoding(encoding)
48
54
  if (!enc || enc === 'replacement') throw new RangeError(E_ENCODING)
49
55
  define(this, 'encoding', enc)
50
- define(this, 'fatal', Boolean(options.fatal))
51
- define(this, 'ignoreBOM', Boolean(options.ignoreBOM))
56
+ define(this, 'fatal', !!options.fatal)
57
+ define(this, 'ignoreBOM', !!options.ignoreBOM)
52
58
  this.#unicode = enc === 'utf-8' || enc === 'utf-16le' || enc === 'utf-16be'
53
- this.#multibyte = !this.#unicode && multibyteSet.has(enc)
59
+ this.#multibyte = !this.#unicode && isMultibyte(enc)
54
60
  this.#canBOM = this.#unicode && !this.ignoreBOM
55
61
  }
56
62
 
@@ -60,44 +66,26 @@ export class TextDecoder {
60
66
 
61
67
  decode(input, options = {}) {
62
68
  if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
63
- const stream = Boolean(options.stream)
69
+ const stream = !!options.stream
64
70
  let u = input === undefined ? new Uint8Array() : fromSource(input)
71
+ const empty = u.length === 0 // also can't be streaming after next line
72
+ if (empty && stream) return '' // no state change
65
73
 
66
74
  if (this.#unicode) {
67
75
  let prefix
68
76
  if (this.#chunk) {
69
- if (u.length === 0) {
70
- if (stream) return '' // no change
71
- u = this.#chunk // process as final chunk to handle errors and state changes
72
- } else if (u.length < 3) {
73
- // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
74
- const a = new Uint8Array(u.length + this.#chunk.length)
75
- a.set(this.#chunk)
76
- a.set(u, this.#chunk.length)
77
- u = a
77
+ const merged = mergePrefix(u, this.#chunk, this.encoding)
78
+ if (u.length < 3) {
79
+ u = merged // might be unfinished, but fully consumed old u
78
80
  } else {
79
- // Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
80
- const t = new Uint8Array(this.#chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
81
- t.set(this.#chunk)
82
- t.set(u.subarray(0, 3), this.#chunk.length)
83
-
84
- // Stop at the first offset where unfinished bytes reaches 0 or fits into u
85
- // If that doesn't happen (u too short), just concat chunk and u completely
86
- for (let i = 1; i <= 3; i++) {
87
- const unfinished = unfinishedBytes(t, this.#chunk.length + i, this.encoding) // 0-3
88
- if (unfinished <= i) {
89
- // Always reachable at 3, but we still need 'unfinished' value for it
90
- const add = i - unfinished // 0-3
91
- prefix = add > 0 ? t.subarray(0, this.#chunk.length + add) : this.#chunk
92
- if (add > 0) u = u.subarray(add)
93
- break
94
- }
95
- }
81
+ prefix = merged // stops at complete chunk
82
+ const add = prefix.length - this.#chunk.length
83
+ if (add > 0) u = u.subarray(add)
96
84
  }
97
85
 
98
86
  this.#chunk = null
99
- } else if (u.byteLength === 0) {
100
- if (!stream) this.#canBOM = !this.ignoreBOM
87
+ } else if (empty) {
88
+ this.#canBOM = !this.ignoreBOM // not streaming
101
89
  return ''
102
90
  }
103
91
 
@@ -118,27 +106,31 @@ export class TextDecoder {
118
106
  }
119
107
  }
120
108
 
109
+ let seenBOM = false
121
110
  if (this.#canBOM) {
122
111
  const bom = this.#findBom(prefix ?? u)
123
112
  if (bom) {
124
- if (stream) this.#canBOM = false
113
+ seenBOM = true
125
114
  if (prefix) {
126
115
  prefix = prefix.subarray(bom)
127
116
  } else {
128
117
  u = u.subarray(bom)
129
118
  }
130
119
  }
120
+ } else if (!stream && !this.ignoreBOM) {
121
+ this.#canBOM = true
131
122
  }
132
123
 
133
124
  if (!this.#decode) this.#decode = unicodeDecoder(this.encoding, !this.fatal)
134
125
  try {
135
126
  const res = (prefix ? this.#decode(prefix) : '') + this.#decode(u) + suffix
136
- if (res.length > 0 && stream) this.#canBOM = false
137
-
138
- if (!stream) this.#canBOM = !this.ignoreBOM
127
+ // "BOM seen" is set on the current decode call only if it did not error, in "serialize I/O queue" after decoding
128
+ if (stream && (seenBOM || res.length > 0)) this.#canBOM = false
139
129
  return res
140
130
  } catch (err) {
141
131
  this.#chunk = null // reset unfinished chunk on errors
132
+ // The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
133
+ // See also multi-byte.js
142
134
  throw err
143
135
  }
144
136
 
@@ -314,7 +306,7 @@ export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
314
306
 
315
307
  if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)
316
308
 
317
- if (multibyteSet.has(enc)) {
309
+ if (isMultibyte(enc)) {
318
310
  if (!createMultibyteDecoder) throw new Error(E_MULTI)
319
311
  return createMultibyteDecoder(enc, true)(u8)
320
312
  }
@@ -1,3 +1,5 @@
1
+ // Get a number of last bytes in an Uint8Array `u` ending at `len` that don't
2
+ // form a codepoint yet, but can be a part of a single codepoint on more data
1
3
  export function unfinishedBytes(u, len, enc) {
2
4
  switch (enc) {
3
5
  case 'utf-8': {
@@ -18,17 +20,45 @@ export function unfinishedBytes(u, len, enc) {
18
20
  case 'utf-16le':
19
21
  case 'utf-16be': {
20
22
  // 0-3
21
- let p = 0
22
- if (len % 2 !== 0) p++ // uneven bytes
23
+ const p = len % 2 // uneven byte length adds 1
24
+ if (len < 2) return p
23
25
  const l = len - p - 1
24
- if (len - p >= 2) {
25
- const last = enc === 'utf-16le' ? (u[l] << 8) ^ u[l - 1] : (u[l - 1] << 8) ^ u[l]
26
- if (last >= 0xd8_00 && last < 0xdc_00) p += 2 // lone lead
27
- }
28
-
29
- return p
26
+ const last = enc === 'utf-16le' ? (u[l] << 8) ^ u[l - 1] : (u[l - 1] << 8) ^ u[l]
27
+ return last >= 0xd8_00 && last < 0xdc_00 ? p + 2 : p // lone lead adds 2
30
28
  }
31
29
  }
32
30
 
33
31
  throw new Error('Unsupported encoding')
34
32
  }
33
+
34
+ // Merge prefix `chunk` with `u` and return new combined prefix
35
+ // For u.length < 3, fully consumes u and can return unfinished data,
36
+ // otherwise returns a prefix with no unfinished bytes
37
+ export function mergePrefix(u, chunk, enc) {
38
+ if (u.length === 0) return chunk
39
+ if (u.length < 3) {
40
+ // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
41
+ const a = new Uint8Array(u.length + chunk.length)
42
+ a.set(chunk)
43
+ a.set(u, chunk.length)
44
+ return a
45
+ }
46
+
47
+ // Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
48
+ const t = new Uint8Array(chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
49
+ t.set(chunk)
50
+ t.set(u.subarray(0, 3), chunk.length)
51
+
52
+ // Stop at the first offset where unfinished bytes reaches 0 or fits into u
53
+ // If that doesn't happen (u too short), just concat chunk and u completely (above)
54
+ for (let i = 1; i <= 3; i++) {
55
+ const unfinished = unfinishedBytes(t, chunk.length + i, enc) // 0-3
56
+ if (unfinished <= i) {
57
+ // Always reachable at 3, but we still need 'unfinished' value for it
58
+ const add = i - unfinished // 0-3
59
+ return add > 0 ? t.subarray(0, chunk.length + add) : chunk
60
+ }
61
+ }
62
+
63
+ // Unreachable
64
+ }
@@ -10,7 +10,7 @@
10
10
  "$7": [2,12541,2,-98,1,-156,1,7897,3,-7897,1,244],
11
11
  "$8": ["E9UxzELB4htgpd4feI7ZJNwBAQEfAf8B_wH_Af8B_wH_Af8B_wEAAAD_"],
12
12
  "$9": [1,26142,4,2,1,1,4,2,1,1,1,1],
13
- "$jis0208tail": [
13
+ "$j": [
14
14
  "ipErP1Ps8XWWMAFJ4rgaAwI1HDv3D_k4cuHcHicp0VFf43EZOCAtKIYJGRokBhxNIB4qMI3tHlWG0gtGG_5HAI0TWEcHAQVAFZtpbqgTMYcTOjErvTAqSgAWUBIyTyZ-JwRT9krRHiX4Z3qSTmo8MH-xFCXNJO8FQPEBuGAlBhEMOhMaPFSWbUBCikNUq4NJTTraLApjAfFoHCnoaimC5yYVIij5CTwiyhSCyCw_DwEgXCVj9FfpAM2rPLIMZfFgRQsMDO407TAD_gQzJhVhbRIZAfwKcC5ocSwVFbV-Cwr_8ssh9gIq1PnvAAABAAAAAAAAAAABAP8BAAAAAAAAAQAAAAABAAAAAAEAAAAAAACnWgAAAAECAAAAAKMAXgABAAAAAAAAAgAa5gABAAAAAQCdYwAAAAACAAAAAAEAAAAAAf8BAAAAAQABAQAAAQEAAAAAAAAAAACUbAAAAAAAAZJuAAH_AQAAkm-RbwABAAAAAAAAAQEAAAAAAQAAAAAAAAAAAQAAAAABAAABAAAAAAEAAAAAiXcAAQABh3kAAAAAAAH_AQAAAAAAAAEAAQAAAACEfdsmAAAAAQ",
15
15
  3,32999,
16
16
  "lIZ_NRU0zrJ-KhNa6DV79Fl84mAcRy5Ra54FEbOQbwDl7RwkQS0WIELTXCtwAx1jrKtUAEF2R-4RsvwGDgD1ACAJ-S8F-xEK9-ctP88Abu8B9latCvJR-9ks9eAd5G3mTCEXGTgTAklJTHMRgwcHCQEBAwENxAD7BHGvigKY_BwhCURv-sHrt3mBfwEAgIABf4MAAAAAAQEAe4kAAAAAAQABAAB1i3UAjgACAAAAAABwkAEAAQABAG2UbACWAGqXAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAABomAAAAQD_aJn_AQAAAAAAAAABAAABZAGcAAEAAAAAAAABAGIAn2GfAQAAAQAAAAABAQBdpAA"
@@ -39,8 +39,8 @@
39
39
  1,8,1,13,1,-11,1,-36,1,54,1,17,1,11,1,13,1,-4,1,-3,-2,1,-6,1,22,1,-9,1,13,1,-5,1,7,-2,-2,1,-2,1,4,-1,1,-2,1,66,-2,1,11,-3,1,14,-1,2,1,2,9,2,2,-2,1,8,1,-5,-2,1,5,1,-4,1,5,-3,1,10,1,17,-1,1,4,2,2,
40
40
  -1,4,1,2,3,
41
41
  "MhH-C_0AAgYbHu8NBAAHAAcLCwnuDAgBABD-Ae8MWLf-_iT-9An4-QEJ9_kp7An-_A4W6hP3GwUGDgFuBAMY_uIXAiP7iHP7JND8Lv3-BwEUAAXs_ggYFv0jwwofCQQIEfz5_AELCPIW-STtBgENHAACVgMDAwYCAAUC-wsHAw7xArNqmwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAf8AAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAP8BAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAPkH",
42
- 3,1,-3,1,7,-1,1,-2,-2,1,4,-2,1,6,1,4,2,1,-1,3,1,2,9,"t10LChELBwEOdikAAAAAAAAA",3,-5,2,4,"bP0M-gMeBgOOl5EKd7yfAAAAAAAAALkRJ-TdIA",0,464,"$jis0208tail",0,2,10,-20522,"4gEi-v8AAAA",0,2068,10,-56723,
43
- 10,-26,"4gEi-i7kChP_AAAAM-8AAQ","$jis0208tail"
42
+ 3,1,-3,1,7,-1,1,-2,-2,1,4,-2,1,6,1,4,2,1,-1,3,1,2,9,"t10LChELBwEOdikAAAAAAAAA",3,-5,2,4,"bP0M-gMeBgOOl5EKd7yfAAAAAAAAALkRJ-TdIA",0,464,"$j",0,2,10,-20522,"4gEi-v8AAAA",0,2068,10,-56723,
43
+ 10,-26,"4gEi-i7kChP_AAAAM-8AAQ","$j"
44
44
  ],
45
45
  "jis0212": [
46
46
  0,108,"2O7wIAPRK_6DJQACAP4CAP4CAP0EAA",0,8,1,-741,1,4,1,24,0,38,"uu_-BHOBcQAAAAAh3yE",0,359,1,-7569,3,1,1,31,0,1,1,-31,0,1,-1,1,28,0,1,1,-29,0,4,4,28,"ysU79Qr95B0DAAAAAAAAAA",0,35,11,51,2,1,0,35,
@@ -255,6 +255,7 @@
255
255
  0,96,"jTDyPYHPZsuW6DaOAK_MYbVV1Cc3hQPtxUtKV3vkuxWVbAKIJVtbpKg5G_YAYBudaZjpT-o5ZSoE-Ts1a64cjtcxYm6cmFG_wfn_U3Wf0xu-7iPUmOmX1uo8Q9XtOHkACQEHAQELY2oBEwAC1AcEAgMCAwj7AgYDAwkTCgLRAQUZCbwNAQEAAQAFAAMDAgABAgMCBQEHBAkGCLUDCAEBAAEFAQMDAgEDCwMCAwQCxiHoDyrAAgUKAAECBQA",
256
256
  0,96,"O3jA12FpgHMtTDt5TLAodifVF3BpNYprtGJ0H_7cxeYXTyeXYzCavakroFIokJoA_OmpTq4RLcHYQw10PC11jQ4JHAjxN8yXMs0Hvc5hHhwCjP6YLIeyHwDs_1ixvXIOEQm1AQMBAAEACQEICgIdwgAGABkAABQF3B_QEQsTzwIW9AAEAwsZBQ-xFQwFEdorErQAAhoTHsgFBQXfCCEN4QAA6QwNEgPOAQQBAAMCAwMAAQQACwAAAQcBBQs"
257
257
  ],
258
+ "iso-2022-jp-katakana":[12290,9,0,-13,249,-10,-82,1,1,1,1,57,1,1,-37,56,-91,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,0,0,0,0,0,2,2,2,2,2,0,0,0,0,1,1,1,0,0,0,0,0,1,3,-89,0],
258
259
  "gb18030-ranges": [0,128,36,37,2,4,7,9,5,6,31,32,8,10,6,9,1,3,4,6,3,4,1,3,1,2,4,5,17,18,7,8,15,16,24,25,3,4,4,5,29,30,98,99,1,2,1,2,1,2,1,2,1,2,1,2,1,2,28,29,87,88,15,16,101,102,1,4,13,14,183,200,1,8,7,24,1,8,55,56,14,78,1,2,7102,7103,2,6,1,3,2,4,7,9,9,10,1,3,1,2,5,6,112,113,86,87,1,2,3,4,12,13,10,11,62,74,4,14,22,26,2,6,110,111,6,7,1,2,3,4,4,5,2,6,2,3,1,2,1,6,2,3,5,9,5,6,10,11,3,4,5,6,13,15,2,6,6,8,37,38,3,4,11,12,25,26,82,83,333,343,10,50,100,176,4,40,13,28,3,6,10,12,16,18,8,10,8,10,3,4,2,4,18,22,31,33,2,3,54,55,1,2,2110,2111,2,3,3,4,2,4,10,11,15,16,2,3,3,4,4,5,2,4,3,4,14,15,293,305,4,8,1,20,5,7,2,11,20,21,2,85,7,11,2,88,5,8,6,43,246,256,7,8,113,114,234,236,12,15,2,3,34,35,9,10,2,4,2,3,113,114,43,44,298,299,111,112,11,12,765,766,85,86,96,98,14,15,147,148,218,219,287,288,113,114,885,886,264,265,471,472,116,117,4,5,43,44,248,249,373,374,20,21,193,194,5,6,82,83,16,17,441,442,50,51,2,3,4,6,1,3,20,21,3,4,22,24,703,704,39,44,111,118,148,149,81,20983,14426,18374,1,92,1,31,13,46,1,4,5,6,7,8,4,6,4,6,8,9,7,8,16,18,14,15,4295,4296,76,77,27,28,81,82,9,10,26,30,1,2,1,3,3,4,6,9,1,3,2,5,1030,1032,1,19,4,14,1,5,1,15,1,5,149,243,129,135,149606,26],
259
260
  "gb18030": [
260
261
  1,19970,3,1,1,8,-2,1,4,3,7,-1,-2,-2,2,4,-1,-1,-1,-1,1,4,3,3,-1,-1,-3,1,6,-3,-1,2,2,4,6,2,1,6,1,-2,10,1,7,1,-1,-2,1,5,2,5,-1,3,2,1,4,1,6,3,4,-2,4,1,3,2,1,9,-3,2,2,-1,3,7,-3,-1,2,3,-1,3,3,-1,-2,3,3,
@@ -688,6 +688,7 @@ const preencoders = {
688
688
  const t = p % 188
689
689
  return ((l + (l < 0x1f ? 0x81 : 0xc1)) << 8) | ((t < 0x3f ? 0x40 : 0x41) + t)
690
690
  },
691
+ 'iso-2022-jp': (p) => ((((p / 94) | 0) + 0x21) << 8) | ((p % 94) + 0x21),
691
692
  'euc-jp': (p) => ((((p / 94) | 0) + 0xa1) << 8) | ((p % 94) + 0xa1),
692
693
  'euc-kr': (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190) + 0x41),
693
694
  gb18030: (p) => ((((p / 190) | 0) + 0x81) << 8) | ((p % 190 < 0x3f ? 0x40 : 0x41) + (p % 190)),
@@ -697,11 +698,13 @@ preencoders.gbk = preencoders.gb18030
697
698
 
698
699
  // We accept that encoders use non-trivial amount of mem, for perf
699
700
  // most are are 128 KiB mem, big5 is 380 KiB, lazy-loaded at first use
700
- function getMap(id, size) {
701
+ function getMap(id, size, ascii) {
701
702
  const cached = maps.get(id)
702
703
  if (cached) return cached
703
704
  let tname = id
704
705
  const sjis = id === 'shift_jis'
706
+ const iso2022jp = id === 'iso-2022-jp'
707
+ if (iso2022jp) tname = 'jis0208'
705
708
  if (id === 'gbk') tname = 'gb18030'
706
709
  if (id === 'euc-jp' || sjis) tname = 'jis0208'
707
710
  const table = getTable(tname)
@@ -738,7 +741,7 @@ function getMap(id, size) {
738
741
  }
739
742
  }
740
743
 
741
- for (let i = 0; i < 0x80; i++) map[i] = i
744
+ if (ascii) for (let i = 0; i < 0x80; i++) map[i] = i
742
745
  if (sjis || id === 'euc-jp') {
743
746
  if (sjis) map[0x80] = 0x80
744
747
  const d = sjis ? 0xfe_c0 : 0x70_c0
@@ -757,32 +760,38 @@ function getMap(id, size) {
757
760
  return map
758
761
  }
759
762
 
760
- const encoders = new Set(['big5', 'euc-kr', 'euc-jp', 'shift_jis', 'gbk', 'gb18030'])
761
763
  const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
762
- let gb18030r
764
+ let gb18030r, katakana
763
765
 
764
766
  export function multibyteEncoder(enc, onError) {
765
- if (!encoders.has(enc)) throw new RangeError('Unsupported encoding')
767
+ if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
766
768
  const size = enc === 'big5' ? 0x2_f8_a7 : 0x1_00_00 // for big5, max codepoint in table + 1
767
- const width = enc === 'gb18030' ? 4 : 2
768
- const map = getMap(enc, size)
769
- if (enc === 'gb18030' && !gb18030r) gb18030r = getTable('gb18030-ranges')
770
-
769
+ const iso2022jp = enc === 'iso-2022-jp'
770
+ const gb18030 = enc === 'gb18030'
771
+ const ascii = isAsciiSuperset(enc)
772
+ const width = iso2022jp ? 5 : gb18030 ? 4 : 2
773
+ const tailsize = iso2022jp ? 3 : 0
774
+ const map = getMap(enc, size, ascii)
775
+ if (gb18030 && !gb18030r) gb18030r = getTable('gb18030-ranges')
776
+ if (iso2022jp && !katakana) katakana = getTable('iso-2022-jp-katakana')
771
777
  return (str) => {
772
778
  if (typeof str !== 'string') throw new TypeError(E_STRING)
773
- if (!NON_LATIN.test(str)) {
779
+ if (ascii && !NON_LATIN.test(str)) {
774
780
  try {
775
781
  return encodeAscii(str, E_STRICT)
776
782
  } catch {}
777
783
  }
778
784
 
779
785
  const length = str.length
780
- const u8 = new Uint8Array(length * width)
786
+ const u8 = new Uint8Array(length * width + tailsize)
781
787
  let i = 0
782
- while (i < length) {
783
- const x = str.charCodeAt(i)
784
- if (x >= 128) break
785
- u8[i++] = x
788
+
789
+ if (ascii) {
790
+ while (i < length) {
791
+ const x = str.charCodeAt(i)
792
+ if (x >= 128) break
793
+ u8[i++] = x
794
+ }
786
795
  }
787
796
 
788
797
  // eslint-disable-next-line unicorn/consistent-function-scoping
@@ -793,7 +802,69 @@ export function multibyteEncoder(enc, onError) {
793
802
 
794
803
  if (!map || map.length < size) /* c8 ignore next */ throw new Error('Unreachable') // Important for perf
795
804
 
796
- if (enc === 'gb18030') {
805
+ if (iso2022jp) {
806
+ let state = 0 // 0 = ASCII, 1 = Roman, 2 = jis0208
807
+ const restore = () => {
808
+ state = 0
809
+ u8[i++] = 0x1b
810
+ u8[i++] = 0x28
811
+ u8[i++] = 0x42
812
+ }
813
+
814
+ for (let j = 0; j < length; j++) {
815
+ let x = str.charCodeAt(j)
816
+ if (x >= 0xd8_00 && x < 0xe0_00) {
817
+ if (state === 2) restore()
818
+ if (x >= 0xdc_00 || j + 1 === length) {
819
+ i += err(x) // lone
820
+ } else {
821
+ const x1 = str.charCodeAt(j + 1)
822
+ if (x1 < 0xdc_00 || x1 >= 0xe0_00) {
823
+ i += err(x) // lone
824
+ } else {
825
+ j++ // consume x1
826
+ i += err(0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10)))
827
+ }
828
+ }
829
+ } else if (x < 0x80) {
830
+ if (state === 2 || (state === 1 && (x === 0x5c || x === 0x7e))) restore()
831
+ if (x === 0xe || x === 0xf || x === 0x1b) {
832
+ i += err(0xff_fd) // 12.2.2. step 3: This returns U+FFFD rather than codePoint to prevent attacks
833
+ } else {
834
+ u8[i++] = x
835
+ }
836
+ } else if (x === 0xa5 || x === 0x20_3e) {
837
+ if (state !== 1) {
838
+ state = 1
839
+ u8[i++] = 0x1b
840
+ u8[i++] = 0x28
841
+ u8[i++] = 0x4a
842
+ }
843
+
844
+ u8[i++] = x === 0xa5 ? 0x5c : 0x7e
845
+ } else {
846
+ if (x === 0x22_12) x = 0xff_0d
847
+ if (x >= 0xff_61 && x <= 0xff_9f) x = katakana[x - 0xff_61]
848
+ const e = map[x]
849
+ if (e) {
850
+ if (state !== 2) {
851
+ state = 2
852
+ u8[i++] = 0x1b
853
+ u8[i++] = 0x24
854
+ u8[i++] = 0x42
855
+ }
856
+
857
+ u8[i++] = e >> 8
858
+ u8[i++] = e & 0xff
859
+ } else {
860
+ if (state === 2) restore()
861
+ i += err(x)
862
+ }
863
+ }
864
+ }
865
+
866
+ if (state) restore()
867
+ } else if (gb18030) {
797
868
  // Deduping this branch hurts other encoders perf
798
869
  const encode = (cp) => {
799
870
  let a = 0, b = 0 // prettier-ignore
@@ -104,6 +104,9 @@ export function getTable(id) {
104
104
  let a = 0, b = 0 // prettier-ignore
105
105
  const idx = indices[id]
106
106
  while (idx.length > 0) res.push([(a += idx.shift()), (b += idx.shift())]) // destroying, we remove it later anyway
107
+ } else if (id.endsWith('-katakana')) {
108
+ let a = -1
109
+ res = new Uint16Array(indices[id].map((x) => (a += x + 1)))
107
110
  } else if (id === 'big5') {
108
111
  if (!Object.hasOwn(sizes, id)) throw new Error('Unknown encoding')
109
112
  res = new Uint32Array(sizes[id]) // array of strings or undefined
@@ -0,0 +1,31 @@
1
+ import { decodeAscii, encodeLatin1 } from './latin1.js'
2
+ import { decode2string } from './_utils.js'
3
+
4
+ const ERR = 'percentEncodeSet must be a string of unique increasing codepoints in range 0x20 - 0x7e'
5
+ const percentMap = new Map()
6
+ let hex, base
7
+
8
+ export function percentEncoder(set, spaceAsPlus = false) {
9
+ if (typeof set !== 'string' || /[^\x20-\x7E]/.test(set)) throw new TypeError(ERR)
10
+ if (typeof spaceAsPlus !== 'boolean') throw new TypeError('spaceAsPlus must be boolean')
11
+ const id = set + +spaceAsPlus
12
+ const cached = percentMap.get(id)
13
+ if (cached) return cached
14
+
15
+ const n = encodeLatin1(set).sort() // string checked above to be ascii
16
+ if (decodeAscii(n) !== set || new Set(n).size !== n.length) throw new TypeError(ERR)
17
+
18
+ if (!base) {
19
+ hex = Array.from({ length: 256 }, (_, i) => `%${i.toString(16).padStart(2, '0').toUpperCase()}`)
20
+ base = hex.map((h, i) => (i < 0x20 || i > 0x7e ? h : String.fromCharCode(i)))
21
+ }
22
+
23
+ const map = base.slice() // copy
24
+ for (const c of n) map[c] = hex[c]
25
+ if (spaceAsPlus) map[0x20] = '+' // overrides whatever percentEncodeSet thinks about it
26
+
27
+ // Input is not typechecked, for internal use only
28
+ const percentEncode = (u8, start = 0, end = u8.length) => decode2string(u8, start, end, map)
29
+ percentMap.set(id, percentEncode)
30
+ return percentEncode
31
+ }