@exodus/bytes 1.5.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +71 -11
- package/bech32.js +4 -0
- package/encoding-lite.js +2 -0
- package/encoding.js +2 -0
- package/fallback/_utils.js +8 -8
- package/fallback/encoding.js +83 -1
- package/fallback/latin1.js +16 -1
- package/fallback/multi-byte.js +124 -81
- package/fallback/multi-byte.table.js +10 -3
- package/fallback/single-byte.js +25 -5
- package/fallback/utf16.js +3 -3
- package/multi-byte.node.js +1 -3
- package/package.json +3 -1
- package/single-byte.js +52 -4
- package/single-byte.node.js +50 -4
- package/utf8.js +1 -1
package/README.md
CHANGED
|
@@ -31,18 +31,33 @@ See [Performance](./Performance.md) for more info
|
|
|
31
31
|
|
|
32
32
|
```js
|
|
33
33
|
import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding.js'
|
|
34
|
+
import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding.js' // Requires Streams
|
|
34
35
|
```
|
|
35
36
|
|
|
36
|
-
Less than half the bundle size of [text-encoding](https://npmjs.com/text-encoding), [whatwg-encoding](https://npmjs.com/whatwg-encoding) or [iconv-lite](https://npmjs.com/iconv-lite) (gzipped or not)
|
|
37
|
-
|
|
37
|
+
Less than half the bundle size of [text-encoding](https://npmjs.com/text-encoding), [whatwg-encoding](https://npmjs.com/whatwg-encoding) or [iconv-lite](https://npmjs.com/iconv-lite) (gzipped or not).\
|
|
38
|
+
Also [much faster](#fast) than all of those.
|
|
38
39
|
|
|
39
|
-
|
|
40
|
+
> [!TIP]
|
|
41
|
+
> See also the [lite version](#lite-version) to get this down to 9 KiB gzipped.
|
|
40
42
|
|
|
41
|
-
|
|
43
|
+
Spec compliant, passing WPT and covered with extra tests.\
|
|
44
|
+
Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).\
|
|
45
|
+
Including all three major browser engines being wrong at UTF-8.\
|
|
46
|
+
See [WPT pull request](https://github.com/web-platform-tests/wpt/pull/56892).
|
|
42
47
|
|
|
43
|
-
|
|
48
|
+
It works correctly even in environments that have native implementations broken (that's all of them currently).\
|
|
44
49
|
Runs (and passes WPT) on Node.js built without ICU.
|
|
45
50
|
|
|
51
|
+
> [!NOTE]
|
|
52
|
+
> [Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
|
|
53
|
+
>
|
|
54
|
+
> The JS multi-byte version is as fast as native impl in Node.js and browsers, but (unlike them) returns correct results.
|
|
55
|
+
>
|
|
56
|
+
> For encodings where native version is known to be fast and correct, it is automatically used.\
|
|
57
|
+
> Some single-byte encodings are faster than native in all three major browser engines.
|
|
58
|
+
|
|
59
|
+
See [analysis table](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit) for more info.
|
|
60
|
+
|
|
46
61
|
### Caveat: `TextDecoder` / `TextEncoder` APIs are lossy by default per spec
|
|
47
62
|
|
|
48
63
|
_These are only provided as a compatibility layer, prefer hardened APIs instead in new code._
|
|
@@ -63,6 +78,7 @@ _These are only provided as a compatibility layer, prefer hardened APIs instead
|
|
|
63
78
|
If you don't need support for legacy multi-byte encodings, you can use the lite import:
|
|
64
79
|
```js
|
|
65
80
|
import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding-lite.js'
|
|
81
|
+
import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding-lite.js' // Requires Streams
|
|
66
82
|
```
|
|
67
83
|
|
|
68
84
|
This reduces the bundle size 10x:\
|
|
@@ -107,8 +123,8 @@ import { utf16fromStringLoose, utf16toStringLoose } from '@exodus/bytes/utf16.js
|
|
|
107
123
|
### `@exodus/bytes/single-byte.js`
|
|
108
124
|
|
|
109
125
|
```js
|
|
110
|
-
import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
|
|
111
|
-
import { windows1252toString } from '@exodus/bytes/single-byte.js'
|
|
126
|
+
import { createSinglebyteDecoder, createSinglebyteEncoder } from '@exodus/bytes/single-byte.js'
|
|
127
|
+
import { windows1252toString, windows1252fromString } from '@exodus/bytes/single-byte.js'
|
|
112
128
|
```
|
|
113
129
|
|
|
114
130
|
Decode the legacy single-byte encodings according to the [Encoding standard](https://encoding.spec.whatwg.org/)
|
|
@@ -123,10 +139,19 @@ Supports all single-byte encodings listed in the standard:
|
|
|
123
139
|
|
|
124
140
|
##### `createSinglebyteDecoder(encoding, loose = false)`
|
|
125
141
|
|
|
126
|
-
Create a decoder for a supported one-byte `encoding`, given
|
|
142
|
+
Create a decoder for a supported one-byte `encoding`, given its lowercased name `encoding`.
|
|
127
143
|
|
|
128
144
|
Returns a function `decode(arr)` that decodes bytes to a string.
|
|
129
145
|
|
|
146
|
+
##### `createSinglebyteEncoder(encoding, { mode = 'fatal' })`
|
|
147
|
+
|
|
148
|
+
Create an encoder for a supported one-byte `encoding`, given its lowercased name `encoding`.
|
|
149
|
+
|
|
150
|
+
Returns a function `encode(string)` that encodes a string to bytes.
|
|
151
|
+
|
|
152
|
+
In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
|
|
153
|
+
not be encoded in the target encoding.
|
|
154
|
+
|
|
130
155
|
##### `windows1252toString(arr)`
|
|
131
156
|
|
|
132
157
|
Decode `windows-1252` bytes to a string.
|
|
@@ -140,6 +165,19 @@ Same as:
|
|
|
140
165
|
const windows1252toString = createSinglebyteDecoder('windows-1252')
|
|
141
166
|
```
|
|
142
167
|
|
|
168
|
+
##### `windows1252fromString(string)`
|
|
169
|
+
|
|
170
|
+
Encode a string to `windows-1252` bytes.
|
|
171
|
+
|
|
172
|
+
Also supports `ascii` and `latin-1` as those are strict subsets of `windows-1252`.
|
|
173
|
+
|
|
174
|
+
Will throw on non well-formed strings or any codepoints which could not be encoded in `windows-1252`.
|
|
175
|
+
|
|
176
|
+
Same as:
|
|
177
|
+
```js
|
|
178
|
+
const windows1252fromString = createSinglebyteEncoder('windows-1252', { mode: 'fatal' })
|
|
179
|
+
```
|
|
180
|
+
|
|
143
181
|
### `@exodus/bytes/multi-byte.js`
|
|
144
182
|
|
|
145
183
|
```js
|
|
@@ -157,7 +195,7 @@ Supports all legacy multi-byte encodings listed in the standard:
|
|
|
157
195
|
|
|
158
196
|
##### `createMultibyteDecoder(encoding, loose = false)`
|
|
159
197
|
|
|
160
|
-
Create a decoder for a supported legacy multi-byte `encoding`, given
|
|
198
|
+
Create a decoder for a supported legacy multi-byte `encoding`, given its lowercased name `encoding`.
|
|
161
199
|
|
|
162
200
|
Returns a function `decode(arr, stream = false)` that decodes bytes to a string.
|
|
163
201
|
|
|
@@ -270,6 +308,7 @@ On non-Node.js, requires peer dependency [@exodus/crypto](https://www.npmjs.com/
|
|
|
270
308
|
|
|
271
309
|
```js
|
|
272
310
|
import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding.js'
|
|
311
|
+
import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding.js' // Requires Streams
|
|
273
312
|
|
|
274
313
|
// Hooks for standards
|
|
275
314
|
import { getBOMEncoding, legacyHookDecode, labelToName, normalizeEncoding } from '@exodus/bytes/encoding.js'
|
|
@@ -277,7 +316,9 @@ import { getBOMEncoding, legacyHookDecode, labelToName, normalizeEncoding } from
|
|
|
277
316
|
|
|
278
317
|
Implements the [Encoding standard](https://encoding.spec.whatwg.org/):
|
|
279
318
|
[TextDecoder](https://encoding.spec.whatwg.org/#interface-textdecoder),
|
|
280
|
-
[TextEncoder](https://encoding.spec.whatwg.org/#interface-
|
|
319
|
+
[TextEncoder](https://encoding.spec.whatwg.org/#interface-textencoder),
|
|
320
|
+
[TextDecoderStream](https://encoding.spec.whatwg.org/#interface-textdecoderstream),
|
|
321
|
+
[TextEncoderStream](https://encoding.spec.whatwg.org/#interface-textencoderstream),
|
|
281
322
|
some [hooks](https://encoding.spec.whatwg.org/#specification-hooks) (see below).
|
|
282
323
|
|
|
283
324
|
#### `new TextDecoder(label = 'utf-8', { fatal = false, ignoreBOM = false })`
|
|
@@ -286,7 +327,21 @@ some [hooks](https://encoding.spec.whatwg.org/#specification-hooks) (see below).
|
|
|
286
327
|
|
|
287
328
|
#### `new TextEncoder()`
|
|
288
329
|
|
|
289
|
-
[TextEncoder](https://encoding.spec.whatwg.org/#interface-
|
|
330
|
+
[TextEncoder](https://encoding.spec.whatwg.org/#interface-textencoder) implementation/polyfill.
|
|
331
|
+
|
|
332
|
+
#### `new TextDecoderStream(label = 'utf-8', { fatal = false, ignoreBOM = false })`
|
|
333
|
+
|
|
334
|
+
[TextDecoderStream](https://encoding.spec.whatwg.org/#interface-textdecoderstream) implementation/polyfill.
|
|
335
|
+
|
|
336
|
+
Requires [Streams](https://streams.spec.whatwg.org/) to be either supported by the platform or
|
|
337
|
+
[polyfilled](https://npmjs.com/package/web-streams-polyfill).
|
|
338
|
+
|
|
339
|
+
#### `new TextEncoderStream()`
|
|
340
|
+
|
|
341
|
+
[TextEncoderStream](https://encoding.spec.whatwg.org/#interface-textencoderstream) implementation/polyfill.
|
|
342
|
+
|
|
343
|
+
Requires [Streams](https://streams.spec.whatwg.org/) to be either supported by the platform or
|
|
344
|
+
[polyfilled](https://npmjs.com/package/web-streams-polyfill).
|
|
290
345
|
|
|
291
346
|
#### `labelToName(label)`
|
|
292
347
|
|
|
@@ -356,6 +411,7 @@ new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding).decode(input)
|
|
|
356
411
|
|
|
357
412
|
```js
|
|
358
413
|
import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding-lite.js'
|
|
414
|
+
import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding-lite.js' // Requires Streams
|
|
359
415
|
|
|
360
416
|
// Hooks for standards
|
|
361
417
|
import { getBOMEncoding, legacyHookDecode, labelToName, normalizeEncoding } from '@exodus/bytes/encoding-lite.js'
|
|
@@ -379,7 +435,9 @@ To avoid inconsistencies, the exported classes and methods are exactly the same
|
|
|
379
435
|
> lite = require('@exodus/bytes/encoding-lite.js')
|
|
380
436
|
[Module: null prototype] {
|
|
381
437
|
TextDecoder: [class TextDecoder],
|
|
438
|
+
TextDecoderStream: [class TextDecoderStream],
|
|
382
439
|
TextEncoder: [class TextEncoder],
|
|
440
|
+
TextEncoderStream: [class TextEncoderStream],
|
|
383
441
|
getBOMEncoding: [Function: getBOMEncoding],
|
|
384
442
|
labelToName: [Function: labelToName],
|
|
385
443
|
legacyHookDecode: [Function: legacyHookDecode],
|
|
@@ -392,7 +450,9 @@ Error: Legacy multi-byte encodings are disabled in /encoding-lite.js, use /encod
|
|
|
392
450
|
> full = require('@exodus/bytes/encoding.js')
|
|
393
451
|
[Module: null prototype] {
|
|
394
452
|
TextDecoder: [class TextDecoder],
|
|
453
|
+
TextDecoderStream: [class TextDecoderStream],
|
|
395
454
|
TextEncoder: [class TextEncoder],
|
|
455
|
+
TextEncoderStream: [class TextEncoderStream],
|
|
396
456
|
getBOMEncoding: [Function: getBOMEncoding],
|
|
397
457
|
labelToName: [Function: labelToName],
|
|
398
458
|
legacyHookDecode: [Function: legacyHookDecode],
|
package/bech32.js
CHANGED
|
@@ -179,6 +179,9 @@ function assertDecodeArgs(str, limit) {
|
|
|
179
179
|
if (typeof limit !== 'number' || str.length < 8 || !(str.length <= limit)) throw new Error(E_SIZE)
|
|
180
180
|
}
|
|
181
181
|
|
|
182
|
+
// this is instant on 8-bit strings
|
|
183
|
+
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
|
|
184
|
+
|
|
182
185
|
function fromBech32enc(str, limit, encoding) {
|
|
183
186
|
assertDecodeArgs(str, limit)
|
|
184
187
|
const lower = str.toLowerCase()
|
|
@@ -195,6 +198,7 @@ function fromBech32enc(str, limit, encoding) {
|
|
|
195
198
|
if (wordsLength < 0) throw new Error(E_SIZE)
|
|
196
199
|
const bytesLength = (wordsLength * 5) >> 3
|
|
197
200
|
const slice = str.slice(split + 1)
|
|
201
|
+
if (!nativeEncoder && NON_LATIN.test(slice)) throw new SyntaxError(E_CHARACTER) // otherwise can't use encodeLatin1
|
|
198
202
|
const c = nativeEncoder ? encodeAscii(slice, E_CHARACTER) : encodeLatin1(slice) // suboptimal, but only affects non-Hermes barebones
|
|
199
203
|
const bytes = new Uint8Array(bytesLength)
|
|
200
204
|
|
package/encoding-lite.js
CHANGED
package/encoding.js
CHANGED
package/fallback/_utils.js
CHANGED
|
@@ -5,14 +5,8 @@ export const isHermes = Boolean(globalThis.HermesInternal)
|
|
|
5
5
|
export const isDeno = Boolean(globalThis.Deno)
|
|
6
6
|
export const isLE = new Uint8Array(Uint16Array.of(258).buffer)[0] === 2
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
if (haveNativeBuffer) return true // we consider Node.js TextDecoder/TextEncoder native
|
|
11
|
-
const s = `${x}`
|
|
12
|
-
// See https://github.com/facebook/hermes/pull/1855#issuecomment-3659386410
|
|
13
|
-
return s.includes('[native code]') || s.includes(`[bytecode]`) // Static Hermes has [bytecode] for contrib, which includes TextEncoder/TextDecoder
|
|
14
|
-
}
|
|
15
|
-
|
|
8
|
+
// We consider Node.js TextDecoder/TextEncoder native
|
|
9
|
+
let isNative = (x) => x && (haveNativeBuffer || `${x}`.includes('[native code]'))
|
|
16
10
|
if (!haveNativeBuffer && isNative(() => {})) isNative = () => false // e.g. XS, we don't want false positives
|
|
17
11
|
|
|
18
12
|
export const nativeEncoder = isNative(TextEncoder) ? new TextEncoder() : null
|
|
@@ -128,3 +122,9 @@ export function decode2string(arr, start, end, m) {
|
|
|
128
122
|
export function assert(condition, msg) {
|
|
129
123
|
if (!condition) throw new Error(msg)
|
|
130
124
|
}
|
|
125
|
+
|
|
126
|
+
// On arrays in heap (<= 64) it's cheaper to copy into a pooled buffer than lazy-create the ArrayBuffer storage
|
|
127
|
+
export const toBuf = (x) =>
|
|
128
|
+
x.byteLength <= 64 && x.BYTES_PER_ELEMENT === 1
|
|
129
|
+
? Buffer.from(x)
|
|
130
|
+
: Buffer.from(x.buffer, x.byteOffset, x.byteLength)
|
package/fallback/encoding.js
CHANGED
|
@@ -68,7 +68,16 @@ function isAnyUint8Array(x) {
|
|
|
68
68
|
const fromSource = (x) => {
|
|
69
69
|
if (x instanceof Uint8Array) return x
|
|
70
70
|
if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
|
|
71
|
-
if (isAnyArrayBuffer(x))
|
|
71
|
+
if (isAnyArrayBuffer(x)) {
|
|
72
|
+
if ('detached' in x) return x.detached === true ? new Uint8Array() : new Uint8Array(x)
|
|
73
|
+
// Old engines without .detached, try-catch
|
|
74
|
+
try {
|
|
75
|
+
return new Uint8Array(x)
|
|
76
|
+
} catch {
|
|
77
|
+
return new Uint8Array()
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
72
81
|
throw new TypeError('Argument must be a SharedArrayBuffer, ArrayBuffer or ArrayBufferView')
|
|
73
82
|
}
|
|
74
83
|
|
|
@@ -259,6 +268,79 @@ export class TextEncoder {
|
|
|
259
268
|
}
|
|
260
269
|
}
|
|
261
270
|
|
|
271
|
+
const E_NO_STREAMS = 'TransformStream global not present in the environment'
|
|
272
|
+
|
|
273
|
+
// https://encoding.spec.whatwg.org/#interface-textdecoderstream
|
|
274
|
+
export class TextDecoderStream {
|
|
275
|
+
constructor(encoding = 'utf-8', options = {}) {
|
|
276
|
+
if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
|
|
277
|
+
const decoder = new TextDecoder(encoding, options)
|
|
278
|
+
const transform = new TransformStream({
|
|
279
|
+
transform: (chunk, controller) => {
|
|
280
|
+
const value = decoder.decode(fromSource(chunk), { stream: true })
|
|
281
|
+
if (value) controller.enqueue(value)
|
|
282
|
+
},
|
|
283
|
+
flush: (controller) => {
|
|
284
|
+
// https://streams.spec.whatwg.org/#dom-transformer-flush
|
|
285
|
+
const value = decoder.decode()
|
|
286
|
+
if (value) controller.enqueue(value)
|
|
287
|
+
// No need to call .terminate() (Node.js is wrong)
|
|
288
|
+
},
|
|
289
|
+
})
|
|
290
|
+
|
|
291
|
+
define(this, 'encoding', decoder.encoding)
|
|
292
|
+
define(this, 'fatal', decoder.fatal)
|
|
293
|
+
define(this, 'ignoreBOM', decoder.ignoreBOM)
|
|
294
|
+
define(this, 'readable', transform.readable)
|
|
295
|
+
define(this, 'writable', transform.writable)
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
get [Symbol.toStringTag]() {
|
|
299
|
+
return 'TextDecoderStream'
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// https://encoding.spec.whatwg.org/#interface-textencoderstream
|
|
304
|
+
// Only UTF-8 per spec
|
|
305
|
+
export class TextEncoderStream {
|
|
306
|
+
constructor() {
|
|
307
|
+
if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
|
|
308
|
+
let lead
|
|
309
|
+
const transform = new TransformStream({
|
|
310
|
+
// https://encoding.spec.whatwg.org/#encode-and-enqueue-a-chunk
|
|
311
|
+
// Not identical in code, but reuses loose mode to have identical behavior
|
|
312
|
+
transform: (chunk, controller) => {
|
|
313
|
+
let s = String(chunk) // DOMString, might contain unpaired surrogates
|
|
314
|
+
if (s.length === 0) return
|
|
315
|
+
if (lead) {
|
|
316
|
+
s = lead + s
|
|
317
|
+
lead = null
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
const last = s.charCodeAt(s.length - 1) // Can't come from previous lead due to length check
|
|
321
|
+
if ((last & 0xfc_00) === 0xd8_00) {
|
|
322
|
+
lead = s[s.length - 1]
|
|
323
|
+
s = s.slice(0, -1)
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
if (s) controller.enqueue(utf8fromStringLoose(s))
|
|
327
|
+
},
|
|
328
|
+
// https://encoding.spec.whatwg.org/#encode-and-flush
|
|
329
|
+
flush: (controller) => {
|
|
330
|
+
if (lead) controller.enqueue(Uint8Array.of(0xef, 0xbf, 0xbd))
|
|
331
|
+
},
|
|
332
|
+
})
|
|
333
|
+
|
|
334
|
+
define(this, 'encoding', 'utf-8')
|
|
335
|
+
define(this, 'readable', transform.readable)
|
|
336
|
+
define(this, 'writable', transform.writable)
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
get [Symbol.toStringTag]() {
|
|
340
|
+
return 'TextEncoderStream'
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
|
|
262
344
|
// Warning: unlike whatwg-encoding, returns lowercased labels
|
|
263
345
|
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
|
|
264
346
|
export function getBOMEncoding(input) {
|
package/fallback/latin1.js
CHANGED
|
@@ -5,6 +5,7 @@ import {
|
|
|
5
5
|
nativeBuffer,
|
|
6
6
|
isHermes,
|
|
7
7
|
isDeno,
|
|
8
|
+
isLE,
|
|
8
9
|
} from './_utils.js'
|
|
9
10
|
|
|
10
11
|
// See http://stackoverflow.com/a/22747272/680742, which says that lowest limit is in Chrome, with 0xffff args
|
|
@@ -60,6 +61,16 @@ export function decodeLatin1(arr, start = 0, stop = arr.length) {
|
|
|
60
61
|
return String.fromCharCode.apply(String, sliced)
|
|
61
62
|
}
|
|
62
63
|
|
|
64
|
+
// Unchecked for well-formedness, raw. Expects Uint16Array input
|
|
65
|
+
export const decodeUCS2 =
|
|
66
|
+
nativeBuffer && isLE && !isDeno
|
|
67
|
+
? (u16, stop = u16.length) => {
|
|
68
|
+
// TODO: fast path for BE, perhaps faster path for Deno. Note that decoder replaces, this function doesn't
|
|
69
|
+
if (stop > 32) return nativeBuffer.from(u16.buffer, u16.byteOffset, stop * 2).ucs2Slice() // from 64 bytes, below are in heap
|
|
70
|
+
return decodeLatin1(u16, 0, stop)
|
|
71
|
+
}
|
|
72
|
+
: (u16, stop = u16.length) => decodeLatin1(u16, 0, stop)
|
|
73
|
+
|
|
63
74
|
// Does not check input, uses best available method
|
|
64
75
|
// Building an array for this is only faster than proper string concatenation when TextDecoder or native Buffer are available
|
|
65
76
|
export const decodeAscii = nativeBuffer
|
|
@@ -70,7 +81,10 @@ export const decodeAscii = nativeBuffer
|
|
|
70
81
|
: nativeDecoder.decode(a) // On Node.js, utf8 decoder is faster than latin1
|
|
71
82
|
: nativeDecoderLatin1
|
|
72
83
|
? (a) => nativeDecoderLatin1.decode(a) // On browsers (specifically WebKit), latin1 decoder is faster than utf8
|
|
73
|
-
: (a) =>
|
|
84
|
+
: (a) =>
|
|
85
|
+
decodeLatin1(
|
|
86
|
+
a instanceof Uint8Array ? a : new Uint8Array(a.buffer, a.byteOffset, a.byteLength)
|
|
87
|
+
)
|
|
74
88
|
|
|
75
89
|
/* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
|
|
76
90
|
|
|
@@ -95,6 +109,7 @@ export const encodeCharcodes = isHermes
|
|
|
95
109
|
|
|
96
110
|
/* eslint-enable @exodus/mutable/no-param-reassign-prop-only */
|
|
97
111
|
|
|
112
|
+
// Warning: can be used only on checked strings, converts strings to 8-bit
|
|
98
113
|
export const encodeLatin1 = (str) => encodeCharcodes(str, new Uint8Array(str.length))
|
|
99
114
|
|
|
100
115
|
// Expects nativeEncoder to be present
|
package/fallback/multi-byte.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { asciiPrefix, decodeLatin1 } from './latin1.js'
|
|
1
|
+
import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2 } from './latin1.js'
|
|
2
2
|
import { getTable } from './multi-byte.table.js'
|
|
3
3
|
|
|
4
4
|
export const E_STRICT = 'Input is not well-formed for this encoding'
|
|
@@ -11,36 +11,48 @@ export const E_STRICT = 'Input is not well-formed for this encoding'
|
|
|
11
11
|
// Common between euc-kr and big5
|
|
12
12
|
function bigDecoder(err, pair) {
|
|
13
13
|
let lead = 0
|
|
14
|
+
let oi = 0
|
|
15
|
+
let o16
|
|
14
16
|
|
|
15
17
|
const decodeLead = (b) => {
|
|
16
|
-
const
|
|
18
|
+
const p = pair(lead, b)
|
|
17
19
|
lead = 0
|
|
18
|
-
if (
|
|
19
|
-
|
|
20
|
+
if (typeof p === 'number') {
|
|
21
|
+
o16[oi++] = p
|
|
22
|
+
} else if (p) {
|
|
23
|
+
// This is still faster than string concatenation. Can we optimize strings though?
|
|
24
|
+
for (let i = 0; i < p.length; i++) o16[oi++] = p.charCodeAt(i)
|
|
25
|
+
} else {
|
|
26
|
+
o16[oi++] = err()
|
|
27
|
+
if (b < 128) o16[oi++] = b
|
|
28
|
+
}
|
|
20
29
|
}
|
|
21
30
|
|
|
22
31
|
const decode = (arr, start, end, stream) => {
|
|
23
|
-
let res = ''
|
|
24
32
|
let i = start
|
|
33
|
+
o16 = new Uint16Array(end - start + (lead ? 1 : 0)) // there are pairs but they consume more than one byte
|
|
34
|
+
oi = 0
|
|
25
35
|
|
|
26
|
-
if (lead && i < end)
|
|
36
|
+
if (lead && i < end) decodeLead(arr[i++])
|
|
27
37
|
while (i < end) {
|
|
28
38
|
const b = arr[i++]
|
|
29
39
|
if (b < 128) {
|
|
30
|
-
|
|
40
|
+
o16[oi++] = b
|
|
31
41
|
} else if (b === 0x80 || b === 0xff) {
|
|
32
|
-
|
|
42
|
+
o16[oi++] = err()
|
|
33
43
|
} else {
|
|
34
44
|
lead = b
|
|
35
|
-
if (i < end)
|
|
45
|
+
if (i < end) decodeLead(arr[i++])
|
|
36
46
|
}
|
|
37
47
|
}
|
|
38
48
|
|
|
39
49
|
if (lead && !stream) {
|
|
40
50
|
lead = 0
|
|
41
|
-
|
|
51
|
+
o16[oi++] = err()
|
|
42
52
|
}
|
|
43
53
|
|
|
54
|
+
const res = decodeUCS2(o16, oi)
|
|
55
|
+
o16 = null
|
|
44
56
|
return res
|
|
45
57
|
}
|
|
46
58
|
|
|
@@ -57,7 +69,7 @@ const mappers = {
|
|
|
57
69
|
return bigDecoder(err, (l, b) => {
|
|
58
70
|
if (b < 0x41 || b > 0xfe) return
|
|
59
71
|
const cp = euc[(l - 0x81) * 190 + b - 0x41]
|
|
60
|
-
return cp !== undefined && cp !== REP ?
|
|
72
|
+
return cp !== undefined && cp !== REP ? cp : undefined
|
|
61
73
|
})
|
|
62
74
|
},
|
|
63
75
|
// https://encoding.spec.whatwg.org/#euc-jp-decoder
|
|
@@ -66,55 +78,61 @@ const mappers = {
|
|
|
66
78
|
const jis0212 = getTable('jis0212')
|
|
67
79
|
let j12 = false
|
|
68
80
|
let lead = 0
|
|
81
|
+
let oi = 0
|
|
82
|
+
let o16
|
|
69
83
|
|
|
70
84
|
const decodeLead = (b) => {
|
|
71
85
|
if (lead === 0x8e && b >= 0xa1 && b <= 0xdf) {
|
|
72
86
|
lead = 0
|
|
73
|
-
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
|
|
87
|
+
o16[oi++] = 0xfe_c0 + b
|
|
88
|
+
} else if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
|
|
77
89
|
j12 = true
|
|
78
90
|
lead = b
|
|
79
|
-
|
|
80
|
-
|
|
91
|
+
} else {
|
|
92
|
+
let cp
|
|
93
|
+
if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
|
|
94
|
+
cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
|
|
95
|
+
}
|
|
81
96
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
97
|
+
lead = 0
|
|
98
|
+
j12 = false
|
|
99
|
+
if (cp !== undefined && cp !== REP) {
|
|
100
|
+
o16[oi++] = cp
|
|
101
|
+
} else {
|
|
102
|
+
o16[oi++] = err()
|
|
103
|
+
if (b < 128) o16[oi++] = b
|
|
104
|
+
}
|
|
85
105
|
}
|
|
86
|
-
|
|
87
|
-
lead = 0
|
|
88
|
-
j12 = false
|
|
89
|
-
if (cp !== undefined && cp !== REP) return String.fromCharCode(cp)
|
|
90
|
-
return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
|
|
91
106
|
}
|
|
92
107
|
|
|
93
108
|
const decode = (arr, start, end, stream) => {
|
|
94
|
-
let res = ''
|
|
95
109
|
let i = start
|
|
110
|
+
o16 = new Uint16Array(end - start + (lead ? 1 : 0))
|
|
111
|
+
oi = 0
|
|
96
112
|
|
|
97
|
-
if (lead && i < end)
|
|
98
|
-
if (lead && i < end)
|
|
113
|
+
if (lead && i < end) decodeLead(arr[i++])
|
|
114
|
+
if (lead && i < end) decodeLead(arr[i++]) // could be two leads, but no more
|
|
99
115
|
while (i < end) {
|
|
100
116
|
const b = arr[i++]
|
|
101
117
|
if (b < 128) {
|
|
102
|
-
|
|
118
|
+
o16[oi++] = b
|
|
103
119
|
} else if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) {
|
|
104
|
-
|
|
120
|
+
o16[oi++] = err()
|
|
105
121
|
} else {
|
|
106
122
|
lead = b
|
|
107
|
-
if (i < end)
|
|
108
|
-
if (lead && i < end)
|
|
123
|
+
if (i < end) decodeLead(arr[i++])
|
|
124
|
+
if (lead && i < end) decodeLead(arr[i++]) // could be two leads
|
|
109
125
|
}
|
|
110
126
|
}
|
|
111
127
|
|
|
112
128
|
if (lead && !stream) {
|
|
113
129
|
lead = 0
|
|
114
130
|
j12 = false // can be true only when lead is non-zero
|
|
115
|
-
|
|
131
|
+
o16[oi++] = err()
|
|
116
132
|
}
|
|
117
133
|
|
|
134
|
+
const res = decodeUCS2(o16, oi)
|
|
135
|
+
o16 = null
|
|
118
136
|
return res
|
|
119
137
|
}
|
|
120
138
|
|
|
@@ -238,7 +256,8 @@ const mappers = {
|
|
|
238
256
|
}
|
|
239
257
|
|
|
240
258
|
const decode = (arr, start, end, stream) => {
|
|
241
|
-
|
|
259
|
+
const o16 = new Uint16Array(end - start + 2) // err in eof + lead from state
|
|
260
|
+
let oi = 0
|
|
242
261
|
let i = start
|
|
243
262
|
const pushback = [] // local and auto-cleared
|
|
244
263
|
|
|
@@ -246,7 +265,7 @@ const mappers = {
|
|
|
246
265
|
// Same as the full loop, but without EOF handling
|
|
247
266
|
while (i < end || pushback.length > 0) {
|
|
248
267
|
const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
|
|
249
|
-
if (c !== undefined)
|
|
268
|
+
if (c !== undefined) o16[oi++] = c // 16-bit
|
|
250
269
|
}
|
|
251
270
|
|
|
252
271
|
// Then, dump EOF. This needs the same loop as the characters can be pushed back
|
|
@@ -254,11 +273,11 @@ const mappers = {
|
|
|
254
273
|
while (i <= end || pushback.length > 0) {
|
|
255
274
|
if (i < end || pushback.length > 0) {
|
|
256
275
|
const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
|
|
257
|
-
if (c !== undefined)
|
|
276
|
+
if (c !== undefined) o16[oi++] = c // 16-bit
|
|
258
277
|
} else {
|
|
259
278
|
const c = eof(pushback)
|
|
260
279
|
if (c === null) break // clean exit
|
|
261
|
-
|
|
280
|
+
o16[oi++] = c
|
|
262
281
|
}
|
|
263
282
|
}
|
|
264
283
|
}
|
|
@@ -272,7 +291,7 @@ const mappers = {
|
|
|
272
291
|
out = false
|
|
273
292
|
}
|
|
274
293
|
|
|
275
|
-
return
|
|
294
|
+
return decodeUCS2(o16, oi)
|
|
276
295
|
}
|
|
277
296
|
|
|
278
297
|
return { decode, isAscii: () => false }
|
|
@@ -281,44 +300,57 @@ const mappers = {
|
|
|
281
300
|
shift_jis: (err) => {
|
|
282
301
|
const jis0208 = getTable('jis0208')
|
|
283
302
|
let lead = 0
|
|
303
|
+
let oi = 0
|
|
304
|
+
let o16
|
|
284
305
|
|
|
285
306
|
const decodeLead = (b) => {
|
|
286
307
|
const l = lead
|
|
287
308
|
lead = 0
|
|
288
309
|
if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
|
|
289
310
|
const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
|
|
290
|
-
if (p >= 8836 && p <= 10_715)
|
|
311
|
+
if (p >= 8836 && p <= 10_715) {
|
|
312
|
+
o16[oi++] = 0xe0_00 - 8836 + p
|
|
313
|
+
return
|
|
314
|
+
}
|
|
315
|
+
|
|
291
316
|
const cp = jis0208[p]
|
|
292
|
-
if (cp !== undefined && cp !== REP)
|
|
317
|
+
if (cp !== undefined && cp !== REP) {
|
|
318
|
+
o16[oi++] = cp
|
|
319
|
+
return
|
|
320
|
+
}
|
|
293
321
|
}
|
|
294
322
|
|
|
295
|
-
|
|
323
|
+
o16[oi++] = err()
|
|
324
|
+
if (b < 128) o16[oi++] = b
|
|
296
325
|
}
|
|
297
326
|
|
|
298
327
|
const decode = (arr, start, end, stream) => {
|
|
299
|
-
|
|
328
|
+
o16 = new Uint16Array(end - start + (lead ? 1 : 0))
|
|
329
|
+
oi = 0
|
|
300
330
|
let i = start
|
|
301
331
|
|
|
302
|
-
if (lead && i < end)
|
|
332
|
+
if (lead && i < end) decodeLead(arr[i++])
|
|
303
333
|
while (i < end) {
|
|
304
334
|
const b = arr[i++]
|
|
305
335
|
if (b <= 0x80) {
|
|
306
|
-
|
|
336
|
+
o16[oi++] = b // 0x80 is allowed
|
|
307
337
|
} else if (b >= 0xa1 && b <= 0xdf) {
|
|
308
|
-
|
|
338
|
+
o16[oi++] = 0xfe_c0 + b
|
|
309
339
|
} else if (b === 0xa0 || b > 0xfc) {
|
|
310
|
-
|
|
340
|
+
o16[oi++] = err()
|
|
311
341
|
} else {
|
|
312
342
|
lead = b
|
|
313
|
-
if (i < end)
|
|
343
|
+
if (i < end) decodeLead(arr[i++])
|
|
314
344
|
}
|
|
315
345
|
}
|
|
316
346
|
|
|
317
347
|
if (lead && !stream) {
|
|
318
348
|
lead = 0
|
|
319
|
-
|
|
349
|
+
o16[oi++] = err()
|
|
320
350
|
}
|
|
321
351
|
|
|
352
|
+
const res = decodeUCS2(o16, oi)
|
|
353
|
+
o16 = null
|
|
322
354
|
return res
|
|
323
355
|
}
|
|
324
356
|
|
|
@@ -349,7 +381,8 @@ const mappers = {
|
|
|
349
381
|
// g3 is 0 or 0x81-0xfe
|
|
350
382
|
|
|
351
383
|
const decode = (arr, start, end, stream) => {
|
|
352
|
-
|
|
384
|
+
const o16 = new Uint16Array(end - start + (g1 ? 3 : 0)) // even with pushback it's at most 1 char per byte
|
|
385
|
+
let oi = 0
|
|
353
386
|
let i = start
|
|
354
387
|
const pushback = [] // local and auto-cleared
|
|
355
388
|
|
|
@@ -357,30 +390,38 @@ const mappers = {
|
|
|
357
390
|
// Same as the full loop, but without EOF handling
|
|
358
391
|
while (i < end || pushback.length > 0) {
|
|
359
392
|
const b = pushback.length > 0 ? pushback.pop() : arr[i++]
|
|
360
|
-
if (
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
393
|
+
if (g1) {
|
|
394
|
+
// g2 can be set only when g1 is set, g3 can be set only when g2 is set
|
|
395
|
+
// hence, 3 checks for g3 is faster than 3 checks for g1
|
|
396
|
+
if (g2) {
|
|
397
|
+
if (g3) {
|
|
398
|
+
if (b < 0x30 || b > 0x39) {
|
|
399
|
+
pushback.push(b, g3, g2)
|
|
400
|
+
g1 = g2 = g3 = 0
|
|
401
|
+
o16[oi++] = err()
|
|
402
|
+
} else {
|
|
403
|
+
const p = index(
|
|
404
|
+
(g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30
|
|
405
|
+
)
|
|
406
|
+
g1 = g2 = g3 = 0
|
|
407
|
+
if (p === undefined) {
|
|
408
|
+
o16[oi++] = err()
|
|
409
|
+
} else if (p <= 0xff_ff) {
|
|
410
|
+
o16[oi++] = p // Can validly return replacement
|
|
411
|
+
} else {
|
|
412
|
+
const d = p - 0x1_00_00
|
|
413
|
+
o16[oi++] = 0xd8_00 | (d >> 10)
|
|
414
|
+
o16[oi++] = 0xdc_00 | (d & 0x3_ff)
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
} else if (b >= 0x81 && b <= 0xfe) {
|
|
418
|
+
g3 = b
|
|
370
419
|
} else {
|
|
371
|
-
|
|
420
|
+
pushback.push(b, g2)
|
|
421
|
+
g1 = g2 = 0
|
|
422
|
+
o16[oi++] = err()
|
|
372
423
|
}
|
|
373
|
-
}
|
|
374
|
-
} else if (g2) {
|
|
375
|
-
if (b >= 0x81 && b <= 0xfe) {
|
|
376
|
-
g3 = b
|
|
377
|
-
} else {
|
|
378
|
-
pushback.push(b, g2)
|
|
379
|
-
g1 = g2 = 0
|
|
380
|
-
res += String.fromCharCode(err())
|
|
381
|
-
}
|
|
382
|
-
} else if (g1) {
|
|
383
|
-
if (b >= 0x30 && b <= 0x39) {
|
|
424
|
+
} else if (b >= 0x30 && b <= 0x39) {
|
|
384
425
|
g2 = b
|
|
385
426
|
} else {
|
|
386
427
|
let cp
|
|
@@ -390,18 +431,18 @@ const mappers = {
|
|
|
390
431
|
|
|
391
432
|
g1 = 0
|
|
392
433
|
if (cp !== undefined && cp !== REP) {
|
|
393
|
-
|
|
434
|
+
o16[oi++] = cp // 16-bit
|
|
394
435
|
} else {
|
|
395
|
-
|
|
396
|
-
if (b < 128)
|
|
436
|
+
o16[oi++] = err()
|
|
437
|
+
if (b < 128) o16[oi++] = b // can be processed immediately
|
|
397
438
|
}
|
|
398
439
|
}
|
|
399
440
|
} else if (b < 128) {
|
|
400
|
-
|
|
441
|
+
o16[oi++] = b
|
|
401
442
|
} else if (b === 0x80) {
|
|
402
|
-
|
|
443
|
+
o16[oi++] = 0x20_ac
|
|
403
444
|
} else if (b === 0xff) {
|
|
404
|
-
|
|
445
|
+
o16[oi++] = err()
|
|
405
446
|
} else {
|
|
406
447
|
g1 = b
|
|
407
448
|
}
|
|
@@ -410,10 +451,10 @@ const mappers = {
|
|
|
410
451
|
// if g1 = 0 then g2 = g3 = 0
|
|
411
452
|
if (g1 && !stream) {
|
|
412
453
|
g1 = g2 = g3 = 0
|
|
413
|
-
|
|
454
|
+
o16[oi++] = err()
|
|
414
455
|
}
|
|
415
456
|
|
|
416
|
-
return
|
|
457
|
+
return decodeUCS2(o16, oi)
|
|
417
458
|
}
|
|
418
459
|
|
|
419
460
|
return { decode, isAscii: () => g1 === 0 } // if g1 = 0 then g2 = g3 = 0
|
|
@@ -433,6 +474,7 @@ const mappers = {
|
|
|
433
474
|
export const isAsciiSuperset = (enc) => enc !== 'iso-2022-jp' // all others are ASCII supersets and can use fast path
|
|
434
475
|
|
|
435
476
|
export function multibyteDecoder(enc, loose = false) {
|
|
477
|
+
if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
|
|
436
478
|
if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
|
|
437
479
|
|
|
438
480
|
// Input is assumed to be typechecked already
|
|
@@ -452,8 +494,9 @@ export function multibyteDecoder(enc, loose = false) {
|
|
|
452
494
|
return (arr, stream = false) => {
|
|
453
495
|
let res = ''
|
|
454
496
|
if (asciiSuperset && (!mapper || mapper.isAscii?.())) {
|
|
455
|
-
|
|
456
|
-
if (
|
|
497
|
+
const prefixLen = asciiPrefix(arr)
|
|
498
|
+
if (prefixLen === arr.length) return decodeAscii(arr) // ascii
|
|
499
|
+
res = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
|
|
457
500
|
}
|
|
458
501
|
|
|
459
502
|
streaming = stream // affects onErr
|
|
@@ -56,7 +56,9 @@ function unwrap(res, t, pos, stringMode = false) {
|
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
if (stringMode) {
|
|
59
|
-
for (let k = 0; k < x; k++, pos++, code++)
|
|
59
|
+
for (let k = 0; k < x; k++, pos++, code++) {
|
|
60
|
+
res[pos] = code <= 0xff_ff ? code : String.fromCodePoint(code)
|
|
61
|
+
}
|
|
60
62
|
} else {
|
|
61
63
|
for (let k = 0; k < x; k++, pos++, code++) res[pos] = code
|
|
62
64
|
}
|
|
@@ -65,8 +67,13 @@ function unwrap(res, t, pos, stringMode = false) {
|
|
|
65
67
|
pos = unwrap(res, indices[x], pos, stringMode) // self-reference using shared chunks
|
|
66
68
|
} else if (stringMode) {
|
|
67
69
|
const s = [...utf16toString(loadBase64(x), 'uint8-le')] // splits by codepoints
|
|
68
|
-
|
|
69
|
-
|
|
70
|
+
let char
|
|
71
|
+
for (let i = 0; i < s.length; ) {
|
|
72
|
+
char = s[i++]
|
|
73
|
+
res[pos++] = char.length === 1 ? char.charCodeAt(0) : char // strings only for high codepoints
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
code = char.codePointAt(0) + 1
|
|
70
77
|
} else {
|
|
71
78
|
const u16 = to16input(loadBase64(x), true) // data is little-endian
|
|
72
79
|
res.set(u16, pos)
|
package/fallback/single-byte.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { asciiPrefix, decodeLatin1 } from './latin1.js'
|
|
1
|
+
import { asciiPrefix, decodeAscii, decodeLatin1 } from './latin1.js'
|
|
2
2
|
import encodings from './single-byte.encodings.js'
|
|
3
3
|
import { decode2string } from './_utils.js'
|
|
4
4
|
|
|
@@ -23,6 +23,7 @@ function getEncoding(encoding) {
|
|
|
23
23
|
|
|
24
24
|
const mappers = new Map()
|
|
25
25
|
const decoders = new Map()
|
|
26
|
+
const encmaps = new Map()
|
|
26
27
|
|
|
27
28
|
// Used only on Node.js, no reason to optimize for anything else
|
|
28
29
|
// E.g. avoiding .from and filling zero-initialized arr manually is faster on Hermes, but we avoid this codepath on Hermes completely
|
|
@@ -31,7 +32,7 @@ export function encodingMapper(encoding) {
|
|
|
31
32
|
if (cached) return cached
|
|
32
33
|
|
|
33
34
|
const codes = getEncoding(encoding)
|
|
34
|
-
const incomplete = codes.includes(
|
|
35
|
+
const incomplete = codes.includes(r)
|
|
35
36
|
let map
|
|
36
37
|
const mapper = (arr, start = 0) => {
|
|
37
38
|
if (!map) {
|
|
@@ -66,7 +67,7 @@ export function encodingDecoder(encoding) {
|
|
|
66
67
|
|
|
67
68
|
let strings
|
|
68
69
|
const codes = getEncoding(encoding)
|
|
69
|
-
const incomplete = codes.includes(
|
|
70
|
+
const incomplete = codes.includes(r)
|
|
70
71
|
const decoder = (arr, loose = false) => {
|
|
71
72
|
if (!strings) {
|
|
72
73
|
const allCodes = Array.from({ length: 128 }, (_, i) => i).concat(codes)
|
|
@@ -74,8 +75,9 @@ export function encodingDecoder(encoding) {
|
|
|
74
75
|
strings = allCodes.map((c) => String.fromCharCode(c))
|
|
75
76
|
}
|
|
76
77
|
|
|
77
|
-
const
|
|
78
|
-
if (
|
|
78
|
+
const prefixLen = asciiPrefix(arr)
|
|
79
|
+
if (prefixLen === arr.length) return decodeAscii(arr)
|
|
80
|
+
const prefix = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
|
|
79
81
|
const suffix = decode2string(arr, prefix.length, arr.length, strings)
|
|
80
82
|
if (!loose && incomplete && suffix.includes('\uFFFD')) throw new TypeError(E_STRICT)
|
|
81
83
|
return prefix + suffix
|
|
@@ -84,3 +86,21 @@ export function encodingDecoder(encoding) {
|
|
|
84
86
|
decoders.set(encoding, decoder)
|
|
85
87
|
return decoder
|
|
86
88
|
}
|
|
89
|
+
|
|
90
|
+
export function encodeMap(encoding) {
|
|
91
|
+
const cached = encmaps.get(encoding)
|
|
92
|
+
if (cached) return cached
|
|
93
|
+
|
|
94
|
+
const codes = getEncoding(encoding)
|
|
95
|
+
let max = 128
|
|
96
|
+
while (codes.length < 128) codes.push(128 + codes.length)
|
|
97
|
+
for (const code of codes) if (code > max && code !== r) max = code
|
|
98
|
+
const map = new Uint8Array(max + 1) // < 10 KiB for all except macintosh, 63 KiB for macintosh
|
|
99
|
+
for (let i = 0; i < 128; i++) {
|
|
100
|
+
map[i] = i
|
|
101
|
+
if (codes[i] !== r) map[codes[i]] = 128 + i
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
encmaps.set(encoding, map)
|
|
105
|
+
return map
|
|
106
|
+
}
|
package/fallback/utf16.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { decodeUCS2, encodeCharcodes } from './latin1.js'
|
|
2
2
|
import { isLE } from './_utils.js'
|
|
3
3
|
|
|
4
4
|
export const E_STRICT = 'Input is not well-formed utf16'
|
|
@@ -38,9 +38,9 @@ export function to16input(u8, le) {
|
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
export const decode = (u16, loose = false, checked = false) => {
|
|
41
|
-
if (checked || isWellFormed(u16)) return
|
|
41
|
+
if (checked || isWellFormed(u16)) return decodeUCS2(u16)
|
|
42
42
|
if (!loose) throw new TypeError(E_STRICT)
|
|
43
|
-
return
|
|
43
|
+
return decodeUCS2(toWellFormed(Uint16Array.from(u16))) // cloned for replacement
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
export function encode(str, loose = false, checked = false, swapped = false) {
|
package/multi-byte.node.js
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import { assertUint8 } from './assert.js'
|
|
2
|
-
import { isDeno } from './fallback/_utils.js'
|
|
2
|
+
import { isDeno, toBuf } from './fallback/_utils.js'
|
|
3
3
|
import { isAsciiSuperset, multibyteDecoder } from './fallback/multi-byte.js'
|
|
4
4
|
import { isAscii } from 'node:buffer'
|
|
5
5
|
|
|
6
|
-
const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
|
|
7
|
-
|
|
8
6
|
export function createMultibyteDecoder(encoding, loose = false) {
|
|
9
7
|
const jsDecoder = multibyteDecoder(encoding, loose) // asserts
|
|
10
8
|
let streaming = false
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@exodus/bytes",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.7.0",
|
|
4
4
|
"description": "Various operations on Uint8Array data",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"lint": "eslint .",
|
|
@@ -145,6 +145,7 @@
|
|
|
145
145
|
"@exodus/prettier": "^1.0.0",
|
|
146
146
|
"@exodus/test": "^1.0.0-rc.109",
|
|
147
147
|
"@noble/hashes": "^2.0.1",
|
|
148
|
+
"@petamoriken/float16": "^3.9.3",
|
|
148
149
|
"@scure/base": "^1.2.6",
|
|
149
150
|
"@stablelib/base64": "^2.0.1",
|
|
150
151
|
"@stablelib/hex": "^2.0.1",
|
|
@@ -172,6 +173,7 @@
|
|
|
172
173
|
"typescript": "^5.9.3",
|
|
173
174
|
"uint8array-tools": "^0.0.9",
|
|
174
175
|
"utf8": "^3.0.0",
|
|
176
|
+
"web-streams-polyfill": "^4.2.0",
|
|
175
177
|
"whatwg-encoding": "^3.1.1",
|
|
176
178
|
"wif": "^5.0.0"
|
|
177
179
|
},
|
package/single-byte.js
CHANGED
|
@@ -1,11 +1,18 @@
|
|
|
1
1
|
import { assertUint8 } from './assert.js'
|
|
2
|
-
import { canDecoders } from './fallback/_utils.js'
|
|
3
|
-
import {
|
|
2
|
+
import { canDecoders, nativeEncoder } from './fallback/_utils.js'
|
|
3
|
+
import { encodeAscii } from './fallback/latin1.js'
|
|
4
|
+
import { assertEncoding, encodingDecoder, encodeMap, E_STRICT } from './fallback/single-byte.js'
|
|
4
5
|
|
|
5
6
|
const { TextDecoder } = globalThis
|
|
6
7
|
|
|
7
8
|
let windows1252works
|
|
8
9
|
|
|
10
|
+
// prettier-ignore
|
|
11
|
+
const skipNative = new Set([
|
|
12
|
+
'iso-8859-16', // iso-8859-16 is somehow broken in WebKit, at least on CI
|
|
13
|
+
'iso-8859-6', 'iso-8859-8', 'iso-8859-8-i', // slow in all 3 engines
|
|
14
|
+
])
|
|
15
|
+
|
|
9
16
|
function shouldUseNative(enc) {
|
|
10
17
|
// https://issues.chromium.org/issues/468458388
|
|
11
18
|
// Also might be incorrectly imlemented on platforms as Latin1 (e.g. in Node.js) or regress
|
|
@@ -24,11 +31,11 @@ function shouldUseNative(enc) {
|
|
|
24
31
|
return windows1252works
|
|
25
32
|
}
|
|
26
33
|
|
|
27
|
-
|
|
28
|
-
return enc !== 'iso-8859-16'
|
|
34
|
+
return !skipNative.has(enc)
|
|
29
35
|
}
|
|
30
36
|
|
|
31
37
|
export function createSinglebyteDecoder(encoding, loose = false) {
|
|
38
|
+
if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
|
|
32
39
|
assertEncoding(encoding)
|
|
33
40
|
|
|
34
41
|
if (canDecoders && shouldUseNative(encoding)) {
|
|
@@ -51,4 +58,45 @@ export function createSinglebyteDecoder(encoding, loose = false) {
|
|
|
51
58
|
}
|
|
52
59
|
}
|
|
53
60
|
|
|
61
|
+
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
|
|
62
|
+
|
|
63
|
+
function encode(s, m) {
|
|
64
|
+
const len = s.length
|
|
65
|
+
const x = new Uint8Array(len)
|
|
66
|
+
for (let i = 0; i < len; i++) {
|
|
67
|
+
const x0 = s.charCodeAt(i)
|
|
68
|
+
const c0 = m[x0]
|
|
69
|
+
if (!c0 && x0) return null
|
|
70
|
+
x[i] = c0
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return x
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
|
|
77
|
+
// TODO: replacement, truncate (replacement will need varying length)
|
|
78
|
+
if (mode !== 'fatal') throw new Error('Unsupported mode')
|
|
79
|
+
const m = encodeMap(encoding) // asserts
|
|
80
|
+
|
|
81
|
+
// No single-byte encoder produces surrogate pairs, so any surrogate is invalid
|
|
82
|
+
// This needs special treatment only to decide how many replacement chars to output, one or two
|
|
83
|
+
// Not much use in running isWellFormed, most likely cause of error is unmapped chars, not surrogate pairs
|
|
84
|
+
return (s) => {
|
|
85
|
+
if (typeof s !== 'string') throw new TypeError('Input is not a string')
|
|
86
|
+
|
|
87
|
+
// Instead of an ASCII regex check, encode optimistically - this is faster
|
|
88
|
+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
|
|
89
|
+
if (nativeEncoder && !NON_LATIN.test(s)) {
|
|
90
|
+
try {
|
|
91
|
+
return encodeAscii(s, E_STRICT)
|
|
92
|
+
} catch {}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const res = encode(s, m)
|
|
96
|
+
if (!res) throw new TypeError(E_STRICT)
|
|
97
|
+
return res
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
54
101
|
export const windows1252toString = createSinglebyteDecoder('windows-1252')
|
|
102
|
+
export const windows1252fromString = createSinglebyteEncoder('windows-1252')
|
package/single-byte.node.js
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import { assertUint8 } from './assert.js'
|
|
2
2
|
import { isAscii } from 'node:buffer'
|
|
3
|
-
import { isDeno, isLE } from './fallback/_utils.js'
|
|
3
|
+
import { isDeno, isLE, toBuf } from './fallback/_utils.js'
|
|
4
4
|
import { asciiPrefix } from './fallback/latin1.js'
|
|
5
|
-
import { encodingMapper, encodingDecoder, E_STRICT } from './fallback/single-byte.js'
|
|
6
|
-
|
|
7
|
-
const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
|
|
5
|
+
import { encodingMapper, encodingDecoder, encodeMap, E_STRICT } from './fallback/single-byte.js'
|
|
8
6
|
|
|
9
7
|
function latin1Prefix(arr, start) {
|
|
10
8
|
let p = start | 0
|
|
@@ -24,6 +22,7 @@ function latin1Prefix(arr, start) {
|
|
|
24
22
|
}
|
|
25
23
|
|
|
26
24
|
export function createSinglebyteDecoder(encoding, loose = false) {
|
|
25
|
+
if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
|
|
27
26
|
const latin1path = encoding === 'windows-1252'
|
|
28
27
|
if (isDeno) {
|
|
29
28
|
const jsDecoder = encodingDecoder(encoding) // asserts
|
|
@@ -59,4 +58,51 @@ export function createSinglebyteDecoder(encoding, loose = false) {
|
|
|
59
58
|
}
|
|
60
59
|
}
|
|
61
60
|
|
|
61
|
+
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
|
|
62
|
+
|
|
63
|
+
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
|
|
64
|
+
// TODO: replacement, truncate (replacement will need varying length)
|
|
65
|
+
if (mode !== 'fatal') throw new Error('Unsupported mode')
|
|
66
|
+
const m = encodeMap(encoding) // asserts
|
|
67
|
+
|
|
68
|
+
return (s) => {
|
|
69
|
+
if (typeof s !== 'string') throw new TypeError('Input is not a string')
|
|
70
|
+
|
|
71
|
+
// Instead of an ASCII regex check, encode optimistically - this is faster
|
|
72
|
+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
|
|
73
|
+
if (!NON_LATIN.test(s)) {
|
|
74
|
+
const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
|
|
75
|
+
if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const len = s.length
|
|
79
|
+
let i = 0
|
|
80
|
+
const b = Buffer.from(s, 'utf-16le') // aligned
|
|
81
|
+
if (!isLE) b.swap16()
|
|
82
|
+
const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
|
|
83
|
+
for (const len3 = len - 3; i < len3; i += 4) {
|
|
84
|
+
const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
|
|
85
|
+
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
|
|
86
|
+
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) {
|
|
87
|
+
throw new TypeError(E_STRICT)
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
x[i] = c0
|
|
91
|
+
x[i + 1] = c1
|
|
92
|
+
x[i + 2] = c2
|
|
93
|
+
x[i + 3] = c3
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
for (; i < len; i++) {
|
|
97
|
+
const x0 = x[i]
|
|
98
|
+
const c0 = m[x0]
|
|
99
|
+
if (!c0 && x0) throw new TypeError(E_STRICT)
|
|
100
|
+
x[i] = c0
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return new Uint8Array(x)
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
62
107
|
export const windows1252toString = createSinglebyteDecoder('windows-1252')
|
|
108
|
+
export const windows1252fromString = createSinglebyteEncoder('windows-1252')
|
package/utf8.js
CHANGED
|
@@ -57,7 +57,7 @@ function decode(arr, loose = false) {
|
|
|
57
57
|
if (nativeDecoder) return loose ? decoderLoose.decode(arr) : decoderFatal.decode(arr) // Node.js and browsers
|
|
58
58
|
|
|
59
59
|
// Fast path for ASCII prefix, this is faster than all alternatives below
|
|
60
|
-
const prefix = decodeLatin1(arr, 0, asciiPrefix(arr))
|
|
60
|
+
const prefix = decodeLatin1(arr, 0, asciiPrefix(arr)) // No native decoder to use, so decodeAscii is useless here
|
|
61
61
|
if (prefix.length === arr.length) return prefix
|
|
62
62
|
|
|
63
63
|
// This codepath gives a ~3x perf boost on Hermes
|