@exodus/bytes 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -33,16 +33,30 @@ See [Performance](./Performance.md) for more info
33
33
  import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding.js'
34
34
  ```
35
35
 
36
- Less than half the bundle size of [text-encoding](https://npmjs.com/text-encoding), [whatwg-encoding](https://npmjs.com/whatwg-encoding) or [iconv-lite](https://npmjs.com/iconv-lite) (gzipped or not), and [is much faster](#fast).
37
- See also [lite version](#lite-version).
36
+ Less than half the bundle size of [text-encoding](https://npmjs.com/text-encoding), [whatwg-encoding](https://npmjs.com/whatwg-encoding) or [iconv-lite](https://npmjs.com/iconv-lite) (gzipped or not).\
37
+ Also [much faster](#fast) than all of those.
38
38
 
39
- Spec compliant, passing WPT and covered with extra tests.
39
+ > [!TIP]
40
+ > See also the [lite version](#lite-version) to get this down to 9 KiB gzipped.
40
41
 
41
- Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).
42
+ Spec compliant, passing WPT and covered with extra tests.\
43
+ Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).\
44
+ Including all three major browser engines being wrong at UTF-8.\
45
+ See [WPT pull request](https://github.com/web-platform-tests/wpt/pull/56892).
42
46
 
43
- [Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
47
+ It works correctly even in environments that have native implementations broken (that's all of them currently).\
44
48
  Runs (and passes WPT) on Node.js built without ICU.
45
49
 
50
+ > [!NOTE]
51
+ > [Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
52
+ >
53
+ > The JS multi-byte version is as fast as native impl in Node.js and browsers, but (unlike them) returns correct results.
54
+ >
55
+ > For encodings where native version is known to be fast and correct, it is automatically used.\
56
+ > Some single-byte encodings are faster than native in all three major browser engines.
57
+
58
+ See [analysis table](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit) for more info.
59
+
46
60
  ### Caveat: `TextDecoder` / `TextEncoder` APIs are lossy by default per spec
47
61
 
48
62
  _These are only provided as a compatibility layer, prefer hardened APIs instead in new code._
@@ -128,3 +128,9 @@ export function decode2string(arr, start, end, m) {
128
128
  export function assert(condition, msg) {
129
129
  if (!condition) throw new Error(msg)
130
130
  }
131
+
132
+ // On arrays in heap (<= 64) it's cheaper to copy into a pooled buffer than lazy-create the ArrayBuffer storage
133
+ export const toBuf = (x) =>
134
+ x.byteLength <= 64 && x.BYTES_PER_ELEMENT === 1
135
+ ? Buffer.from(x)
136
+ : Buffer.from(x.buffer, x.byteOffset, x.byteLength)
@@ -47,6 +47,10 @@ export function normalizeEncoding(label) {
47
47
 
48
48
  const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
49
49
 
50
+ // TODO: make this more strict against Symbol.toStringTag
51
+ // Is not very significant though, anything faking Symbol.toStringTag could as well override
52
+ // prototypes, which is not something we protect against
53
+
50
54
  function isAnyArrayBuffer(x) {
51
55
  if (x instanceof ArrayBuffer) return true
52
56
  if (globalThis.SharedArrayBuffer && x instanceof SharedArrayBuffer) return true
@@ -55,6 +59,12 @@ function isAnyArrayBuffer(x) {
55
59
  return s === '[object ArrayBuffer]' || s === '[object SharedArrayBuffer]'
56
60
  }
57
61
 
62
+ function isAnyUint8Array(x) {
63
+ if (x instanceof Uint8Array) return true
64
+ if (!x || !ArrayBuffer.isView(x) || x.BYTES_PER_ELEMENT !== 1) return false
65
+ return Object.prototype.toString.call(x) === '[object Uint8Array]'
66
+ }
67
+
58
68
  const fromSource = (x) => {
59
69
  if (x instanceof Uint8Array) return x
60
70
  if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
@@ -217,7 +227,7 @@ export class TextEncoder {
217
227
 
218
228
  encodeInto(str, target) {
219
229
  if (typeof str !== 'string') str = `${str}`
220
- if (!(target instanceof Uint8Array)) throw new TypeError('Target must be an Uint8Array')
230
+ if (!isAnyUint8Array(target)) throw new TypeError('Target must be an Uint8Array')
221
231
  if (target.buffer.detached) return { read: 0, written: 0 } // Until https://github.com/whatwg/encoding/issues/324 is resolved
222
232
 
223
233
  const tlen = target.length
@@ -5,6 +5,7 @@ import {
5
5
  nativeBuffer,
6
6
  isHermes,
7
7
  isDeno,
8
+ isLE,
8
9
  } from './_utils.js'
9
10
 
10
11
  // See http://stackoverflow.com/a/22747272/680742, which says that lowest limit is in Chrome, with 0xffff args
@@ -60,6 +61,16 @@ export function decodeLatin1(arr, start = 0, stop = arr.length) {
60
61
  return String.fromCharCode.apply(String, sliced)
61
62
  }
62
63
 
64
+ // Unchecked for well-formedness, raw. Expects Uint16Array input
65
+ export const decodeUCS2 =
66
+ nativeBuffer && isLE && !isDeno
67
+ ? (u16, stop = u16.length) => {
68
+ // TODO: fast path for BE, perhaps faster path for Deno. Note that decoder replaces, this function doesn't
69
+ if (stop > 32) return nativeBuffer.from(u16.buffer, u16.byteOffset, stop * 2).ucs2Slice() // from 64 bytes, below are in heap
70
+ return decodeLatin1(u16, 0, stop)
71
+ }
72
+ : (u16, stop = u16.length) => decodeLatin1(u16, 0, stop)
73
+
63
74
  // Does not check input, uses best available method
64
75
  // Building an array for this is only faster than proper string concatenation when TextDecoder or native Buffer are available
65
76
  export const decodeAscii = nativeBuffer
@@ -70,7 +81,10 @@ export const decodeAscii = nativeBuffer
70
81
  : nativeDecoder.decode(a) // On Node.js, utf8 decoder is faster than latin1
71
82
  : nativeDecoderLatin1
72
83
  ? (a) => nativeDecoderLatin1.decode(a) // On browsers (specifically WebKit), latin1 decoder is faster than utf8
73
- : (a) => decodeLatin1(new Uint8Array(a.buffer, a.byteOffset, a.byteLength)) // Fallback. We shouldn't get here, constructing with strings directly is faster
84
+ : (a) =>
85
+ decodeLatin1(
86
+ a instanceof Uint8Array ? a : new Uint8Array(a.buffer, a.byteOffset, a.byteLength)
87
+ )
74
88
 
75
89
  /* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
76
90
 
@@ -1,4 +1,4 @@
1
- import { asciiPrefix, decodeLatin1 } from './latin1.js'
1
+ import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2 } from './latin1.js'
2
2
  import { getTable } from './multi-byte.table.js'
3
3
 
4
4
  export const E_STRICT = 'Input is not well-formed for this encoding'
@@ -8,37 +8,69 @@ export const E_STRICT = 'Input is not well-formed for this encoding'
8
8
  // If the decoder is not cleared properly, state can be preserved between non-streaming calls!
9
9
  // See comment about fatal stream
10
10
 
11
- // All except iso-2022-jp are ASCII supersets
12
- // When adding something that is not an ASCII superset, ajust the ASCII fast path
13
- const REP = 0xff_fd
14
- const mappers = {
15
- // https://encoding.spec.whatwg.org/#euc-kr-decoder
16
- 'euc-kr': (err) => {
17
- const euc = getTable('euc-kr')
18
- let lead = 0
11
+ // Common between euc-kr and big5
12
+ function bigDecoder(err, pair) {
13
+ let lead = 0
14
+ let oi = 0
15
+ let o16
16
+
17
+ const decodeLead = (b) => {
18
+ const p = pair(lead, b)
19
+ lead = 0
20
+ if (typeof p === 'number') {
21
+ o16[oi++] = p
22
+ } else if (p) {
23
+ // This is still faster than string concatenation. Can we optimize strings though?
24
+ for (let i = 0; i < p.length; i++) o16[oi++] = p.charCodeAt(i)
25
+ } else {
26
+ o16[oi++] = err()
27
+ if (b < 128) o16[oi++] = b
28
+ }
29
+ }
19
30
 
20
- const pushback = []
21
- const bytes = (b) => {
22
- if (lead) {
23
- const cp = b >= 0x41 && b <= 0xfe ? euc[(lead - 0x81) * 190 + b - 0x41] : undefined
24
- lead = 0
25
- if (cp !== undefined && cp !== REP) return cp
26
- if (b < 128) pushback.push(b)
27
- return err()
31
+ const decode = (arr, start, end, stream) => {
32
+ let i = start
33
+ o16 = new Uint16Array(end - start + (lead ? 1 : 0)) // there are pairs but they consume more than one byte
34
+ oi = 0
35
+
36
+ if (lead && i < end) decodeLead(arr[i++])
37
+ while (i < end) {
38
+ const b = arr[i++]
39
+ if (b < 128) {
40
+ o16[oi++] = b
41
+ } else if (b === 0x80 || b === 0xff) {
42
+ o16[oi++] = err()
43
+ } else {
44
+ lead = b
45
+ if (i < end) decodeLead(arr[i++])
28
46
  }
29
-
30
- if (b < 128) return b
31
- if (b < 0x81 || b === 0xff) return err()
32
- lead = b
33
47
  }
34
48
 
35
- const eof = () => {
36
- if (!lead) return null
49
+ if (lead && !stream) {
37
50
  lead = 0
38
- return err()
51
+ o16[oi++] = err()
39
52
  }
40
53
 
41
- return { bytes, eof, pushback }
54
+ const res = decodeUCS2(o16, oi)
55
+ o16 = null
56
+ return res
57
+ }
58
+
59
+ return { decode, isAscii: () => lead === 0 }
60
+ }
61
+
62
+ // All except iso-2022-jp are ASCII supersets
63
+ // When adding something that is not an ASCII superset, ajust the ASCII fast path
64
+ const REP = 0xff_fd
65
+ const mappers = {
66
+ // https://encoding.spec.whatwg.org/#euc-kr-decoder
67
+ 'euc-kr': (err) => {
68
+ const euc = getTable('euc-kr')
69
+ return bigDecoder(err, (l, b) => {
70
+ if (b < 0x41 || b > 0xfe) return
71
+ const cp = euc[(l - 0x81) * 190 + b - 0x41]
72
+ return cp !== undefined && cp !== REP ? cp : undefined
73
+ })
42
74
  },
43
75
  // https://encoding.spec.whatwg.org/#euc-jp-decoder
44
76
  'euc-jp': (err) => {
@@ -46,21 +78,17 @@ const mappers = {
46
78
  const jis0212 = getTable('jis0212')
47
79
  let j12 = false
48
80
  let lead = 0
81
+ let oi = 0
82
+ let o16
49
83
 
50
- const pushback = []
51
- const bytes = (b) => {
84
+ const decodeLead = (b) => {
52
85
  if (lead === 0x8e && b >= 0xa1 && b <= 0xdf) {
53
86
  lead = 0
54
- return 0xfe_c0 + b
55
- }
56
-
57
- if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
87
+ o16[oi++] = 0xfe_c0 + b
88
+ } else if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
58
89
  j12 = true
59
90
  lead = b
60
- return
61
- }
62
-
63
- if (lead) {
91
+ } else {
64
92
  let cp
65
93
  if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
66
94
  cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
@@ -68,43 +96,60 @@ const mappers = {
68
96
 
69
97
  lead = 0
70
98
  j12 = false
71
- if (cp !== undefined && cp !== REP) return cp
72
- if (b < 128) pushback.push(b)
73
- return err()
99
+ if (cp !== undefined && cp !== REP) {
100
+ o16[oi++] = cp
101
+ } else {
102
+ o16[oi++] = err()
103
+ if (b < 128) o16[oi++] = b
104
+ }
74
105
  }
75
-
76
- if (b < 128) return b
77
- if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) return err()
78
- lead = b
79
106
  }
80
107
 
81
- // eslint-disable-next-line sonarjs/no-identical-functions
82
- const eof = () => {
83
- if (!lead) return null
84
- lead = 0
85
- return err()
108
+ const decode = (arr, start, end, stream) => {
109
+ let i = start
110
+ o16 = new Uint16Array(end - start + (lead ? 1 : 0))
111
+ oi = 0
112
+
113
+ if (lead && i < end) decodeLead(arr[i++])
114
+ if (lead && i < end) decodeLead(arr[i++]) // could be two leads, but no more
115
+ while (i < end) {
116
+ const b = arr[i++]
117
+ if (b < 128) {
118
+ o16[oi++] = b
119
+ } else if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) {
120
+ o16[oi++] = err()
121
+ } else {
122
+ lead = b
123
+ if (i < end) decodeLead(arr[i++])
124
+ if (lead && i < end) decodeLead(arr[i++]) // could be two leads
125
+ }
126
+ }
127
+
128
+ if (lead && !stream) {
129
+ lead = 0
130
+ j12 = false // can be true only when lead is non-zero
131
+ o16[oi++] = err()
132
+ }
133
+
134
+ const res = decodeUCS2(o16, oi)
135
+ o16 = null
136
+ return res
86
137
  }
87
138
 
88
- return { bytes, eof, pushback }
139
+ return { decode, isAscii: () => lead === 0 } // j12 can be true only when lead is non-zero
89
140
  },
90
141
  // https://encoding.spec.whatwg.org/#iso-2022-jp-decoder
91
- // Per-letter of the spec, don't shortcut on state changes on EOF. Some code is regrouped but preserving the logic
92
142
  'iso-2022-jp': (err) => {
93
143
  const jis0208 = getTable('jis0208')
94
- const EOF = -1
95
144
  let dState = 1
96
145
  let oState = 1
97
- let lead = 0
146
+ let lead = 0 // 0 or 0x21-0x7e
98
147
  let out = false
99
148
 
100
- const pushback = []
101
- const bytes = (b) => {
102
- if (dState < 5) {
103
- if (b === EOF) return null
104
- if (b === 0x1b) {
105
- dState = 6 // escape start
106
- return
107
- }
149
+ const bytes = (pushback, b) => {
150
+ if (dState < 5 && b === 0x1b) {
151
+ dState = 6 // escape start
152
+ return
108
153
  }
109
154
 
110
155
  switch (dState) {
@@ -156,7 +201,7 @@ const mappers = {
156
201
 
157
202
  out = false
158
203
  dState = oState
159
- if (b !== EOF) pushback.push(b)
204
+ pushback.push(b)
160
205
  return err()
161
206
  case 7: {
162
207
  // Escape
@@ -185,52 +230,131 @@ const mappers = {
185
230
 
186
231
  out = false
187
232
  dState = oState
188
- if (b !== EOF) pushback.push(b)
189
- pushback.push(l)
233
+ pushback.push(b, l)
234
+ return err()
235
+ }
236
+ }
237
+ }
238
+
239
+ const eof = (pushback) => {
240
+ if (dState < 5) return null
241
+ out = false
242
+ switch (dState) {
243
+ case 5:
244
+ dState = 4
245
+ return err()
246
+ case 6:
247
+ dState = oState
248
+ return err()
249
+ case 7: {
250
+ dState = oState
251
+ pushback.push(lead)
252
+ lead = 0
190
253
  return err()
191
254
  }
192
255
  }
193
256
  }
194
257
 
195
- const eof = () => bytes(EOF)
258
+ const decode = (arr, start, end, stream) => {
259
+ const o16 = new Uint16Array(end - start + 2) // err in eof + lead from state
260
+ let oi = 0
261
+ let i = start
262
+ const pushback = [] // local and auto-cleared
263
+
264
+ // First, dump everything until EOF
265
+ // Same as the full loop, but without EOF handling
266
+ while (i < end || pushback.length > 0) {
267
+ const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
268
+ if (c !== undefined) o16[oi++] = c // 16-bit
269
+ }
270
+
271
+ // Then, dump EOF. This needs the same loop as the characters can be pushed back
272
+ if (!stream) {
273
+ while (i <= end || pushback.length > 0) {
274
+ if (i < end || pushback.length > 0) {
275
+ const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
276
+ if (c !== undefined) o16[oi++] = c // 16-bit
277
+ } else {
278
+ const c = eof(pushback)
279
+ if (c === null) break // clean exit
280
+ o16[oi++] = c
281
+ }
282
+ }
283
+ }
284
+
285
+ // Chrome and WebKit fail on this, we don't: completely destroy the old decoder state when finished streaming
286
+ // > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
287
+ // > Set this’s do not flush to options["stream"]
288
+ if (!stream) {
289
+ dState = oState = 1
290
+ lead = 0
291
+ out = false
292
+ }
293
+
294
+ return decodeUCS2(o16, oi)
295
+ }
196
296
 
197
- return { bytes, eof, pushback }
297
+ return { decode, isAscii: () => false }
198
298
  },
199
299
  // https://encoding.spec.whatwg.org/#shift_jis-decoder
200
300
  shift_jis: (err) => {
201
301
  const jis0208 = getTable('jis0208')
202
302
  let lead = 0
303
+ let oi = 0
304
+ let o16
203
305
 
204
- const pushback = []
205
- const bytes = (b) => {
206
- if (lead) {
207
- const l = lead
208
- lead = 0
209
- if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
210
- const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
211
- if (p >= 8836 && p <= 10_715) return 0xe0_00 - 8836 + p // 16-bit
212
- const cp = jis0208[p]
213
- if (cp !== undefined && cp !== REP) return cp
306
+ const decodeLead = (b) => {
307
+ const l = lead
308
+ lead = 0
309
+ if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
310
+ const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
311
+ if (p >= 8836 && p <= 10_715) {
312
+ o16[oi++] = 0xe0_00 - 8836 + p
313
+ return
214
314
  }
215
315
 
216
- if (b < 128) pushback.push(b)
217
- return err()
316
+ const cp = jis0208[p]
317
+ if (cp !== undefined && cp !== REP) {
318
+ o16[oi++] = cp
319
+ return
320
+ }
218
321
  }
219
322
 
220
- if (b <= 0x80) return b // 0x80 is allowed
221
- if (b >= 0xa1 && b <= 0xdf) return 0xff_61 - 0xa1 + b
222
- if (b < 0x81 || (b > 0x9f && b < 0xe0) || b > 0xfc) return err()
223
- lead = b
323
+ o16[oi++] = err()
324
+ if (b < 128) o16[oi++] = b
224
325
  }
225
326
 
226
- // eslint-disable-next-line sonarjs/no-identical-functions
227
- const eof = () => {
228
- if (!lead) return null
229
- lead = 0 // this clears state completely on EOF
230
- return err()
327
+ const decode = (arr, start, end, stream) => {
328
+ o16 = new Uint16Array(end - start + (lead ? 1 : 0))
329
+ oi = 0
330
+ let i = start
331
+
332
+ if (lead && i < end) decodeLead(arr[i++])
333
+ while (i < end) {
334
+ const b = arr[i++]
335
+ if (b <= 0x80) {
336
+ o16[oi++] = b // 0x80 is allowed
337
+ } else if (b >= 0xa1 && b <= 0xdf) {
338
+ o16[oi++] = 0xfe_c0 + b
339
+ } else if (b === 0xa0 || b > 0xfc) {
340
+ o16[oi++] = err()
341
+ } else {
342
+ lead = b
343
+ if (i < end) decodeLead(arr[i++])
344
+ }
345
+ }
346
+
347
+ if (lead && !stream) {
348
+ lead = 0
349
+ o16[oi++] = err()
350
+ }
351
+
352
+ const res = decodeUCS2(o16, oi)
353
+ o16 = null
354
+ return res
231
355
  }
232
356
 
233
- return { bytes, eof, pushback }
357
+ return { decode, isAscii: () => lead === 0 }
234
358
  },
235
359
  // https://encoding.spec.whatwg.org/#gbk-decoder
236
360
  gbk: (err) => mappers.gb18030(err), // 10.1.1. GBK’s decoder is gb18030’s decoder
@@ -252,179 +376,130 @@ const mappers = {
252
376
  return b + p - a
253
377
  }
254
378
 
255
- const pushback = []
256
- const bytes = (b) => {
257
- if (g3) {
258
- if (b < 0x30 || b > 0x39) {
259
- pushback.push(b, g3, g2)
260
- g1 = g2 = g3 = 0
261
- return err()
262
- }
263
-
264
- const cp = index((g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30)
265
- g1 = g2 = g3 = 0
266
- if (cp !== undefined) return cp // Can validly return replacement
267
- return err()
268
- }
379
+ // g1 is 0 or 0x81-0xfe
380
+ // g2 is 0 or 0x30-0x39
381
+ // g3 is 0 or 0x81-0xfe
382
+
383
+ const decode = (arr, start, end, stream) => {
384
+ const o16 = new Uint16Array(end - start + (g1 ? 3 : 0)) // even with pushback it's at most 1 char per byte
385
+ let oi = 0
386
+ let i = start
387
+ const pushback = [] // local and auto-cleared
388
+
389
+ // First, dump everything until EOF
390
+ // Same as the full loop, but without EOF handling
391
+ while (i < end || pushback.length > 0) {
392
+ const b = pushback.length > 0 ? pushback.pop() : arr[i++]
393
+ if (g1) {
394
+ // g2 can be set only when g1 is set, g3 can be set only when g2 is set
395
+ // hence, 3 checks for g3 is faster than 3 checks for g1
396
+ if (g2) {
397
+ if (g3) {
398
+ if (b < 0x30 || b > 0x39) {
399
+ pushback.push(b, g3, g2)
400
+ g1 = g2 = g3 = 0
401
+ o16[oi++] = err()
402
+ } else {
403
+ const p = index(
404
+ (g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30
405
+ )
406
+ g1 = g2 = g3 = 0
407
+ if (p === undefined) {
408
+ o16[oi++] = err()
409
+ } else if (p <= 0xff_ff) {
410
+ o16[oi++] = p // Can validly return replacement
411
+ } else {
412
+ const d = p - 0x1_00_00
413
+ o16[oi++] = 0xd8_00 | (d >> 10)
414
+ o16[oi++] = 0xdc_00 | (d & 0x3_ff)
415
+ }
416
+ }
417
+ } else if (b >= 0x81 && b <= 0xfe) {
418
+ g3 = b
419
+ } else {
420
+ pushback.push(b, g2)
421
+ g1 = g2 = 0
422
+ o16[oi++] = err()
423
+ }
424
+ } else if (b >= 0x30 && b <= 0x39) {
425
+ g2 = b
426
+ } else {
427
+ let cp
428
+ if (b >= 0x40 && b <= 0xfe && b !== 0x7f) {
429
+ cp = gb18030[(g1 - 0x81) * 190 + b - (b < 0x7f ? 0x40 : 0x41)]
430
+ }
269
431
 
270
- if (g2) {
271
- if (b >= 0x81 && b <= 0xfe) {
272
- g3 = b
273
- return
432
+ g1 = 0
433
+ if (cp !== undefined && cp !== REP) {
434
+ o16[oi++] = cp // 16-bit
435
+ } else {
436
+ o16[oi++] = err()
437
+ if (b < 128) o16[oi++] = b // can be processed immediately
438
+ }
439
+ }
440
+ } else if (b < 128) {
441
+ o16[oi++] = b
442
+ } else if (b === 0x80) {
443
+ o16[oi++] = 0x20_ac
444
+ } else if (b === 0xff) {
445
+ o16[oi++] = err()
446
+ } else {
447
+ g1 = b
274
448
  }
275
-
276
- pushback.push(b, g2)
277
- g1 = g2 = 0
278
- return err()
279
449
  }
280
450
 
281
- if (g1) {
282
- if (b >= 0x30 && b <= 0x39) {
283
- g2 = b
284
- return
285
- }
286
-
287
- let cp
288
- if (b >= 0x40 && b <= 0xfe && b !== 0x7f) {
289
- cp = gb18030[(g1 - 0x81) * 190 + b - (b < 0x7f ? 0x40 : 0x41)]
290
- }
291
-
292
- g1 = 0
293
- if (cp !== undefined && cp !== REP) return cp
294
- if (b < 128) pushback.push(b)
295
- return err()
451
+ // if g1 = 0 then g2 = g3 = 0
452
+ if (g1 && !stream) {
453
+ g1 = g2 = g3 = 0
454
+ o16[oi++] = err()
296
455
  }
297
456
 
298
- if (b < 128) return b
299
- if (b === 0x80) return 0x20_ac
300
- if (b === 0xff) return err()
301
- g1 = b
302
- }
303
-
304
- const eof = () => {
305
- if (!g1 && !g2 && !g3) return null
306
- g1 = g2 = g3 = 0
307
- return err()
457
+ return decodeUCS2(o16, oi)
308
458
  }
309
459
 
310
- return { bytes, eof, pushback }
460
+ return { decode, isAscii: () => g1 === 0 } // if g1 = 0 then g2 = g3 = 0
461
+ },
462
+ // https://encoding.spec.whatwg.org/#big5
463
+ big5: (err) => {
464
+ // The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
465
+ // We store that as strings
466
+ const big5 = getTable('big5')
467
+ return bigDecoder(err, (l, b) => {
468
+ if (b < 0x40 || (b > 0x7e && b < 0xa1) || b === 0xff) return
469
+ return big5[(l - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)] // strings
470
+ })
311
471
  },
312
472
  }
313
473
 
314
474
  export const isAsciiSuperset = (enc) => enc !== 'iso-2022-jp' // all others are ASCII supersets and can use fast path
315
475
 
316
476
  export function multibyteDecoder(enc, loose = false) {
317
- if (enc === 'big5') return big5decoder(loose)
318
477
  if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
319
478
 
320
479
  // Input is assumed to be typechecked already
321
480
  let mapper
322
481
  const asciiSuperset = isAsciiSuperset(enc)
323
- return (arr, stream = false) => {
324
- const onErr = loose
325
- ? () => REP
326
- : () => {
327
- mapper.pushback.length = 0 // the queue is cleared on returning an error
328
- // The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
329
- // Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
330
- // iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
331
- if (!stream) mapper = null // destroy state, effectively the same as 'do not flush' = false, but early
332
- throw new TypeError(E_STRICT)
333
- }
334
-
335
- let res = ''
336
- const length = arr.length
337
- if (asciiSuperset && !mapper) {
338
- res = decodeLatin1(arr, 0, asciiPrefix(arr))
339
- if (res.length === arr.length) return res // ascii
340
- }
341
-
342
- if (!mapper) mapper = mappers[enc](onErr)
343
- const { bytes, eof, pushback } = mapper
344
- let i = res.length
345
-
346
- // First, dump everything until EOF
347
- // Same as the full loop, but without EOF handling
348
- while (i < length || pushback.length > 0) {
349
- const c = bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
350
- if (c === undefined) continue // consuming
351
- res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
352
- }
353
-
354
- // Then, dump EOF. This needs the same loop as the characters can be pushed back
355
- // TODO: only some encodings need this, most can be optimized
356
- if (!stream) {
357
- while (i <= length || pushback.length > 0) {
358
- const isEOF = i === length && pushback.length === 0
359
- const c = isEOF ? eof() : bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
360
- if (isEOF && c === null) break // clean exit
361
- if (c === undefined) continue // consuming
362
- res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
482
+ let streaming // because onErr is cached in mapper
483
+ const onErr = loose
484
+ ? () => REP
485
+ : () => {
486
+ // The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
487
+ // Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
488
+ // iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
489
+ if (!streaming) mapper = null // destroy state, effectively the same as 'do not flush' = false, but early
490
+ throw new TypeError(E_STRICT)
363
491
  }
364
- }
365
-
366
- // Chrome and WebKit fail on this, we don't: completely destroy the old decoder instance when finished streaming
367
- // > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
368
- // > Set this’s do not flush to options["stream"]
369
- if (!stream) mapper = null
370
492
 
371
- return res
372
- }
373
- }
374
-
375
- // The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
376
- // We store that as strings
377
- function big5decoder(loose) {
378
- // Input is assumed to be typechecked already
379
- let lead = 0
380
- let big5
381
493
  return (arr, stream = false) => {
382
- const onErr = loose
383
- ? () => '\uFFFD'
384
- : () => {
385
- // Lead is always already cleared before throwing
386
- throw new TypeError(E_STRICT)
387
- }
388
-
389
494
  let res = ''
390
- const length = arr.length
391
- if (!lead) {
392
- res = decodeLatin1(arr, 0, asciiPrefix(arr))
393
- if (res.length === arr.length) return res // ascii
394
- }
395
-
396
- if (!big5) big5 = getTable('big5')
397
- for (let i = res.length; i < length; i++) {
398
- const b = arr[i]
399
- if (lead) {
400
- let cp
401
- if ((b >= 0x40 && b <= 0x7e) || (b >= 0xa1 && b !== 0xff)) {
402
- cp = big5[(lead - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)]
403
- }
404
-
405
- lead = 0
406
- if (cp) {
407
- res += cp // strings
408
- } else {
409
- res += onErr()
410
- // same as pushing it back: lead is cleared, pushed back can't contain more than 1 byte
411
- if (b < 128) res += String.fromCharCode(b)
412
- }
413
- } else if (b < 128) {
414
- res += String.fromCharCode(b)
415
- } else if (b < 0x81 || b === 0xff) {
416
- res += onErr()
417
- } else {
418
- lead = b
419
- }
495
+ if (asciiSuperset && (!mapper || mapper.isAscii?.())) {
496
+ const prefixLen = asciiPrefix(arr)
497
+ if (prefixLen === arr.length) return decodeAscii(arr) // ascii
498
+ res = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
420
499
  }
421
500
 
422
- if (!stream && lead) {
423
- // Destroy decoder state
424
- lead = 0
425
- res += onErr()
426
- }
427
-
428
- return res
501
+ streaming = stream // affects onErr
502
+ if (!mapper) mapper = mappers[enc](onErr)
503
+ return res + mapper.decode(arr, res.length, arr.length, stream)
429
504
  }
430
505
  }
@@ -56,7 +56,9 @@ function unwrap(res, t, pos, stringMode = false) {
56
56
  }
57
57
 
58
58
  if (stringMode) {
59
- for (let k = 0; k < x; k++, pos++, code++) res[pos] = String.fromCodePoint(code)
59
+ for (let k = 0; k < x; k++, pos++, code++) {
60
+ res[pos] = code <= 0xff_ff ? code : String.fromCodePoint(code)
61
+ }
60
62
  } else {
61
63
  for (let k = 0; k < x; k++, pos++, code++) res[pos] = code
62
64
  }
@@ -65,8 +67,13 @@ function unwrap(res, t, pos, stringMode = false) {
65
67
  pos = unwrap(res, indices[x], pos, stringMode) // self-reference using shared chunks
66
68
  } else if (stringMode) {
67
69
  const s = [...utf16toString(loadBase64(x), 'uint8-le')] // splits by codepoints
68
- for (let i = 0; i < s.length; ) res[pos++] = s[i++] // TODO: splice?
69
- code = s[s.length - 1].codePointAt(0) + 1
70
+ let char
71
+ for (let i = 0; i < s.length; ) {
72
+ char = s[i++]
73
+ res[pos++] = char.length === 1 ? char.charCodeAt(0) : char // strings only for high codepoints
74
+ }
75
+
76
+ code = char.codePointAt(0) + 1
70
77
  } else {
71
78
  const u16 = to16input(loadBase64(x), true) // data is little-endian
72
79
  res.set(u16, pos)
@@ -1,4 +1,4 @@
1
- import { asciiPrefix, decodeLatin1 } from './latin1.js'
1
+ import { asciiPrefix, decodeAscii, decodeLatin1 } from './latin1.js'
2
2
  import encodings from './single-byte.encodings.js'
3
3
  import { decode2string } from './_utils.js'
4
4
 
@@ -74,8 +74,9 @@ export function encodingDecoder(encoding) {
74
74
  strings = allCodes.map((c) => String.fromCharCode(c))
75
75
  }
76
76
 
77
- const prefix = decodeLatin1(arr, 0, asciiPrefix(arr))
78
- if (prefix.length === arr.length) return prefix
77
+ const prefixLen = asciiPrefix(arr)
78
+ if (prefixLen === arr.length) return decodeAscii(arr)
79
+ const prefix = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
79
80
  const suffix = decode2string(arr, prefix.length, arr.length, strings)
80
81
  if (!loose && incomplete && suffix.includes('\uFFFD')) throw new TypeError(E_STRICT)
81
82
  return prefix + suffix
package/fallback/utf16.js CHANGED
@@ -1,4 +1,4 @@
1
- import { decodeLatin1, encodeCharcodes } from './latin1.js'
1
+ import { decodeUCS2, encodeCharcodes } from './latin1.js'
2
2
  import { isLE } from './_utils.js'
3
3
 
4
4
  export const E_STRICT = 'Input is not well-formed utf16'
@@ -38,9 +38,9 @@ export function to16input(u8, le) {
38
38
  }
39
39
 
40
40
  export const decode = (u16, loose = false, checked = false) => {
41
- if (checked || isWellFormed(u16)) return decodeLatin1(u16, 0, u16.length) // it's capable of decoding Uint16Array to UTF-16 as well
41
+ if (checked || isWellFormed(u16)) return decodeUCS2(u16)
42
42
  if (!loose) throw new TypeError(E_STRICT)
43
- return decodeLatin1(toWellFormed(Uint16Array.from(u16)), 0, u16.length) // cloned for replacement
43
+ return decodeUCS2(toWellFormed(Uint16Array.from(u16))) // cloned for replacement
44
44
  }
45
45
 
46
46
  export function encode(str, loose = false, checked = false, swapped = false) {
@@ -1,10 +1,8 @@
1
1
  import { assertUint8 } from './assert.js'
2
- import { isDeno } from './fallback/_utils.js'
2
+ import { isDeno, toBuf } from './fallback/_utils.js'
3
3
  import { isAsciiSuperset, multibyteDecoder } from './fallback/multi-byte.js'
4
4
  import { isAscii } from 'node:buffer'
5
5
 
6
- const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
7
-
8
6
  export function createMultibyteDecoder(encoding, loose = false) {
9
7
  const jsDecoder = multibyteDecoder(encoding, loose) // asserts
10
8
  let streaming = false
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@exodus/bytes",
3
- "version": "1.4.0",
3
+ "version": "1.6.0",
4
4
  "description": "Various operations on Uint8Array data",
5
5
  "scripts": {
6
6
  "lint": "eslint .",
package/single-byte.js CHANGED
@@ -6,6 +6,12 @@ const { TextDecoder } = globalThis
6
6
 
7
7
  let windows1252works
8
8
 
9
+ // prettier-ignore
10
+ const skipNative = new Set([
11
+ 'iso-8859-16', // iso-8859-16 is somehow broken in WebKit, at least on CI
12
+ 'iso-8859-6', 'iso-8859-8', 'iso-8859-8-i', // slow in all 3 engines
13
+ ])
14
+
9
15
  function shouldUseNative(enc) {
10
16
  // https://issues.chromium.org/issues/468458388
11
17
  // Also might be incorrectly imlemented on platforms as Latin1 (e.g. in Node.js) or regress
@@ -24,8 +30,7 @@ function shouldUseNative(enc) {
24
30
  return windows1252works
25
31
  }
26
32
 
27
- // iso-8859-16 is somehow broken in WebKit, at least on CI
28
- return enc !== 'iso-8859-16'
33
+ return !skipNative.has(enc)
29
34
  }
30
35
 
31
36
  export function createSinglebyteDecoder(encoding, loose = false) {
@@ -1,11 +1,9 @@
1
1
  import { assertUint8 } from './assert.js'
2
2
  import { isAscii } from 'node:buffer'
3
- import { isDeno, isLE } from './fallback/_utils.js'
3
+ import { isDeno, isLE, toBuf } from './fallback/_utils.js'
4
4
  import { asciiPrefix } from './fallback/latin1.js'
5
5
  import { encodingMapper, encodingDecoder, E_STRICT } from './fallback/single-byte.js'
6
6
 
7
- const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
8
-
9
7
  function latin1Prefix(arr, start) {
10
8
  let p = start | 0
11
9
  const length = arr.length
package/utf8.js CHANGED
@@ -57,7 +57,7 @@ function decode(arr, loose = false) {
57
57
  if (nativeDecoder) return loose ? decoderLoose.decode(arr) : decoderFatal.decode(arr) // Node.js and browsers
58
58
 
59
59
  // Fast path for ASCII prefix, this is faster than all alternatives below
60
- const prefix = decodeLatin1(arr, 0, asciiPrefix(arr))
60
+ const prefix = decodeLatin1(arr, 0, asciiPrefix(arr)) // No native decoder to use, so decodeAscii is useless here
61
61
  if (prefix.length === arr.length) return prefix
62
62
 
63
63
  // This codepath gives a ~3x perf boost on Hermes