npm - @exodus/bytes - Versions diffs - 1.4.0 → 1.6.0 - Mend

@exodus/bytes 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +19 -5
package/fallback/_utils.js +6 -0
package/fallback/encoding.js +11 -1
package/fallback/latin1.js +15 -1
package/fallback/multi-byte.js +310 -235
package/fallback/multi-byte.table.js +10 -3
package/fallback/single-byte.js +4 -3
package/fallback/utf16.js +3 -3
package/multi-byte.node.js +1 -3
package/package.json +1 -1
package/single-byte.js +7 -2
package/single-byte.node.js +1 -3
package/utf8.js +1 -1

package/README.md CHANGED Viewed

@@ -33,16 +33,30 @@ See [Performance](./Performance.md) for more info
 import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding.js'
 ```
-Less than half the bundle size of [text-encoding](https://npmjs.com/text-encoding), [whatwg-encoding](https://npmjs.com/whatwg-encoding) or [iconv-lite](https://npmjs.com/iconv-lite) (gzipped or not), and [is much faster](#fast).
-See also [lite version](#lite-version).
+Less than half the bundle size of [text-encoding](https://npmjs.com/text-encoding), [whatwg-encoding](https://npmjs.com/whatwg-encoding) or [iconv-lite](https://npmjs.com/iconv-lite) (gzipped or not).\
+Also [much faster](#fast) than all of those.
-Spec compliant, passing WPT and covered with extra tests.
+> [!TIP]
+> See also the [lite version](#lite-version) to get this down to 9 KiB gzipped.
-Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).
+Spec compliant, passing WPT and covered with extra tests.\
+Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).\
+Including all three major browser engines being wrong at UTF-8.\
+See [WPT pull request](https://github.com/web-platform-tests/wpt/pull/56892).
-[Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
+It works correctly even in environments that have native implementations broken (that's all of them currently).\
 Runs (and passes WPT) on Node.js built without ICU.
+> [!NOTE]
+> [Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
+>
+> The JS multi-byte version is as fast as native impl in Node.js and browsers, but (unlike them) returns correct results.
+>
+> For encodings where native version is known to be fast and correct, it is automatically used.\
+> Some single-byte encodings are faster than native in all three major browser engines.
+See [analysis table](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit) for more info.
 ### Caveat: `TextDecoder` / `TextEncoder` APIs are lossy by default per spec
 _These are only provided as a compatibility layer, prefer hardened APIs instead in new code._

package/fallback/_utils.js CHANGED Viewed

@@ -128,3 +128,9 @@ export function decode2string(arr, start, end, m) {
 export function assert(condition, msg) {
   if (!condition) throw new Error(msg)
 }
+// On arrays in heap (<= 64) it's cheaper to copy into a pooled buffer than lazy-create the ArrayBuffer storage
+export const toBuf = (x) =>
+  x.byteLength <= 64 && x.BYTES_PER_ELEMENT === 1
+    ? Buffer.from(x)
+    : Buffer.from(x.buffer, x.byteOffset, x.byteLength)

package/fallback/encoding.js CHANGED Viewed

@@ -47,6 +47,10 @@ export function normalizeEncoding(label) {
 const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
+// TODO: make this more strict against Symbol.toStringTag
+// Is not very significant though, anything faking Symbol.toStringTag could as well override
+// prototypes, which is not something we protect against
 function isAnyArrayBuffer(x) {
   if (x instanceof ArrayBuffer) return true
   if (globalThis.SharedArrayBuffer && x instanceof SharedArrayBuffer) return true
@@ -55,6 +59,12 @@ function isAnyArrayBuffer(x) {
   return s === '[object ArrayBuffer]' || s === '[object SharedArrayBuffer]'
 }
+function isAnyUint8Array(x) {
+  if (x instanceof Uint8Array) return true
+  if (!x || !ArrayBuffer.isView(x) || x.BYTES_PER_ELEMENT !== 1) return false
+  return Object.prototype.toString.call(x) === '[object Uint8Array]'
+}
 const fromSource = (x) => {
   if (x instanceof Uint8Array) return x
   if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
@@ -217,7 +227,7 @@ export class TextEncoder {
   encodeInto(str, target) {
     if (typeof str !== 'string') str = `${str}`
-    if (!(target instanceof Uint8Array)) throw new TypeError('Target must be an Uint8Array')
+    if (!isAnyUint8Array(target)) throw new TypeError('Target must be an Uint8Array')
     if (target.buffer.detached) return { read: 0, written: 0 } // Until https://github.com/whatwg/encoding/issues/324 is resolved
     const tlen = target.length

package/fallback/latin1.js CHANGED Viewed

@@ -5,6 +5,7 @@ import {
   nativeBuffer,
   isHermes,
   isDeno,
+  isLE,
 } from './_utils.js'
 // See http://stackoverflow.com/a/22747272/680742, which says that lowest limit is in Chrome, with 0xffff args
@@ -60,6 +61,16 @@ export function decodeLatin1(arr, start = 0, stop = arr.length) {
   return String.fromCharCode.apply(String, sliced)
 }
+// Unchecked for well-formedness, raw. Expects Uint16Array input
+export const decodeUCS2 =
+  nativeBuffer && isLE && !isDeno
+    ? (u16, stop = u16.length) => {
+        // TODO: fast path for BE, perhaps faster path for Deno. Note that decoder replaces, this function doesn't
+        if (stop > 32) return nativeBuffer.from(u16.buffer, u16.byteOffset, stop * 2).ucs2Slice() // from 64 bytes, below are in heap
+        return decodeLatin1(u16, 0, stop)
+      }
+    : (u16, stop = u16.length) => decodeLatin1(u16, 0, stop)
 // Does not check input, uses best available method
 // Building an array for this is only faster than proper string concatenation when TextDecoder or native Buffer are available
 export const decodeAscii = nativeBuffer
@@ -70,7 +81,10 @@ export const decodeAscii = nativeBuffer
         : nativeDecoder.decode(a) // On Node.js, utf8 decoder is faster than latin1
   : nativeDecoderLatin1
     ? (a) => nativeDecoderLatin1.decode(a) // On browsers (specifically WebKit), latin1 decoder is faster than utf8
-    : (a) => decodeLatin1(new Uint8Array(a.buffer, a.byteOffset, a.byteLength)) // Fallback. We shouldn't get here, constructing with strings directly is faster
+    : (a) =>
+        decodeLatin1(
+          a instanceof Uint8Array ? a : new Uint8Array(a.buffer, a.byteOffset, a.byteLength)
+        )
 /* eslint-disable @exodus/mutable/no-param-reassign-prop-only */

package/fallback/multi-byte.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { asciiPrefix, decodeLatin1 } from './latin1.js'
+import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2 } from './latin1.js'
 import { getTable } from './multi-byte.table.js'
 export const E_STRICT = 'Input is not well-formed for this encoding'
@@ -8,37 +8,69 @@ export const E_STRICT = 'Input is not well-formed for this encoding'
 // If the decoder is not cleared properly, state can be preserved between non-streaming calls!
 // See comment about fatal stream
-// All except iso-2022-jp are ASCII supersets
-// When adding something that is not an ASCII superset, ajust the ASCII fast path
-const REP = 0xff_fd
-const mappers = {
-  // https://encoding.spec.whatwg.org/#euc-kr-decoder
-  'euc-kr': (err) => {
-    const euc = getTable('euc-kr')
-    let lead = 0
+// Common between euc-kr and big5
+function bigDecoder(err, pair) {
+  let lead = 0
+  let oi = 0
+  let o16
+  const decodeLead = (b) => {
+    const p = pair(lead, b)
+    lead = 0
+    if (typeof p === 'number') {
+      o16[oi++] = p
+    } else if (p) {
+      // This is still faster than string concatenation. Can we optimize strings though?
+      for (let i = 0; i < p.length; i++) o16[oi++] = p.charCodeAt(i)
+    } else {
+      o16[oi++] = err()
+      if (b < 128) o16[oi++] = b
+    }
+  }
-    const pushback = []
-    const bytes = (b) => {
-      if (lead) {
-        const cp = b >= 0x41 && b <= 0xfe ? euc[(lead - 0x81) * 190 + b - 0x41] : undefined
-        lead = 0
-        if (cp !== undefined && cp !== REP) return cp
-        if (b < 128) pushback.push(b)
-        return err()
+  const decode = (arr, start, end, stream) => {
+    let i = start
+    o16 = new Uint16Array(end - start + (lead ? 1 : 0)) // there are pairs but they consume more than one byte
+    oi = 0
+    if (lead && i < end) decodeLead(arr[i++])
+    while (i < end) {
+      const b = arr[i++]
+      if (b < 128) {
+        o16[oi++] = b
+      } else if (b === 0x80 || b === 0xff) {
+        o16[oi++] = err()
+      } else {
+        lead = b
+        if (i < end) decodeLead(arr[i++])
       }
-      if (b < 128) return b
-      if (b < 0x81 || b === 0xff) return err()
-      lead = b
     }
-    const eof = () => {
-      if (!lead) return null
+    if (lead && !stream) {
       lead = 0
-      return err()
+      o16[oi++] = err()
     }
-    return { bytes, eof, pushback }
+    const res = decodeUCS2(o16, oi)
+    o16 = null
+    return res
+  }
+  return { decode, isAscii: () => lead === 0 }
+}
+// All except iso-2022-jp are ASCII supersets
+// When adding something that is not an ASCII superset, ajust the ASCII fast path
+const REP = 0xff_fd
+const mappers = {
+  // https://encoding.spec.whatwg.org/#euc-kr-decoder
+  'euc-kr': (err) => {
+    const euc = getTable('euc-kr')
+    return bigDecoder(err, (l, b) => {
+      if (b < 0x41 || b > 0xfe) return
+      const cp = euc[(l - 0x81) * 190 + b - 0x41]
+      return cp !== undefined && cp !== REP ? cp : undefined
+    })
   },
   // https://encoding.spec.whatwg.org/#euc-jp-decoder
   'euc-jp': (err) => {
@@ -46,21 +78,17 @@ const mappers = {
     const jis0212 = getTable('jis0212')
     let j12 = false
     let lead = 0
+    let oi = 0
+    let o16
-    const pushback = []
-    const bytes = (b) => {
+    const decodeLead = (b) => {
       if (lead === 0x8e && b >= 0xa1 && b <= 0xdf) {
         lead = 0
-        return 0xfe_c0 + b
-      }
-      if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
+        o16[oi++] = 0xfe_c0 + b
+      } else if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
         j12 = true
         lead = b
-        return
-      }
-      if (lead) {
+      } else {
         let cp
         if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
           cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
@@ -68,43 +96,60 @@ const mappers = {
         lead = 0
         j12 = false
-        if (cp !== undefined && cp !== REP) return cp
-        if (b < 128) pushback.push(b)
-        return err()
+        if (cp !== undefined && cp !== REP) {
+          o16[oi++] = cp
+        } else {
+          o16[oi++] = err()
+          if (b < 128) o16[oi++] = b
+        }
       }
-      if (b < 128) return b
-      if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) return err()
-      lead = b
     }
-    // eslint-disable-next-line sonarjs/no-identical-functions
-    const eof = () => {
-      if (!lead) return null
-      lead = 0
-      return err()
+    const decode = (arr, start, end, stream) => {
+      let i = start
+      o16 = new Uint16Array(end - start + (lead ? 1 : 0))
+      oi = 0
+      if (lead && i < end) decodeLead(arr[i++])
+      if (lead && i < end) decodeLead(arr[i++]) // could be two leads, but no more
+      while (i < end) {
+        const b = arr[i++]
+        if (b < 128) {
+          o16[oi++] = b
+        } else if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) {
+          o16[oi++] = err()
+        } else {
+          lead = b
+          if (i < end) decodeLead(arr[i++])
+          if (lead && i < end) decodeLead(arr[i++]) // could be two leads
+        }
+      }
+      if (lead && !stream) {
+        lead = 0
+        j12 = false // can be true only when lead is non-zero
+        o16[oi++] = err()
+      }
+      const res = decodeUCS2(o16, oi)
+      o16 = null
+      return res
     }
-    return { bytes, eof, pushback }
+    return { decode, isAscii: () => lead === 0 } // j12 can be true only when lead is non-zero
   },
   // https://encoding.spec.whatwg.org/#iso-2022-jp-decoder
-  // Per-letter of the spec, don't shortcut on state changes on EOF. Some code is regrouped but preserving the logic
   'iso-2022-jp': (err) => {
     const jis0208 = getTable('jis0208')
-    const EOF = -1
     let dState = 1
     let oState = 1
-    let lead = 0
+    let lead = 0 // 0 or 0x21-0x7e
     let out = false
-    const pushback = []
-    const bytes = (b) => {
-      if (dState < 5) {
-        if (b === EOF) return null
-        if (b === 0x1b) {
-          dState = 6 // escape start
-          return
-        }
+    const bytes = (pushback, b) => {
+      if (dState < 5 && b === 0x1b) {
+        dState = 6 // escape start
+        return
       }
       switch (dState) {
@@ -156,7 +201,7 @@ const mappers = {
           out = false
           dState = oState
-          if (b !== EOF) pushback.push(b)
+          pushback.push(b)
           return err()
         case 7: {
           // Escape
@@ -185,52 +230,131 @@ const mappers = {
           out = false
           dState = oState
-          if (b !== EOF) pushback.push(b)
-          pushback.push(l)
+          pushback.push(b, l)
+          return err()
+        }
+      }
+    }
+    const eof = (pushback) => {
+      if (dState < 5) return null
+      out = false
+      switch (dState) {
+        case 5:
+          dState = 4
+          return err()
+        case 6:
+          dState = oState
+          return err()
+        case 7: {
+          dState = oState
+          pushback.push(lead)
+          lead = 0
           return err()
         }
       }
     }
-    const eof = () => bytes(EOF)
+    const decode = (arr, start, end, stream) => {
+      const o16 = new Uint16Array(end - start + 2) // err in eof + lead from state
+      let oi = 0
+      let i = start
+      const pushback = [] // local and auto-cleared
+      // First, dump everything until EOF
+      // Same as the full loop, but without EOF handling
+      while (i < end || pushback.length > 0) {
+        const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
+        if (c !== undefined) o16[oi++] = c // 16-bit
+      }
+      // Then, dump EOF. This needs the same loop as the characters can be pushed back
+      if (!stream) {
+        while (i <= end || pushback.length > 0) {
+          if (i < end || pushback.length > 0) {
+            const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
+            if (c !== undefined) o16[oi++] = c // 16-bit
+          } else {
+            const c = eof(pushback)
+            if (c === null) break // clean exit
+            o16[oi++] = c
+          }
+        }
+      }
+      // Chrome and WebKit fail on this, we don't: completely destroy the old decoder state when finished streaming
+      // > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
+      // > Set this’s do not flush to options["stream"]
+      if (!stream) {
+        dState = oState = 1
+        lead = 0
+        out = false
+      }
+      return decodeUCS2(o16, oi)
+    }
-    return { bytes, eof, pushback }
+    return { decode, isAscii: () => false }
   },
   // https://encoding.spec.whatwg.org/#shift_jis-decoder
   shift_jis: (err) => {
     const jis0208 = getTable('jis0208')
     let lead = 0
+    let oi = 0
+    let o16
-    const pushback = []
-    const bytes = (b) => {
-      if (lead) {
-        const l = lead
-        lead = 0
-        if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
-          const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
-          if (p >= 8836 && p <= 10_715) return 0xe0_00 - 8836 + p // 16-bit
-          const cp = jis0208[p]
-          if (cp !== undefined && cp !== REP) return cp
+    const decodeLead = (b) => {
+      const l = lead
+      lead = 0
+      if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
+        const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
+        if (p >= 8836 && p <= 10_715) {
+          o16[oi++] = 0xe0_00 - 8836 + p
+          return
         }
-        if (b < 128) pushback.push(b)
-        return err()
+        const cp = jis0208[p]
+        if (cp !== undefined && cp !== REP) {
+          o16[oi++] = cp
+          return
+        }
       }
-      if (b <= 0x80) return b // 0x80 is allowed
-      if (b >= 0xa1 && b <= 0xdf) return 0xff_61 - 0xa1 + b
-      if (b < 0x81 || (b > 0x9f && b < 0xe0) || b > 0xfc) return err()
-      lead = b
+      o16[oi++] = err()
+      if (b < 128) o16[oi++] = b
     }
-    // eslint-disable-next-line sonarjs/no-identical-functions
-    const eof = () => {
-      if (!lead) return null
-      lead = 0 // this clears state completely on EOF
-      return err()
+    const decode = (arr, start, end, stream) => {
+      o16 = new Uint16Array(end - start + (lead ? 1 : 0))
+      oi = 0
+      let i = start
+      if (lead && i < end) decodeLead(arr[i++])
+      while (i < end) {
+        const b = arr[i++]
+        if (b <= 0x80) {
+          o16[oi++] = b // 0x80 is allowed
+        } else if (b >= 0xa1 && b <= 0xdf) {
+          o16[oi++] = 0xfe_c0 + b
+        } else if (b === 0xa0 || b > 0xfc) {
+          o16[oi++] = err()
+        } else {
+          lead = b
+          if (i < end) decodeLead(arr[i++])
+        }
+      }
+      if (lead && !stream) {
+        lead = 0
+        o16[oi++] = err()
+      }
+      const res = decodeUCS2(o16, oi)
+      o16 = null
+      return res
     }
-    return { bytes, eof, pushback }
+    return { decode, isAscii: () => lead === 0 }
   },
   // https://encoding.spec.whatwg.org/#gbk-decoder
   gbk: (err) => mappers.gb18030(err), // 10.1.1. GBK’s decoder is gb18030’s decoder
@@ -252,179 +376,130 @@ const mappers = {
       return b + p - a
     }
-    const pushback = []
-    const bytes = (b) => {
-      if (g3) {
-        if (b < 0x30 || b > 0x39) {
-          pushback.push(b, g3, g2)
-          g1 = g2 = g3 = 0
-          return err()
-        }
-        const cp = index((g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30)
-        g1 = g2 = g3 = 0
-        if (cp !== undefined) return cp // Can validly return replacement
-        return err()
-      }
+    // g1 is 0 or 0x81-0xfe
+    // g2 is 0 or 0x30-0x39
+    // g3 is 0 or 0x81-0xfe
+    const decode = (arr, start, end, stream) => {
+      const o16 = new Uint16Array(end - start + (g1 ? 3 : 0)) // even with pushback it's at most 1 char per byte
+      let oi = 0
+      let i = start
+      const pushback = [] // local and auto-cleared
+      // First, dump everything until EOF
+      // Same as the full loop, but without EOF handling
+      while (i < end || pushback.length > 0) {
+        const b = pushback.length > 0 ? pushback.pop() : arr[i++]
+        if (g1) {
+          // g2 can be set only when g1 is set, g3 can be set only when g2 is set
+          // hence, 3 checks for g3 is faster than 3 checks for g1
+          if (g2) {
+            if (g3) {
+              if (b < 0x30 || b > 0x39) {
+                pushback.push(b, g3, g2)
+                g1 = g2 = g3 = 0
+                o16[oi++] = err()
+              } else {
+                const p = index(
+                  (g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30
+                )
+                g1 = g2 = g3 = 0
+                if (p === undefined) {
+                  o16[oi++] = err()
+                } else if (p <= 0xff_ff) {
+                  o16[oi++] = p // Can validly return replacement
+                } else {
+                  const d = p - 0x1_00_00
+                  o16[oi++] = 0xd8_00 | (d >> 10)
+                  o16[oi++] = 0xdc_00 | (d & 0x3_ff)
+                }
+              }
+            } else if (b >= 0x81 && b <= 0xfe) {
+              g3 = b
+            } else {
+              pushback.push(b, g2)
+              g1 = g2 = 0
+              o16[oi++] = err()
+            }
+          } else if (b >= 0x30 && b <= 0x39) {
+            g2 = b
+          } else {
+            let cp
+            if (b >= 0x40 && b <= 0xfe && b !== 0x7f) {
+              cp = gb18030[(g1 - 0x81) * 190 + b - (b < 0x7f ? 0x40 : 0x41)]
+            }
-      if (g2) {
-        if (b >= 0x81 && b <= 0xfe) {
-          g3 = b
-          return
+            g1 = 0
+            if (cp !== undefined && cp !== REP) {
+              o16[oi++] = cp // 16-bit
+            } else {
+              o16[oi++] = err()
+              if (b < 128) o16[oi++] = b // can be processed immediately
+            }
+          }
+        } else if (b < 128) {
+          o16[oi++] = b
+        } else if (b === 0x80) {
+          o16[oi++] = 0x20_ac
+        } else if (b === 0xff) {
+          o16[oi++] = err()
+        } else {
+          g1 = b
         }
-        pushback.push(b, g2)
-        g1 = g2 = 0
-        return err()
       }
-      if (g1) {
-        if (b >= 0x30 && b <= 0x39) {
-          g2 = b
-          return
-        }
-        let cp
-        if (b >= 0x40 && b <= 0xfe && b !== 0x7f) {
-          cp = gb18030[(g1 - 0x81) * 190 + b - (b < 0x7f ? 0x40 : 0x41)]
-        }
-        g1 = 0
-        if (cp !== undefined && cp !== REP) return cp
-        if (b < 128) pushback.push(b)
-        return err()
+      // if g1 = 0 then g2 = g3 = 0
+      if (g1 && !stream) {
+        g1 = g2 = g3 = 0
+        o16[oi++] = err()
       }
-      if (b < 128) return b
-      if (b === 0x80) return 0x20_ac
-      if (b === 0xff) return err()
-      g1 = b
-    }
-    const eof = () => {
-      if (!g1 && !g2 && !g3) return null
-      g1 = g2 = g3 = 0
-      return err()
+      return decodeUCS2(o16, oi)
     }
-    return { bytes, eof, pushback }
+    return { decode, isAscii: () => g1 === 0 } // if g1 = 0 then g2 = g3 = 0
+  },
+  // https://encoding.spec.whatwg.org/#big5
+  big5: (err) => {
+    // The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
+    // We store that as strings
+    const big5 = getTable('big5')
+    return bigDecoder(err, (l, b) => {
+      if (b < 0x40 || (b > 0x7e && b < 0xa1) || b === 0xff) return
+      return big5[(l - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)] // strings
+    })
   },
 }
 export const isAsciiSuperset = (enc) => enc !== 'iso-2022-jp' // all others are ASCII supersets and can use fast path
 export function multibyteDecoder(enc, loose = false) {
-  if (enc === 'big5') return big5decoder(loose)
   if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
   // Input is assumed to be typechecked already
   let mapper
   const asciiSuperset = isAsciiSuperset(enc)
-  return (arr, stream = false) => {
-    const onErr = loose
-      ? () => REP
-      : () => {
-          mapper.pushback.length = 0 // the queue is cleared on returning an error
-          // The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
-          // Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
-          // iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
-          if (!stream) mapper = null // destroy state, effectively the same as 'do not flush' = false, but early
-          throw new TypeError(E_STRICT)
-        }
-    let res = ''
-    const length = arr.length
-    if (asciiSuperset && !mapper) {
-      res = decodeLatin1(arr, 0, asciiPrefix(arr))
-      if (res.length === arr.length) return res // ascii
-    }
-    if (!mapper) mapper = mappers[enc](onErr)
-    const { bytes, eof, pushback } = mapper
-    let i = res.length
-    // First, dump everything until EOF
-    // Same as the full loop, but without EOF handling
-    while (i < length || pushback.length > 0) {
-      const c = bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
-      if (c === undefined) continue // consuming
-      res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
-    }
-    // Then, dump EOF. This needs the same loop as the characters can be pushed back
-    // TODO: only some encodings need this, most can be optimized
-    if (!stream) {
-      while (i <= length || pushback.length > 0) {
-        const isEOF = i === length && pushback.length === 0
-        const c = isEOF ? eof() : bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
-        if (isEOF && c === null) break // clean exit
-        if (c === undefined) continue // consuming
-        res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
+  let streaming // because onErr is cached in mapper
+  const onErr = loose
+    ? () => REP
+    : () => {
+        // The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
+        // Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
+        // iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
+        if (!streaming) mapper = null // destroy state, effectively the same as 'do not flush' = false, but early
+        throw new TypeError(E_STRICT)
       }
-    }
-    // Chrome and WebKit fail on this, we don't: completely destroy the old decoder instance when finished streaming
-    // > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
-    // > Set this’s do not flush to options["stream"]
-    if (!stream) mapper = null
-    return res
-  }
-}
-// The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
-// We store that as strings
-function big5decoder(loose) {
-  // Input is assumed to be typechecked already
-  let lead = 0
-  let big5
   return (arr, stream = false) => {
-    const onErr = loose
-      ? () => '\uFFFD'
-      : () => {
-          // Lead is always already cleared before throwing
-          throw new TypeError(E_STRICT)
-        }
     let res = ''
-    const length = arr.length
-    if (!lead) {
-      res = decodeLatin1(arr, 0, asciiPrefix(arr))
-      if (res.length === arr.length) return res // ascii
-    }
-    if (!big5) big5 = getTable('big5')
-    for (let i = res.length; i < length; i++) {
-      const b = arr[i]
-      if (lead) {
-        let cp
-        if ((b >= 0x40 && b <= 0x7e) || (b >= 0xa1 && b !== 0xff)) {
-          cp = big5[(lead - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)]
-        }
-        lead = 0
-        if (cp) {
-          res += cp // strings
-        } else {
-          res += onErr()
-          // same as pushing it back: lead is cleared, pushed back can't contain more than 1 byte
-          if (b < 128) res += String.fromCharCode(b)
-        }
-      } else if (b < 128) {
-        res += String.fromCharCode(b)
-      } else if (b < 0x81 || b === 0xff) {
-        res += onErr()
-      } else {
-        lead = b
-      }
+    if (asciiSuperset && (!mapper || mapper.isAscii?.())) {
+      const prefixLen = asciiPrefix(arr)
+      if (prefixLen === arr.length) return decodeAscii(arr) // ascii
+      res = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
     }
-    if (!stream && lead) {
-      // Destroy decoder state
-      lead = 0
-      res += onErr()
-    }
-    return res
+    streaming = stream // affects onErr
+    if (!mapper) mapper = mappers[enc](onErr)
+    return res + mapper.decode(arr, res.length, arr.length, stream)
   }
 }

package/fallback/multi-byte.table.js CHANGED Viewed

@@ -56,7 +56,9 @@ function unwrap(res, t, pos, stringMode = false) {
         }
         if (stringMode) {
-          for (let k = 0; k < x; k++, pos++, code++) res[pos] = String.fromCodePoint(code)
+          for (let k = 0; k < x; k++, pos++, code++) {
+            res[pos] = code <= 0xff_ff ? code : String.fromCodePoint(code)
+          }
         } else {
           for (let k = 0; k < x; k++, pos++, code++) res[pos] = code
         }
@@ -65,8 +67,13 @@ function unwrap(res, t, pos, stringMode = false) {
       pos = unwrap(res, indices[x], pos, stringMode) // self-reference using shared chunks
     } else if (stringMode) {
       const s = [...utf16toString(loadBase64(x), 'uint8-le')] // splits by codepoints
-      for (let i = 0; i < s.length; ) res[pos++] = s[i++] // TODO: splice?
-      code = s[s.length - 1].codePointAt(0) + 1
+      let char
+      for (let i = 0; i < s.length; ) {
+        char = s[i++]
+        res[pos++] = char.length === 1 ? char.charCodeAt(0) : char // strings only for high codepoints
+      }
+      code = char.codePointAt(0) + 1
     } else {
       const u16 = to16input(loadBase64(x), true) // data is little-endian
       res.set(u16, pos)

package/fallback/single-byte.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { asciiPrefix, decodeLatin1 } from './latin1.js'
+import { asciiPrefix, decodeAscii, decodeLatin1 } from './latin1.js'
 import encodings from './single-byte.encodings.js'
 import { decode2string } from './_utils.js'
@@ -74,8 +74,9 @@ export function encodingDecoder(encoding) {
       strings = allCodes.map((c) => String.fromCharCode(c))
     }
-    const prefix = decodeLatin1(arr, 0, asciiPrefix(arr))
-    if (prefix.length === arr.length) return prefix
+    const prefixLen = asciiPrefix(arr)
+    if (prefixLen === arr.length) return decodeAscii(arr)
+    const prefix = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
     const suffix = decode2string(arr, prefix.length, arr.length, strings)
     if (!loose && incomplete && suffix.includes('\uFFFD')) throw new TypeError(E_STRICT)
     return prefix + suffix

package/fallback/utf16.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { decodeLatin1, encodeCharcodes } from './latin1.js'
+import { decodeUCS2, encodeCharcodes } from './latin1.js'
 import { isLE } from './_utils.js'
 export const E_STRICT = 'Input is not well-formed utf16'
@@ -38,9 +38,9 @@ export function to16input(u8, le) {
 }
 export const decode = (u16, loose = false, checked = false) => {
-  if (checked || isWellFormed(u16)) return decodeLatin1(u16, 0, u16.length) // it's capable of decoding Uint16Array to UTF-16 as well
+  if (checked || isWellFormed(u16)) return decodeUCS2(u16)
   if (!loose) throw new TypeError(E_STRICT)
-  return decodeLatin1(toWellFormed(Uint16Array.from(u16)), 0, u16.length) // cloned for replacement
+  return decodeUCS2(toWellFormed(Uint16Array.from(u16))) // cloned for replacement
 }
 export function encode(str, loose = false, checked = false, swapped = false) {

package/multi-byte.node.js CHANGED Viewed

@@ -1,10 +1,8 @@
 import { assertUint8 } from './assert.js'
-import { isDeno } from './fallback/_utils.js'
+import { isDeno, toBuf } from './fallback/_utils.js'
 import { isAsciiSuperset, multibyteDecoder } from './fallback/multi-byte.js'
 import { isAscii } from 'node:buffer'
-const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
 export function createMultibyteDecoder(encoding, loose = false) {
   const jsDecoder = multibyteDecoder(encoding, loose) // asserts
   let streaming = false

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@exodus/bytes",
-  "version": "1.4.0",
+  "version": "1.6.0",
   "description": "Various operations on Uint8Array data",
   "scripts": {
     "lint": "eslint .",

package/single-byte.js CHANGED Viewed

@@ -6,6 +6,12 @@ const { TextDecoder } = globalThis
 let windows1252works
+// prettier-ignore
+const skipNative = new Set([
+  'iso-8859-16', // iso-8859-16 is somehow broken in WebKit, at least on CI
+  'iso-8859-6', 'iso-8859-8', 'iso-8859-8-i', // slow in all 3 engines
+])
 function shouldUseNative(enc) {
   // https://issues.chromium.org/issues/468458388
   // Also might be incorrectly imlemented on platforms as Latin1 (e.g. in Node.js) or regress
@@ -24,8 +30,7 @@ function shouldUseNative(enc) {
     return windows1252works
   }
-  // iso-8859-16 is somehow broken in WebKit, at least on CI
-  return enc !== 'iso-8859-16'
+  return !skipNative.has(enc)
 }
 export function createSinglebyteDecoder(encoding, loose = false) {

package/single-byte.node.js CHANGED Viewed

@@ -1,11 +1,9 @@
 import { assertUint8 } from './assert.js'
 import { isAscii } from 'node:buffer'
-import { isDeno, isLE } from './fallback/_utils.js'
+import { isDeno, isLE, toBuf } from './fallback/_utils.js'
 import { asciiPrefix } from './fallback/latin1.js'
 import { encodingMapper, encodingDecoder, E_STRICT } from './fallback/single-byte.js'
-const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
 function latin1Prefix(arr, start) {
   let p = start | 0
   const length = arr.length

package/utf8.js CHANGED Viewed

@@ -57,7 +57,7 @@ function decode(arr, loose = false) {
   if (nativeDecoder) return loose ? decoderLoose.decode(arr) : decoderFatal.decode(arr) // Node.js and browsers
   // Fast path for ASCII prefix, this is faster than all alternatives below
-  const prefix = decodeLatin1(arr, 0, asciiPrefix(arr))
+  const prefix = decodeLatin1(arr, 0, asciiPrefix(arr)) // No native decoder to use, so decodeAscii is useless here
   if (prefix.length === arr.length) return prefix
   // This codepath gives a ~3x perf boost on Hermes