npm - @exodus/bytes - Versions diffs - 1.5.0 → 1.7.0 - Mend

@exodus/bytes 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +71 -11
package/bech32.js +4 -0
package/encoding-lite.js +2 -0
package/encoding.js +2 -0
package/fallback/_utils.js +8 -8
package/fallback/encoding.js +83 -1
package/fallback/latin1.js +16 -1
package/fallback/multi-byte.js +124 -81
package/fallback/multi-byte.table.js +10 -3
package/fallback/single-byte.js +25 -5
package/fallback/utf16.js +3 -3
package/multi-byte.node.js +1 -3
package/package.json +3 -1
package/single-byte.js +52 -4
package/single-byte.node.js +50 -4
package/utf8.js +1 -1

package/README.md CHANGED Viewed

@@ -31,18 +31,33 @@ See [Performance](./Performance.md) for more info
 ```js
 import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding.js'
+import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding.js' // Requires Streams
 ```
-Less than half the bundle size of [text-encoding](https://npmjs.com/text-encoding), [whatwg-encoding](https://npmjs.com/whatwg-encoding) or [iconv-lite](https://npmjs.com/iconv-lite) (gzipped or not), and [is much faster](#fast).
-See also [lite version](#lite-version).
+Less than half the bundle size of [text-encoding](https://npmjs.com/text-encoding), [whatwg-encoding](https://npmjs.com/whatwg-encoding) or [iconv-lite](https://npmjs.com/iconv-lite) (gzipped or not).\
+Also [much faster](#fast) than all of those.
-Spec compliant, passing WPT and covered with extra tests.
+> [!TIP]
+> See also the [lite version](#lite-version) to get this down to 9 KiB gzipped.
-Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).
+Spec compliant, passing WPT and covered with extra tests.\
+Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).\
+Including all three major browser engines being wrong at UTF-8.\
+See [WPT pull request](https://github.com/web-platform-tests/wpt/pull/56892).
-[Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
+It works correctly even in environments that have native implementations broken (that's all of them currently).\
 Runs (and passes WPT) on Node.js built without ICU.
+> [!NOTE]
+> [Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
+>
+> The JS multi-byte version is as fast as native impl in Node.js and browsers, but (unlike them) returns correct results.
+>
+> For encodings where native version is known to be fast and correct, it is automatically used.\
+> Some single-byte encodings are faster than native in all three major browser engines.
+See [analysis table](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit) for more info.
 ### Caveat: `TextDecoder` / `TextEncoder` APIs are lossy by default per spec
 _These are only provided as a compatibility layer, prefer hardened APIs instead in new code._
@@ -63,6 +78,7 @@ _These are only provided as a compatibility layer, prefer hardened APIs instead
 If you don't need support for legacy multi-byte encodings, you can use the lite import:
 ```js
 import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding-lite.js'
+import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding-lite.js' // Requires Streams
 ```
 This reduces the bundle size 10x:\
@@ -107,8 +123,8 @@ import { utf16fromStringLoose, utf16toStringLoose } from '@exodus/bytes/utf16.js
 ### `@exodus/bytes/single-byte.js`
 ```js
-import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
-import { windows1252toString } from '@exodus/bytes/single-byte.js'
+import { createSinglebyteDecoder, createSinglebyteEncoder } from '@exodus/bytes/single-byte.js'
+import { windows1252toString, windows1252fromString } from '@exodus/bytes/single-byte.js'
 ```
 Decode the legacy single-byte encodings according to the [Encoding standard](https://encoding.spec.whatwg.org/)
@@ -123,10 +139,19 @@ Supports all single-byte encodings listed in the standard:
 ##### `createSinglebyteDecoder(encoding, loose = false)`
-Create a decoder for a supported one-byte `encoding`, given it's lowercased name `encoding`.
+Create a decoder for a supported one-byte `encoding`, given its lowercased name `encoding`.
 Returns a function `decode(arr)` that decodes bytes to a string.
+##### `createSinglebyteEncoder(encoding, { mode = 'fatal' })`
+Create an encoder for a supported one-byte `encoding`, given its lowercased name `encoding`.
+Returns a function `encode(string)` that encodes a string to bytes.
+In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
+not be encoded in the target encoding.
 ##### `windows1252toString(arr)`
 Decode `windows-1252` bytes to a string.
@@ -140,6 +165,19 @@ Same as:
 const windows1252toString = createSinglebyteDecoder('windows-1252')
 ```
+##### `windows1252fromString(string)`
+Encode a string to `windows-1252` bytes.
+Also supports `ascii` and `latin-1` as those are strict subsets of `windows-1252`.
+Will throw on non well-formed strings or any codepoints which could not be encoded in `windows-1252`.
+Same as:
+```js
+const windows1252fromString = createSinglebyteEncoder('windows-1252', { mode: 'fatal' })
+```
 ### `@exodus/bytes/multi-byte.js`
 ```js
@@ -157,7 +195,7 @@ Supports all legacy multi-byte encodings listed in the standard:
 ##### `createMultibyteDecoder(encoding, loose = false)`
-Create a decoder for a supported legacy multi-byte `encoding`, given it's lowercased name `encoding`.
+Create a decoder for a supported legacy multi-byte `encoding`, given its lowercased name `encoding`.
 Returns a function `decode(arr, stream = false)` that decodes bytes to a string.
@@ -270,6 +308,7 @@ On non-Node.js, requires peer dependency [@exodus/crypto](https://www.npmjs.com/
 ```js
 import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding.js'
+import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding.js' // Requires Streams
 // Hooks for standards
 import { getBOMEncoding, legacyHookDecode, labelToName, normalizeEncoding } from '@exodus/bytes/encoding.js'
@@ -277,7 +316,9 @@ import { getBOMEncoding, legacyHookDecode, labelToName, normalizeEncoding } from
 Implements the [Encoding standard](https://encoding.spec.whatwg.org/):
 [TextDecoder](https://encoding.spec.whatwg.org/#interface-textdecoder),
-[TextEncoder](https://encoding.spec.whatwg.org/#interface-textdecoder),
+[TextEncoder](https://encoding.spec.whatwg.org/#interface-textencoder),
+[TextDecoderStream](https://encoding.spec.whatwg.org/#interface-textdecoderstream),
+[TextEncoderStream](https://encoding.spec.whatwg.org/#interface-textencoderstream),
 some [hooks](https://encoding.spec.whatwg.org/#specification-hooks) (see below).
 #### `new TextDecoder(label = 'utf-8', { fatal = false, ignoreBOM = false })`
@@ -286,7 +327,21 @@ some [hooks](https://encoding.spec.whatwg.org/#specification-hooks) (see below).
 #### `new TextEncoder()`
-[TextEncoder](https://encoding.spec.whatwg.org/#interface-textdecoder) implementation/polyfill.
+[TextEncoder](https://encoding.spec.whatwg.org/#interface-textencoder) implementation/polyfill.
+#### `new TextDecoderStream(label = 'utf-8', { fatal = false, ignoreBOM = false })`
+[TextDecoderStream](https://encoding.spec.whatwg.org/#interface-textdecoderstream) implementation/polyfill.
+Requires [Streams](https://streams.spec.whatwg.org/) to be either supported by the platform or
+[polyfilled](https://npmjs.com/package/web-streams-polyfill).
+#### `new TextEncoderStream()`
+[TextEncoderStream](https://encoding.spec.whatwg.org/#interface-textencoderstream) implementation/polyfill.
+Requires [Streams](https://streams.spec.whatwg.org/) to be either supported by the platform or
+[polyfilled](https://npmjs.com/package/web-streams-polyfill).
 #### `labelToName(label)`
@@ -356,6 +411,7 @@ new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding).decode(input)
 ```js
 import { TextDecoder, TextEncoder } from '@exodus/bytes/encoding-lite.js'
+import { TextDecoderStream, TextEncoderStream } from '@exodus/bytes/encoding-lite.js' // Requires Streams
 // Hooks for standards
 import { getBOMEncoding, legacyHookDecode, labelToName, normalizeEncoding } from '@exodus/bytes/encoding-lite.js'
@@ -379,7 +435,9 @@ To avoid inconsistencies, the exported classes and methods are exactly the same
 > lite = require('@exodus/bytes/encoding-lite.js')
 [Module: null prototype] {
   TextDecoder: [class TextDecoder],
+  TextDecoderStream: [class TextDecoderStream],
   TextEncoder: [class TextEncoder],
+  TextEncoderStream: [class TextEncoderStream],
   getBOMEncoding: [Function: getBOMEncoding],
   labelToName: [Function: labelToName],
   legacyHookDecode: [Function: legacyHookDecode],
@@ -392,7 +450,9 @@ Error: Legacy multi-byte encodings are disabled in /encoding-lite.js, use /encod
 > full = require('@exodus/bytes/encoding.js')
 [Module: null prototype] {
   TextDecoder: [class TextDecoder],
+  TextDecoderStream: [class TextDecoderStream],
   TextEncoder: [class TextEncoder],
+  TextEncoderStream: [class TextEncoderStream],
   getBOMEncoding: [Function: getBOMEncoding],
   labelToName: [Function: labelToName],
   legacyHookDecode: [Function: legacyHookDecode],

package/bech32.js CHANGED Viewed

@@ -179,6 +179,9 @@ function assertDecodeArgs(str, limit) {
   if (typeof limit !== 'number' || str.length < 8 || !(str.length <= limit)) throw new Error(E_SIZE)
 }
+// this is instant on 8-bit strings
+const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
 function fromBech32enc(str, limit, encoding) {
   assertDecodeArgs(str, limit)
   const lower = str.toLowerCase()
@@ -195,6 +198,7 @@ function fromBech32enc(str, limit, encoding) {
   if (wordsLength < 0) throw new Error(E_SIZE)
   const bytesLength = (wordsLength * 5) >> 3
   const slice = str.slice(split + 1)
+  if (!nativeEncoder && NON_LATIN.test(slice)) throw new SyntaxError(E_CHARACTER) // otherwise can't use encodeLatin1
   const c = nativeEncoder ? encodeAscii(slice, E_CHARACTER) : encodeLatin1(slice) // suboptimal, but only affects non-Hermes barebones
   const bytes = new Uint8Array(bytesLength)

package/encoding-lite.js CHANGED Viewed

@@ -1,6 +1,8 @@
 export {
   TextDecoder,
   TextEncoder,
+  TextDecoderStream,
+  TextEncoderStream,
   normalizeEncoding,
   getBOMEncoding,
   labelToName,

package/encoding.js CHANGED Viewed

@@ -6,6 +6,8 @@ setMultibyteDecoder(createMultibyteDecoder)
 export {
   TextDecoder,
   TextEncoder,
+  TextDecoderStream,
+  TextEncoderStream,
   normalizeEncoding,
   getBOMEncoding,
   labelToName,

package/fallback/_utils.js CHANGED Viewed

@@ -5,14 +5,8 @@ export const isHermes = Boolean(globalThis.HermesInternal)
 export const isDeno = Boolean(globalThis.Deno)
 export const isLE = new Uint8Array(Uint16Array.of(258).buffer)[0] === 2
-let isNative = (x) => {
-  if (!x) return false
-  if (haveNativeBuffer) return true // we consider Node.js TextDecoder/TextEncoder native
-  const s = `${x}`
-  // See https://github.com/facebook/hermes/pull/1855#issuecomment-3659386410
-  return s.includes('[native code]') || s.includes(`[bytecode]`) // Static Hermes has [bytecode] for contrib, which includes TextEncoder/TextDecoder
-}
+// We consider Node.js TextDecoder/TextEncoder native
+let isNative = (x) => x && (haveNativeBuffer || `${x}`.includes('[native code]'))
 if (!haveNativeBuffer && isNative(() => {})) isNative = () => false // e.g. XS, we don't want false positives
 export const nativeEncoder = isNative(TextEncoder) ? new TextEncoder() : null
@@ -128,3 +122,9 @@ export function decode2string(arr, start, end, m) {
 export function assert(condition, msg) {
   if (!condition) throw new Error(msg)
 }
+// On arrays in heap (<= 64) it's cheaper to copy into a pooled buffer than lazy-create the ArrayBuffer storage
+export const toBuf = (x) =>
+  x.byteLength <= 64 && x.BYTES_PER_ELEMENT === 1
+    ? Buffer.from(x)
+    : Buffer.from(x.buffer, x.byteOffset, x.byteLength)

package/fallback/encoding.js CHANGED Viewed

@@ -68,7 +68,16 @@ function isAnyUint8Array(x) {
 const fromSource = (x) => {
   if (x instanceof Uint8Array) return x
   if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
-  if (isAnyArrayBuffer(x)) return new Uint8Array(x)
+  if (isAnyArrayBuffer(x)) {
+    if ('detached' in x) return x.detached === true ? new Uint8Array() : new Uint8Array(x)
+    // Old engines without .detached, try-catch
+    try {
+      return new Uint8Array(x)
+    } catch {
+      return new Uint8Array()
+    }
+  }
   throw new TypeError('Argument must be a SharedArrayBuffer, ArrayBuffer or ArrayBufferView')
 }
@@ -259,6 +268,79 @@ export class TextEncoder {
   }
 }
+const E_NO_STREAMS = 'TransformStream global not present in the environment'
+// https://encoding.spec.whatwg.org/#interface-textdecoderstream
+export class TextDecoderStream {
+  constructor(encoding = 'utf-8', options = {}) {
+    if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
+    const decoder = new TextDecoder(encoding, options)
+    const transform = new TransformStream({
+      transform: (chunk, controller) => {
+        const value = decoder.decode(fromSource(chunk), { stream: true })
+        if (value) controller.enqueue(value)
+      },
+      flush: (controller) => {
+        // https://streams.spec.whatwg.org/#dom-transformer-flush
+        const value = decoder.decode()
+        if (value) controller.enqueue(value)
+        // No need to call .terminate() (Node.js is wrong)
+      },
+    })
+    define(this, 'encoding', decoder.encoding)
+    define(this, 'fatal', decoder.fatal)
+    define(this, 'ignoreBOM', decoder.ignoreBOM)
+    define(this, 'readable', transform.readable)
+    define(this, 'writable', transform.writable)
+  }
+  get [Symbol.toStringTag]() {
+    return 'TextDecoderStream'
+  }
+}
+// https://encoding.spec.whatwg.org/#interface-textencoderstream
+// Only UTF-8 per spec
+export class TextEncoderStream {
+  constructor() {
+    if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
+    let lead
+    const transform = new TransformStream({
+      // https://encoding.spec.whatwg.org/#encode-and-enqueue-a-chunk
+      // Not identical in code, but reuses loose mode to have identical behavior
+      transform: (chunk, controller) => {
+        let s = String(chunk) // DOMString, might contain unpaired surrogates
+        if (s.length === 0) return
+        if (lead) {
+          s = lead + s
+          lead = null
+        }
+        const last = s.charCodeAt(s.length - 1) // Can't come from previous lead due to length check
+        if ((last & 0xfc_00) === 0xd8_00) {
+          lead = s[s.length - 1]
+          s = s.slice(0, -1)
+        }
+        if (s) controller.enqueue(utf8fromStringLoose(s))
+      },
+      // https://encoding.spec.whatwg.org/#encode-and-flush
+      flush: (controller) => {
+        if (lead) controller.enqueue(Uint8Array.of(0xef, 0xbf, 0xbd))
+      },
+    })
+    define(this, 'encoding', 'utf-8')
+    define(this, 'readable', transform.readable)
+    define(this, 'writable', transform.writable)
+  }
+  get [Symbol.toStringTag]() {
+    return 'TextEncoderStream'
+  }
+}
 // Warning: unlike whatwg-encoding, returns lowercased labels
 // Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
 export function getBOMEncoding(input) {

package/fallback/latin1.js CHANGED Viewed

@@ -5,6 +5,7 @@ import {
   nativeBuffer,
   isHermes,
   isDeno,
+  isLE,
 } from './_utils.js'
 // See http://stackoverflow.com/a/22747272/680742, which says that lowest limit is in Chrome, with 0xffff args
@@ -60,6 +61,16 @@ export function decodeLatin1(arr, start = 0, stop = arr.length) {
   return String.fromCharCode.apply(String, sliced)
 }
+// Unchecked for well-formedness, raw. Expects Uint16Array input
+export const decodeUCS2 =
+  nativeBuffer && isLE && !isDeno
+    ? (u16, stop = u16.length) => {
+        // TODO: fast path for BE, perhaps faster path for Deno. Note that decoder replaces, this function doesn't
+        if (stop > 32) return nativeBuffer.from(u16.buffer, u16.byteOffset, stop * 2).ucs2Slice() // from 64 bytes, below are in heap
+        return decodeLatin1(u16, 0, stop)
+      }
+    : (u16, stop = u16.length) => decodeLatin1(u16, 0, stop)
 // Does not check input, uses best available method
 // Building an array for this is only faster than proper string concatenation when TextDecoder or native Buffer are available
 export const decodeAscii = nativeBuffer
@@ -70,7 +81,10 @@ export const decodeAscii = nativeBuffer
         : nativeDecoder.decode(a) // On Node.js, utf8 decoder is faster than latin1
   : nativeDecoderLatin1
     ? (a) => nativeDecoderLatin1.decode(a) // On browsers (specifically WebKit), latin1 decoder is faster than utf8
-    : (a) => decodeLatin1(new Uint8Array(a.buffer, a.byteOffset, a.byteLength)) // Fallback. We shouldn't get here, constructing with strings directly is faster
+    : (a) =>
+        decodeLatin1(
+          a instanceof Uint8Array ? a : new Uint8Array(a.buffer, a.byteOffset, a.byteLength)
+        )
 /* eslint-disable @exodus/mutable/no-param-reassign-prop-only */
@@ -95,6 +109,7 @@ export const encodeCharcodes = isHermes
 /* eslint-enable @exodus/mutable/no-param-reassign-prop-only */
+// Warning: can be used only on checked strings, converts strings to 8-bit
 export const encodeLatin1 = (str) => encodeCharcodes(str, new Uint8Array(str.length))
 // Expects nativeEncoder to be present

package/fallback/multi-byte.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { asciiPrefix, decodeLatin1 } from './latin1.js'
+import { asciiPrefix, decodeAscii, decodeLatin1, decodeUCS2 } from './latin1.js'
 import { getTable } from './multi-byte.table.js'
 export const E_STRICT = 'Input is not well-formed for this encoding'
@@ -11,36 +11,48 @@ export const E_STRICT = 'Input is not well-formed for this encoding'
 // Common between euc-kr and big5
 function bigDecoder(err, pair) {
   let lead = 0
+  let oi = 0
+  let o16
   const decodeLead = (b) => {
-    const str = pair(lead, b)
+    const p = pair(lead, b)
     lead = 0
-    if (str) return str
-    return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
+    if (typeof p === 'number') {
+      o16[oi++] = p
+    } else if (p) {
+      // This is still faster than string concatenation. Can we optimize strings though?
+      for (let i = 0; i < p.length; i++) o16[oi++] = p.charCodeAt(i)
+    } else {
+      o16[oi++] = err()
+      if (b < 128) o16[oi++] = b
+    }
   }
   const decode = (arr, start, end, stream) => {
-    let res = ''
     let i = start
+    o16 = new Uint16Array(end - start + (lead ? 1 : 0)) // there are pairs but they consume more than one byte
+    oi = 0
-    if (lead && i < end) res += decodeLead(arr[i++])
+    if (lead && i < end) decodeLead(arr[i++])
     while (i < end) {
       const b = arr[i++]
       if (b < 128) {
-        res += String.fromCharCode(b)
+        o16[oi++] = b
       } else if (b === 0x80 || b === 0xff) {
-        res += String.fromCharCode(err())
+        o16[oi++] = err()
       } else {
         lead = b
-        if (i < end) res += decodeLead(arr[i++])
+        if (i < end) decodeLead(arr[i++])
       }
     }
     if (lead && !stream) {
       lead = 0
-      res += String.fromCharCode(err())
+      o16[oi++] = err()
     }
+    const res = decodeUCS2(o16, oi)
+    o16 = null
     return res
   }
@@ -57,7 +69,7 @@ const mappers = {
     return bigDecoder(err, (l, b) => {
       if (b < 0x41 || b > 0xfe) return
       const cp = euc[(l - 0x81) * 190 + b - 0x41]
-      return cp !== undefined && cp !== REP ? String.fromCharCode(cp) : undefined
+      return cp !== undefined && cp !== REP ? cp : undefined
     })
   },
   // https://encoding.spec.whatwg.org/#euc-jp-decoder
@@ -66,55 +78,61 @@ const mappers = {
     const jis0212 = getTable('jis0212')
     let j12 = false
     let lead = 0
+    let oi = 0
+    let o16
     const decodeLead = (b) => {
       if (lead === 0x8e && b >= 0xa1 && b <= 0xdf) {
         lead = 0
-        return String.fromCharCode(0xfe_c0 + b)
-      }
-      if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
+        o16[oi++] = 0xfe_c0 + b
+      } else if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
         j12 = true
         lead = b
-        return ''
-      }
+      } else {
+        let cp
+        if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
+          cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
+        }
-      let cp
-      if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
-        cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
+        lead = 0
+        j12 = false
+        if (cp !== undefined && cp !== REP) {
+          o16[oi++] = cp
+        } else {
+          o16[oi++] = err()
+          if (b < 128) o16[oi++] = b
+        }
       }
-      lead = 0
-      j12 = false
-      if (cp !== undefined && cp !== REP) return String.fromCharCode(cp)
-      return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
     }
     const decode = (arr, start, end, stream) => {
-      let res = ''
       let i = start
+      o16 = new Uint16Array(end - start + (lead ? 1 : 0))
+      oi = 0
-      if (lead && i < end) res += decodeLead(arr[i++])
-      if (lead && i < end) res += decodeLead(arr[i++]) // could be two leads, but no more
+      if (lead && i < end) decodeLead(arr[i++])
+      if (lead && i < end) decodeLead(arr[i++]) // could be two leads, but no more
       while (i < end) {
         const b = arr[i++]
         if (b < 128) {
-          res += String.fromCharCode(b)
+          o16[oi++] = b
         } else if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) {
-          res += String.fromCharCode(err())
+          o16[oi++] = err()
         } else {
           lead = b
-          if (i < end) res += decodeLead(arr[i++])
-          if (lead && i < end) res += decodeLead(arr[i++]) // could be two leads
+          if (i < end) decodeLead(arr[i++])
+          if (lead && i < end) decodeLead(arr[i++]) // could be two leads
         }
       }
       if (lead && !stream) {
         lead = 0
         j12 = false // can be true only when lead is non-zero
-        res += String.fromCharCode(err())
+        o16[oi++] = err()
       }
+      const res = decodeUCS2(o16, oi)
+      o16 = null
       return res
     }
@@ -238,7 +256,8 @@ const mappers = {
     }
     const decode = (arr, start, end, stream) => {
-      let res = ''
+      const o16 = new Uint16Array(end - start + 2) // err in eof + lead from state
+      let oi = 0
       let i = start
       const pushback = [] // local and auto-cleared
@@ -246,7 +265,7 @@ const mappers = {
       // Same as the full loop, but without EOF handling
       while (i < end || pushback.length > 0) {
         const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
-        if (c !== undefined) res += String.fromCodePoint(c)
+        if (c !== undefined) o16[oi++] = c // 16-bit
       }
       // Then, dump EOF. This needs the same loop as the characters can be pushed back
@@ -254,11 +273,11 @@ const mappers = {
         while (i <= end || pushback.length > 0) {
           if (i < end || pushback.length > 0) {
             const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
-            if (c !== undefined) res += String.fromCodePoint(c)
+            if (c !== undefined) o16[oi++] = c // 16-bit
           } else {
             const c = eof(pushback)
             if (c === null) break // clean exit
-            res += String.fromCodePoint(c)
+            o16[oi++] = c
           }
         }
       }
@@ -272,7 +291,7 @@ const mappers = {
         out = false
       }
-      return res
+      return decodeUCS2(o16, oi)
     }
     return { decode, isAscii: () => false }
@@ -281,44 +300,57 @@ const mappers = {
   shift_jis: (err) => {
     const jis0208 = getTable('jis0208')
     let lead = 0
+    let oi = 0
+    let o16
     const decodeLead = (b) => {
       const l = lead
       lead = 0
       if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
         const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
-        if (p >= 8836 && p <= 10_715) return String.fromCharCode(0xe0_00 - 8836 + p)
+        if (p >= 8836 && p <= 10_715) {
+          o16[oi++] = 0xe0_00 - 8836 + p
+          return
+        }
         const cp = jis0208[p]
-        if (cp !== undefined && cp !== REP) return String.fromCharCode(cp)
+        if (cp !== undefined && cp !== REP) {
+          o16[oi++] = cp
+          return
+        }
       }
-      return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
+      o16[oi++] = err()
+      if (b < 128) o16[oi++] = b
     }
     const decode = (arr, start, end, stream) => {
-      let res = ''
+      o16 = new Uint16Array(end - start + (lead ? 1 : 0))
+      oi = 0
       let i = start
-      if (lead && i < end) res += decodeLead(arr[i++])
+      if (lead && i < end) decodeLead(arr[i++])
       while (i < end) {
         const b = arr[i++]
         if (b <= 0x80) {
-          res += String.fromCharCode(b) // 0x80 is allowed
+          o16[oi++] = b // 0x80 is allowed
         } else if (b >= 0xa1 && b <= 0xdf) {
-          res += String.fromCharCode(0xfe_c0 + b)
+          o16[oi++] = 0xfe_c0 + b
         } else if (b === 0xa0 || b > 0xfc) {
-          res += String.fromCharCode(err())
+          o16[oi++] = err()
         } else {
           lead = b
-          if (i < end) res += decodeLead(arr[i++])
+          if (i < end) decodeLead(arr[i++])
         }
       }
       if (lead && !stream) {
         lead = 0
-        res += String.fromCharCode(err())
+        o16[oi++] = err()
       }
+      const res = decodeUCS2(o16, oi)
+      o16 = null
       return res
     }
@@ -349,7 +381,8 @@ const mappers = {
     // g3 is 0 or 0x81-0xfe
     const decode = (arr, start, end, stream) => {
-      let res = ''
+      const o16 = new Uint16Array(end - start + (g1 ? 3 : 0)) // even with pushback it's at most 1 char per byte
+      let oi = 0
       let i = start
       const pushback = [] // local and auto-cleared
@@ -357,30 +390,38 @@ const mappers = {
       // Same as the full loop, but without EOF handling
       while (i < end || pushback.length > 0) {
         const b = pushback.length > 0 ? pushback.pop() : arr[i++]
-        if (g3) {
-          if (b < 0x30 || b > 0x39) {
-            pushback.push(b, g3, g2)
-            g1 = g2 = g3 = 0
-            res += String.fromCharCode(err())
-          } else {
-            const p = index((g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30)
-            g1 = g2 = g3 = 0
-            if (p === undefined) {
-              res += String.fromCharCode(err())
+        if (g1) {
+          // g2 can be set only when g1 is set, g3 can be set only when g2 is set
+          // hence, 3 checks for g3 is faster than 3 checks for g1
+          if (g2) {
+            if (g3) {
+              if (b < 0x30 || b > 0x39) {
+                pushback.push(b, g3, g2)
+                g1 = g2 = g3 = 0
+                o16[oi++] = err()
+              } else {
+                const p = index(
+                  (g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30
+                )
+                g1 = g2 = g3 = 0
+                if (p === undefined) {
+                  o16[oi++] = err()
+                } else if (p <= 0xff_ff) {
+                  o16[oi++] = p // Can validly return replacement
+                } else {
+                  const d = p - 0x1_00_00
+                  o16[oi++] = 0xd8_00 | (d >> 10)
+                  o16[oi++] = 0xdc_00 | (d & 0x3_ff)
+                }
+              }
+            } else if (b >= 0x81 && b <= 0xfe) {
+              g3 = b
             } else {
-              res += String.fromCodePoint(p) // Can validly return replacement
+              pushback.push(b, g2)
+              g1 = g2 = 0
+              o16[oi++] = err()
             }
-          }
-        } else if (g2) {
-          if (b >= 0x81 && b <= 0xfe) {
-            g3 = b
-          } else {
-            pushback.push(b, g2)
-            g1 = g2 = 0
-            res += String.fromCharCode(err())
-          }
-        } else if (g1) {
-          if (b >= 0x30 && b <= 0x39) {
+          } else if (b >= 0x30 && b <= 0x39) {
             g2 = b
           } else {
             let cp
@@ -390,18 +431,18 @@ const mappers = {
             g1 = 0
             if (cp !== undefined && cp !== REP) {
-              res += String.fromCodePoint(cp)
+              o16[oi++] = cp // 16-bit
             } else {
-              res += String.fromCharCode(err())
-              if (b < 128) res += String.fromCharCode(b) // can be processed immediately
+              o16[oi++] = err()
+              if (b < 128) o16[oi++] = b // can be processed immediately
             }
           }
         } else if (b < 128) {
-          res += String.fromCharCode(b)
+          o16[oi++] = b
         } else if (b === 0x80) {
-          res += '\u20AC'
+          o16[oi++] = 0x20_ac
         } else if (b === 0xff) {
-          res += String.fromCharCode(err())
+          o16[oi++] = err()
         } else {
           g1 = b
         }
@@ -410,10 +451,10 @@ const mappers = {
       // if g1 = 0 then g2 = g3 = 0
       if (g1 && !stream) {
         g1 = g2 = g3 = 0
-        res += String.fromCharCode(err())
+        o16[oi++] = err()
       }
-      return res
+      return decodeUCS2(o16, oi)
     }
     return { decode, isAscii: () => g1 === 0 } // if g1 = 0 then g2 = g3 = 0
@@ -433,6 +474,7 @@ const mappers = {
 export const isAsciiSuperset = (enc) => enc !== 'iso-2022-jp' // all others are ASCII supersets and can use fast path
 export function multibyteDecoder(enc, loose = false) {
+  if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
   if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
   // Input is assumed to be typechecked already
@@ -452,8 +494,9 @@ export function multibyteDecoder(enc, loose = false) {
   return (arr, stream = false) => {
     let res = ''
     if (asciiSuperset && (!mapper || mapper.isAscii?.())) {
-      res = decodeLatin1(arr, 0, asciiPrefix(arr))
-      if (res.length === arr.length) return res // ascii
+      const prefixLen = asciiPrefix(arr)
+      if (prefixLen === arr.length) return decodeAscii(arr) // ascii
+      res = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
     }
     streaming = stream // affects onErr

package/fallback/multi-byte.table.js CHANGED Viewed

@@ -56,7 +56,9 @@ function unwrap(res, t, pos, stringMode = false) {
         }
         if (stringMode) {
-          for (let k = 0; k < x; k++, pos++, code++) res[pos] = String.fromCodePoint(code)
+          for (let k = 0; k < x; k++, pos++, code++) {
+            res[pos] = code <= 0xff_ff ? code : String.fromCodePoint(code)
+          }
         } else {
           for (let k = 0; k < x; k++, pos++, code++) res[pos] = code
         }
@@ -65,8 +67,13 @@ function unwrap(res, t, pos, stringMode = false) {
       pos = unwrap(res, indices[x], pos, stringMode) // self-reference using shared chunks
     } else if (stringMode) {
       const s = [...utf16toString(loadBase64(x), 'uint8-le')] // splits by codepoints
-      for (let i = 0; i < s.length; ) res[pos++] = s[i++] // TODO: splice?
-      code = s[s.length - 1].codePointAt(0) + 1
+      let char
+      for (let i = 0; i < s.length; ) {
+        char = s[i++]
+        res[pos++] = char.length === 1 ? char.charCodeAt(0) : char // strings only for high codepoints
+      }
+      code = char.codePointAt(0) + 1
     } else {
       const u16 = to16input(loadBase64(x), true) // data is little-endian
       res.set(u16, pos)

package/fallback/single-byte.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { asciiPrefix, decodeLatin1 } from './latin1.js'
+import { asciiPrefix, decodeAscii, decodeLatin1 } from './latin1.js'
 import encodings from './single-byte.encodings.js'
 import { decode2string } from './_utils.js'
@@ -23,6 +23,7 @@ function getEncoding(encoding) {
 const mappers = new Map()
 const decoders = new Map()
+const encmaps = new Map()
 // Used only on Node.js, no reason to optimize for anything else
 // E.g. avoiding .from and filling zero-initialized arr manually is faster on Hermes, but we avoid this codepath on Hermes completely
@@ -31,7 +32,7 @@ export function encodingMapper(encoding) {
   if (cached) return cached
   const codes = getEncoding(encoding)
-  const incomplete = codes.includes(0xff_fd)
+  const incomplete = codes.includes(r)
   let map
   const mapper = (arr, start = 0) => {
     if (!map) {
@@ -66,7 +67,7 @@ export function encodingDecoder(encoding) {
   let strings
   const codes = getEncoding(encoding)
-  const incomplete = codes.includes(0xff_fd)
+  const incomplete = codes.includes(r)
   const decoder = (arr, loose = false) => {
     if (!strings) {
       const allCodes = Array.from({ length: 128 }, (_, i) => i).concat(codes)
@@ -74,8 +75,9 @@ export function encodingDecoder(encoding) {
       strings = allCodes.map((c) => String.fromCharCode(c))
     }
-    const prefix = decodeLatin1(arr, 0, asciiPrefix(arr))
-    if (prefix.length === arr.length) return prefix
+    const prefixLen = asciiPrefix(arr)
+    if (prefixLen === arr.length) return decodeAscii(arr)
+    const prefix = decodeLatin1(arr, 0, prefixLen) // TODO: check if decodeAscii with subarray is faster for small prefixes too
     const suffix = decode2string(arr, prefix.length, arr.length, strings)
     if (!loose && incomplete && suffix.includes('\uFFFD')) throw new TypeError(E_STRICT)
     return prefix + suffix
@@ -84,3 +86,21 @@ export function encodingDecoder(encoding) {
   decoders.set(encoding, decoder)
   return decoder
 }
+export function encodeMap(encoding) {
+  const cached = encmaps.get(encoding)
+  if (cached) return cached
+  const codes = getEncoding(encoding)
+  let max = 128
+  while (codes.length < 128) codes.push(128 + codes.length)
+  for (const code of codes) if (code > max && code !== r) max = code
+  const map = new Uint8Array(max + 1) // < 10 KiB for all except macintosh, 63 KiB for macintosh
+  for (let i = 0; i < 128; i++) {
+    map[i] = i
+    if (codes[i] !== r) map[codes[i]] = 128 + i
+  }
+  encmaps.set(encoding, map)
+  return map
+}

package/fallback/utf16.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { decodeLatin1, encodeCharcodes } from './latin1.js'
+import { decodeUCS2, encodeCharcodes } from './latin1.js'
 import { isLE } from './_utils.js'
 export const E_STRICT = 'Input is not well-formed utf16'
@@ -38,9 +38,9 @@ export function to16input(u8, le) {
 }
 export const decode = (u16, loose = false, checked = false) => {
-  if (checked || isWellFormed(u16)) return decodeLatin1(u16, 0, u16.length) // it's capable of decoding Uint16Array to UTF-16 as well
+  if (checked || isWellFormed(u16)) return decodeUCS2(u16)
   if (!loose) throw new TypeError(E_STRICT)
-  return decodeLatin1(toWellFormed(Uint16Array.from(u16)), 0, u16.length) // cloned for replacement
+  return decodeUCS2(toWellFormed(Uint16Array.from(u16))) // cloned for replacement
 }
 export function encode(str, loose = false, checked = false, swapped = false) {

package/multi-byte.node.js CHANGED Viewed

@@ -1,10 +1,8 @@
 import { assertUint8 } from './assert.js'
-import { isDeno } from './fallback/_utils.js'
+import { isDeno, toBuf } from './fallback/_utils.js'
 import { isAsciiSuperset, multibyteDecoder } from './fallback/multi-byte.js'
 import { isAscii } from 'node:buffer'
-const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
 export function createMultibyteDecoder(encoding, loose = false) {
   const jsDecoder = multibyteDecoder(encoding, loose) // asserts
   let streaming = false

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@exodus/bytes",
-  "version": "1.5.0",
+  "version": "1.7.0",
   "description": "Various operations on Uint8Array data",
   "scripts": {
     "lint": "eslint .",
@@ -145,6 +145,7 @@
     "@exodus/prettier": "^1.0.0",
     "@exodus/test": "^1.0.0-rc.109",
     "@noble/hashes": "^2.0.1",
+    "@petamoriken/float16": "^3.9.3",
     "@scure/base": "^1.2.6",
     "@stablelib/base64": "^2.0.1",
     "@stablelib/hex": "^2.0.1",
@@ -172,6 +173,7 @@
     "typescript": "^5.9.3",
     "uint8array-tools": "^0.0.9",
     "utf8": "^3.0.0",
+    "web-streams-polyfill": "^4.2.0",
     "whatwg-encoding": "^3.1.1",
     "wif": "^5.0.0"
   },

package/single-byte.js CHANGED Viewed

@@ -1,11 +1,18 @@
 import { assertUint8 } from './assert.js'
-import { canDecoders } from './fallback/_utils.js'
-import { assertEncoding, encodingDecoder } from './fallback/single-byte.js'
+import { canDecoders, nativeEncoder } from './fallback/_utils.js'
+import { encodeAscii } from './fallback/latin1.js'
+import { assertEncoding, encodingDecoder, encodeMap, E_STRICT } from './fallback/single-byte.js'
 const { TextDecoder } = globalThis
 let windows1252works
+// prettier-ignore
+const skipNative = new Set([
+  'iso-8859-16', // iso-8859-16 is somehow broken in WebKit, at least on CI
+  'iso-8859-6', 'iso-8859-8', 'iso-8859-8-i', // slow in all 3 engines
+])
 function shouldUseNative(enc) {
   // https://issues.chromium.org/issues/468458388
   // Also might be incorrectly imlemented on platforms as Latin1 (e.g. in Node.js) or regress
@@ -24,11 +31,11 @@ function shouldUseNative(enc) {
     return windows1252works
   }
-  // iso-8859-16 is somehow broken in WebKit, at least on CI
-  return enc !== 'iso-8859-16'
+  return !skipNative.has(enc)
 }
 export function createSinglebyteDecoder(encoding, loose = false) {
+  if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
   assertEncoding(encoding)
   if (canDecoders && shouldUseNative(encoding)) {
@@ -51,4 +58,45 @@ export function createSinglebyteDecoder(encoding, loose = false) {
   }
 }
+const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
+function encode(s, m) {
+  const len = s.length
+  const x = new Uint8Array(len)
+  for (let i = 0; i < len; i++) {
+    const x0 = s.charCodeAt(i)
+    const c0 = m[x0]
+    if (!c0 && x0) return null
+    x[i] = c0
+  }
+  return x
+}
+export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
+  // TODO: replacement, truncate (replacement will need varying length)
+  if (mode !== 'fatal') throw new Error('Unsupported mode')
+  const m = encodeMap(encoding) // asserts
+  // No single-byte encoder produces surrogate pairs, so any surrogate is invalid
+  // This needs special treatment only to decide how many replacement chars to output, one or two
+  // Not much use in running isWellFormed, most likely cause of error is unmapped chars, not surrogate pairs
+  return (s) => {
+    if (typeof s !== 'string') throw new TypeError('Input is not a string')
+    // Instead of an ASCII regex check, encode optimistically - this is faster
+    // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
+    if (nativeEncoder && !NON_LATIN.test(s)) {
+      try {
+        return encodeAscii(s, E_STRICT)
+      } catch {}
+    }
+    const res = encode(s, m)
+    if (!res) throw new TypeError(E_STRICT)
+    return res
+  }
+}
 export const windows1252toString = createSinglebyteDecoder('windows-1252')
+export const windows1252fromString = createSinglebyteEncoder('windows-1252')

package/single-byte.node.js CHANGED Viewed

@@ -1,10 +1,8 @@
 import { assertUint8 } from './assert.js'
 import { isAscii } from 'node:buffer'
-import { isDeno, isLE } from './fallback/_utils.js'
+import { isDeno, isLE, toBuf } from './fallback/_utils.js'
 import { asciiPrefix } from './fallback/latin1.js'
-import { encodingMapper, encodingDecoder, E_STRICT } from './fallback/single-byte.js'
-const toBuf = (x) => Buffer.from(x.buffer, x.byteOffset, x.byteLength)
+import { encodingMapper, encodingDecoder, encodeMap, E_STRICT } from './fallback/single-byte.js'
 function latin1Prefix(arr, start) {
   let p = start | 0
@@ -24,6 +22,7 @@ function latin1Prefix(arr, start) {
 }
 export function createSinglebyteDecoder(encoding, loose = false) {
+  if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
   const latin1path = encoding === 'windows-1252'
   if (isDeno) {
     const jsDecoder = encodingDecoder(encoding) // asserts
@@ -59,4 +58,51 @@ export function createSinglebyteDecoder(encoding, loose = false) {
   }
 }
+const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
+export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
+  // TODO: replacement, truncate (replacement will need varying length)
+  if (mode !== 'fatal') throw new Error('Unsupported mode')
+  const m = encodeMap(encoding) // asserts
+  return (s) => {
+    if (typeof s !== 'string') throw new TypeError('Input is not a string')
+    // Instead of an ASCII regex check, encode optimistically - this is faster
+    // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
+    if (!NON_LATIN.test(s)) {
+      const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
+      if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
+    }
+    const len = s.length
+    let i = 0
+    const b = Buffer.from(s, 'utf-16le') // aligned
+    if (!isLE) b.swap16()
+    const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
+    for (const len3 = len - 3; i < len3; i += 4) {
+      const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
+      const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
+      if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) {
+        throw new TypeError(E_STRICT)
+      }
+      x[i] = c0
+      x[i + 1] = c1
+      x[i + 2] = c2
+      x[i + 3] = c3
+    }
+    for (; i < len; i++) {
+      const x0 = x[i]
+      const c0 = m[x0]
+      if (!c0 && x0) throw new TypeError(E_STRICT)
+      x[i] = c0
+    }
+    return new Uint8Array(x)
+  }
+}
 export const windows1252toString = createSinglebyteDecoder('windows-1252')
+export const windows1252fromString = createSinglebyteEncoder('windows-1252')

package/utf8.js CHANGED Viewed

@@ -57,7 +57,7 @@ function decode(arr, loose = false) {
   if (nativeDecoder) return loose ? decoderLoose.decode(arr) : decoderFatal.decode(arr) // Node.js and browsers
   // Fast path for ASCII prefix, this is faster than all alternatives below
-  const prefix = decodeLatin1(arr, 0, asciiPrefix(arr))
+  const prefix = decodeLatin1(arr, 0, asciiPrefix(arr)) // No native decoder to use, so decodeAscii is useless here
   if (prefix.length === arr.length) return prefix
   // This codepath gives a ~3x perf boost on Hermes