npm - @exodus/bytes - Versions diffs - 1.4.0 → 1.5.0 - Mend

@exodus/bytes 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/fallback/encoding.js +11 -1
package/fallback/multi-byte.js +267 -234
package/package.json +1 -1

package/fallback/encoding.js CHANGED Viewed

@@ -47,6 +47,10 @@ export function normalizeEncoding(label) {
 const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
+// TODO: make this more strict against Symbol.toStringTag
+// Is not very significant though, anything faking Symbol.toStringTag could as well override
+// prototypes, which is not something we protect against
 function isAnyArrayBuffer(x) {
   if (x instanceof ArrayBuffer) return true
   if (globalThis.SharedArrayBuffer && x instanceof SharedArrayBuffer) return true
@@ -55,6 +59,12 @@ function isAnyArrayBuffer(x) {
   return s === '[object ArrayBuffer]' || s === '[object SharedArrayBuffer]'
 }
+function isAnyUint8Array(x) {
+  if (x instanceof Uint8Array) return true
+  if (!x || !ArrayBuffer.isView(x) || x.BYTES_PER_ELEMENT !== 1) return false
+  return Object.prototype.toString.call(x) === '[object Uint8Array]'
+}
 const fromSource = (x) => {
   if (x instanceof Uint8Array) return x
   if (ArrayBuffer.isView(x)) return new Uint8Array(x.buffer, x.byteOffset, x.byteLength)
@@ -217,7 +227,7 @@ export class TextEncoder {
   encodeInto(str, target) {
     if (typeof str !== 'string') str = `${str}`
-    if (!(target instanceof Uint8Array)) throw new TypeError('Target must be an Uint8Array')
+    if (!isAnyUint8Array(target)) throw new TypeError('Target must be an Uint8Array')
     if (target.buffer.detached) return { read: 0, written: 0 } // Until https://github.com/whatwg/encoding/issues/324 is resolved
     const tlen = target.length

package/fallback/multi-byte.js CHANGED Viewed

@@ -8,37 +8,57 @@ export const E_STRICT = 'Input is not well-formed for this encoding'
 // If the decoder is not cleared properly, state can be preserved between non-streaming calls!
 // See comment about fatal stream
-// All except iso-2022-jp are ASCII supersets
-// When adding something that is not an ASCII superset, ajust the ASCII fast path
-const REP = 0xff_fd
-const mappers = {
-  // https://encoding.spec.whatwg.org/#euc-kr-decoder
-  'euc-kr': (err) => {
-    const euc = getTable('euc-kr')
-    let lead = 0
+// Common between euc-kr and big5
+function bigDecoder(err, pair) {
+  let lead = 0
-    const pushback = []
-    const bytes = (b) => {
-      if (lead) {
-        const cp = b >= 0x41 && b <= 0xfe ? euc[(lead - 0x81) * 190 + b - 0x41] : undefined
-        lead = 0
-        if (cp !== undefined && cp !== REP) return cp
-        if (b < 128) pushback.push(b)
-        return err()
-      }
+  const decodeLead = (b) => {
+    const str = pair(lead, b)
+    lead = 0
+    if (str) return str
+    return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
+  }
+  const decode = (arr, start, end, stream) => {
+    let res = ''
+    let i = start
-      if (b < 128) return b
-      if (b < 0x81 || b === 0xff) return err()
-      lead = b
+    if (lead && i < end) res += decodeLead(arr[i++])
+    while (i < end) {
+      const b = arr[i++]
+      if (b < 128) {
+        res += String.fromCharCode(b)
+      } else if (b === 0x80 || b === 0xff) {
+        res += String.fromCharCode(err())
+      } else {
+        lead = b
+        if (i < end) res += decodeLead(arr[i++])
+      }
     }
-    const eof = () => {
-      if (!lead) return null
+    if (lead && !stream) {
       lead = 0
-      return err()
+      res += String.fromCharCode(err())
     }
-    return { bytes, eof, pushback }
+    return res
+  }
+  return { decode, isAscii: () => lead === 0 }
+}
+// All except iso-2022-jp are ASCII supersets
+// When adding something that is not an ASCII superset, ajust the ASCII fast path
+const REP = 0xff_fd
+const mappers = {
+  // https://encoding.spec.whatwg.org/#euc-kr-decoder
+  'euc-kr': (err) => {
+    const euc = getTable('euc-kr')
+    return bigDecoder(err, (l, b) => {
+      if (b < 0x41 || b > 0xfe) return
+      const cp = euc[(l - 0x81) * 190 + b - 0x41]
+      return cp !== undefined && cp !== REP ? String.fromCharCode(cp) : undefined
+    })
   },
   // https://encoding.spec.whatwg.org/#euc-jp-decoder
   'euc-jp': (err) => {
@@ -47,64 +67,71 @@ const mappers = {
     let j12 = false
     let lead = 0
-    const pushback = []
-    const bytes = (b) => {
+    const decodeLead = (b) => {
       if (lead === 0x8e && b >= 0xa1 && b <= 0xdf) {
         lead = 0
-        return 0xfe_c0 + b
+        return String.fromCharCode(0xfe_c0 + b)
       }
       if (lead === 0x8f && b >= 0xa1 && b <= 0xfe) {
         j12 = true
         lead = b
-        return
+        return ''
       }
-      if (lead) {
-        let cp
-        if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
-          cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
+      let cp
+      if (lead >= 0xa1 && lead <= 0xfe && b >= 0xa1 && b <= 0xfe) {
+        cp = (j12 ? jis0212 : jis0208)[(lead - 0xa1) * 94 + b - 0xa1]
+      }
+      lead = 0
+      j12 = false
+      if (cp !== undefined && cp !== REP) return String.fromCharCode(cp)
+      return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
+    }
+    const decode = (arr, start, end, stream) => {
+      let res = ''
+      let i = start
+      if (lead && i < end) res += decodeLead(arr[i++])
+      if (lead && i < end) res += decodeLead(arr[i++]) // could be two leads, but no more
+      while (i < end) {
+        const b = arr[i++]
+        if (b < 128) {
+          res += String.fromCharCode(b)
+        } else if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) {
+          res += String.fromCharCode(err())
+        } else {
+          lead = b
+          if (i < end) res += decodeLead(arr[i++])
+          if (lead && i < end) res += decodeLead(arr[i++]) // could be two leads
         }
+      }
+      if (lead && !stream) {
         lead = 0
-        j12 = false
-        if (cp !== undefined && cp !== REP) return cp
-        if (b < 128) pushback.push(b)
-        return err()
+        j12 = false // can be true only when lead is non-zero
+        res += String.fromCharCode(err())
       }
-      if (b < 128) return b
-      if ((b < 0xa1 && b !== 0x8e && b !== 0x8f) || b === 0xff) return err()
-      lead = b
+      return res
     }
-    // eslint-disable-next-line sonarjs/no-identical-functions
-    const eof = () => {
-      if (!lead) return null
-      lead = 0
-      return err()
-    }
-    return { bytes, eof, pushback }
+    return { decode, isAscii: () => lead === 0 } // j12 can be true only when lead is non-zero
   },
   // https://encoding.spec.whatwg.org/#iso-2022-jp-decoder
-  // Per-letter of the spec, don't shortcut on state changes on EOF. Some code is regrouped but preserving the logic
   'iso-2022-jp': (err) => {
     const jis0208 = getTable('jis0208')
-    const EOF = -1
     let dState = 1
     let oState = 1
-    let lead = 0
+    let lead = 0 // 0 or 0x21-0x7e
     let out = false
-    const pushback = []
-    const bytes = (b) => {
-      if (dState < 5) {
-        if (b === EOF) return null
-        if (b === 0x1b) {
-          dState = 6 // escape start
-          return
-        }
+    const bytes = (pushback, b) => {
+      if (dState < 5 && b === 0x1b) {
+        dState = 6 // escape start
+        return
       }
       switch (dState) {
@@ -156,7 +183,7 @@ const mappers = {
           out = false
           dState = oState
-          if (b !== EOF) pushback.push(b)
+          pushback.push(b)
           return err()
         case 7: {
           // Escape
@@ -185,52 +212,117 @@ const mappers = {
           out = false
           dState = oState
-          if (b !== EOF) pushback.push(b)
-          pushback.push(l)
+          pushback.push(b, l)
+          return err()
+        }
+      }
+    }
+    const eof = (pushback) => {
+      if (dState < 5) return null
+      out = false
+      switch (dState) {
+        case 5:
+          dState = 4
+          return err()
+        case 6:
+          dState = oState
+          return err()
+        case 7: {
+          dState = oState
+          pushback.push(lead)
+          lead = 0
           return err()
         }
       }
     }
-    const eof = () => bytes(EOF)
+    const decode = (arr, start, end, stream) => {
+      let res = ''
+      let i = start
+      const pushback = [] // local and auto-cleared
+      // First, dump everything until EOF
+      // Same as the full loop, but without EOF handling
+      while (i < end || pushback.length > 0) {
+        const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
+        if (c !== undefined) res += String.fromCodePoint(c)
+      }
+      // Then, dump EOF. This needs the same loop as the characters can be pushed back
+      if (!stream) {
+        while (i <= end || pushback.length > 0) {
+          if (i < end || pushback.length > 0) {
+            const c = bytes(pushback, pushback.length > 0 ? pushback.pop() : arr[i++])
+            if (c !== undefined) res += String.fromCodePoint(c)
+          } else {
+            const c = eof(pushback)
+            if (c === null) break // clean exit
+            res += String.fromCodePoint(c)
+          }
+        }
+      }
+      // Chrome and WebKit fail on this, we don't: completely destroy the old decoder state when finished streaming
+      // > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
+      // > Set this’s do not flush to options["stream"]
+      if (!stream) {
+        dState = oState = 1
+        lead = 0
+        out = false
+      }
+      return res
+    }
-    return { bytes, eof, pushback }
+    return { decode, isAscii: () => false }
   },
   // https://encoding.spec.whatwg.org/#shift_jis-decoder
   shift_jis: (err) => {
     const jis0208 = getTable('jis0208')
     let lead = 0
-    const pushback = []
-    const bytes = (b) => {
-      if (lead) {
-        const l = lead
-        lead = 0
-        if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
-          const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
-          if (p >= 8836 && p <= 10_715) return 0xe0_00 - 8836 + p // 16-bit
-          const cp = jis0208[p]
-          if (cp !== undefined && cp !== REP) return cp
-        }
-        if (b < 128) pushback.push(b)
-        return err()
+    const decodeLead = (b) => {
+      const l = lead
+      lead = 0
+      if (b >= 0x40 && b <= 0xfc && b !== 0x7f) {
+        const p = (l - (l < 0xa0 ? 0x81 : 0xc1)) * 188 + b - (b < 0x7f ? 0x40 : 0x41)
+        if (p >= 8836 && p <= 10_715) return String.fromCharCode(0xe0_00 - 8836 + p)
+        const cp = jis0208[p]
+        if (cp !== undefined && cp !== REP) return String.fromCharCode(cp)
       }
-      if (b <= 0x80) return b // 0x80 is allowed
-      if (b >= 0xa1 && b <= 0xdf) return 0xff_61 - 0xa1 + b
-      if (b < 0x81 || (b > 0x9f && b < 0xe0) || b > 0xfc) return err()
-      lead = b
+      return b < 128 ? String.fromCharCode(err(), b) : String.fromCharCode(err())
     }
-    // eslint-disable-next-line sonarjs/no-identical-functions
-    const eof = () => {
-      if (!lead) return null
-      lead = 0 // this clears state completely on EOF
-      return err()
+    const decode = (arr, start, end, stream) => {
+      let res = ''
+      let i = start
+      if (lead && i < end) res += decodeLead(arr[i++])
+      while (i < end) {
+        const b = arr[i++]
+        if (b <= 0x80) {
+          res += String.fromCharCode(b) // 0x80 is allowed
+        } else if (b >= 0xa1 && b <= 0xdf) {
+          res += String.fromCharCode(0xfe_c0 + b)
+        } else if (b === 0xa0 || b > 0xfc) {
+          res += String.fromCharCode(err())
+        } else {
+          lead = b
+          if (i < end) res += decodeLead(arr[i++])
+        }
+      }
+      if (lead && !stream) {
+        lead = 0
+        res += String.fromCharCode(err())
+      }
+      return res
     }
-    return { bytes, eof, pushback }
+    return { decode, isAscii: () => lead === 0 }
   },
   // https://encoding.spec.whatwg.org/#gbk-decoder
   gbk: (err) => mappers.gb18030(err), // 10.1.1. GBK’s decoder is gb18030’s decoder
@@ -252,179 +344,120 @@ const mappers = {
       return b + p - a
     }
-    const pushback = []
-    const bytes = (b) => {
-      if (g3) {
-        if (b < 0x30 || b > 0x39) {
-          pushback.push(b, g3, g2)
-          g1 = g2 = g3 = 0
-          return err()
-        }
-        const cp = index((g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30)
-        g1 = g2 = g3 = 0
-        if (cp !== undefined) return cp // Can validly return replacement
-        return err()
-      }
+    // g1 is 0 or 0x81-0xfe
+    // g2 is 0 or 0x30-0x39
+    // g3 is 0 or 0x81-0xfe
+    const decode = (arr, start, end, stream) => {
+      let res = ''
+      let i = start
+      const pushback = [] // local and auto-cleared
+      // First, dump everything until EOF
+      // Same as the full loop, but without EOF handling
+      while (i < end || pushback.length > 0) {
+        const b = pushback.length > 0 ? pushback.pop() : arr[i++]
+        if (g3) {
+          if (b < 0x30 || b > 0x39) {
+            pushback.push(b, g3, g2)
+            g1 = g2 = g3 = 0
+            res += String.fromCharCode(err())
+          } else {
+            const p = index((g1 - 0x81) * 12_600 + (g2 - 0x30) * 1260 + (g3 - 0x81) * 10 + b - 0x30)
+            g1 = g2 = g3 = 0
+            if (p === undefined) {
+              res += String.fromCharCode(err())
+            } else {
+              res += String.fromCodePoint(p) // Can validly return replacement
+            }
+          }
+        } else if (g2) {
+          if (b >= 0x81 && b <= 0xfe) {
+            g3 = b
+          } else {
+            pushback.push(b, g2)
+            g1 = g2 = 0
+            res += String.fromCharCode(err())
+          }
+        } else if (g1) {
+          if (b >= 0x30 && b <= 0x39) {
+            g2 = b
+          } else {
+            let cp
+            if (b >= 0x40 && b <= 0xfe && b !== 0x7f) {
+              cp = gb18030[(g1 - 0x81) * 190 + b - (b < 0x7f ? 0x40 : 0x41)]
+            }
-      if (g2) {
-        if (b >= 0x81 && b <= 0xfe) {
-          g3 = b
-          return
+            g1 = 0
+            if (cp !== undefined && cp !== REP) {
+              res += String.fromCodePoint(cp)
+            } else {
+              res += String.fromCharCode(err())
+              if (b < 128) res += String.fromCharCode(b) // can be processed immediately
+            }
+          }
+        } else if (b < 128) {
+          res += String.fromCharCode(b)
+        } else if (b === 0x80) {
+          res += '\u20AC'
+        } else if (b === 0xff) {
+          res += String.fromCharCode(err())
+        } else {
+          g1 = b
         }
-        pushback.push(b, g2)
-        g1 = g2 = 0
-        return err()
       }
-      if (g1) {
-        if (b >= 0x30 && b <= 0x39) {
-          g2 = b
-          return
-        }
-        let cp
-        if (b >= 0x40 && b <= 0xfe && b !== 0x7f) {
-          cp = gb18030[(g1 - 0x81) * 190 + b - (b < 0x7f ? 0x40 : 0x41)]
-        }
-        g1 = 0
-        if (cp !== undefined && cp !== REP) return cp
-        if (b < 128) pushback.push(b)
-        return err()
+      // if g1 = 0 then g2 = g3 = 0
+      if (g1 && !stream) {
+        g1 = g2 = g3 = 0
+        res += String.fromCharCode(err())
       }
-      if (b < 128) return b
-      if (b === 0x80) return 0x20_ac
-      if (b === 0xff) return err()
-      g1 = b
+      return res
     }
-    const eof = () => {
-      if (!g1 && !g2 && !g3) return null
-      g1 = g2 = g3 = 0
-      return err()
-    }
-    return { bytes, eof, pushback }
+    return { decode, isAscii: () => g1 === 0 } // if g1 = 0 then g2 = g3 = 0
+  },
+  // https://encoding.spec.whatwg.org/#big5
+  big5: (err) => {
+    // The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
+    // We store that as strings
+    const big5 = getTable('big5')
+    return bigDecoder(err, (l, b) => {
+      if (b < 0x40 || (b > 0x7e && b < 0xa1) || b === 0xff) return
+      return big5[(l - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)] // strings
+    })
   },
 }
 export const isAsciiSuperset = (enc) => enc !== 'iso-2022-jp' // all others are ASCII supersets and can use fast path
 export function multibyteDecoder(enc, loose = false) {
-  if (enc === 'big5') return big5decoder(loose)
   if (!Object.hasOwn(mappers, enc)) throw new RangeError('Unsupported encoding')
   // Input is assumed to be typechecked already
   let mapper
   const asciiSuperset = isAsciiSuperset(enc)
-  return (arr, stream = false) => {
-    const onErr = loose
-      ? () => REP
-      : () => {
-          mapper.pushback.length = 0 // the queue is cleared on returning an error
-          // The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
-          // Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
-          // iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
-          if (!stream) mapper = null // destroy state, effectively the same as 'do not flush' = false, but early
-          throw new TypeError(E_STRICT)
-        }
-    let res = ''
-    const length = arr.length
-    if (asciiSuperset && !mapper) {
-      res = decodeLatin1(arr, 0, asciiPrefix(arr))
-      if (res.length === arr.length) return res // ascii
-    }
-    if (!mapper) mapper = mappers[enc](onErr)
-    const { bytes, eof, pushback } = mapper
-    let i = res.length
-    // First, dump everything until EOF
-    // Same as the full loop, but without EOF handling
-    while (i < length || pushback.length > 0) {
-      const c = bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
-      if (c === undefined) continue // consuming
-      res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
-    }
-    // Then, dump EOF. This needs the same loop as the characters can be pushed back
-    // TODO: only some encodings need this, most can be optimized
-    if (!stream) {
-      while (i <= length || pushback.length > 0) {
-        const isEOF = i === length && pushback.length === 0
-        const c = isEOF ? eof() : bytes(pushback.length > 0 ? pushback.pop() : arr[i++])
-        if (isEOF && c === null) break // clean exit
-        if (c === undefined) continue // consuming
-        res += String.fromCodePoint(c) // gb18030 returns codepoints above 0xFFFF from ranges
+  let streaming // because onErr is cached in mapper
+  const onErr = loose
+    ? () => REP
+    : () => {
+        // The correct way per spec seems to be not destoying the decoder state in stream mode, even when fatal
+        // Decoders big5, euc-jp, euc-kr, shift_jis, gb18030 / gbk - all clear state before throwing unless EOF, so not affected
+        // iso-2022-jp is the only tricky one one where this !stream check matters in non-stream mode
+        if (!streaming) mapper = null // destroy state, effectively the same as 'do not flush' = false, but early
+        throw new TypeError(E_STRICT)
       }
-    }
-    // Chrome and WebKit fail on this, we don't: completely destroy the old decoder instance when finished streaming
-    // > If this’s do not flush is false, then set this’s decoder to a new instance of this’s encoding’s decoder,
-    // > Set this’s do not flush to options["stream"]
-    if (!stream) mapper = null
-    return res
-  }
-}
-// The only decoder which returns multiple codepoints per byte, also has non-charcode codepoints
-// We store that as strings
-function big5decoder(loose) {
-  // Input is assumed to be typechecked already
-  let lead = 0
-  let big5
   return (arr, stream = false) => {
-    const onErr = loose
-      ? () => '\uFFFD'
-      : () => {
-          // Lead is always already cleared before throwing
-          throw new TypeError(E_STRICT)
-        }
     let res = ''
-    const length = arr.length
-    if (!lead) {
+    if (asciiSuperset && (!mapper || mapper.isAscii?.())) {
       res = decodeLatin1(arr, 0, asciiPrefix(arr))
       if (res.length === arr.length) return res // ascii
     }
-    if (!big5) big5 = getTable('big5')
-    for (let i = res.length; i < length; i++) {
-      const b = arr[i]
-      if (lead) {
-        let cp
-        if ((b >= 0x40 && b <= 0x7e) || (b >= 0xa1 && b !== 0xff)) {
-          cp = big5[(lead - 0x81) * 157 + b - (b < 0x7f ? 0x40 : 0x62)]
-        }
-        lead = 0
-        if (cp) {
-          res += cp // strings
-        } else {
-          res += onErr()
-          // same as pushing it back: lead is cleared, pushed back can't contain more than 1 byte
-          if (b < 128) res += String.fromCharCode(b)
-        }
-      } else if (b < 128) {
-        res += String.fromCharCode(b)
-      } else if (b < 0x81 || b === 0xff) {
-        res += onErr()
-      } else {
-        lead = b
-      }
-    }
-    if (!stream && lead) {
-      // Destroy decoder state
-      lead = 0
-      res += onErr()
-    }
-    return res
+    streaming = stream // affects onErr
+    if (!mapper) mapper = mappers[enc](onErr)
+    return res + mapper.decode(arr, res.length, arr.length, stream)
   }
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@exodus/bytes",
-  "version": "1.4.0",
+  "version": "1.5.0",
   "description": "Various operations on Uint8Array data",
   "scripts": {
     "lint": "eslint .",