npm - @exodus/bytes - Versions diffs - 1.0.0 → 1.2.0 - Mend

@exodus/bytes 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +8 -14
package/fallback/encoding.js +3 -3
package/fallback/encoding.labels.js +10 -10
package/package.json +1 -1
package/utf16.node.js +1 -0
package/utf8.node.js +16 -3

package/README.md CHANGED Viewed

@@ -41,6 +41,7 @@ Spec compliant, passing WPT and covered with extra tests.
 Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).
 [Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
+Runs (and passes WPT) on Node.js built without ICU.
 ### Caveat: `TextDecoder` / `TextEncoder` APIs are lossy by default per spec
@@ -205,16 +206,10 @@ as an ASCII-lowercased string.
 If an encoding with that label does not exist, returns `null`.
 This is the same as [`decoder.encoding` getter](https://encoding.spec.whatwg.org/#dom-textdecoder-encoding),
-except that it does not throw for invalid labels and instead returns `null`, and is identical to
-the following code:
-```js
-try {
-  if (!label) return null // does not default to 'utf-8'
-  return new TextDecoder(label).encoding
-} catch {
-  return null
-}
-```
+except that it:
+ 1. Supports [`replacement` encoding](https://encoding.spec.whatwg.org/#replacement) and its
+    [labels](https://encoding.spec.whatwg.org/#ref-for-replacement%E2%91%A1)
+ 2. Does not throw for invalid labels and instead returns `null`
 All encoding names are also valid labels for corresponding encodings.
@@ -233,15 +228,14 @@ Given a `TypedArray` or an `ArrayBuffer` instance `input`, returns either of:
 Implements [decode](https://encoding.spec.whatwg.org/#decode) legacy hook.
 Given a `TypedArray` or an `ArrayBuffer` instance `input` and an optional `fallbackEncoding`
-normalized encoding name, sniffs encoding from BOM with `fallbackEncoding` fallback and then
+encoding [label](https://encoding.spec.whatwg.org/#names-and-labels),
+sniffs encoding from BOM with `fallbackEncoding` fallback and then
 decodes the `input` using that encoding, skipping BOM if it was present.
 Notes:
  * BOM-sniffed encoding takes precedence over `fallbackEncoding` option per spec.
    Use with care.
- * `fallbackEncoding` must be ASCII-lowercased encoding name,
-   e.g. a result of `normalizeEncoding(label)` call.
  * Always operates in non-fatal [mode](https://encoding.spec.whatwg.org/#textdecoder-error-mode),
    aka replacement. It can convert different byte sequences to equal strings.
@@ -249,7 +243,7 @@ This method is similar to the following code, except that it doesn't support enc
 only expects lowercased encoding name:
 ```js
-new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding ?? 'utf-8').decode(input)
+new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding).decode(input)
 ```
 ### `@exodus/bytes/encoding-lite.js`

package/fallback/encoding.js CHANGED Viewed

@@ -256,13 +256,13 @@ export function getBOMEncoding(input) {
 // https://encoding.spec.whatwg.org/#decode
 // Warning: encoding sniffed from BOM takes preference over the supplied one
 // Warning: lossy, performs replacement, no option of throwing
-// Expects normalized (lower-case) encoding as input. Completely ignores it and even skips validation when BOM is found
+// Completely ignores encoding and even skips validation when BOM is found
 // Unlike TextDecoder public API, additionally supports 'replacement' encoding
-export function legacyHookDecode(input, fallbackEncoding) {
+export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
   let u8 = fromSource(input)
   const bomEncoding = getBOMEncoding(u8)
   if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
-  const enc = bomEncoding ?? fallbackEncoding ?? 'utf-8' // "the byte order mark is more authoritative than anything else"
+  const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
   if (enc === 'utf-8') return utf8toStringLoose(u8)
   if (enc === 'utf-16le' || enc === 'utf-16be') {

package/fallback/encoding.labels.js CHANGED Viewed

@@ -2,7 +2,7 @@
 /* eslint-disable @exodus/export-default/named */
 // prettier-ignore
-export default {
+const labels = {
   'utf-8': ['unicode-1-1-utf-8', 'unicode11utf8', 'unicode20utf8', 'utf8', 'x-unicode20utf8'],
   ibm866: ['866', 'cp866', 'csibm866'],
   'iso-8859-2': ['csisolatin2', 'iso-ir-101', 'iso8859-2', 'iso88592', 'iso_8859-2', 'iso_8859-2:1987', 'l2', 'latin2'],
@@ -22,15 +22,6 @@ export default {
   'koi8-u': ['koi8-ru'],
   macintosh: ['csmacintosh', 'mac', 'x-mac-roman'],
   'windows-874': ['dos-874', 'iso-8859-11', 'iso8859-11', 'iso885911', 'tis-620'],
-  'windows-1250': ['cp1250', 'x-cp1250'],
-  'windows-1251': ['cp1251', 'x-cp1251'],
-  'windows-1252': ['ansi_x3.4-1968', 'ascii', 'cp1252', 'cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii', 'x-cp1252'],
-  'windows-1253': ['cp1253', 'x-cp1253'],
-  'windows-1254': ['cp1254', 'csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599', 'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5', 'x-cp1254'],
-  'windows-1255': ['cp1255', 'x-cp1255'],
-  'windows-1256': ['cp1256', 'x-cp1256'],
-  'windows-1257': ['cp1257', 'x-cp1257'],
-  'windows-1258': ['cp1258', 'x-cp1258'],
   'x-mac-cyrillic': ['x-mac-ukrainian'],
   gbk: ['chinese', 'csgb2312', 'csiso58gb231280', 'gb2312', 'gb_2312', 'gb_2312-80', 'iso-ir-58', 'x-gbk'],
   gb18030: [],
@@ -44,3 +35,12 @@ export default {
   'utf-16le': ['csunicode', 'iso-10646-ucs-2', 'ucs-2', 'unicode', 'unicodefeff', 'utf-16'],
   'x-user-defined': [],
 }
+for (let i = 0; i < 9; i++) labels[`windows-125${i}`] = [`cp125${i}`, `x-cp125${i}`]
+// prettier-ignore
+labels['windows-1252'].push('ansi_x3.4-1968', 'ascii', 'cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii')
+// prettier-ignore
+labels['windows-1254'].push('csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599', 'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5')
+export default labels

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@exodus/bytes",
-  "version": "1.0.0",
+  "version": "1.2.0",
   "description": "Various operations on Uint8Array data",
   "scripts": {
     "lint": "eslint .",

package/utf16.node.js CHANGED Viewed

@@ -36,6 +36,7 @@ const swapped = (x, swap) =>
   swap ? Buffer.from(x).swap16() : Buffer.from(x.buffer, x.byteOffset, x.byteLength)
 // We skip TextDecoder on Node.js, as it's is somewhy significantly slower than Buffer for utf16
+// Also, it incorrectly misses replacements with Node.js is built without ICU, we fix that
 function decodeNode(input, loose = false, format = 'uint16') {
   let ble
   if (format === 'uint16') {

package/utf8.node.js CHANGED Viewed

@@ -1,15 +1,21 @@
 import { assertUint8 } from './assert.js'
 import { typedView } from './array.js'
-import { E_STRICT_UNICODE } from './fallback/utf8.js'
+import { E_STRICT, E_STRICT_UNICODE } from './fallback/utf8.js'
 import { isAscii } from 'node:buffer'
 if (Buffer.TYPED_ARRAY_SUPPORT) throw new Error('Unexpected Buffer polyfill')
-const decoderFatal = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true })
+let decoderFatal
 const decoderLoose = new TextDecoder('utf-8', { ignoreBOM: true })
 const { isWellFormed } = String.prototype
 const isDeno = Boolean(globalThis.Deno)
+try {
+  decoderFatal = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true })
+} catch {
+  // Without ICU, Node.js doesn't support fatal option for utf-8
+}
 function encode(str, loose = false) {
   if (typeof str !== 'string') throw new TypeError('Input is not a string')
   const strLength = str.length
@@ -45,7 +51,14 @@ function decode(arr, loose = false) {
     return buf.latin1Slice(0, arr.byteLength) // .latin1Slice is faster than .asciiSlice
   }
-  return loose ? decoderLoose.decode(arr) : decoderFatal.decode(arr)
+  if (loose) return decoderLoose.decode(arr)
+  if (decoderFatal) return decoderFatal.decode(arr)
+  // We are in an env without native fatal decoder support (non-fixed Node.js without ICU)
+  // Well, just recheck against encode if it contains replacement then, this is still faster than js impl
+  const str = decoderLoose.decode(arr)
+  if (str.includes('\uFFFD') && !Buffer.from(str).equals(arr)) throw new TypeError(E_STRICT)
+  return str
 }
 export const utf8fromString = (str, format = 'uint8') => typedView(encode(str, false), format)