@exodus/bytes 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -41,6 +41,7 @@ Spec compliant, passing WPT and covered with extra tests.
41
41
  Moreover, tests for this library uncovered [bugs in all major implementations](https://docs.google.com/spreadsheets/d/1pdEefRG6r9fZy61WHGz0TKSt8cO4ISWqlpBN5KntIvQ/edit).
42
42
 
43
43
  [Faster than Node.js native implementation on Node.js](https://github.com/nodejs/node/issues/61041#issuecomment-3649242024).
44
+ Runs (and passes WPT) on Node.js built without ICU.
44
45
 
45
46
  ### Caveat: `TextDecoder` / `TextEncoder` APIs are lossy by default per spec
46
47
 
@@ -205,16 +206,10 @@ as an ASCII-lowercased string.
205
206
  If an encoding with that label does not exist, returns `null`.
206
207
 
207
208
  This is the same as [`decoder.encoding` getter](https://encoding.spec.whatwg.org/#dom-textdecoder-encoding),
208
- except that it does not throw for invalid labels and instead returns `null`, and is identical to
209
- the following code:
210
- ```js
211
- try {
212
- if (!label) return null // does not default to 'utf-8'
213
- return new TextDecoder(label).encoding
214
- } catch {
215
- return null
216
- }
217
- ```
209
+ except that it:
210
+ 1. Supports [`replacement` encoding](https://encoding.spec.whatwg.org/#replacement) and its
211
+ [labels](https://encoding.spec.whatwg.org/#ref-for-replacement%E2%91%A1)
212
+ 2. Does not throw for invalid labels and instead returns `null`
218
213
 
219
214
  All encoding names are also valid labels for corresponding encodings.
220
215
 
@@ -233,15 +228,14 @@ Given a `TypedArray` or an `ArrayBuffer` instance `input`, returns either of:
233
228
  Implements [decode](https://encoding.spec.whatwg.org/#decode) legacy hook.
234
229
 
235
230
  Given a `TypedArray` or an `ArrayBuffer` instance `input` and an optional `fallbackEncoding`
236
- normalized encoding name, sniffs encoding from BOM with `fallbackEncoding` fallback and then
231
+ encoding [label](https://encoding.spec.whatwg.org/#names-and-labels),
232
+ sniffs encoding from BOM with `fallbackEncoding` fallback and then
237
233
  decodes the `input` using that encoding, skipping BOM if it was present.
238
234
 
239
235
  Notes:
240
236
 
241
237
  * BOM-sniffed encoding takes precedence over `fallbackEncoding` option per spec.
242
238
  Use with care.
243
- * `fallbackEncoding` must be ASCII-lowercased encoding name,
244
- e.g. a result of `normalizeEncoding(label)` call.
245
239
  * Always operates in non-fatal [mode](https://encoding.spec.whatwg.org/#textdecoder-error-mode),
246
240
  aka replacement. It can convert different byte sequences to equal strings.
247
241
 
@@ -249,7 +243,7 @@ This method is similar to the following code, except that it doesn't support enc
249
243
  only expects lowercased encoding name:
250
244
 
251
245
  ```js
252
- new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding ?? 'utf-8').decode(input)
246
+ new TextDecoder(getBOMEncoding(input) ?? fallbackEncoding).decode(input)
253
247
  ```
254
248
 
255
249
  ### `@exodus/bytes/encoding-lite.js`
@@ -256,13 +256,13 @@ export function getBOMEncoding(input) {
256
256
  // https://encoding.spec.whatwg.org/#decode
257
257
  // Warning: encoding sniffed from BOM takes preference over the supplied one
258
258
  // Warning: lossy, performs replacement, no option of throwing
259
- // Expects normalized (lower-case) encoding as input. Completely ignores it and even skips validation when BOM is found
259
+ // Completely ignores encoding and even skips validation when BOM is found
260
260
  // Unlike TextDecoder public API, additionally supports 'replacement' encoding
261
- export function legacyHookDecode(input, fallbackEncoding) {
261
+ export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
262
262
  let u8 = fromSource(input)
263
263
  const bomEncoding = getBOMEncoding(u8)
264
264
  if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
265
- const enc = bomEncoding ?? fallbackEncoding ?? 'utf-8' // "the byte order mark is more authoritative than anything else"
265
+ const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
266
266
 
267
267
  if (enc === 'utf-8') return utf8toStringLoose(u8)
268
268
  if (enc === 'utf-16le' || enc === 'utf-16be') {
@@ -2,7 +2,7 @@
2
2
 
3
3
  /* eslint-disable @exodus/export-default/named */
4
4
  // prettier-ignore
5
- export default {
5
+ const labels = {
6
6
  'utf-8': ['unicode-1-1-utf-8', 'unicode11utf8', 'unicode20utf8', 'utf8', 'x-unicode20utf8'],
7
7
  ibm866: ['866', 'cp866', 'csibm866'],
8
8
  'iso-8859-2': ['csisolatin2', 'iso-ir-101', 'iso8859-2', 'iso88592', 'iso_8859-2', 'iso_8859-2:1987', 'l2', 'latin2'],
@@ -22,15 +22,6 @@ export default {
22
22
  'koi8-u': ['koi8-ru'],
23
23
  macintosh: ['csmacintosh', 'mac', 'x-mac-roman'],
24
24
  'windows-874': ['dos-874', 'iso-8859-11', 'iso8859-11', 'iso885911', 'tis-620'],
25
- 'windows-1250': ['cp1250', 'x-cp1250'],
26
- 'windows-1251': ['cp1251', 'x-cp1251'],
27
- 'windows-1252': ['ansi_x3.4-1968', 'ascii', 'cp1252', 'cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii', 'x-cp1252'],
28
- 'windows-1253': ['cp1253', 'x-cp1253'],
29
- 'windows-1254': ['cp1254', 'csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599', 'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5', 'x-cp1254'],
30
- 'windows-1255': ['cp1255', 'x-cp1255'],
31
- 'windows-1256': ['cp1256', 'x-cp1256'],
32
- 'windows-1257': ['cp1257', 'x-cp1257'],
33
- 'windows-1258': ['cp1258', 'x-cp1258'],
34
25
  'x-mac-cyrillic': ['x-mac-ukrainian'],
35
26
  gbk: ['chinese', 'csgb2312', 'csiso58gb231280', 'gb2312', 'gb_2312', 'gb_2312-80', 'iso-ir-58', 'x-gbk'],
36
27
  gb18030: [],
@@ -44,3 +35,12 @@ export default {
44
35
  'utf-16le': ['csunicode', 'iso-10646-ucs-2', 'ucs-2', 'unicode', 'unicodefeff', 'utf-16'],
45
36
  'x-user-defined': [],
46
37
  }
38
+
39
+ for (let i = 0; i < 9; i++) labels[`windows-125${i}`] = [`cp125${i}`, `x-cp125${i}`]
40
+
41
+ // prettier-ignore
42
+ labels['windows-1252'].push('ansi_x3.4-1968', 'ascii', 'cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-ir-100', 'iso8859-1', 'iso88591', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1', 'us-ascii')
43
+ // prettier-ignore
44
+ labels['windows-1254'].push('csisolatin5', 'iso-8859-9', 'iso-ir-148', 'iso8859-9', 'iso88599', 'iso_8859-9', 'iso_8859-9:1989', 'l5', 'latin5')
45
+
46
+ export default labels
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@exodus/bytes",
3
- "version": "1.0.0",
3
+ "version": "1.2.0",
4
4
  "description": "Various operations on Uint8Array data",
5
5
  "scripts": {
6
6
  "lint": "eslint .",
package/utf16.node.js CHANGED
@@ -36,6 +36,7 @@ const swapped = (x, swap) =>
36
36
  swap ? Buffer.from(x).swap16() : Buffer.from(x.buffer, x.byteOffset, x.byteLength)
37
37
 
38
38
  // We skip TextDecoder on Node.js, as it's is somewhy significantly slower than Buffer for utf16
39
+ // Also, it incorrectly misses replacements with Node.js is built without ICU, we fix that
39
40
  function decodeNode(input, loose = false, format = 'uint16') {
40
41
  let ble
41
42
  if (format === 'uint16') {
package/utf8.node.js CHANGED
@@ -1,15 +1,21 @@
1
1
  import { assertUint8 } from './assert.js'
2
2
  import { typedView } from './array.js'
3
- import { E_STRICT_UNICODE } from './fallback/utf8.js'
3
+ import { E_STRICT, E_STRICT_UNICODE } from './fallback/utf8.js'
4
4
  import { isAscii } from 'node:buffer'
5
5
 
6
6
  if (Buffer.TYPED_ARRAY_SUPPORT) throw new Error('Unexpected Buffer polyfill')
7
7
 
8
- const decoderFatal = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true })
8
+ let decoderFatal
9
9
  const decoderLoose = new TextDecoder('utf-8', { ignoreBOM: true })
10
10
  const { isWellFormed } = String.prototype
11
11
  const isDeno = Boolean(globalThis.Deno)
12
12
 
13
+ try {
14
+ decoderFatal = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true })
15
+ } catch {
16
+ // Without ICU, Node.js doesn't support fatal option for utf-8
17
+ }
18
+
13
19
  function encode(str, loose = false) {
14
20
  if (typeof str !== 'string') throw new TypeError('Input is not a string')
15
21
  const strLength = str.length
@@ -45,7 +51,14 @@ function decode(arr, loose = false) {
45
51
  return buf.latin1Slice(0, arr.byteLength) // .latin1Slice is faster than .asciiSlice
46
52
  }
47
53
 
48
- return loose ? decoderLoose.decode(arr) : decoderFatal.decode(arr)
54
+ if (loose) return decoderLoose.decode(arr)
55
+ if (decoderFatal) return decoderFatal.decode(arr)
56
+
57
+ // We are in an env without native fatal decoder support (non-fixed Node.js without ICU)
58
+ // Well, just recheck against encode if it contains replacement then, this is still faster than js impl
59
+ const str = decoderLoose.decode(arr)
60
+ if (str.includes('\uFFFD') && !Buffer.from(str).equals(arr)) throw new TypeError(E_STRICT)
61
+ return str
49
62
  }
50
63
 
51
64
  export const utf8fromString = (str, format = 'uint8') => typedView(encode(str, false), format)