npm - @atproto/lex-data - Versions diffs - 0.0.0 - Mend

@atproto/lex-data 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

package/dist/blob.d.ts +16 -0
package/dist/blob.d.ts.map +1 -0
package/dist/blob.js +73 -0
package/dist/blob.js.map +1 -0
package/dist/cid.d.ts +12 -0
package/dist/cid.d.ts.map +1 -0
package/dist/cid.js +47 -0
package/dist/cid.js.map +1 -0
package/dist/index.d.ts +9 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +12 -0
package/dist/index.js.map +1 -0
package/dist/language.d.ts +18 -0
package/dist/language.d.ts.map +1 -0
package/dist/language.js +30 -0
package/dist/language.js.map +1 -0
package/dist/lex-equals.d.ts +3 -0
package/dist/lex-equals.d.ts.map +1 -0
package/dist/lex-equals.js +78 -0
package/dist/lex-equals.js.map +1 -0
package/dist/lex.d.ts +18 -0
package/dist/lex.d.ts.map +1 -0
package/dist/lex.js +83 -0
package/dist/lex.js.map +1 -0
package/dist/lib/nodejs-buffer.d.ts +15 -0
package/dist/lib/nodejs-buffer.d.ts.map +1 -0
package/dist/lib/nodejs-buffer.js +12 -0
package/dist/lib/nodejs-buffer.js.map +1 -0
package/dist/object.d.ts +3 -0
package/dist/object.d.ts.map +1 -0
package/dist/object.js +22 -0
package/dist/object.js.map +1 -0
package/dist/uint8array-from-base64.d.ts +16 -0
package/dist/uint8array-from-base64.d.ts.map +1 -0
package/dist/uint8array-from-base64.js +60 -0
package/dist/uint8array-from-base64.js.map +1 -0
package/dist/uint8array-to-base64.d.ts +16 -0
package/dist/uint8array-to-base64.d.ts.map +1 -0
package/dist/uint8array-to-base64.js +30 -0
package/dist/uint8array-to-base64.js.map +1 -0
package/dist/uint8array.d.ts +21 -0
package/dist/uint8array.d.ts.map +1 -0
package/dist/uint8array.js +57 -0
package/dist/uint8array.js.map +1 -0
package/dist/utf8-grapheme-len.d.ts +3 -0
package/dist/utf8-grapheme-len.d.ts.map +1 -0
package/dist/utf8-grapheme-len.js +23 -0
package/dist/utf8-grapheme-len.js.map +1 -0
package/dist/utf8-len.d.ts +3 -0
package/dist/utf8-len.d.ts.map +1 -0
package/dist/utf8-len.js +50 -0
package/dist/utf8-len.js.map +1 -0
package/dist/utf8.d.ts +3 -0
package/dist/utf8.d.ts.map +1 -0
package/dist/utf8.js +12 -0
package/dist/utf8.js.map +1 -0
package/package.json +51 -0
package/src/blob.test.ts +186 -0
package/src/blob.ts +99 -0
package/src/cid.ts +50 -0
package/src/index.ts +8 -0
package/src/language.test.ts +87 -0
package/src/language.ts +39 -0
package/src/lex-equals.test.ts +153 -0
package/src/lex-equals.ts +85 -0
package/src/lex.test.ts +124 -0
package/src/lex.ts +78 -0
package/src/lib/nodejs-buffer.ts +27 -0
package/src/object.test.ts +78 -0
package/src/object.ts +21 -0
package/src/uint8array-from-base64.test.ts +113 -0
package/src/uint8array-from-base64.ts +85 -0
package/src/uint8array-to-base64.ts +45 -0
package/src/uint8array.ts +78 -0
package/src/utf8-grapheme-len.test.ts +37 -0
package/src/utf8-grapheme-len.ts +21 -0
package/src/utf8-len.test.ts +31 -0
package/src/utf8-len.ts +51 -0
package/src/utf8.ts +14 -0
package/tsconfig.build.json +12 -0
package/tsconfig.json +7 -0
package/tsconfig.tests.json +9 -0

package/src/lex.ts ADDED Viewed

@@ -0,0 +1,78 @@
+import { CID, isCid } from './cid.js'
+import { isPlainObject } from './object.js'
+// @NOTE BlobRef is just a special case of LexMap.
+export type LexScalar = number | string | boolean | null | CID | Uint8Array
+export type LexValue = LexScalar | LexValue[] | { [_ in string]?: LexValue }
+export type LexMap = { [_ in string]?: LexValue }
+export type LexArray = LexValue[]
+export function isLexMap(value: unknown): value is LexMap {
+  if (!isPlainObject(value)) return false
+  for (const key in value) {
+    if (!isLexValue(value[key])) return false
+  }
+  return true
+}
+export function isLexArray(value: unknown): value is LexArray {
+  if (!Array.isArray(value)) return false
+  for (let i = 0; i < value.length; i++) {
+    if (!isLexValue(value[i])) return false
+  }
+  return true
+}
+export function isLexScalar(value: unknown): value is LexScalar {
+  switch (typeof value) {
+    case 'object':
+      if (value === null) return true
+      return value instanceof Uint8Array || isCid(value)
+    case 'string':
+    case 'boolean':
+      return true
+    case 'number':
+      if (Number.isInteger(value)) return true
+      throw new TypeError(`Invalid Lex value: ${value}`)
+    default:
+      throw new TypeError(`Invalid Lex value: ${typeof value}`)
+  }
+}
+export function isLexValue(value: unknown): value is LexValue {
+  switch (typeof value) {
+    case 'number':
+      if (!Number.isInteger(value)) return false
+    // fallthrough
+    case 'string':
+    case 'boolean':
+      return true
+    case 'object':
+      if (value === null) return true
+      if (Array.isArray(value)) {
+        for (let i = 0; i < value.length; i++) {
+          if (!isLexValue(value[i])) return false
+        }
+        return true
+      }
+      if (isPlainObject(value)) {
+        for (const key in value) {
+          if (!isLexValue(value[key])) return false
+        }
+        return true
+      }
+      if (value instanceof Uint8Array) return true
+      if (isCid(value)) return true
+    // fallthrough
+    default:
+      return false
+  }
+}
+export type TypedLexMap = LexMap & { $type: string }
+export function isTypedLexMap(value: LexValue): value is TypedLexMap {
+  return (
+    isLexMap(value) && typeof value.$type === 'string' && value.$type.length > 0
+  )
+}

package/src/lib/nodejs-buffer.ts ADDED Viewed

@@ -0,0 +1,27 @@
+type Encoding = 'utf8' | 'base64' | 'base64url'
+interface NodeJSBuffer<TArrayBuffer extends ArrayBufferLike = ArrayBufferLike>
+  extends Uint8Array<TArrayBuffer> {
+  byteLength: number
+  toString(encoding?: Encoding): string
+}
+interface NodeJSBufferConstructor {
+  new (input: string, encoding?: Encoding): NodeJSBuffer
+  from(
+    input: Uint8Array | ArrayBuffer | ArrayBufferView,
+  ): NodeJSBuffer<ArrayBuffer>
+  from(input: string, encoding?: Encoding): NodeJSBuffer<ArrayBuffer>
+  byteLength(input: string, encoding?: Encoding): number
+  prototype: NodeJSBuffer
+}
+// Avoids a direct reference to Node.js Buffer, which might not exist in some
+// environments (e.g. browsers, Deno, Bun) to prevent bundlers from trying to
+// include polyfills.
+const BUFFER = /*#__PURE__*/ (() => 'Bu' + 'f'.repeat(2) + 'er')() as 'Buffer'
+export const NodeJSBuffer: NodeJSBufferConstructor | null =
+  (globalThis as any)?.[BUFFER]?.prototype instanceof Uint8Array &&
+  'byteLength' in (globalThis as any)[BUFFER]
+    ? ((globalThis as any)[BUFFER] as NodeJSBufferConstructor)
+    : null

package/src/object.test.ts ADDED Viewed

@@ -0,0 +1,78 @@
+import { CID } from './cid.js'
+import { isObject, isPlainObject } from './object.js'
+describe('isObject', () => {
+  it('returns true for plain objects', () => {
+    expect(isObject({})).toBe(true)
+    expect(isObject({ a: 1 })).toBe(true)
+  })
+  it('returns true for CIDs', () => {
+    const cid = CID.parse(
+      'bafyreidfayvfuwqa7qlnopdjiqrxzs6blmoeu4rujcjtnci5beludirz2a',
+    )
+    expect(isObject(cid)).toBe(true)
+  })
+  it('returns true for class instances', () => {
+    class MyClass {}
+    expect(isObject(new MyClass())).toBe(true)
+  })
+  it('returns true for arrays', () => {
+    expect(isObject([])).toBe(true)
+    expect(isObject([1, 2, 3])).toBe(true)
+  })
+  it('returns false for null', () => {
+    expect(isObject(null)).toBe(false)
+  })
+  it('returns false for non-objects', () => {
+    expect(isObject(42)).toBe(false)
+    expect(isObject('string')).toBe(false)
+    expect(isObject(undefined)).toBe(false)
+    expect(isObject(true)).toBe(false)
+  })
+})
+describe('isPlainObject', () => {
+  it('returns true for plain objects', () => {
+    expect(isPlainObject({})).toBe(true)
+    expect(isPlainObject({ a: 1 })).toBe(true)
+  })
+  it('returns true for objects with null prototype', () => {
+    const obj = Object.create(null)
+    obj.a = 1
+    expect(isPlainObject(obj)).toBe(true)
+    expect(isPlainObject({ __proto__: null, foo: 'bar' })).toBe(true)
+  })
+  it('returns false for class instances', () => {
+    class MyClass {}
+    expect(isPlainObject(new MyClass())).toBe(false)
+  })
+  it('returns false for CIDs', () => {
+    const cid = CID.parse(
+      'bafyreidfayvfuwqa7qlnopdjiqrxzs6blmoeu4rujcjtnci5beludirz2a',
+    )
+    expect(isPlainObject(cid)).toBe(false)
+  })
+  it('returns false for arrays', () => {
+    expect(isPlainObject([])).toBe(false)
+    expect(isPlainObject([1, 2, 3])).toBe(false)
+  })
+  it('returns false for null', () => {
+    expect(isPlainObject(null)).toBe(false)
+  })
+  it('returns false for non-objects', () => {
+    expect(isPlainObject(42)).toBe(false)
+    expect(isPlainObject('string')).toBe(false)
+    expect(isPlainObject(undefined)).toBe(false)
+    expect(isPlainObject(true)).toBe(false)
+  })
+})

package/src/object.ts ADDED Viewed

@@ -0,0 +1,21 @@
+export function isObject(input: unknown): input is object {
+  return input != null && typeof input === 'object'
+}
+const ObjectProto = Object.prototype
+const ObjectToString = Object.prototype.toString
+export function isPlainObject(
+  input: unknown,
+): input is object & Record<string, unknown> {
+  if (!input || typeof input !== 'object') return false
+  const proto = Object.getPrototypeOf(input)
+  if (proto === null) return true
+  return (
+    (proto === ObjectProto ||
+      // Needed to support NodeJS's `runInNewContext` which produces objects
+      // with a different prototype
+      Object.getPrototypeOf(proto) === null) &&
+    ObjectToString.call(input) === '[object Object]'
+  )
+}

package/src/uint8array-from-base64.test.ts ADDED Viewed

@@ -0,0 +1,113 @@
+import 'core-js/modules/es.uint8-array.from-base64.js'
+import 'core-js/modules/es.uint8-array.to-base64.js'
+import assert from 'node:assert'
+import {
+  fromBase64Native,
+  fromBase64Node,
+  fromBase64Ponyfill,
+} from './uint8array-from-base64.js'
+import { ui8Equals } from './uint8array.js'
+// @NOTE This test suite relies on the NodeJS Buffer implementation to generate
+// valid base64 strings for testing.
+// @NOTE b64 needs a test suite because fromBase64 implementations differ in
+// their behavior when encountering invalid base64 strings. This is not the case
+// for toBase64, which is straightforward and has no edge cases.
+for (const fromBase64 of [
+  fromBase64Native,
+  fromBase64Node,
+  fromBase64Ponyfill,
+] as const) {
+  // Tests should run in NodeJS where implementations are either available or
+  // polyfilled (see core-js imports above).
+  assert(fromBase64 !== null, 'fromBase64 implementation should not be null')
+  describe(fromBase64.name, () => {
+    describe('valid base64 strings', () => {
+      it('decodes empty string', () => {
+        const decoded = fromBase64('')
+        expect(decoded).toBeInstanceOf(Uint8Array)
+        expect(decoded.length).toBe(0)
+      })
+      it('decodes 10MB', () => {
+        const bytes = Buffer.allocUnsafe(10_000_000).fill('🐩')
+        const encoded = bytes.toString('base64')
+        const decoded = fromBase64(encoded)
+        expect(decoded).toBeInstanceOf(Uint8Array)
+        expect(ui8Equals(decoded, bytes)).toBe(true)
+      })
+      for (const string of [
+        '',
+        '\0\0',
+        '\0\0\0',
+        '\0\0\0\0',
+        '__',
+        'é',
+        'àç',
+        '\0éàç',
+        '```',
+        'aaa',
+        'Hello, World!',
+        '😀😃😄😁😆😅😂🤣😊😇',
+        '👩‍💻👨‍💻👩‍🔬👨‍🔬👩‍🚀👨‍🚀',
+        '🌍🌎🌏🌐🪐🌟✨⚡🔥💧',
+      ] as const) {
+        const buffer = Buffer.from(string, 'utf8')
+        const base64 = buffer.toString('base64')
+        const base64Unpadded = base64.replace(/=+$/, '')
+        it(`decodes ${JSON.stringify(string)}`, () => {
+          const decoded = fromBase64(base64)
+          expect(decoded).toBeInstanceOf(Uint8Array)
+          expect(ui8Equals(decoded, buffer)).toBe(true)
+        })
+        if (base64 !== base64Unpadded) {
+          it(`decodes ${JSON.stringify(string)} (unpadded)`, () => {
+            const decoded = fromBase64(base64Unpadded)
+            expect(decoded).toBeInstanceOf(Uint8Array)
+            expect(ui8Equals(decoded, buffer)).toBe(true)
+          })
+        }
+      }
+    })
+    describe('invalid base64 strings', () => {
+      for (const invalidB64 of [
+        'çç',
+        'é',
+        'YWJjZGU$$$',
+        '@@@@',
+        'abcd!',
+        'ab=cd',
+        // "YWFh" is "aaa" in base64
+        'YWFh' + 'é',
+        'YWFh' + 'éé',
+        'YWFh' + 'ééé',
+        'YWFh' + 'éééé',
+        // Invalid padding
+        'YWFh' + '=',
+        'YWFh' + '==',
+        'YWFh' + '===',
+        'YWFh' + '====',
+        'YWFh' + '=====',
+        'YWFh' + '======',
+        // More invalid padding
+        // 'TWE=', // 'Ma'
+        'TWE=' + '=',
+        'TWE=' + '==',
+        // 'TQ==', // 'M'
+        'TQ==' + '=',
+        'TQ==' + '==',
+      ] as const) {
+        it(`throws on invalid base64 string "${invalidB64}"`, () => {
+          expect(() => fromBase64(invalidB64)).toThrow()
+        })
+      }
+    })
+  })
+}

package/src/uint8array-from-base64.ts ADDED Viewed

@@ -0,0 +1,85 @@
+import { fromString } from 'uint8arrays/from-string'
+import { NodeJSBuffer } from './lib/nodejs-buffer.js'
+const Buffer = NodeJSBuffer
+declare global {
+  interface Uint8ArrayConstructor {
+    /**
+     * @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Uint8Array/fromBase64 Uint8Array.fromBase64()}
+     */
+    fromBase64?: (
+      b64: string,
+      options?: {
+        /** @default 'base64' */
+        alphabet?: 'base64' | 'base64url'
+        lastChunkHandling?: 'loose' | 'strict' | 'stop-before-partial'
+      },
+    ) => Uint8Array
+  }
+}
+export const fromBase64Native =
+  typeof Uint8Array.fromBase64 === 'function'
+    ? function fromBase64Native(b64: string): Uint8Array {
+        return Uint8Array.fromBase64!(b64, { lastChunkHandling: 'loose' })
+      }
+    : null
+export const fromBase64Node = Buffer
+  ? function fromBase64Node(b64: string): Uint8Array {
+      const bytes = Buffer.from(b64, 'base64')
+      verifyBase64ForBytes(b64, bytes)
+      // Convert to Uint8Array because even though Buffer is a sub class of
+      // Uint8Array, it serializes differently to Uint8Array (e.g. in JSON) and
+      // results in unexpected behavior downstream (e.g. in tests)
+      return new Uint8Array(bytes.buffer, bytes.byteOffset, bytes.byteLength)
+    }
+  : null
+export function fromBase64Ponyfill(b64: string): Uint8Array {
+  const bytes = fromString(b64, 'base64')
+  verifyBase64ForBytes(b64, bytes)
+  return bytes
+}
+// @NOTE NodeJS will silently stop decoding at the first invalid character,
+// while "uint8arrays/from-string" will not validate that the padding is
+// correct. The following function performs basic validation to ensure that the
+// input was a valid base64 string. The availability of the "bytes" allows
+// to perform checks with O[1] complexity.
+function verifyBase64ForBytes(b64: string, bytes: Uint8Array): void {
+  const paddingCount = b64.endsWith('==') ? 2 : b64.endsWith('=') ? 1 : 0
+  const trimmedLength = b64.length - paddingCount
+  const expectedByteLength = Math.floor((trimmedLength * 3) / 4)
+  if (bytes.length !== expectedByteLength) {
+    throw new Error('Invalid base64 string')
+  }
+  const expectedB64Length = (bytes.length / 3) * 4
+  const expectedPaddingCount =
+    expectedB64Length % 4 === 0 ? 0 : 4 - (expectedB64Length % 4)
+  const expectedFullB64Length = expectedB64Length + expectedPaddingCount
+  if (b64.length > expectedFullB64Length) {
+    throw new Error('Invalid base64 string')
+  }
+  // The previous might still allow false positive if only the last few
+  // chars are invalid.
+  for (
+    let i = Math.ceil(expectedB64Length);
+    i < b64.length - paddingCount;
+    i++
+  ) {
+    const code = b64.charCodeAt(i)
+    if (
+      !(code >= 65 && code <= 90) && // A-Z
+      !(code >= 97 && code <= 122) && // a-z
+      !(code >= 48 && code <= 57) && // 0-9
+      code !== 43 && // +
+      code !== 47 // /
+    ) {
+      throw new Error('Invalid base64 string')
+    }
+  }
+}

package/src/uint8array-to-base64.ts ADDED Viewed

@@ -0,0 +1,45 @@
+import { toString } from 'uint8arrays/to-string'
+import { NodeJSBuffer } from './lib/nodejs-buffer.js'
+const Buffer = NodeJSBuffer
+declare global {
+  interface Uint8Array {
+    /**
+     * @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Uint8Array/toBase64 Uint8Array.prototype.toBase64()}
+     */
+    toBase64?: (options?: {
+      /** @default 'base64' */
+      alphabet?: 'base64' | 'base64url'
+      omitPadding?: boolean
+    }) => string
+  }
+}
+export const toBase64Native =
+  typeof Uint8Array.prototype.toBase64 === 'function'
+    ? function toBase64Native(bytes: Uint8Array): string {
+        return bytes.toBase64!({ omitPadding: true })
+      }
+    : null
+export const toBase64Node = Buffer
+  ? function toBase64Node(bytes: Uint8Array): string {
+      const b64 = (
+        bytes instanceof Buffer ? bytes : Buffer.from(bytes)
+      ).toString('base64')
+      // @NOTE We strip padding for strict compatibility with
+      // uint8arrays.toString behavior. Tests failing because of the presence of
+      // padding are not really synonymous with an actual error and we might
+      // (should?) actually want to keep the padding at some point.
+      return b64.charCodeAt(b64.length - 1) === /* '=' */ 0x3d
+        ? b64.charCodeAt(b64.length - 2) === /* '=' */ 0x3d
+          ? b64.slice(0, -2) // '=='
+          : b64.slice(0, -1) // '='
+        : b64
+    }
+  : null
+export function toBase64Ponyfill(bytes: Uint8Array): string {
+  return toString(bytes, 'base64')
+}

package/src/uint8array.ts ADDED Viewed

@@ -0,0 +1,78 @@
+import {
+  fromBase64Native,
+  fromBase64Node,
+  fromBase64Ponyfill,
+} from './uint8array-from-base64.js'
+import {
+  toBase64Native,
+  toBase64Node,
+  toBase64Ponyfill,
+} from './uint8array-to-base64.js'
+// @TODO drop dependency on uint8arrays package once Uint8Array.fromBase64 /
+// Uint8Array.prototype.toBase64 is widely supported, and mark fromBase64 /
+// toBase64 as deprecated. We can also drop NodeJS specific implementations
+// once NodeJS <24 is no longer supported.
+/**
+ * Encodes a Uint8Array into a base64 string.
+ *
+ * @returns The base64 encoded string
+ */
+export const toBase64: (bytes: Uint8Array) => string =
+  toBase64Native ?? toBase64Node ?? toBase64Ponyfill
+/**
+ * Decodes a base64 string into a Uint8Array.
+ *
+ * @returns The decoded {@link Uint8Array}
+ * @throws If the input is not a valid base64 string
+ */
+export const fromBase64: (b64: string) => Uint8Array =
+  fromBase64Native ?? fromBase64Node ?? fromBase64Ponyfill
+if (toBase64 === toBase64Ponyfill || fromBase64 === fromBase64Ponyfill) {
+  /*#__PURE__*/
+  console.warn(
+    '[@atproto/lex-data]: Uint8Array.fromBase64 / Uint8Array.prototype.toBase64 not available in this environment. Falling back to ponyfill implementation.',
+  )
+}
+/**
+ * Coerces various binary data representations into a Uint8Array.
+ *
+ * @return `undefined` if the input could not be coerced into a {@link Uint8Array}.
+ */
+export function asUint8Array(input: unknown): Uint8Array | undefined {
+  if (input instanceof Uint8Array) {
+    return input
+  }
+  if (ArrayBuffer.isView(input)) {
+    return new Uint8Array(
+      input.buffer,
+      input.byteOffset,
+      input.byteLength / Uint8Array.BYTES_PER_ELEMENT,
+    )
+  }
+  if (input instanceof ArrayBuffer) {
+    return new Uint8Array(input)
+  }
+  return undefined
+}
+export function ui8Equals(a: Uint8Array, b: Uint8Array): boolean {
+  if (a.byteLength !== b.byteLength) {
+    return false
+  }
+  for (let i = 0; i < a.byteLength; i++) {
+    if (a[i] !== b[i]) {
+      return false
+    }
+  }
+  return true
+}

package/src/utf8-grapheme-len.test.ts ADDED Viewed

@@ -0,0 +1,37 @@
+import { graphemeLenNative, graphemeLenPonyfill } from './utf8-grapheme-len.js'
+describe('graphemeLenSegmenter', () => {
+  it('computes grapheme length', () => {
+    expect(graphemeLenNative!('a')).toBe(1)
+    expect(graphemeLenNative!('~')).toBe(1)
+    expect(graphemeLenNative!('ö')).toBe(1)
+    expect(graphemeLenNative!('ñ')).toBe(1)
+    expect(graphemeLenNative!('©')).toBe(1)
+    expect(graphemeLenNative!('⽘')).toBe(1)
+    expect(graphemeLenNative!('☎')).toBe(1)
+    expect(graphemeLenNative!('𓋓')).toBe(1)
+    expect(graphemeLenNative!('😀')).toBe(1)
+    expect(graphemeLenNative!('👨‍👩‍👧‍👧')).toBe(1)
+    expect(graphemeLenNative!('a~öñ©⽘☎𓋓😀👨‍👩‍👧‍👧')).toBe(10)
+    // https://github.com/bluesky-social/atproto/issues/4321
+    expect(graphemeLenNative!('नमस्ते')).toBe(3)
+  })
+})
+describe('graphemeLenInternal', () => {
+  it('computes grapheme length', () => {
+    expect(graphemeLenPonyfill('a')).toBe(1)
+    expect(graphemeLenPonyfill('~')).toBe(1)
+    expect(graphemeLenPonyfill('ö')).toBe(1)
+    expect(graphemeLenPonyfill('ñ')).toBe(1)
+    expect(graphemeLenPonyfill('©')).toBe(1)
+    expect(graphemeLenPonyfill('⽘')).toBe(1)
+    expect(graphemeLenPonyfill('☎')).toBe(1)
+    expect(graphemeLenPonyfill('𓋓')).toBe(1)
+    expect(graphemeLenPonyfill('😀')).toBe(1)
+    expect(graphemeLenPonyfill('👨‍👩‍👧‍👧')).toBe(1)
+    expect(graphemeLenPonyfill('a~öñ©⽘☎𓋓😀👨‍👩‍👧‍👧')).toBe(10)
+    // https://github.com/bluesky-social/atproto/issues/4321
+    expect(graphemeLenPonyfill('नमस्ते')).toBe(3)
+  })
+})

package/src/utf8-grapheme-len.ts ADDED Viewed

@@ -0,0 +1,21 @@
+import { countGraphemes } from 'unicode-segmenter/grapheme'
+// @TODO: Drop usage of "unicode-segmenter" package when Intl.Segmenter is
+// widely supported.
+// https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter
+const segmenter =
+  'Segmenter' in Intl && typeof Intl.Segmenter === 'function'
+    ? /*#__PURE__*/ new Intl.Segmenter()
+    : null
+export const graphemeLenNative = segmenter
+  ? function graphemeLenNative(str: string): number {
+      let length = 0
+      for (const _ of segmenter.segment(str)) length++
+      return length
+    }
+  : null
+export function graphemeLenPonyfill(str: string): number {
+  return countGraphemes(str)
+}

package/src/utf8-len.test.ts ADDED Viewed

@@ -0,0 +1,31 @@
+import { utf8LenCompute, utf8LenNode } from './utf8-len.js'
+describe('utf8LenNode', () => {
+  it('computes utf8 string length', () => {
+    expect(utf8LenNode!('a')).toBe(1)
+    expect(utf8LenNode!('~')).toBe(1)
+    expect(utf8LenNode!('ö')).toBe(2)
+    expect(utf8LenNode!('ñ')).toBe(2)
+    expect(utf8LenNode!('©')).toBe(2)
+    expect(utf8LenNode!('⽘')).toBe(3)
+    expect(utf8LenNode!('☎')).toBe(3)
+    expect(utf8LenNode!('𓋓')).toBe(4)
+    expect(utf8LenNode!('😀')).toBe(4)
+    expect(utf8LenNode!('👨‍👩‍👧‍👧')).toBe(25)
+  })
+})
+describe('utf8LenInternal', () => {
+  it('computes utf8 string length', () => {
+    expect(utf8LenCompute('a')).toBe(1)
+    expect(utf8LenCompute('~')).toBe(1)
+    expect(utf8LenCompute('ö')).toBe(2)
+    expect(utf8LenCompute('ñ')).toBe(2)
+    expect(utf8LenCompute('©')).toBe(2)
+    expect(utf8LenCompute('⽘')).toBe(3)
+    expect(utf8LenCompute('☎')).toBe(3)
+    expect(utf8LenCompute('𓋓')).toBe(4)
+    expect(utf8LenCompute('😀')).toBe(4)
+    expect(utf8LenCompute('👨‍👩‍👧‍👧')).toBe(25)
+  })
+})

package/src/utf8-len.ts ADDED Viewed

@@ -0,0 +1,51 @@
+import { NodeJSBuffer } from './lib/nodejs-buffer.js'
+// @NOTE This file is not meant to be exported directly. Instead, we re-export
+// public functions from ./utf8.ts. The reason for this separation is that this
+// file allows to test both the NodeJS-optimized and ponyfill implementations.
+export const utf8LenNode = NodeJSBuffer
+  ? function utf8LenNode(string: string): number {
+      return NodeJSBuffer!.byteLength(string, 'utf8')
+    }
+  : null
+export function utf8LenCompute(string: string): number {
+  // The code below is similar to TextEncoder's implementation of UTF-8
+  // encoding. However, using TextEncoder to get the byte length is slower
+  // as it requires allocating a new Uint8Array and copying data:
+  // return new TextEncoder().encode(string).byteLength
+  // The base length is the string length (all ASCII)
+  let len = string.length
+  let code: number
+  // The loop calculates the number of additional bytes needed for
+  // non-ASCII characters
+  for (let i = 0; i < string.length; i += 1) {
+    code = string.charCodeAt(i)
+    if (code <= 0x7f) {
+      // ASCII, 1 byte
+    } else if (code <= 0x7ff) {
+      // 2 bytes char
+      len += 1
+    } else {
+      // 3 bytes char
+      len += 2
+      // If the current char is a high surrogate, and the next char is a low
+      // surrogate, skip the next char as the total is a 4 bytes char
+      // (represented as a surrogate pair in UTF-16) and was already accounted
+      // for.
+      if (code >= 0xd800 && code <= 0xdbff) {
+        code = string.charCodeAt(i + 1)
+        if (code >= 0xdc00 && code <= 0xdfff) {
+          i++
+        }
+      }
+    }
+  }
+  return len
+}

package/src/utf8.ts ADDED Viewed

@@ -0,0 +1,14 @@
+import { graphemeLenNative, graphemeLenPonyfill } from './utf8-grapheme-len.js'
+import { utf8LenCompute, utf8LenNode } from './utf8-len.js'
+export const graphemeLen: (str: string) => number =
+  graphemeLenNative ?? graphemeLenPonyfill
+if (graphemeLen === graphemeLenPonyfill) {
+  /*#__PURE__*/
+  console.warn(
+    '[@atproto/lex-data]: Intl.Segmenter is not available in this environment. Falling back to ponyfill implementation.',
+  )
+}
+export const utf8Len: (string: string) => number = utf8LenNode ?? utf8LenCompute