npm - @atproto/lex-data - Versions diffs - 0.0.13 → 0.0.15 - Mend

@atproto/lex-data 0.0.13 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/CHANGELOG.md +20 -0
package/dist/blob.d.ts +118 -39
package/dist/blob.d.ts.map +1 -1
package/dist/blob.js +73 -10
package/dist/blob.js.map +1 -1
package/dist/lib/nodejs-buffer.d.ts +1 -0
package/dist/lib/nodejs-buffer.d.ts.map +1 -1
package/dist/lib/nodejs-buffer.js.map +1 -1
package/dist/utf8-from-bytes.d.ts +3 -0
package/dist/utf8-from-bytes.d.ts.map +1 -0
package/dist/utf8-from-bytes.js +19 -0
package/dist/utf8-from-bytes.js.map +1 -0
package/dist/utf8.d.ts +18 -0
package/dist/utf8.d.ts.map +1 -1
package/dist/utf8.js +20 -1
package/dist/utf8.js.map +1 -1
package/package.json +1 -1
package/src/blob.test.ts +38 -25
package/src/blob.ts +198 -53
package/src/lib/nodejs-buffer.ts +5 -0
package/src/utf8-from-bytes.test.ts +43 -0
package/src/utf8-from-bytes.ts +21 -0
package/src/utf8.ts +20 -0

package/dist/utf8.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"utf8.js","sourceRoot":"","sources":["../src/utf8.ts"],"names":[],"mappings":";;;AACA,+DAG8B;AAC9B,iEAA+E;AAC/E,+CAA2D;AAC3D,2DAA4E;AAE5E;;;;;;;;;;;;;;;;;;;;;;GAsBG;AACU,QAAA,WAAW;AACtB,iCAAiC,CAAC,wCAAiB,IAAI,0CAAmB,CAAA;AAE5E,iCAAiC;AACjC,IAAI,mBAAW,KAAK,0CAAmB,EAAE,CAAC;IACxC,aAAa;IACb,OAAO,CAAC,IAAI,CACV,oHAAoH,CACrH,CAAA;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;GAqBG;AACU,QAAA,OAAO;AAClB,iCAAiC,CAAC,yBAAW,IAAI,4BAAc,CAAA;AAEjE;;;;;;;;;;;;;;;GAeG;AACU,QAAA,YAAY;AACvB,iCAAiC,CAAC,oCAAgB,IAAI,wCAAoB,CAAA;AAE5E;;;;;;;;;;;;;;;GAeG;AACU,QAAA,cAAc;AAIzB,iCAAiC,CAAC,wCAAkB,IAAI,4CAAsB,CAAA","sourcesContent":["import { Base64Alphabet } from './uint8array.js'\nimport {\n utf8FromBase64Node,\n utf8FromBase64Ponyfill,\n} from './utf8-from-base64.js'\nimport { graphemeLenNative, graphemeLenPonyfill } from './utf8-grapheme-len.js'\nimport { utf8LenCompute, utf8LenNode } from './utf8-len.js'\nimport { utf8ToBase64Node, utf8ToBase64Ponyfill } from './utf8-to-base64.js'\n\n/*\n Counts the number of grapheme clusters (user-perceived characters) in a string.\n \n Grapheme clusters represent what users typically think of as \"characters\",\n * handling complex cases like:\n * - Emoji with skin tones and ZWJ sequences (e.g., family emoji)\n * - Combined characters (e.g., 'e' + combining accent)\n * - Regional indicator pairs (flag emoji)\n \n Uses native {@link Intl.Segmenter} when available, falling back to a ponyfill.\n \n @param str - The string to measure\n * @returns The number of grapheme clusters\n \n @example\n * ```typescript\n * import { graphemeLen } from '@atproto/lex-data'\n \n graphemeLen('hello') // 5\n * graphemeLen('cafe\\u0301') // 4 (cafe with combining accent)\n * graphemeLen('\\u{1F468}\\u{200D}\\u{1F469}\\u{200D}\\u{1F467}\\u{200D}\\u{1F466}') // 1 (family emoji)\n * ```\n /\nexport const graphemeLen: (str: string) => number =\n / v8 ignore next -- @preserve / graphemeLenNative ?? graphemeLenPonyfill\n\n/ v8 ignore next -- @preserve /\nif (graphemeLen === graphemeLenPonyfill) {\n /#__PURE__/\n console.warn(\n '[@atproto/lex-data]: Intl.Segmenter is not available in this environment. Falling back to ponyfill implementation.',\n )\n}\n\n/\n Calculates the UTF-8 byte length of a string.\n \n Returns the number of bytes the string would occupy when encoded as UTF-8.\n * This is important for Lexicon validation where schemas specify byte limits.\n \n Uses Node.js Buffer.byteLength when available for performance,\n * falling back to a computed implementation.\n \n @param str - The string to measure\n * @returns The UTF-8 byte length\n \n @example\n * ```typescript\n * import { utf8Len } from '@atproto/lex-data'\n \n utf8Len('hello') // 5 (ASCII: 1 byte per char)\n * utf8Len('\\u00e9') // 2 (e with accent: 2 bytes)\n * utf8Len('\\u{1F600}') // 4 (emoji: 4 bytes)\n * utf8Len('\\u{1F468}\\u{200D}\\u{1F469}\\u{200D}\\u{1F467}\\u{200D}\\u{1F466}') // 25 (family emoji)\n * ```\n /\nexport const utf8Len: (string: string) => number =\n / v8 ignore next -- @preserve / utf8LenNode ?? utf8LenCompute\n\n/\n Encodes a UTF-8 string to base64.\n \n First encodes the string as UTF-8 bytes, then encodes those bytes as base64.\n \n @param str - The string to encode\n * @param alphabet - The base64 alphabet to use ('base64' or 'base64url')\n * @returns The base64-encoded string\n \n @example\n * ```typescript\n * import { utf8ToBase64 } from '@atproto/lex-data'\n \n utf8ToBase64('Hello') // 'SGVsbG8='\n * ```\n /\nexport const utf8ToBase64: (str: string, alphabet?: Base64Alphabet) => string =\n / v8 ignore next -- @preserve / utf8ToBase64Node ?? utf8ToBase64Ponyfill\n\n/\n Decodes a base64 string to UTF-8.\n \n Decodes the base64 to bytes, then interprets those bytes as UTF-8 text.\n \n @param b64 - The base64 string to decode\n * @param alphabet - The base64 alphabet to use ('base64' or 'base64url')\n * @returns The decoded UTF-8 string\n \n @example\n * ```typescript\n * import { utf8FromBase64 } from '@atproto/lex-data'\n \n utf8FromBase64('SGVsbG8=') // 'Hello'\n * ```\n /\nexport const utf8FromBase64: (\n b64: string,\n alphabet?: Base64Alphabet,\n) => string =\n / v8 ignore next -- @preserve */ utf8FromBase64Node ?? utf8FromBase64Ponyfill\n"]}
1	+ {"version":3,"file":"utf8.js","sourceRoot":"","sources":["../src/utf8.ts"],"names":[],"mappings":";;;AACA,+DAG8B;AAC9B,6DAA6E;AAC7E,iEAA+E;AAC/E,+CAA2D;AAC3D,2DAA4E;AAE5E;;;;;;;;;;;;;;;;GAgBG;AACU,QAAA,aAAa,GAAG,sCAAiB,IAAI,wCAAmB,CAAA;AAErE;;;;;;;;;;;;;;;;;;;;;;GAsBG;AACU,QAAA,WAAW;AACtB,iCAAiC,CAAC,wCAAiB,IAAI,0CAAmB,CAAA;AAE5E,iCAAiC;AACjC,IAAI,mBAAW,KAAK,0CAAmB,EAAE,CAAC;IACxC,aAAa;IACb,OAAO,CAAC,IAAI,CACV,oHAAoH,CACrH,CAAA;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;GAqBG;AACU,QAAA,OAAO;AAClB,iCAAiC,CAAC,yBAAW,IAAI,4BAAc,CAAA;AAEjE;;;;;;;;;;;;;;;GAeG;AACU,QAAA,YAAY;AACvB,iCAAiC,CAAC,oCAAgB,IAAI,wCAAoB,CAAA;AAE5E;;;;;;;;;;;;;;;GAeG;AACU,QAAA,cAAc;AAIzB,iCAAiC,CAAC,wCAAkB,IAAI,4CAAsB,CAAA","sourcesContent":["import { Base64Alphabet } from './uint8array.js'\nimport {\n utf8FromBase64Node,\n utf8FromBase64Ponyfill,\n} from './utf8-from-base64.js'\nimport { utf8FromBytesNative, utf8FromBytesNode } from './utf8-from-bytes.js'\nimport { graphemeLenNative, graphemeLenPonyfill } from './utf8-grapheme-len.js'\nimport { utf8LenCompute, utf8LenNode } from './utf8-len.js'\nimport { utf8ToBase64Node, utf8ToBase64Ponyfill } from './utf8-to-base64.js'\n\n/*\n Converts a Uint8Array to a UTF-8 string.\n \n Uses Node.js Buffer when available for performance, falling back to\n * TextDecoder in environments without Buffer support.\n \n @param bytes - The binary data to decode\n * @returns The decoded string (as UTF-16 JavaScript string)\n \n @example\n * ```typescript\n * import { utf8FromBytes } from '@atproto/lex-data'\n \n const bytes = new Uint8Array([72, 101, 108, 108, 111])\n * utf8FromBytes(bytes) // 'Hello'\n * ```\n /\nexport const utf8FromBytes = utf8FromBytesNode ?? utf8FromBytesNative\n\n/\n Counts the number of grapheme clusters (user-perceived characters) in a string.\n \n Grapheme clusters represent what users typically think of as \"characters\",\n * handling complex cases like:\n * - Emoji with skin tones and ZWJ sequences (e.g., family emoji)\n * - Combined characters (e.g., 'e' + combining accent)\n * - Regional indicator pairs (flag emoji)\n \n Uses native {@link Intl.Segmenter} when available, falling back to a ponyfill.\n \n @param str - The string to measure\n * @returns The number of grapheme clusters\n \n @example\n * ```typescript\n * import { graphemeLen } from '@atproto/lex-data'\n \n graphemeLen('hello') // 5\n * graphemeLen('cafe\\u0301') // 4 (cafe with combining accent)\n * graphemeLen('\\u{1F468}\\u{200D}\\u{1F469}\\u{200D}\\u{1F467}\\u{200D}\\u{1F466}') // 1 (family emoji)\n * ```\n /\nexport const graphemeLen: (str: string) => number =\n / v8 ignore next -- @preserve / graphemeLenNative ?? graphemeLenPonyfill\n\n/ v8 ignore next -- @preserve /\nif (graphemeLen === graphemeLenPonyfill) {\n /#__PURE__/\n console.warn(\n '[@atproto/lex-data]: Intl.Segmenter is not available in this environment. Falling back to ponyfill implementation.',\n )\n}\n\n/\n Calculates the UTF-8 byte length of a string.\n \n Returns the number of bytes the string would occupy when encoded as UTF-8.\n * This is important for Lexicon validation where schemas specify byte limits.\n \n Uses Node.js Buffer.byteLength when available for performance,\n * falling back to a computed implementation.\n \n @param str - The string to measure\n * @returns The UTF-8 byte length\n \n @example\n * ```typescript\n * import { utf8Len } from '@atproto/lex-data'\n \n utf8Len('hello') // 5 (ASCII: 1 byte per char)\n * utf8Len('\\u00e9') // 2 (e with accent: 2 bytes)\n * utf8Len('\\u{1F600}') // 4 (emoji: 4 bytes)\n * utf8Len('\\u{1F468}\\u{200D}\\u{1F469}\\u{200D}\\u{1F467}\\u{200D}\\u{1F466}') // 25 (family emoji)\n * ```\n /\nexport const utf8Len: (string: string) => number =\n / v8 ignore next -- @preserve / utf8LenNode ?? utf8LenCompute\n\n/\n Encodes a UTF-8 string to base64.\n \n First encodes the string as UTF-8 bytes, then encodes those bytes as base64.\n \n @param str - The string to encode\n * @param alphabet - The base64 alphabet to use ('base64' or 'base64url')\n * @returns The base64-encoded string\n \n @example\n * ```typescript\n * import { utf8ToBase64 } from '@atproto/lex-data'\n \n utf8ToBase64('Hello') // 'SGVsbG8='\n * ```\n /\nexport const utf8ToBase64: (str: string, alphabet?: Base64Alphabet) => string =\n / v8 ignore next -- @preserve / utf8ToBase64Node ?? utf8ToBase64Ponyfill\n\n/\n Decodes a base64 string to UTF-8.\n \n Decodes the base64 to bytes, then interprets those bytes as UTF-8 text.\n \n @param b64 - The base64 string to decode\n * @param alphabet - The base64 alphabet to use ('base64' or 'base64url')\n * @returns The decoded UTF-8 string\n \n @example\n * ```typescript\n * import { utf8FromBase64 } from '@atproto/lex-data'\n \n utf8FromBase64('SGVsbG8=') // 'Hello'\n * ```\n /\nexport const utf8FromBase64: (\n b64: string,\n alphabet?: Base64Alphabet,\n) => string =\n / v8 ignore next -- @preserve */ utf8FromBase64Node ?? utf8FromBase64Ponyfill\n"]}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@atproto/lex-data",
-  "version": "0.0.13",
+  "version": "0.0.15",
   "license": "MIT",
   "description": "Core utilities for AT Lexicons",
   "keywords": [

package/src/blob.test.ts CHANGED Viewed

@@ -3,8 +3,8 @@ import {
   BlobRef,
   LegacyBlobRef,
   enumBlobRefs,
-  isBlobRef,
   isLegacyBlobRef,
+  isTypedBlobRef,
 } from './blob.js'
 import { RawCid, parseCid } from './cid.js'
 import { LexArray, LexMap, LexValue } from './lex.js'
@@ -21,7 +21,7 @@ const invalidBlobCid = parseCid(
   { flavor: 'cbor' },
 )
-describe(isBlobRef, () => {
+describe(isTypedBlobRef, () => {
   it('tests valid blobCid and lexCid', () => {
     expect(validBlobCid.code).toBe(0x55) // raw
     expect(validBlobCid.multihash.code).toBe(0x12) // sha2-256
@@ -31,7 +31,7 @@ describe(isBlobRef, () => {
   it('parses valid blob', () => {
     expect(
-      isBlobRef({
+      isTypedBlobRef({
         $type: 'blob',
         ref: validBlobCid,
         mimeType: 'image/jpeg',
@@ -40,7 +40,7 @@ describe(isBlobRef, () => {
     ).toBe(true)
     expect(
-      isBlobRef(
+      isTypedBlobRef(
         {
           $type: 'blob',
           ref: invalidBlobCid,
@@ -55,7 +55,7 @@ describe(isBlobRef, () => {
   it('performs strict validation by default', () => {
     expect(
-      isBlobRef({
+      isTypedBlobRef({
         $type: 'blob',
         ref: invalidBlobCid,
         mimeType: 'image/jpeg',
@@ -66,7 +66,7 @@ describe(isBlobRef, () => {
   it('rejects invalid inputs', () => {
     expect(
-      isBlobRef({
+      isTypedBlobRef({
         $type: 'blob',
         ref: { $link: validBlobCid.toString() },
         mimeType: 'image/jpeg',
@@ -75,7 +75,7 @@ describe(isBlobRef, () => {
     ).toBe(false)
     expect(
-      isBlobRef({
+      isTypedBlobRef({
         // $type: 'blob',
         ref: validBlobCid,
         mimeType: 'image/jpeg',
@@ -84,7 +84,7 @@ describe(isBlobRef, () => {
     ).toBe(false)
     expect(
-      isBlobRef({
+      isTypedBlobRef({
         $type: 'blob',
         ref: validBlobCid,
         mimeType: { toString: () => 'image/jpeg' },
@@ -93,7 +93,7 @@ describe(isBlobRef, () => {
     ).toBe(false)
     expect(
-      isBlobRef(
+      isTypedBlobRef(
         {
           $type: 'blob',
           ref: { $link: validBlobCid.toString() },
@@ -105,7 +105,7 @@ describe(isBlobRef, () => {
     ).toBe(false)
     expect(
-      isBlobRef({
+      isTypedBlobRef({
         $type: 'blob',
         mimeType: 'image/jpeg',
         size: 10000,
@@ -113,7 +113,7 @@ describe(isBlobRef, () => {
     ).toBe(false)
     expect(
-      isBlobRef(
+      isTypedBlobRef(
         {
           $type: 'blob',
           mimeType: 'image/jpeg',
@@ -123,15 +123,15 @@ describe(isBlobRef, () => {
       ),
     ).toBe(false)
-    expect(isBlobRef('not an object')).toBe(false)
-    expect(isBlobRef([])).toBe(false)
-    expect(isBlobRef(new Date())).toBe(false)
-    expect(isBlobRef(new Map())).toBe(false)
+    expect(isTypedBlobRef('not an object')).toBe(false)
+    expect(isTypedBlobRef([])).toBe(false)
+    expect(isTypedBlobRef(new Date())).toBe(false)
+    expect(isTypedBlobRef(new Map())).toBe(false)
   })
   it('rejects non-integer size', () => {
     expect(
-      isBlobRef({
+      isTypedBlobRef({
         $type: 'blob',
         ref: validBlobCid,
         mimeType: 'image/jpeg',
@@ -142,7 +142,7 @@ describe(isBlobRef, () => {
   it('rejects invalid CID/multihash code', () => {
     expect(
-      isBlobRef(
+      isTypedBlobRef(
         {
           $type: 'blob',
           ref: validBlobCid,
@@ -154,7 +154,7 @@ describe(isBlobRef, () => {
     ).toBe(true)
     expect(
-      isBlobRef(
+      isTypedBlobRef(
         {
           $type: 'blob',
           ref: invalidBlobCid,
@@ -168,7 +168,7 @@ describe(isBlobRef, () => {
   it('rejects extra keys', () => {
     expect(
-      isBlobRef({
+      isTypedBlobRef({
         $type: 'blob',
         ref: validBlobCid,
         mimeType: 'image/jpeg',
@@ -178,7 +178,7 @@ describe(isBlobRef, () => {
     ).toBe(false)
     expect(
-      isBlobRef(
+      isTypedBlobRef(
         {
           $type: 'blob',
           ref: validBlobCid,
@@ -197,7 +197,7 @@ describe(isBlobRef, () => {
         'QmYwAPJzv5CZsnA625s3Xf2nemtYgPpHdWEz79ojWnPbdG', // CID v0
       )
       expect(
-        isBlobRef(
+        isTypedBlobRef(
           {
             $type: 'blob',
             ref: cidV0,
@@ -221,14 +221,27 @@ describe(isLegacyBlobRef, () => {
     ).toBe(true)
     expect(
-      isLegacyBlobRef({
-        cid: invalidBlobCid.toString(),
-        mimeType: 'image/jpeg',
-      }),
+      isLegacyBlobRef(
+        {
+          cid: invalidBlobCid.toString(),
+          mimeType: 'image/jpeg',
+        },
+        { strict: false },
+      ),
     ).toBe(true)
   })
   it('rejects invalid inputs', () => {
+    expect(
+      isLegacyBlobRef(
+        {
+          cid: invalidBlobCid.toString(),
+          mimeType: 'image/jpeg',
+        },
+        { strict: true },
+      ),
+    ).toBe(false)
     expect(
       isLegacyBlobRef({
         cid: 'babbaaa',

package/src/blob.ts CHANGED Viewed

@@ -1,21 +1,172 @@
-import { Cid, RawCid, ifCid, validateCidString } from './cid.js'
+import {
+  CheckCidOptions,
+  Cid,
+  RawCid,
+  ifCid,
+  parseCid,
+  validateCidString,
+} from './cid.js'
 import { LexValue } from './lex.js'
 import { isPlainObject, isPlainProto } from './object.js'
+/**
+ * Options to use with {@link ifCid}, {@link validateCidString}, and related CID
+ * validation functions when validating CIDs in BlobRefs, in strict mode. This
+ * ensures that the CID is a {@link RawCid} (CID v1, raw multicodec, sha256
+ * multihash), which is the expected format for blob references in the AT
+ * Protocol data model.
+ */
+const STRICT_CID_CHECK_OPTIONS: CheckCidOptions = { flavor: 'raw' }
+// Number.isSafeInteger is actually safe to use with non-number values, so we
+// can use it as a type guard.
+const isSafeInteger = Number.isSafeInteger as (v: unknown) => v is number
+/**
+ * Reference to binary data (like images, videos, etc.) in the AT Protocol data
+ * model.
+ *
+ * This type represents a reference to a blob of binary data, identified by its
+ * content hash (CID) and accompanied by metadata such as MIME type and size.
+ *
+ * The {@link BlobRef} type is a union of the current {@link TypedBlobRef}
+ * format and the legacy {@link LegacyBlobRef} format.
+ */
+export type BlobRef<Ref extends Cid = Cid> = TypedBlobRef<Ref> | LegacyBlobRef
+/**
+ * Options for validating a {@link BlobRef}.
+ */
+export type BlobRefCheckOptions = {
+  /**
+   * If `false`, skips strict CID validation of {@link BlobRef.ref}, allowing
+   * any valid CID. Otherwise, validates that the CID is v1, uses the raw
+   * multicodec, and has a sha256 multihash.
+   *
+   * @default true
+   */
+  strict?: boolean
+}
+/**
+ * Type guard to check if a value is a valid {@link BlobRef}, which can be
+ * either a {@link TypedBlobRef} or a {@link LegacyBlobRef}. By default, strict
+ * CID validation is applied to ensure that the CID in the blob reference is in
+ * the expected format for the AT Protocol, but this can be relaxed with the
+ * `strict: false` option.
+ */
+export function isBlobRef(input: unknown): input is BlobRef<RawCid>
+export function isBlobRef<TOptions extends BlobRefCheckOptions>(
+  input: unknown,
+  options: TOptions,
+): input is LegacyBlobRef | InferTypedBlobRef<TOptions>
+export function isBlobRef(
+  input: unknown,
+  options?: BlobRefCheckOptions,
+): input is BlobRef<RawCid>
+export function isBlobRef(
+  input: unknown,
+  options?: BlobRefCheckOptions,
+): input is BlobRef {
+  return (input as any)?.$type === 'blob'
+    ? isTypedBlobRef(input, options)
+    : isLegacyBlobRef(input, options)
+}
+/**
+ * Extracts the MIME type from a {@link BlobRef}.
+ *
+ * @example
+ * ```ts
+ * const mimeType = getBlobMime(blobRef)
+ * console.log(mimeType)  // e.g., 'image/jpeg'
+ * ```
+ */
+export function getBlobMime(blob: BlobRef): string
+export function getBlobMime(blob?: BlobRef): string | undefined
+export function getBlobMime(blob?: BlobRef): string | undefined {
+  return blob?.mimeType
+}
+/**
+ * Extracts the size (in bytes) from a {@link TypedBlobRef}. For
+ * {@link LegacyBlobRef}, size information is not available, so this function
+ * returns `undefined` for legacy refs.
+ *
+ * @note The size property, in blob refs, cannot be 100% trusted since the PDS
+ * might not have a local copy of the blob (to check the size against) and might
+ * just be passing through the blob ref from the client without validating it.
+ * So, while this function can be useful for getting size information when
+ * available, it should not be solely relied upon for critical functionality
+ * without additional validation.
+ *
+ * @example
+ * ```ts
+ * const size = getBlobSize(blobRef)
+ * if (size !== undefined) {
+ *   console.log(`Blob size: ${size} bytes`)
+ * } else {
+ *   console.log('Size information not available for legacy blob ref')
+ * }
+ * ```
+ */
+export function getBlobSize(blob: BlobRef): number | undefined {
+  if ('$type' in blob && blob.size >= 0) return blob.size
+  // LegacyBlobRef doesn't have size information
+  return undefined
+}
+/**
+ * Extracts the {@link Cid} from a {@link BlobRef}.
+ *
+ * @throws If the input input is a {@link LegacyBlobRef} with an invalid CID string
+ * @example
+ * ```ts
+ * const cid = getBlobCid(blobRef)
+ * console.log(cid.bytes)
+ * ```
+ */
+export function getBlobCid(blob: BlobRef): Cid
+export function getBlobCid(blob?: BlobRef): Cid | undefined
+export function getBlobCid(blob?: BlobRef): Cid | undefined {
+  if (!blob) return undefined
+  return '$type' in blob ? blob.ref : parseCid(blob.cid)
+}
+/**
+ * Extracts the CID string from a {@link BlobRef}.
+ *
+ * This is similar to `getBlobCid(blob).toString()` but is more optimized since
+ * the CID string is already available in the legacy format and we can avoid
+ * parsing it into a CID object just to convert it back to a string.
+ *
+ * @example
+ * ```ts
+ * const cidString = getBlobCidString(blobRef)
+ * console.log(cidString)
+ * ```
+ */
+export function getBlobCidString(blob: BlobRef): string
+export function getBlobCidString(blob?: BlobRef): string | undefined
+export function getBlobCidString(blob?: BlobRef): string | undefined {
+  if (!blob) return undefined
+  return '$type' in blob ? blob.ref.toString() : blob.cid
+}
 /**
  * Reference to binary data (like images, videos, etc.) in the AT Protocol data model.
  *
- * A BlobRef is a {@link LexMap} with a specific structure that identifies binary
- * content by its content hash (CID), along with metadata about the content type
- * and size.
+ * A {@link TypedBlobRef} is a {@link LexMap} with a specific structure that
+ * identifies binary content by its content hash (CID), along with metadata
+ * about the content type and size.
  *
  * @typeParam Ref - The type of CID reference, defaults to any {@link Cid}
  *
  * @example
  * ```typescript
- * import type { BlobRef } from '@atproto/lex-data'
+ * import type { TypedBlobRef } from '@atproto/lex-data'
  *
- * const imageRef: BlobRef = {
+ * const imageRef: TypedBlobRef = {
  *   $type: 'blob',
  *   mimeType: 'image/jpeg',
  *   ref: cid,  // CID of the blob content
@@ -23,41 +174,27 @@ import { isPlainObject, isPlainProto } from './object.js'
  * }
  * ```
  *
- * @see {@link isBlobRef} to check if a value is a valid BlobRef
+ * @see {@link isTypedBlobRef} to check if a value is a valid {@link TypedBlobRef}
  * @see {@link LegacyBlobRef} for the older blob reference format
  */
-export type BlobRef<Ref extends Cid = Cid> = {
+export type TypedBlobRef<Ref extends Cid = Cid> = {
   $type: 'blob'
   mimeType: string
   ref: Ref
   size: number
 }
-/**
- * Options for validating a {@link BlobRef}.
- */
-export type BlobRefCheckOptions = {
-  /**
-   * If `false`, skips strict CID validation of {@link BlobRef.ref}, allowing
-   * any valid CID. Otherwise, validates that the CID is v1, uses the raw
-   * multicodec, and has a sha256 multihash.
-   *
-   * @default true
-   */
-  strict?: boolean
-}
 /**
  * Infers the BlobRef type based on the check options.
  *
  * @typeParam TOptions - The options used for checking
  */
-export type InferCheckedBlobRef<TOptions extends BlobRefCheckOptions> =
+export type InferTypedBlobRef<TOptions extends BlobRefCheckOptions> =
   TOptions extends { strict: false }
-    ? BlobRef
+    ? TypedBlobRef
     : { strict: boolean } extends TOptions
-      ? BlobRef
-      : BlobRef<RawCid>
+      ? TypedBlobRef
+      : TypedBlobRef<RawCid>
 /**
  * Type guard to check if a value is a valid {@link BlobRef}.
@@ -74,32 +211,32 @@ export type InferCheckedBlobRef<TOptions extends BlobRefCheckOptions> =
  *
  * @example
  * ```typescript
- * import { isBlobRef } from '@atproto/lex-data'
+ * import { isTypedBlobRef } from '@atproto/lex-data'
  *
- * if (isBlobRef(data)) {
+ * if (isTypedBlobRef(data)) {
  *   console.log(data.mimeType)  // e.g., 'image/jpeg'
  *   console.log(data.size)      // e.g., 12345
  * }
  *
  * // Allow any valid CID (not just raw CIDs)
- * if (isBlobRef(data, { strict: false })) {
+ * if (isTypedBlobRef(data, { strict: false })) {
  *   // ...
  * }
  * ```
  */
-export function isBlobRef(input: unknown): input is BlobRef<RawCid>
-export function isBlobRef<TOptions extends BlobRefCheckOptions>(
+export function isTypedBlobRef(input: unknown): input is TypedBlobRef<RawCid>
+export function isTypedBlobRef<TOptions extends BlobRefCheckOptions>(
   input: unknown,
   options: TOptions,
-): input is InferCheckedBlobRef<TOptions>
-export function isBlobRef(
+): input is InferTypedBlobRef<TOptions>
+export function isTypedBlobRef(
   input: unknown,
   options?: BlobRefCheckOptions,
-): input is BlobRef
-export function isBlobRef(
+): input is TypedBlobRef<RawCid>
+export function isTypedBlobRef(
   input: unknown,
   options?: BlobRefCheckOptions,
-): input is BlobRef {
+): input is TypedBlobRef {
   if (!isPlainObject(input)) {
     return false
   }
@@ -114,7 +251,10 @@ export function isBlobRef(
     return false
   }
-  if (typeof size !== 'number' || size < 0 || !Number.isSafeInteger(size)) {
+  if (size === -1 && options?.strict === false) {
+    // In non-strict mode, allow size to be -1 to accommodate legacy blob refs
+    // that don't include size information.
+  } else if (!isSafeInteger(size) || size < 0) {
     return false
   }
@@ -136,7 +276,7 @@ export function isBlobRef(
   const cid = ifCid(
     ref,
     // Strict unless explicitly disabled
-    options?.strict === false ? undefined : { flavor: 'raw' },
+    options?.strict === false ? undefined : STRICT_CID_CHECK_OPTIONS,
   )
   if (!cid) {
     return false
@@ -178,9 +318,6 @@ export type LegacyBlobRef = {
  * - `mimeType` must be a non-empty string
  * - No additional properties allowed
  *
- * @param input - The value to check
- * @returns `true` if the input is a valid LegacyBlobRef
- *
  * @example
  * ```typescript
  * import { isLegacyBlobRef } from '@atproto/lex-data'
@@ -191,9 +328,12 @@ export type LegacyBlobRef = {
  * }
  * ```
  *
- * @see {@link isBlobRef} for checking the current blob reference format
+ * @see {@link isTypedBlobRef} for checking the current blob reference format
  */
-export function isLegacyBlobRef(input: unknown): input is LegacyBlobRef {
+export function isLegacyBlobRef(
+  input: unknown,
+  options?: BlobRefCheckOptions,
+): input is LegacyBlobRef {
   if (!isPlainObject(input)) {
     return false
   }
@@ -213,7 +353,12 @@ export function isLegacyBlobRef(input: unknown): input is LegacyBlobRef {
     }
   }
-  if (!validateCidString(cid)) {
+  if (
+    !validateCidString(
+      cid,
+      options?.strict === false ? undefined : STRICT_CID_CHECK_OPTIONS,
+    )
+  ) {
     return false
   }
@@ -240,10 +385,10 @@ export type EnumBlobRefsOptions = BlobRefCheckOptions & {
  */
 export type InferEnumBlobRefs<TOptions extends EnumBlobRefsOptions> =
   TOptions extends { allowLegacy: true }
-    ? InferCheckedBlobRef<TOptions> | LegacyBlobRef
+    ? InferTypedBlobRef<TOptions> | LegacyBlobRef
     : { allowLegacy: boolean } extends TOptions
-      ? InferCheckedBlobRef<TOptions> | LegacyBlobRef
-      : InferCheckedBlobRef<TOptions>
+      ? InferTypedBlobRef<TOptions> | LegacyBlobRef
+      : InferTypedBlobRef<TOptions>
 /**
  * Generator that enumerates all {@link BlobRef}s (and, optionally,
@@ -273,8 +418,8 @@ export type InferEnumBlobRefs<TOptions extends EnumBlobRefsOptions> =
  * }
  *
  * // Include legacy blob references
- * for (const ref of enumBlobRefs(record, { allowLegacy: true })) {
- *   // ref may be BlobRef or LegacyBlobRef
+ * for (const ref of enumBlobRefs(record, { allowLegacy: true, strict: false })) {
+ *   // ref may be BlobRef or LegacyBlobRef, with relaxed CID validation
  * }
  * ```
  */
@@ -288,11 +433,11 @@ export function enumBlobRefs<TOptions extends EnumBlobRefsOptions>(
 export function enumBlobRefs(
   input: LexValue,
   options?: EnumBlobRefsOptions,
-): Generator<BlobRef | LegacyBlobRef, void, unknown>
+): Generator<BlobRef, void, unknown>
 export function* enumBlobRefs(
   input: LexValue,
   options?: EnumBlobRefsOptions,
-): Generator<BlobRef | LegacyBlobRef, void, unknown> {
+): Generator<BlobRef, void, unknown> {
   // LegacyBlobRef not included by default
   const includeLegacy = options?.allowLegacy === true
@@ -315,9 +460,9 @@ export function* enumBlobRefs(
       } else if (isPlainProto(value)) {
         if (visited.has(value)) continue
         visited.add(value)
-        if (isBlobRef(value, options)) {
+        if (isTypedBlobRef(value, options)) {
           yield value
-        } else if (includeLegacy && isLegacyBlobRef(value)) {
+        } else if (includeLegacy && isLegacyBlobRef(value, options)) {
           yield value
         } else {
           for (const v of Object.values(value)) {

package/src/lib/nodejs-buffer.ts CHANGED Viewed

@@ -12,6 +12,11 @@ interface NodeJSBufferConstructor {
     input: Uint8Array | ArrayBuffer | ArrayBufferView,
   ): NodeJSBuffer<ArrayBuffer>
   from(input: string, encoding?: Encoding): NodeJSBuffer<ArrayBuffer>
+  from<TArrayBuffer extends ArrayBufferLike>(
+    arrayBuffer: WithImplicitCoercion<TArrayBuffer>,
+    byteOffset?: number,
+    length?: number,
+  ): Buffer<TArrayBuffer>
   concat(list: readonly Uint8Array[], totalLength?: number): NodeJSBuffer
   byteLength(input: string, encoding?: Encoding): number
   prototype: NodeJSBuffer

package/src/utf8-from-bytes.test.ts ADDED Viewed

@@ -0,0 +1,43 @@
+import { assert, describe, expect, it } from 'vitest'
+import { utf8FromBytesNative, utf8FromBytesNode } from './utf8-from-bytes.js'
+for (const utf8FromBytes of [utf8FromBytesNode, utf8FromBytesNative] as const) {
+  assert(utf8FromBytes, 'utf8FromBytes implementation should not be null')
+  describe(utf8FromBytes, () => {
+    it('decodes empty Uint8Array', () => {
+      const decoded = utf8FromBytes(new Uint8Array(0))
+      expect(typeof decoded).toBe('string')
+      expect(decoded).toBe('')
+    })
+    it('decodes 10MB', () => {
+      const bytes = Buffer.allocUnsafe(10_000_000).fill('🐩')
+      const decoded = utf8FromBytes(bytes)
+      expect(decoded).toBe('🐩'.repeat(10_000_000 / 4))
+    })
+    for (const string of [
+      '',
+      '\0\0',
+      '\0\0\0',
+      '\0\0\0\0',
+      '__',
+      'é',
+      'àç',
+      '\0éàç',
+      '```\x1b',
+      'aaa',
+      'Hello, World!',
+      '😀😃😄😁😆😅😂🤣😊😇',
+      '👩‍💻👨‍💻👩‍🔬👨‍🔬👩‍🚀👨‍🚀',
+      '🌍🌎🌏🌐🪐🌟✨⚡🔥💧',
+    ] as const) {
+      const buffer = Buffer.from(string, 'utf8')
+      it(`decodes ${JSON.stringify(string)}`, () => {
+        const decoded = utf8FromBytes(buffer)
+        expect(decoded).toBe(string)
+      })
+    }
+  })
+}

package/src/utf8-from-bytes.ts ADDED Viewed

@@ -0,0 +1,21 @@
+import { NodeJSBuffer } from './lib/nodejs-buffer.js'
+const Buffer = NodeJSBuffer
+export const utf8FromBytesNode = Buffer
+  ? function utf8FromBytesNode(bytes: Uint8Array): string {
+      // @NOTE Buffer.from(bytes) creates a copy of the ArrayBuffer. The following
+      // allows us to avoid the copy by creating a Buffer that shares the same
+      // memory as the input Uint8Array.
+      const buffer = Buffer.from(
+        bytes.buffer,
+        bytes.byteOffset,
+        bytes.byteLength,
+      )
+      return buffer.toString('utf8')
+    }
+  : /* v8 ignore next -- @preserve */ null
+export function utf8FromBytesNative(bytes: Uint8Array): string {
+  return new TextDecoder('utf-8').decode(bytes)
+}