npm - unrag - Versions diffs - 0.2.1 → 0.2.3 - Mend

unrag 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +2 -2
package/dist/cli/index.js +251 -42
package/package.json +2 -1
package/registry/config/unrag.config.ts +140 -7
package/registry/connectors/notion/render.ts +78 -0
package/registry/connectors/notion/sync.ts +12 -3
package/registry/connectors/notion/types.ts +3 -1
package/registry/core/assets.ts +54 -0
package/registry/core/config.ts +150 -0
package/registry/core/context-engine.ts +69 -1
package/registry/core/index.ts +15 -2
package/registry/core/ingest.ts +743 -17
package/registry/core/types.ts +606 -0
package/registry/docs/unrag.md +6 -0
package/registry/embedding/ai.ts +89 -8
package/registry/extractors/_shared/fetch.ts +113 -0
package/registry/extractors/_shared/media.ts +14 -0
package/registry/extractors/_shared/text.ts +11 -0
package/registry/extractors/audio-transcribe/index.ts +75 -0
package/registry/extractors/file-docx/index.ts +53 -0
package/registry/extractors/file-pptx/index.ts +92 -0
package/registry/extractors/file-text/index.ts +85 -0
package/registry/extractors/file-xlsx/index.ts +58 -0
package/registry/extractors/image-caption-llm/index.ts +60 -0
package/registry/extractors/image-ocr/index.ts +60 -0
package/registry/extractors/pdf-llm/index.ts +84 -0
package/registry/extractors/pdf-ocr/index.ts +125 -0
package/registry/extractors/pdf-text-layer/index.ts +76 -0
package/registry/extractors/video-frames/index.ts +126 -0
package/registry/extractors/video-transcribe/index.ts +78 -0
package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1

package/README.md CHANGED Viewed

@@ -10,13 +10,13 @@ It installs small, auditable source files into your repo:
 ## Usage
 ```bash
-bunx unrag init
+bunx unrag@latest init
 ```
 ### Common flags
 ```bash
-bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag
+bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag
 ```
 - `--store`: `drizzle` | `prisma` | `raw-sql`

package/dist/cli/index.js CHANGED Viewed

@@ -74,8 +74,7 @@ var writeText = async (filePath, content) => {
 var renderUnragConfig = (content, selection) => {
   const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
   const baseImports = [
-    `import { createContextEngine, defineConfig } from "${installImportBase}/core";`,
-    `import { createAiEmbeddingProvider } from "${installImportBase}/embedding/ai";`
+    `import { defineUnragConfig } from "${installImportBase}/core";`
   ];
   const storeImports = [];
   const storeCreateLines = [];
@@ -93,24 +92,14 @@ var renderUnragConfig = (content, selection) => {
 `);
   const createEngineBlock = [
     `export function createUnragEngine() {`,
-    `  const embedding = createAiEmbeddingProvider({`,
-    `    model: unragConfig.embedding.model,`,
-    `    timeoutMs: unragConfig.embedding.timeoutMs,`,
-    `  });`,
     ...storeCreateLines,
     ``,
-    `  return createContextEngine(`,
-    `    defineConfig({`,
-    `      embedding,`,
-    `      store,`,
-    `      defaults: unragConfig.chunking,`,
-    `    })`,
-    `  );`,
+    `  return unrag.createEngine({ store });`,
     `}`,
     ``,
     `export async function retrieve(query: string) {`,
     `  const engine = createUnragEngine();`,
-    `  return engine.retrieve({ query, topK: unragConfig.retrieval.topK });`,
+    `  return engine.retrieve({ query, topK: unrag.defaults.retrieval.topK });`,
     `}`
   ].join(`
 `);
@@ -147,6 +136,10 @@ async function copyRegistryFiles(selection) {
       src: path2.join(selection.registryRoot, "core/index.ts"),
       dest: path2.join(installBaseAbs, "core/index.ts")
     },
+    {
+      src: path2.join(selection.registryRoot, "core/assets.ts"),
+      dest: path2.join(installBaseAbs, "core/assets.ts")
+    },
     {
       src: path2.join(selection.registryRoot, "core/types.ts"),
       dest: path2.join(installBaseAbs, "core/types.ts")
@@ -163,6 +156,10 @@ async function copyRegistryFiles(selection) {
       src: path2.join(selection.registryRoot, "core/context-engine.ts"),
       dest: path2.join(installBaseAbs, "core/context-engine.ts")
     },
+    {
+      src: path2.join(selection.registryRoot, "core/delete.ts"),
+      dest: path2.join(installBaseAbs, "core/delete.ts")
+    },
     {
       src: path2.join(selection.registryRoot, "core/ingest.ts"),
       dest: path2.join(installBaseAbs, "core/ingest.ts")
@@ -262,6 +259,70 @@ async function copyConnectorFiles(selection) {
     await writeText(dest, raw);
   }
 }
+async function copyExtractorFiles(selection) {
+  const toAbs = (projectRelative) => path2.join(selection.projectRoot, projectRelative);
+  const installBaseAbs = toAbs(selection.installDir);
+  const extractorRegistryAbs = path2.join(selection.registryRoot, "extractors", selection.extractor);
+  const sharedRegistryAbs = path2.join(selection.registryRoot, "extractors", "_shared");
+  if (!await exists(extractorRegistryAbs)) {
+    throw new Error(`Unknown extractor registry: ${path2.relative(selection.registryRoot, extractorRegistryAbs)}`);
+  }
+  const extractorFiles = await listFilesRecursive(extractorRegistryAbs);
+  const sharedFiles = await exists(sharedRegistryAbs) ? await listFilesRecursive(sharedRegistryAbs) : [];
+  const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
+  const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
+  const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
+  for (const src of extractorFiles) {
+    if (!await exists(src)) {
+      throw new Error(`Registry file missing: ${src}`);
+    }
+    const rel = path2.relative(extractorRegistryAbs, src);
+    const dest = path2.join(destRootAbs, rel);
+    if (await exists(dest)) {
+      if (nonInteractive) {
+        continue;
+      }
+      const answer = await confirm({
+        message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
+        initialValue: false
+      });
+      if (isCancel(answer)) {
+        cancel("Cancelled.");
+        return;
+      }
+      if (!answer) {
+        continue;
+      }
+    }
+    const raw = await readText(src);
+    await writeText(dest, raw);
+  }
+  for (const src of sharedFiles) {
+    if (!await exists(src)) {
+      throw new Error(`Registry file missing: ${src}`);
+    }
+    const rel = path2.relative(sharedRegistryAbs, src);
+    const dest = path2.join(sharedDestRootAbs, rel);
+    if (await exists(dest)) {
+      if (nonInteractive) {
+        continue;
+      }
+      const answer = await confirm({
+        message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
+        initialValue: false
+      });
+      if (isCancel(answer)) {
+        cancel("Cancelled.");
+        return;
+      }
+      if (!answer) {
+        continue;
+      }
+    }
+    const raw = await readText(src);
+    await writeText(dest, raw);
+  }
+}
 // cli/lib/json.ts
 import { readFile as readFile2, writeFile as writeFile2 } from "node:fs/promises";
@@ -347,6 +408,37 @@ function depsForConnector(connector) {
   }
   return { deps, devDeps };
 }
+function depsForExtractor(extractor) {
+  const deps = {};
+  const devDeps = {};
+  if (extractor === "pdf-llm") {
+    deps["ai"] = "^5.0.113";
+  }
+  if (extractor === "pdf-text-layer") {
+    deps["pdfjs-dist"] = "^5.4.149";
+  }
+  if (extractor === "pdf-ocr") {}
+  if (extractor === "image-ocr" || extractor === "image-caption-llm") {
+    deps["ai"] = "^5.0.113";
+  }
+  if (extractor === "audio-transcribe" || extractor === "video-transcribe") {
+    deps["ai"] = "^5.0.113";
+  }
+  if (extractor === "video-frames") {
+    deps["ai"] = "^5.0.113";
+  }
+  if (extractor === "file-text") {}
+  if (extractor === "file-docx") {
+    deps["mammoth"] = "^1.10.0";
+  }
+  if (extractor === "file-pptx") {
+    deps["jszip"] = "^3.10.1";
+  }
+  if (extractor === "file-xlsx") {
+    deps["xlsx"] = "^0.18.5";
+  }
+  return { deps, devDeps };
+}
 function installCmd(pm) {
   if (pm === "bun")
     return "bun install";
@@ -550,7 +642,8 @@ async function initCommand(args) {
     storeAdapter: storeAdapterAnswer,
     aliasBase,
     version: CONFIG_VERSION,
-    connectors: existing?.connectors ?? []
+    connectors: existing?.connectors ?? [],
+    extractors: existing?.extractors ?? []
   };
   await writeJsonFile(path5.join(root, CONFIG_FILE), config);
   const pm = await detectPackageManager(root);
@@ -578,9 +671,34 @@ async function initCommand(args) {
 import { outro as outro2 } from "@clack/prompts";
 import path6 from "node:path";
 import { fileURLToPath as fileURLToPath2 } from "node:url";
+// cli/lib/constants.ts
+var UNRAG_SITE_URL = (process.env.UNRAG_SITE_URL ?? process.env.UNRAG_DOCS_BASE_URL)?.trim() || "https://unrag.dev";
+var UNRAG_GITHUB_REPO_URL = "https://github.com/BetterStacks/unrag";
+function docsUrl(siteRelativePath) {
+  const p = siteRelativePath.startsWith("/") ? siteRelativePath : `/${siteRelativePath}`;
+  const base = UNRAG_SITE_URL.endsWith("/") ? UNRAG_SITE_URL : `${UNRAG_SITE_URL}/`;
+  return new URL(p.replace(/^\/+/, "/"), base).toString();
+}
+// cli/commands/add.ts
 var CONFIG_FILE2 = "unrag.json";
 var __filename3 = fileURLToPath2(import.meta.url);
 var __dirname3 = path6.dirname(__filename3);
+var AVAILABLE_EXTRACTORS = [
+  "pdf-llm",
+  "pdf-text-layer",
+  "pdf-ocr",
+  "image-ocr",
+  "image-caption-llm",
+  "audio-transcribe",
+  "video-transcribe",
+  "video-frames",
+  "file-text",
+  "file-docx",
+  "file-pptx",
+  "file-xlsx"
+];
 var parseAddArgs = (args) => {
   const out = {};
   for (let i = 0;i < args.length; i++) {
@@ -589,8 +707,17 @@ var parseAddArgs = (args) => {
       out.yes = true;
       continue;
     }
-    if (!out.connector && !a.startsWith("-")) {
-      out.connector = a;
+    if (!out.kind && a && !a.startsWith("-")) {
+      if (a === "extractor") {
+        out.kind = "extractor";
+        continue;
+      }
+      out.kind = "connector";
+      out.name = a;
+      continue;
+    }
+    if (out.kind === "extractor" && !out.name && a && !a.startsWith("-")) {
+      out.name = a;
       continue;
     }
   }
@@ -602,23 +729,24 @@ async function addCommand(args) {
     throw new Error("Could not find a project root (no package.json found).");
   }
   const parsed = parseAddArgs(args);
-  const connector = parsed.connector;
-  if (!connector) {
-    outro2(`Usage: unrag add <connector>
-Available connectors: notion`);
-    return;
-  }
-  if (connector !== "notion") {
-    outro2(`Unknown connector: ${connector}
-Available connectors: notion`);
+  const kind = parsed.kind ?? "connector";
+  const name = parsed.name;
+  if (!name) {
+    outro2([
+      "Usage:",
+      "  unrag add <connector>",
+      "  unrag add extractor <name>",
+      "",
+      "Available connectors: notion",
+      `Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`
+    ].join(`
+`));
     return;
   }
   const configPath = path6.join(root, CONFIG_FILE2);
   const config = await readJsonFile(configPath);
   if (!config?.installDir) {
-    throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag init\` first.`);
+    throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag@latest init\` first.`);
   }
   const cliPackageRoot = await findUp(__dirname3, "package.json");
   if (!cliPackageRoot) {
@@ -626,40 +754,120 @@ Available connectors: notion`);
   }
   const registryRoot = path6.join(cliPackageRoot, "registry");
   const nonInteractive = parsed.yes || !process.stdin.isTTY;
-  await copyConnectorFiles({
+  const pkg = await readPackageJson(root);
+  if (kind === "connector") {
+    const connector = name;
+    if (connector !== "notion") {
+      outro2(`Unknown connector: ${name}
+Available connectors: notion`);
+      return;
+    }
+    await copyConnectorFiles({
+      projectRoot: root,
+      registryRoot,
+      installDir: config.installDir,
+      connector,
+      yes: nonInteractive
+    });
+    const { deps: deps2, devDeps: devDeps2 } = depsForConnector(connector);
+    const merged2 = mergeDeps(pkg, deps2, devDeps2);
+    if (merged2.changes.length > 0) {
+      await writePackageJson(root, merged2.pkg);
+    }
+    const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
+    await writeJsonFile(configPath, { ...config, connectors });
+    outro2([
+      `Installed connector: ${connector}.`,
+      "",
+      `- Code: ${path6.join(config.installDir, "connectors", connector)}`,
+      `- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
+      "",
+      merged2.changes.length > 0 ? `Added deps: ${merged2.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
+      nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
+    ].filter(Boolean).join(`
+`));
+    return;
+  }
+  const extractor = name;
+  if (!extractor || !AVAILABLE_EXTRACTORS.includes(extractor)) {
+    outro2(`Unknown extractor: ${name}
+Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`);
+    return;
+  }
+  await copyExtractorFiles({
     projectRoot: root,
     registryRoot,
     installDir: config.installDir,
-    connector,
+    extractor,
     yes: nonInteractive
   });
-  const pkg = await readPackageJson(root);
-  const { deps, devDeps } = depsForConnector(connector);
+  const { deps, devDeps } = depsForExtractor(extractor);
   const merged = mergeDeps(pkg, deps, devDeps);
   if (merged.changes.length > 0) {
     await writePackageJson(root, merged.pkg);
   }
-  const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
-  await writeJsonFile(configPath, { ...config, connectors });
+  const extractors = Array.from(new Set([...config.extractors ?? [], extractor])).sort();
+  await writeJsonFile(configPath, { ...config, extractors });
   outro2([
-    `Installed connector: ${connector}.`,
+    `Installed extractor: ${extractor}.`,
     "",
-    `- Code: ${path6.join(config.installDir, "connectors", connector)}`,
-    `- Docs: /docs/connectors/${connector}`,
+    `- Code: ${path6.join(config.installDir, "extractors", extractor)}`,
     "",
     merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
-    nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
+    "",
+    `Next: import the extractor and pass it to createContextEngine({ extractors: [...] }).`
   ].filter(Boolean).join(`
 `));
 }
 // cli/run.ts
+function renderHelp() {
+  return [
+    "unrag — vendor-in RAG primitives (ingest/retrieve + adapters) into your repo.",
+    "",
+    "Usage:",
+    "  bunx unrag <command> [options]",
+    "  npx  unrag <command> [options]",
+    "",
+    "Commands:",
+    "  init                Install core files (config + store adapter templates)",
+    "  add <connector>     Install a connector (currently: notion)",
+    "  help                Show this help",
+    "",
+    "Global options:",
+    "  -h, --help           Show help",
+    "  -y, --yes            Non-interactive; accept defaults",
+    "",
+    "init options:",
+    "  --store <adapter>    drizzle | prisma | raw-sql",
+    "  --dir <path>         Install directory (alias: --install-dir)",
+    "  --alias <@name>      Import alias base (e.g. @unrag)",
+    "",
+    "Examples:",
+    "  bunx unrag@latest init",
+    "  bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
+    "  bunx unrag add notion --yes",
+    "",
+    "Docs:",
+    `  - Quickstart: ${docsUrl("/docs/getting-started/quickstart")}`,
+    `  - CLI:       ${docsUrl("/docs/reference/cli")}`,
+    `  - Notion:    ${docsUrl("/docs/connectors/notion")}`,
+    "",
+    "Repo:",
+    `  ${UNRAG_GITHUB_REPO_URL}`,
+    "",
+    "Tip:",
+    "  After `init`, open the generated unrag.md for schema + env vars (DATABASE_URL)."
+  ].join(`
+`);
+}
 async function run(argv) {
   const [, , command, ...rest] = argv;
   intro("unrag");
   if (!command || command === "help" || command === "--help" || command === "-h") {
-    outro3(["Usage:", "", "- unrag init", "- unrag add <connector>"].join(`
-`));
+    outro3(renderHelp());
     return;
   }
   if (command === "init") {
@@ -670,7 +878,8 @@ async function run(argv) {
     await addCommand(rest);
     return;
   }
-  outro3(`Unknown command: ${command}`);
+  outro3([`Unknown command: ${command}`, "", renderHelp()].join(`
+`));
   process.exitCode = 1;
 }

package/package.json CHANGED Viewed

@@ -1,10 +1,11 @@
 {
   "name": "unrag",
   "type": "module",
+  "repository": "https://github.com/BetterStacks/unrag",
   "bin": {
     "unrag": "./dist/cli/index.js"
   },
-  "version": "0.2.1",
+  "version": "0.2.3",
   "private": false,
   "license": "Apache-2.0",
   "devDependencies": {

package/registry/config/unrag.config.ts CHANGED Viewed

@@ -2,10 +2,10 @@
  * Root Unrag config (generated).
  *
  * This file is meant to be the single place you tweak:
+ * - Defaults (chunking + retrieval)
+ * - Engine settings (storage, asset processing, extractors)
  * - Embedding provider/model/timeouts
- * - Chunking defaults
- * - Retrieval defaults
- * - How you construct your DB client (Pool/Prisma/etc)
+ * - How you construct your DB client (Pool/Prisma/etc) and vector store adapter
  *
  * The files under your install dir (e.g. `lib/unrag/**`) are intended to be
  * treated like vendored source code.
@@ -13,7 +13,8 @@
 // __UNRAG_IMPORTS__
-export const unragConfig = {
+export const unrag = defineUnragConfig({
+  defaults: {
   chunking: {
     chunkSize: 200,
     chunkOverlap: 40,
@@ -21,11 +22,143 @@ export const unragConfig = {
   retrieval: {
     topK: 8,
   },
+  },
   embedding: {
-    model: "openai/text-embedding-3-small",
-    timeoutMs: 15_000,
+    provider: "ai",
+    config: {
+      type: "text",
+      model: "openai/text-embedding-3-small",
+      timeoutMs: 15_000,
+    },
+  },
+  engine: {
+  /**
+   * Storage controls.
+   *
+   * - storeChunkContent: whether `chunk.content` is persisted and returned by retrieval.
+   * - storeDocumentContent: whether the full original document text is stored in `documents.content`.
+   */
+  storage: {
+    storeChunkContent: true,
+    storeDocumentContent: true,
+  },
+    /**
+     * Optional extractor modules that can process non-text assets into text outputs.
+     *
+     * To install:
+     * - `unrag add extractor pdf-llm`
+     *
+     * Then import it in this file and add it here, for example:
+     * - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
+     * - `extractors: [createPdfLlmExtractor()]`
+     */
+    extractors: [],
+  /**
+   * Rich media processing controls.
+   *
+   * Notes:
+   * - The library defaults are cost-safe (PDF LLM extraction is off).
+   * - This generated config opts you into PDF extraction for convenience.
+   * - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
+   */
+  assetProcessing: {
+    onUnsupportedAsset: "skip",
+    onError: "skip",
+    concurrency: 4,
+    fetch: {
+      enabled: true,
+      maxBytes: 15 * 1024 * 1024,
+      timeoutMs: 20_000,
+      // allowedHosts: ["..."], // recommended to mitigate SSRF
+    },
+    pdf: {
+      // Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
+      textLayer: {
+        enabled: false,
+        maxBytes: 15 * 1024 * 1024,
+        maxOutputChars: 200_000,
+        minChars: 200,
+        // maxPages: 200,
+      },
+      llmExtraction: {
+        enabled: true,
+        model: "google/gemini-2.0-flash",
+        prompt:
+          "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
+        timeoutMs: 60_000,
+        maxBytes: 15 * 1024 * 1024,
+        maxOutputChars: 200_000,
+      },
+      // Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
+      ocr: {
+        enabled: false,
+        maxBytes: 15 * 1024 * 1024,
+        maxOutputChars: 200_000,
+        minChars: 200,
+        // maxPages: 200,
+        // pdftoppmPath: "/usr/bin/pdftoppm",
+        // tesseractPath: "/usr/bin/tesseract",
+        // dpi: 200,
+        // lang: "eng",
+      },
+    },
+    image: {
+      ocr: {
+        enabled: false,
+        model: "google/gemini-2.0-flash",
+        prompt:
+          "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
+        timeoutMs: 60_000,
+        maxBytes: 10 * 1024 * 1024,
+        maxOutputChars: 50_000,
+      },
+      captionLlm: {
+        enabled: false,
+        model: "google/gemini-2.0-flash",
+        prompt:
+          "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
+        timeoutMs: 60_000,
+        maxBytes: 10 * 1024 * 1024,
+        maxOutputChars: 10_000,
+      },
+    },
+    audio: {
+      transcription: {
+        enabled: false,
+        model: "openai/whisper-1",
+        timeoutMs: 120_000,
+        maxBytes: 25 * 1024 * 1024,
+      },
+    },
+    video: {
+      transcription: {
+        enabled: false,
+        model: "openai/whisper-1",
+        timeoutMs: 120_000,
+        maxBytes: 50 * 1024 * 1024,
+      },
+      frames: {
+        enabled: false,
+        sampleFps: 0.2,
+        maxFrames: 50,
+        // ffmpegPath: "/usr/bin/ffmpeg",
+        maxBytes: 50 * 1024 * 1024,
+        model: "google/gemini-2.0-flash",
+        prompt:
+          "Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
+        timeoutMs: 60_000,
+        maxOutputChars: 50_000,
+      },
+    },
+    file: {
+      text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+      docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+      pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+      xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+    },
+  },
   },
-} as const;
+} as const);
 // __UNRAG_CREATE_ENGINE__

package/registry/connectors/notion/render.ts CHANGED Viewed

@@ -1,3 +1,5 @@
+import type { AssetInput, AssetKind, Metadata } from "../../core";
 type RichText = { plain_text?: string };
 export type NotionBlock = {
@@ -20,6 +22,82 @@ const rt = (value: unknown): string => {
 const indent = (n: number) => (n > 0 ? "  ".repeat(n) : "");
+const asString = (v: unknown) => String(v ?? "").trim();
+const supportedAssetKinds = new Set<AssetKind>([
+  "image",
+  "pdf",
+  "audio",
+  "video",
+  "file",
+]);
+const toAssetKind = (notionType: string): AssetKind | null => {
+  const t = notionType as AssetKind;
+  return supportedAssetKinds.has(t) ? t : null;
+};
+const pickUrl = (payload: any): string | undefined => {
+  const type = String(payload?.type ?? "");
+  if (type === "external") return asString(payload?.external?.url);
+  if (type === "file") return asString(payload?.file?.url);
+  return undefined;
+};
+const pickCaption = (payload: any): string => {
+  // Notion captions are typically an array of rich text items.
+  return rt(payload?.caption);
+};
+const inferMediaType = (assetKind: AssetKind, payload: any): string | undefined => {
+  if (assetKind === "pdf") return "application/pdf";
+  // Notion does not consistently include media types; keep it optional.
+  return asString(payload?.media_type) || undefined;
+};
+const asMetadata = (obj: Record<string, unknown>): Metadata => obj as any;
+export function extractNotionAssets(
+  nodes: NotionBlockNode[],
+  opts: { maxDepth?: number } = {}
+): AssetInput[] {
+  const maxDepth = opts.maxDepth ?? 6;
+  const out: AssetInput[] = [];
+  const walk = (node: NotionBlockNode, depth: number) => {
+    if (depth > maxDepth) return;
+    const b = node.block as any;
+    const kind = toAssetKind(String(b.type ?? ""));
+    if (kind) {
+      const payload = b[kind];
+      const url = pickUrl(payload);
+      if (url) {
+        const caption = pickCaption(payload).trim();
+        const mediaType = inferMediaType(kind, payload);
+        out.push({
+          assetId: String(b.id),
+          kind,
+          data: { kind: "url", url, ...(mediaType ? { mediaType } : {}) },
+          uri: url,
+          ...(caption ? { text: caption } : {}),
+          metadata: asMetadata({
+            connector: "notion",
+            notionBlockId: String(b.id),
+            notionBlockType: String(b.type),
+          }),
+        });
+      }
+    }
+    for (const child of node.children) {
+      walk(child, depth + 1);
+    }
+  };
+  for (const n of nodes) walk(n, 0);
+  return out;
+}
 export function renderNotionBlocksToText(
   nodes: NotionBlockNode[],
   opts: { maxDepth?: number } = {}