npm - unrag - Versions diffs - 0.2.2 → 0.2.4 - Mend

unrag 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +2 -2
package/dist/cli/index.js +408 -50
package/package.json +3 -1
package/registry/config/unrag.config.ts +164 -7
package/registry/connectors/notion/render.ts +78 -0
package/registry/connectors/notion/sync.ts +12 -3
package/registry/connectors/notion/types.ts +3 -1
package/registry/core/assets.ts +54 -0
package/registry/core/config.ts +150 -0
package/registry/core/context-engine.ts +69 -1
package/registry/core/index.ts +15 -2
package/registry/core/ingest.ts +743 -17
package/registry/core/types.ts +606 -0
package/registry/docs/unrag.md +6 -0
package/registry/embedding/ai.ts +89 -8
package/registry/extractors/_shared/fetch.ts +113 -0
package/registry/extractors/_shared/media.ts +14 -0
package/registry/extractors/_shared/text.ts +11 -0
package/registry/extractors/audio-transcribe/index.ts +75 -0
package/registry/extractors/file-docx/index.ts +53 -0
package/registry/extractors/file-pptx/index.ts +92 -0
package/registry/extractors/file-text/index.ts +85 -0
package/registry/extractors/file-xlsx/index.ts +58 -0
package/registry/extractors/image-caption-llm/index.ts +60 -0
package/registry/extractors/image-ocr/index.ts +60 -0
package/registry/extractors/pdf-llm/index.ts +84 -0
package/registry/extractors/pdf-ocr/index.ts +125 -0
package/registry/extractors/pdf-text-layer/index.ts +76 -0
package/registry/extractors/video-frames/index.ts +126 -0
package/registry/extractors/video-transcribe/index.ts +78 -0
package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1

package/registry/config/unrag.config.ts CHANGED Viewed

@@ -2,18 +2,21 @@
  * Root Unrag config (generated).
  *
  * This file is meant to be the single place you tweak:
+ * - Defaults (chunking + retrieval)
+ * - Engine settings (storage, asset processing, extractors)
  * - Embedding provider/model/timeouts
- * - Chunking defaults
- * - Retrieval defaults
- * - How you construct your DB client (Pool/Prisma/etc)
+ * - How you construct your DB client (Pool/Prisma/etc) and vector store adapter
  *
  * The files under your install dir (e.g. `lib/unrag/**`) are intended to be
  * treated like vendored source code.
  */
+// @ts-nocheck
 // __UNRAG_IMPORTS__
-export const unragConfig = {
+export const unrag = defineUnragConfig({
+  defaults: {
   chunking: {
     chunkSize: 200,
     chunkOverlap: 40,
@@ -21,11 +24,165 @@ export const unragConfig = {
   retrieval: {
     topK: 8,
   },
+  },
   embedding: {
-    model: "openai/text-embedding-3-small",
-    timeoutMs: 15_000,
+    provider: "ai",
+    config: {
+      type: "text", // __UNRAG_EMBEDDING_TYPE__
+      model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__
+      timeoutMs: 15_000,
+    },
+  },
+  engine: {
+  /**
+   * Storage controls.
+   *
+   * - storeChunkContent: whether `chunk.content` is persisted and returned by retrieval.
+   * - storeDocumentContent: whether the full original document text is stored in `documents.content`.
+   */
+  storage: {
+    storeChunkContent: true,
+    storeDocumentContent: true,
+  },
+    /**
+     * Optional extractor modules that can process non-text assets into text outputs.
+     *
+     * To install:
+     * - `unrag add extractor pdf-llm`
+     *
+     * Then import it in this file and add it here, for example:
+     * - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
+     * - `extractors: [createPdfLlmExtractor()]`
+     */
+    extractors: [
+      // __UNRAG_EXTRACTORS__
+    ],
+  /**
+   * Rich media processing controls.
+   *
+   * Notes:
+   * - This generated config is cost-safe by default (all extraction is off).
+   * - `unrag init` can enable rich media + multimodal embeddings for you.
+   * - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
+   */
+  assetProcessing: {
+    onUnsupportedAsset: "skip",
+    onError: "skip",
+    concurrency: 4,
+    fetch: {
+      enabled: true,
+      maxBytes: 15 * 1024 * 1024,
+      timeoutMs: 20_000,
+      // allowedHosts: ["..."], // recommended to mitigate SSRF
+    },
+    pdf: {
+      // Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
+      textLayer: {
+        enabled: false, // __UNRAG_FLAG_pdf_textLayer__
+        maxBytes: 15 * 1024 * 1024,
+        maxOutputChars: 200_000,
+        minChars: 200,
+        // maxPages: 200,
+      },
+      llmExtraction: {
+        enabled: false, // __UNRAG_FLAG_pdf_llmExtraction__
+        model: "google/gemini-2.0-flash",
+        prompt:
+          "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
+        timeoutMs: 60_000,
+        maxBytes: 15 * 1024 * 1024,
+        maxOutputChars: 200_000,
+      },
+      // Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
+      ocr: {
+        enabled: false, // __UNRAG_FLAG_pdf_ocr__
+        maxBytes: 15 * 1024 * 1024,
+        maxOutputChars: 200_000,
+        minChars: 200,
+        // maxPages: 200,
+        // pdftoppmPath: "/usr/bin/pdftoppm",
+        // tesseractPath: "/usr/bin/tesseract",
+        // dpi: 200,
+        // lang: "eng",
+      },
+    },
+    image: {
+      ocr: {
+        enabled: false, // __UNRAG_FLAG_image_ocr__
+        model: "google/gemini-2.0-flash",
+        prompt:
+          "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
+        timeoutMs: 60_000,
+        maxBytes: 10 * 1024 * 1024,
+        maxOutputChars: 50_000,
+      },
+      captionLlm: {
+        enabled: false, // __UNRAG_FLAG_image_captionLlm__
+        model: "google/gemini-2.0-flash",
+        prompt:
+          "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
+        timeoutMs: 60_000,
+        maxBytes: 10 * 1024 * 1024,
+        maxOutputChars: 10_000,
+      },
+    },
+    audio: {
+      transcription: {
+        enabled: false, // __UNRAG_FLAG_audio_transcription__
+        model: "openai/whisper-1",
+        timeoutMs: 120_000,
+        maxBytes: 25 * 1024 * 1024,
+      },
+    },
+    video: {
+      transcription: {
+        enabled: false, // __UNRAG_FLAG_video_transcription__
+        model: "openai/whisper-1",
+        timeoutMs: 120_000,
+        maxBytes: 50 * 1024 * 1024,
+      },
+      frames: {
+        enabled: false, // __UNRAG_FLAG_video_frames__
+        sampleFps: 0.2,
+        maxFrames: 50,
+        // ffmpegPath: "/usr/bin/ffmpeg",
+        maxBytes: 50 * 1024 * 1024,
+        model: "google/gemini-2.0-flash",
+        prompt:
+          "Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
+        timeoutMs: 60_000,
+        maxOutputChars: 50_000,
+      },
+    },
+    file: {
+      text: {
+        enabled: false, // __UNRAG_FLAG_file_text__
+        maxBytes: 5 * 1024 * 1024,
+        maxOutputChars: 200_000,
+        minChars: 50,
+      },
+      docx: {
+        enabled: false, // __UNRAG_FLAG_file_docx__
+        maxBytes: 15 * 1024 * 1024,
+        maxOutputChars: 200_000,
+        minChars: 50,
+      },
+      pptx: {
+        enabled: false, // __UNRAG_FLAG_file_pptx__
+        maxBytes: 30 * 1024 * 1024,
+        maxOutputChars: 200_000,
+        minChars: 50,
+      },
+      xlsx: {
+        enabled: false, // __UNRAG_FLAG_file_xlsx__
+        maxBytes: 30 * 1024 * 1024,
+        maxOutputChars: 200_000,
+        minChars: 50,
+      },
+    },
+  },
   },
-} as const;
+} as const);
 // __UNRAG_CREATE_ENGINE__

package/registry/connectors/notion/render.ts CHANGED Viewed

@@ -1,3 +1,5 @@
+import type { AssetInput, AssetKind, Metadata } from "../../core";
 type RichText = { plain_text?: string };
 export type NotionBlock = {
@@ -20,6 +22,82 @@ const rt = (value: unknown): string => {
 const indent = (n: number) => (n > 0 ? "  ".repeat(n) : "");
+const asString = (v: unknown) => String(v ?? "").trim();
+const supportedAssetKinds = new Set<AssetKind>([
+  "image",
+  "pdf",
+  "audio",
+  "video",
+  "file",
+]);
+const toAssetKind = (notionType: string): AssetKind | null => {
+  const t = notionType as AssetKind;
+  return supportedAssetKinds.has(t) ? t : null;
+};
+const pickUrl = (payload: any): string | undefined => {
+  const type = String(payload?.type ?? "");
+  if (type === "external") return asString(payload?.external?.url);
+  if (type === "file") return asString(payload?.file?.url);
+  return undefined;
+};
+const pickCaption = (payload: any): string => {
+  // Notion captions are typically an array of rich text items.
+  return rt(payload?.caption);
+};
+const inferMediaType = (assetKind: AssetKind, payload: any): string | undefined => {
+  if (assetKind === "pdf") return "application/pdf";
+  // Notion does not consistently include media types; keep it optional.
+  return asString(payload?.media_type) || undefined;
+};
+const asMetadata = (obj: Record<string, unknown>): Metadata => obj as any;
+export function extractNotionAssets(
+  nodes: NotionBlockNode[],
+  opts: { maxDepth?: number } = {}
+): AssetInput[] {
+  const maxDepth = opts.maxDepth ?? 6;
+  const out: AssetInput[] = [];
+  const walk = (node: NotionBlockNode, depth: number) => {
+    if (depth > maxDepth) return;
+    const b = node.block as any;
+    const kind = toAssetKind(String(b.type ?? ""));
+    if (kind) {
+      const payload = b[kind];
+      const url = pickUrl(payload);
+      if (url) {
+        const caption = pickCaption(payload).trim();
+        const mediaType = inferMediaType(kind, payload);
+        out.push({
+          assetId: String(b.id),
+          kind,
+          data: { kind: "url", url, ...(mediaType ? { mediaType } : {}) },
+          uri: url,
+          ...(caption ? { text: caption } : {}),
+          metadata: asMetadata({
+            connector: "notion",
+            notionBlockId: String(b.id),
+            notionBlockType: String(b.type),
+          }),
+        });
+      }
+    }
+    for (const child of node.children) {
+      walk(child, depth + 1);
+    }
+  };
+  for (const n of nodes) walk(n, 0);
+  return out;
+}
 export function renderNotionBlocksToText(
   nodes: NotionBlockNode[],
   opts: { maxDepth?: number } = {}

package/registry/connectors/notion/sync.ts CHANGED Viewed

@@ -1,8 +1,12 @@
-import type { ContextEngine } from "../../core";
-import type { IngestResult } from "../../core/types";
+import type { IngestResult } from "../../core";
 import { createNotionClient, type NotionClient } from "./client";
 import { normalizeNotionPageId32, toUuidHyphenated } from "./ids";
-import { renderNotionBlocksToText, type NotionBlock, type NotionBlockNode } from "./render";
+import {
+  extractNotionAssets,
+  renderNotionBlocksToText,
+  type NotionBlock,
+  type NotionBlockNode,
+} from "./render";
 import type {
   BuildNotionPageIngestInputArgs,
   NotionPageDocument,
@@ -29,6 +33,7 @@ export function buildNotionPageIngestInput(
     sourceId,
     content: args.content,
     metadata: args.metadata ?? {},
+    assets: args.assets ?? [],
   };
 }
@@ -108,6 +113,7 @@ export async function loadNotionPageDocument(args: {
   const tree = await buildBlockTree(args.notion, apiId, 0, args.maxDepth ?? 4);
   const body = renderNotionBlocksToText(tree);
   const content = [title.trim(), body.trim()].filter(Boolean).join("\n\n");
+  const assets = extractNotionAssets(tree);
   const metadata = {
     connector: "notion",
@@ -121,6 +127,7 @@ export async function loadNotionPageDocument(args: {
   const ingest = buildNotionPageIngestInput({
     pageId,
     content,
+    assets,
     metadata: metadata as any,
     sourceIdPrefix: args.sourceIdPrefix,
   });
@@ -129,6 +136,7 @@ export async function loadNotionPageDocument(args: {
     sourceId: ingest.sourceId,
     content: ingest.content,
     metadata: ingest.metadata ?? {},
+    assets: ingest.assets ?? [],
   };
 }
@@ -178,6 +186,7 @@ export async function syncNotionPages(
       const result: IngestResult = await input.engine.ingest({
         sourceId: doc.sourceId,
         content: doc.content,
+        assets: doc.assets,
         metadata: doc.metadata as any,
       });

package/registry/connectors/notion/types.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import type { ContextEngine } from "../../core";
-import type { IngestInput } from "../../core/types";
+import type { AssetInput, IngestInput } from "../../core";
 export type NotionSyncProgressEvent =
   | { type: "page:start"; pageId: string; sourceId: string }
@@ -42,11 +42,13 @@ export type NotionPageDocument = {
   sourceId: string;
   content: string;
   metadata: Record<string, unknown>;
+  assets: AssetInput[];
 };
 export type BuildNotionPageIngestInputArgs = {
   pageId: string; // normalized 32-hex (no dashes)
   content: string;
+  assets?: AssetInput[];
   metadata?: Record<string, unknown>;
   sourceIdPrefix?: string;
 };

package/registry/core/assets.ts ADDED Viewed

@@ -0,0 +1,54 @@
+import type { AssetKind, Chunk } from "./types";
+export type ChunkAssetRef = {
+  assetId: string;
+  assetKind: AssetKind;
+  assetUri?: string;
+  assetMediaType?: string;
+  extractor?: string;
+};
+const assetKinds = new Set<AssetKind>(["image", "pdf", "audio", "video", "file"]);
+/**
+ * Convenience helper to extract an asset reference from a retrieved chunk.
+ *
+ * Asset chunks are represented as standard text chunks whose `metadata` contains:
+ * - `assetKind`: "image" | "pdf" | "audio" | "video" | "file"
+ * - `assetId`: stable identifier emitted by the connector/ingester
+ * - optional `assetUri`, `assetMediaType`, and `extractor`
+ */
+export function getChunkAssetRef(
+  chunk: Pick<Chunk, "metadata">
+): ChunkAssetRef | null {
+  const meta = chunk.metadata as any;
+  const kind = meta?.assetKind;
+  const id = meta?.assetId;
+  if (typeof kind !== "string" || !assetKinds.has(kind as AssetKind)) {
+    return null;
+  }
+  if (typeof id !== "string" || !id) {
+    return null;
+  }
+  const assetUri = typeof meta?.assetUri === "string" ? meta.assetUri : undefined;
+  const assetMediaType =
+    typeof meta?.assetMediaType === "string" ? meta.assetMediaType : undefined;
+  const extractor =
+    typeof meta?.extractor === "string" ? meta.extractor : undefined;
+  return {
+    assetId: id,
+    assetKind: kind as AssetKind,
+    ...(assetUri ? { assetUri } : {}),
+    ...(assetMediaType ? { assetMediaType } : {}),
+    ...(extractor ? { extractor } : {}),
+  };
+}
+export function isAssetChunk(chunk: Pick<Chunk, "metadata">): boolean {
+  return getChunkAssetRef(chunk) !== null;
+}

package/registry/core/config.ts CHANGED Viewed

@@ -2,6 +2,9 @@ import type {
   Chunker,
   ContextEngineConfig,
   ResolvedContextEngineConfig,
+  AssetProcessingConfig,
+  DeepPartial,
+  ContentStorageConfig,
 } from "./types";
 import { defaultChunker, resolveChunkingOptions } from "./chunking";
@@ -10,6 +13,150 @@ export const defineConfig = (config: ContextEngineConfig): ContextEngineConfig =
 const defaultIdGenerator = () => crypto.randomUUID();
+const DEFAULT_PDF_LLM_MODEL = "google/gemini-2.0-flash";
+const DEFAULT_IMAGE_OCR_MODEL = "google/gemini-2.0-flash";
+const DEFAULT_IMAGE_CAPTION_MODEL = "google/gemini-2.0-flash";
+const DEFAULT_AUDIO_TRANSCRIBE_MODEL = "openai/whisper-1";
+const DEFAULT_VIDEO_TRANSCRIBE_MODEL = "openai/whisper-1";
+export const defaultAssetProcessingConfig: AssetProcessingConfig = {
+  onUnsupportedAsset: "skip",
+  onError: "skip",
+  concurrency: 4,
+  hooks: {
+    onEvent: undefined,
+  },
+  fetch: {
+    enabled: true,
+    allowedHosts: undefined,
+    maxBytes: 15 * 1024 * 1024, // 15MB
+    timeoutMs: 20_000,
+    headers: undefined,
+  },
+  pdf: {
+    textLayer: {
+      enabled: false,
+      maxBytes: 15 * 1024 * 1024, // 15MB
+      maxOutputChars: 200_000,
+      minChars: 200,
+      maxPages: undefined,
+    },
+    llmExtraction: {
+      enabled: false, // library default (cost-safe)
+      model: DEFAULT_PDF_LLM_MODEL,
+      prompt:
+        "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
+      timeoutMs: 60_000,
+      maxBytes: 15 * 1024 * 1024, // 15MB
+      maxOutputChars: 200_000,
+    },
+    ocr: {
+      enabled: false,
+      maxBytes: 15 * 1024 * 1024, // 15MB
+      maxOutputChars: 200_000,
+      minChars: 200,
+      maxPages: undefined,
+      pdftoppmPath: undefined,
+      tesseractPath: undefined,
+      dpi: 200,
+      lang: "eng",
+    },
+  },
+  image: {
+    ocr: {
+      enabled: false,
+      model: DEFAULT_IMAGE_OCR_MODEL,
+      prompt:
+        "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
+      timeoutMs: 60_000,
+      maxBytes: 10 * 1024 * 1024, // 10MB
+      maxOutputChars: 50_000,
+    },
+    captionLlm: {
+      enabled: false,
+      model: DEFAULT_IMAGE_CAPTION_MODEL,
+      prompt:
+        "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
+      timeoutMs: 60_000,
+      maxBytes: 10 * 1024 * 1024, // 10MB
+      maxOutputChars: 10_000,
+    },
+  },
+  audio: {
+    transcription: {
+      enabled: false,
+      model: DEFAULT_AUDIO_TRANSCRIBE_MODEL,
+      timeoutMs: 120_000,
+      maxBytes: 25 * 1024 * 1024, // 25MB
+    },
+  },
+  video: {
+    transcription: {
+      enabled: false,
+      model: DEFAULT_VIDEO_TRANSCRIBE_MODEL,
+      timeoutMs: 120_000,
+      maxBytes: 50 * 1024 * 1024, // 50MB
+    },
+    frames: {
+      enabled: false,
+      sampleFps: 0.2,
+      maxFrames: 50,
+      ffmpegPath: undefined,
+      maxBytes: 50 * 1024 * 1024, // 50MB
+      model: "google/gemini-2.0-flash",
+      prompt:
+        "Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
+      timeoutMs: 60_000,
+      maxOutputChars: 50_000,
+    },
+  },
+  file: {
+    text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+    docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+    pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+    xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+  },
+};
+export const defaultContentStorageConfig: ContentStorageConfig = {
+  storeChunkContent: true,
+  storeDocumentContent: true,
+};
+const mergeDeep = <T extends Record<string, any>>(
+  base: T,
+  overrides: DeepPartial<T> | undefined
+): T => {
+  if (!overrides) return base;
+  const out: any = Array.isArray(base) ? [...base] : { ...base };
+  for (const key of Object.keys(overrides) as Array<keyof T>) {
+    const nextVal = overrides[key];
+    if (nextVal === undefined) continue;
+    const baseVal = base[key];
+    if (
+      baseVal &&
+      typeof baseVal === "object" &&
+      !Array.isArray(baseVal) &&
+      nextVal &&
+      typeof nextVal === "object" &&
+      !Array.isArray(nextVal)
+    ) {
+      out[key] = mergeDeep(baseVal, nextVal as any);
+    } else {
+      out[key] = nextVal as any;
+    }
+  }
+  return out as T;
+};
+export const resolveAssetProcessingConfig = (
+  overrides?: DeepPartial<AssetProcessingConfig>
+): AssetProcessingConfig => mergeDeep(defaultAssetProcessingConfig, overrides);
+export const resolveContentStorageConfig = (
+  overrides?: DeepPartial<ContentStorageConfig>
+): ContentStorageConfig => mergeDeep(defaultContentStorageConfig, overrides);
 export const resolveConfig = (
   config: ContextEngineConfig
 ): ResolvedContextEngineConfig => {
@@ -21,6 +168,9 @@ export const resolveConfig = (
     defaults: resolveChunkingOptions(config.defaults),
     chunker,
     idGenerator: config.idGenerator ?? defaultIdGenerator,
+    extractors: config.extractors ?? [],
+    storage: resolveContentStorageConfig(config.storage),
+    assetProcessing: resolveAssetProcessingConfig(config.assetProcessing),
   };
 };

package/registry/core/context-engine.ts CHANGED Viewed

@@ -1,15 +1,21 @@
 import { deleteDocuments } from "./delete";
-import { ingest } from "./ingest";
+import { ingest, planIngest } from "./ingest";
 import { retrieve } from "./retrieve";
 import { defineConfig, resolveConfig } from "./config";
+import { createAiEmbeddingProvider } from "../embedding/ai";
 import type {
+  AssetExtractor,
   ContextEngineConfig,
   DeleteInput,
+  DefineUnragConfigInput,
+  EmbeddingProvider,
   IngestInput,
   IngestResult,
+  IngestPlanResult,
   ResolvedContextEngineConfig,
   RetrieveInput,
   RetrieveResult,
+  UnragCreateEngineRuntime,
 } from "./types";
 export class ContextEngine {
@@ -23,6 +29,16 @@ export class ContextEngine {
     return ingest(this.config, input);
   }
+  /**
+   * Dry-run for ingestion. Returns which assets would be processed and by which extractors,
+   * without calling external services.
+   *
+   * Note: chunk counts/embeddings are not produced in dry-run.
+   */
+  async planIngest(input: IngestInput): Promise<IngestPlanResult> {
+    return planIngest(this.config, input);
+  }
   async retrieve(input: RetrieveInput): Promise<RetrieveResult> {
     return retrieve(this.config, input);
   }
@@ -37,4 +53,56 @@ export const createContextEngine = (config: ContextEngineConfig) =>
 export { defineConfig };
+/**
+ * Ergonomic, higher-level config wrapper.
+ *
+ * This helps keep `unrag.config.ts` as a single source of truth while still
+ * allowing runtime wiring (DB client/store, optional extractors).
+ */
+export const defineUnragConfig = <T extends DefineUnragConfigInput>(config: T) => {
+  let embeddingProvider: EmbeddingProvider | undefined;
+  const getEmbeddingProvider = () => {
+    if (embeddingProvider) return embeddingProvider;
+    if (config.embedding.provider === "ai") {
+      embeddingProvider = createAiEmbeddingProvider(config.embedding.config);
+      return embeddingProvider;
+    }
+    embeddingProvider = config.embedding.create();
+    return embeddingProvider;
+  };
+  const defaults = {
+    chunking: config.defaults?.chunking ?? {},
+    retrieval: {
+      topK: config.defaults?.retrieval?.topK ?? 8,
+    },
+  } as const;
+  const createEngineConfig = (runtime: UnragCreateEngineRuntime): ContextEngineConfig => {
+    const baseExtractors = (config.engine?.extractors ?? []) as AssetExtractor[];
+    const extractors =
+      typeof runtime.extractors === "function"
+        ? runtime.extractors(baseExtractors)
+        : runtime.extractors ?? baseExtractors;
+    return defineConfig({
+      ...(config.engine ?? {}),
+      defaults: defaults.chunking,
+      embedding: getEmbeddingProvider(),
+      store: runtime.store,
+      extractors,
+    });
+  };
+  return {
+    defaults,
+    createEngineConfig,
+    createEngine: (runtime: UnragCreateEngineRuntime) =>
+      new ContextEngine(createEngineConfig(runtime)),
+  };
+};

package/registry/core/index.ts CHANGED Viewed

@@ -1,8 +1,21 @@
-export { ContextEngine, createContextEngine, defineConfig } from "./context-engine";
+export {
+  ContextEngine,
+  createContextEngine,
+  defineConfig,
+  defineUnragConfig,
+} from "./context-engine";
 export { deleteDocuments } from "./delete";
-export { ingest } from "./ingest";
+export { ingest, planIngest } from "./ingest";
 export { retrieve } from "./retrieve";
 export { defaultChunker, resolveChunkingOptions } from "./chunking";
+export {
+  defaultAssetProcessingConfig,
+  defaultContentStorageConfig,
+  resolveAssetProcessingConfig,
+  resolveContentStorageConfig,
+} from "./config";
+export { getChunkAssetRef, isAssetChunk } from "./assets";
+export type { ChunkAssetRef } from "./assets";
 export * from "./types";