npm - unrag - Versions diffs - 0.2.2 → 0.2.3 - Mend

unrag 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +2 -2
package/dist/cli/index.js +199 -41
package/package.json +2 -1
package/registry/config/unrag.config.ts +140 -7
package/registry/connectors/notion/render.ts +78 -0
package/registry/connectors/notion/sync.ts +12 -3
package/registry/connectors/notion/types.ts +3 -1
package/registry/core/assets.ts +54 -0
package/registry/core/config.ts +150 -0
package/registry/core/context-engine.ts +69 -1
package/registry/core/index.ts +15 -2
package/registry/core/ingest.ts +743 -17
package/registry/core/types.ts +606 -0
package/registry/docs/unrag.md +6 -0
package/registry/embedding/ai.ts +89 -8
package/registry/extractors/_shared/fetch.ts +113 -0
package/registry/extractors/_shared/media.ts +14 -0
package/registry/extractors/_shared/text.ts +11 -0
package/registry/extractors/audio-transcribe/index.ts +75 -0
package/registry/extractors/file-docx/index.ts +53 -0
package/registry/extractors/file-pptx/index.ts +92 -0
package/registry/extractors/file-text/index.ts +85 -0
package/registry/extractors/file-xlsx/index.ts +58 -0
package/registry/extractors/image-caption-llm/index.ts +60 -0
package/registry/extractors/image-ocr/index.ts +60 -0
package/registry/extractors/pdf-llm/index.ts +84 -0
package/registry/extractors/pdf-ocr/index.ts +125 -0
package/registry/extractors/pdf-text-layer/index.ts +76 -0
package/registry/extractors/video-frames/index.ts +126 -0
package/registry/extractors/video-transcribe/index.ts +78 -0
package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1

package/registry/connectors/notion/types.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import type { ContextEngine } from "../../core";
-import type { IngestInput } from "../../core/types";
+import type { AssetInput, IngestInput } from "../../core";
 export type NotionSyncProgressEvent =
   | { type: "page:start"; pageId: string; sourceId: string }
@@ -42,11 +42,13 @@ export type NotionPageDocument = {
   sourceId: string;
   content: string;
   metadata: Record<string, unknown>;
+  assets: AssetInput[];
 };
 export type BuildNotionPageIngestInputArgs = {
   pageId: string; // normalized 32-hex (no dashes)
   content: string;
+  assets?: AssetInput[];
   metadata?: Record<string, unknown>;
   sourceIdPrefix?: string;
 };

package/registry/core/assets.ts ADDED Viewed

@@ -0,0 +1,54 @@
+import type { AssetKind, Chunk } from "./types";
+export type ChunkAssetRef = {
+  assetId: string;
+  assetKind: AssetKind;
+  assetUri?: string;
+  assetMediaType?: string;
+  extractor?: string;
+};
+const assetKinds = new Set<AssetKind>(["image", "pdf", "audio", "video", "file"]);
+/**
+ * Convenience helper to extract an asset reference from a retrieved chunk.
+ *
+ * Asset chunks are represented as standard text chunks whose `metadata` contains:
+ * - `assetKind`: "image" | "pdf" | "audio" | "video" | "file"
+ * - `assetId`: stable identifier emitted by the connector/ingester
+ * - optional `assetUri`, `assetMediaType`, and `extractor`
+ */
+export function getChunkAssetRef(
+  chunk: Pick<Chunk, "metadata">
+): ChunkAssetRef | null {
+  const meta = chunk.metadata as any;
+  const kind = meta?.assetKind;
+  const id = meta?.assetId;
+  if (typeof kind !== "string" || !assetKinds.has(kind as AssetKind)) {
+    return null;
+  }
+  if (typeof id !== "string" || !id) {
+    return null;
+  }
+  const assetUri = typeof meta?.assetUri === "string" ? meta.assetUri : undefined;
+  const assetMediaType =
+    typeof meta?.assetMediaType === "string" ? meta.assetMediaType : undefined;
+  const extractor =
+    typeof meta?.extractor === "string" ? meta.extractor : undefined;
+  return {
+    assetId: id,
+    assetKind: kind as AssetKind,
+    ...(assetUri ? { assetUri } : {}),
+    ...(assetMediaType ? { assetMediaType } : {}),
+    ...(extractor ? { extractor } : {}),
+  };
+}
+export function isAssetChunk(chunk: Pick<Chunk, "metadata">): boolean {
+  return getChunkAssetRef(chunk) !== null;
+}

package/registry/core/config.ts CHANGED Viewed

@@ -2,6 +2,9 @@ import type {
   Chunker,
   ContextEngineConfig,
   ResolvedContextEngineConfig,
+  AssetProcessingConfig,
+  DeepPartial,
+  ContentStorageConfig,
 } from "./types";
 import { defaultChunker, resolveChunkingOptions } from "./chunking";
@@ -10,6 +13,150 @@ export const defineConfig = (config: ContextEngineConfig): ContextEngineConfig =
 const defaultIdGenerator = () => crypto.randomUUID();
+const DEFAULT_PDF_LLM_MODEL = "google/gemini-2.0-flash";
+const DEFAULT_IMAGE_OCR_MODEL = "google/gemini-2.0-flash";
+const DEFAULT_IMAGE_CAPTION_MODEL = "google/gemini-2.0-flash";
+const DEFAULT_AUDIO_TRANSCRIBE_MODEL = "openai/whisper-1";
+const DEFAULT_VIDEO_TRANSCRIBE_MODEL = "openai/whisper-1";
+export const defaultAssetProcessingConfig: AssetProcessingConfig = {
+  onUnsupportedAsset: "skip",
+  onError: "skip",
+  concurrency: 4,
+  hooks: {
+    onEvent: undefined,
+  },
+  fetch: {
+    enabled: true,
+    allowedHosts: undefined,
+    maxBytes: 15 * 1024 * 1024, // 15MB
+    timeoutMs: 20_000,
+    headers: undefined,
+  },
+  pdf: {
+    textLayer: {
+      enabled: false,
+      maxBytes: 15 * 1024 * 1024, // 15MB
+      maxOutputChars: 200_000,
+      minChars: 200,
+      maxPages: undefined,
+    },
+    llmExtraction: {
+      enabled: false, // library default (cost-safe)
+      model: DEFAULT_PDF_LLM_MODEL,
+      prompt:
+        "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
+      timeoutMs: 60_000,
+      maxBytes: 15 * 1024 * 1024, // 15MB
+      maxOutputChars: 200_000,
+    },
+    ocr: {
+      enabled: false,
+      maxBytes: 15 * 1024 * 1024, // 15MB
+      maxOutputChars: 200_000,
+      minChars: 200,
+      maxPages: undefined,
+      pdftoppmPath: undefined,
+      tesseractPath: undefined,
+      dpi: 200,
+      lang: "eng",
+    },
+  },
+  image: {
+    ocr: {
+      enabled: false,
+      model: DEFAULT_IMAGE_OCR_MODEL,
+      prompt:
+        "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
+      timeoutMs: 60_000,
+      maxBytes: 10 * 1024 * 1024, // 10MB
+      maxOutputChars: 50_000,
+    },
+    captionLlm: {
+      enabled: false,
+      model: DEFAULT_IMAGE_CAPTION_MODEL,
+      prompt:
+        "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
+      timeoutMs: 60_000,
+      maxBytes: 10 * 1024 * 1024, // 10MB
+      maxOutputChars: 10_000,
+    },
+  },
+  audio: {
+    transcription: {
+      enabled: false,
+      model: DEFAULT_AUDIO_TRANSCRIBE_MODEL,
+      timeoutMs: 120_000,
+      maxBytes: 25 * 1024 * 1024, // 25MB
+    },
+  },
+  video: {
+    transcription: {
+      enabled: false,
+      model: DEFAULT_VIDEO_TRANSCRIBE_MODEL,
+      timeoutMs: 120_000,
+      maxBytes: 50 * 1024 * 1024, // 50MB
+    },
+    frames: {
+      enabled: false,
+      sampleFps: 0.2,
+      maxFrames: 50,
+      ffmpegPath: undefined,
+      maxBytes: 50 * 1024 * 1024, // 50MB
+      model: "google/gemini-2.0-flash",
+      prompt:
+        "Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
+      timeoutMs: 60_000,
+      maxOutputChars: 50_000,
+    },
+  },
+  file: {
+    text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+    docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+    pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+    xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
+  },
+};
+export const defaultContentStorageConfig: ContentStorageConfig = {
+  storeChunkContent: true,
+  storeDocumentContent: true,
+};
+const mergeDeep = <T extends Record<string, any>>(
+  base: T,
+  overrides: DeepPartial<T> | undefined
+): T => {
+  if (!overrides) return base;
+  const out: any = Array.isArray(base) ? [...base] : { ...base };
+  for (const key of Object.keys(overrides) as Array<keyof T>) {
+    const nextVal = overrides[key];
+    if (nextVal === undefined) continue;
+    const baseVal = base[key];
+    if (
+      baseVal &&
+      typeof baseVal === "object" &&
+      !Array.isArray(baseVal) &&
+      nextVal &&
+      typeof nextVal === "object" &&
+      !Array.isArray(nextVal)
+    ) {
+      out[key] = mergeDeep(baseVal, nextVal as any);
+    } else {
+      out[key] = nextVal as any;
+    }
+  }
+  return out as T;
+};
+export const resolveAssetProcessingConfig = (
+  overrides?: DeepPartial<AssetProcessingConfig>
+): AssetProcessingConfig => mergeDeep(defaultAssetProcessingConfig, overrides);
+export const resolveContentStorageConfig = (
+  overrides?: DeepPartial<ContentStorageConfig>
+): ContentStorageConfig => mergeDeep(defaultContentStorageConfig, overrides);
 export const resolveConfig = (
   config: ContextEngineConfig
 ): ResolvedContextEngineConfig => {
@@ -21,6 +168,9 @@ export const resolveConfig = (
     defaults: resolveChunkingOptions(config.defaults),
     chunker,
     idGenerator: config.idGenerator ?? defaultIdGenerator,
+    extractors: config.extractors ?? [],
+    storage: resolveContentStorageConfig(config.storage),
+    assetProcessing: resolveAssetProcessingConfig(config.assetProcessing),
   };
 };

package/registry/core/context-engine.ts CHANGED Viewed

@@ -1,15 +1,21 @@
 import { deleteDocuments } from "./delete";
-import { ingest } from "./ingest";
+import { ingest, planIngest } from "./ingest";
 import { retrieve } from "./retrieve";
 import { defineConfig, resolveConfig } from "./config";
+import { createAiEmbeddingProvider } from "../embedding/ai";
 import type {
+  AssetExtractor,
   ContextEngineConfig,
   DeleteInput,
+  DefineUnragConfigInput,
+  EmbeddingProvider,
   IngestInput,
   IngestResult,
+  IngestPlanResult,
   ResolvedContextEngineConfig,
   RetrieveInput,
   RetrieveResult,
+  UnragCreateEngineRuntime,
 } from "./types";
 export class ContextEngine {
@@ -23,6 +29,16 @@ export class ContextEngine {
     return ingest(this.config, input);
   }
+  /**
+   * Dry-run for ingestion. Returns which assets would be processed and by which extractors,
+   * without calling external services.
+   *
+   * Note: chunk counts/embeddings are not produced in dry-run.
+   */
+  async planIngest(input: IngestInput): Promise<IngestPlanResult> {
+    return planIngest(this.config, input);
+  }
   async retrieve(input: RetrieveInput): Promise<RetrieveResult> {
     return retrieve(this.config, input);
   }
@@ -37,4 +53,56 @@ export const createContextEngine = (config: ContextEngineConfig) =>
 export { defineConfig };
+/**
+ * Ergonomic, higher-level config wrapper.
+ *
+ * This helps keep `unrag.config.ts` as a single source of truth while still
+ * allowing runtime wiring (DB client/store, optional extractors).
+ */
+export const defineUnragConfig = <T extends DefineUnragConfigInput>(config: T) => {
+  let embeddingProvider: EmbeddingProvider | undefined;
+  const getEmbeddingProvider = () => {
+    if (embeddingProvider) return embeddingProvider;
+    if (config.embedding.provider === "ai") {
+      embeddingProvider = createAiEmbeddingProvider(config.embedding.config);
+      return embeddingProvider;
+    }
+    embeddingProvider = config.embedding.create();
+    return embeddingProvider;
+  };
+  const defaults = {
+    chunking: config.defaults?.chunking ?? {},
+    retrieval: {
+      topK: config.defaults?.retrieval?.topK ?? 8,
+    },
+  } as const;
+  const createEngineConfig = (runtime: UnragCreateEngineRuntime): ContextEngineConfig => {
+    const baseExtractors = (config.engine?.extractors ?? []) as AssetExtractor[];
+    const extractors =
+      typeof runtime.extractors === "function"
+        ? runtime.extractors(baseExtractors)
+        : runtime.extractors ?? baseExtractors;
+    return defineConfig({
+      ...(config.engine ?? {}),
+      defaults: defaults.chunking,
+      embedding: getEmbeddingProvider(),
+      store: runtime.store,
+      extractors,
+    });
+  };
+  return {
+    defaults,
+    createEngineConfig,
+    createEngine: (runtime: UnragCreateEngineRuntime) =>
+      new ContextEngine(createEngineConfig(runtime)),
+  };
+};

package/registry/core/index.ts CHANGED Viewed

@@ -1,8 +1,21 @@
-export { ContextEngine, createContextEngine, defineConfig } from "./context-engine";
+export {
+  ContextEngine,
+  createContextEngine,
+  defineConfig,
+  defineUnragConfig,
+} from "./context-engine";
 export { deleteDocuments } from "./delete";
-export { ingest } from "./ingest";
+export { ingest, planIngest } from "./ingest";
 export { retrieve } from "./retrieve";
 export { defaultChunker, resolveChunkingOptions } from "./chunking";
+export {
+  defaultAssetProcessingConfig,
+  defaultContentStorageConfig,
+  resolveAssetProcessingConfig,
+  resolveContentStorageConfig,
+} from "./config";
+export { getChunkAssetRef, isAssetChunk } from "./assets";
+export type { ChunkAssetRef } from "./assets";
 export * from "./types";