npm - unrag - Versions diffs - 0.2.2 → 0.2.4 - Mend

unrag 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +2 -2
package/dist/cli/index.js +408 -50
package/package.json +3 -1
package/registry/config/unrag.config.ts +164 -7
package/registry/connectors/notion/render.ts +78 -0
package/registry/connectors/notion/sync.ts +12 -3
package/registry/connectors/notion/types.ts +3 -1
package/registry/core/assets.ts +54 -0
package/registry/core/config.ts +150 -0
package/registry/core/context-engine.ts +69 -1
package/registry/core/index.ts +15 -2
package/registry/core/ingest.ts +743 -17
package/registry/core/types.ts +606 -0
package/registry/docs/unrag.md +6 -0
package/registry/embedding/ai.ts +89 -8
package/registry/extractors/_shared/fetch.ts +113 -0
package/registry/extractors/_shared/media.ts +14 -0
package/registry/extractors/_shared/text.ts +11 -0
package/registry/extractors/audio-transcribe/index.ts +75 -0
package/registry/extractors/file-docx/index.ts +53 -0
package/registry/extractors/file-pptx/index.ts +92 -0
package/registry/extractors/file-text/index.ts +85 -0
package/registry/extractors/file-xlsx/index.ts +58 -0
package/registry/extractors/image-caption-llm/index.ts +60 -0
package/registry/extractors/image-ocr/index.ts +60 -0
package/registry/extractors/pdf-llm/index.ts +84 -0
package/registry/extractors/pdf-ocr/index.ts +125 -0
package/registry/extractors/pdf-text-layer/index.ts +76 -0
package/registry/extractors/video-frames/index.ts +126 -0
package/registry/extractors/video-transcribe/index.ts +78 -0
package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1

package/registry/core/types.ts CHANGED Viewed

@@ -17,6 +17,20 @@ export type Chunk = {
   documentContent?: string;
 };
+/**
+ * Controls what text Unrag persists to the backing store.
+ *
+ * - `storeChunkContent`: whether to persist `chunks.content` (what you get back as `chunk.content` in retrieval).
+ * - `storeDocumentContent`: whether to persist the full original document text (`documents.content`).
+ *
+ * Disabling these can be useful for privacy/compliance or when you have an external
+ * content store and want Unrag to keep only embeddings + identifiers/metadata.
+ */
+export type ContentStorageConfig = {
+  storeChunkContent: boolean;
+  storeDocumentContent: boolean;
+};
 export type ChunkText = {
   index: number;
   content: string;
@@ -30,6 +44,429 @@ export type ChunkingOptions = {
 export type Chunker = (content: string, options: ChunkingOptions) => ChunkText[];
+/**
+ * Data reference for an ingested asset.
+ *
+ * Prefer `bytes` when possible (most reliable). URLs are convenient for connectors
+ * but require network fetch at ingest time (see assetProcessing.fetch safety settings).
+ */
+export type AssetData =
+  | {
+      kind: "url";
+      /** HTTPS URL to fetch the bytes from. */
+      url: string;
+      /** Optional request headers (e.g. signed URLs). */
+      headers?: Record<string, string>;
+      /**
+       * Optional media type hint.
+       * Useful when the URL doesn't have a stable extension (e.g. signed URLs).
+       */
+      mediaType?: string;
+      /** Optional filename hint. */
+      filename?: string;
+    }
+  | {
+      kind: "bytes";
+      /** Raw bytes of the asset. */
+      bytes: Uint8Array;
+      /** IANA media type (e.g. "application/pdf", "image/png"). */
+      mediaType: string;
+      /** Optional filename hint. */
+      filename?: string;
+    };
+export type AssetKind = "image" | "pdf" | "audio" | "video" | "file";
+/**
+ * Non-text input attached to an ingested document.
+ *
+ * Connectors should emit stable `assetId`s (e.g. Notion block id) so downstream
+ * systems can associate chunks back to their originating rich media.
+ */
+export type AssetInput = {
+  /** Stable identifier within the document/source (e.g. block id). */
+  assetId: string;
+  kind: AssetKind;
+  data: AssetData;
+  /**
+   * Optional stable-ish URI for debugging/display (may be the same as data.url).
+   * This value is stored in metadata; do not assume it will be fetchable later.
+   */
+  uri?: string;
+  /**
+   * Optional text already known for the asset (caption/alt text).
+   * This can be embedded as normal text chunks and can also be passed to extractors.
+   */
+  text?: string;
+  /** Optional per-asset metadata (merged into chunk metadata). */
+  metadata?: Metadata;
+};
+export type AssetPolicy = "skip" | "fail";
+export type AssetFetchConfig = {
+  /**
+   * When true, the engine may fetch asset bytes from URLs during ingest.
+   * Disable in high-security environments; provide `bytes` instead.
+   */
+  enabled: boolean;
+  /**
+   * Optional allowlist of hostnames. When set, only these hosts can be fetched.
+   * Recommended to mitigate SSRF.
+   */
+  allowedHosts?: string[];
+  /** Hard cap on fetched bytes. */
+  maxBytes: number;
+  /** Fetch timeout in milliseconds. */
+  timeoutMs: number;
+  /** Extra headers to attach to all fetches (merged with per-asset headers). */
+  headers?: Record<string, string>;
+};
+export type PdfLlmExtractionConfig = {
+  /**
+   * When enabled, PDFs are sent to an LLM to extract text, which is then chunked
+   * and embedded as normal text.
+   *
+   * Library default: false (cost-safe).
+   * Generated config template may set this to true for convenience.
+   */
+  enabled: boolean;
+  /**
+   * AI Gateway model id (Vercel AI SDK), e.g. "google/gemini-2.0-flash".
+   * This must be a model that supports file inputs for PDF extraction.
+   */
+  model: string;
+  /**
+   * Prompt used for extraction. Keep it deterministic: \"extract faithfully\".
+   * The output is later chunked and embedded.
+   */
+  prompt: string;
+  /** LLM call timeout in milliseconds. */
+  timeoutMs: number;
+  /** Hard cap on input PDF bytes. */
+  maxBytes: number;
+  /** Hard cap on extracted text length (characters). */
+  maxOutputChars: number;
+};
+export type PdfTextLayerConfig = {
+  /**
+   * When enabled, PDFs are processed by extracting the built-in text layer (when present).
+   * This is fast/cheap but won't work well for scanned/image-only PDFs.
+   */
+  enabled: boolean;
+  /** Max PDF bytes to attempt text-layer extraction on. */
+  maxBytes: number;
+  /** Hard cap on extracted text length (characters). */
+  maxOutputChars: number;
+  /**
+   * Minimum extracted characters required to accept the result. If fewer chars are extracted,
+   * the extractor should return empty output so the pipeline can fall back to another extractor.
+   */
+  minChars: number;
+  /**
+   * Optional cap on pages to read (defense-in-depth for huge PDFs).
+   * Extractors may ignore this when they can't reliably compute page count.
+   */
+  maxPages?: number;
+};
+export type PdfOcrConfig = {
+  /**
+   * When enabled, PDFs are rendered to images and OCR'd.
+   * This is typically worker-only (needs binaries like poppler/tesseract or external services).
+   */
+  enabled: boolean;
+  /** Max PDF bytes to attempt OCR on. */
+  maxBytes: number;
+  /** Hard cap on extracted text length (characters). */
+  maxOutputChars: number;
+  /** Minimum extracted characters required to accept the OCR output. */
+  minChars: number;
+  /** Optional max pages to OCR (defense-in-depth). */
+  maxPages?: number;
+  /** Optional path to `pdftoppm` (Poppler). */
+  pdftoppmPath?: string;
+  /** Optional path to `tesseract`. */
+  tesseractPath?: string;
+  /** DPI for rasterization (higher = better OCR, slower/larger). */
+  dpi?: number;
+  /** Tesseract language code (e.g. "eng"). */
+  lang?: string;
+};
+export type ImageOcrConfig = {
+  /** When enabled, images can be OCR'd into text chunks. */
+  enabled: boolean;
+  /** Model id (AI Gateway) for vision OCR. */
+  model: string;
+  /** Prompt used for deterministic OCR extraction. */
+  prompt: string;
+  timeoutMs: number;
+  /** Hard cap on input bytes (enforced by fetch + extractor). */
+  maxBytes: number;
+  /** Hard cap on extracted text length (characters). */
+  maxOutputChars: number;
+};
+export type ImageCaptionLlmConfig = {
+  /** When enabled, images can have captions generated via a vision-capable LLM. */
+  enabled: boolean;
+  model: string;
+  prompt: string;
+  timeoutMs: number;
+  maxBytes: number;
+  maxOutputChars: number;
+};
+export type AudioTranscriptionConfig = {
+  /** When enabled, audio assets can be transcribed into text chunks. */
+  enabled: boolean;
+  /** Provider/model id (AI Gateway) for transcription. */
+  model: string;
+  timeoutMs: number;
+  maxBytes: number;
+};
+export type VideoTranscriptionConfig = {
+  /** When enabled, video assets can be transcribed (audio track) into text chunks. */
+  enabled: boolean;
+  model: string;
+  timeoutMs: number;
+  maxBytes: number;
+};
+export type VideoFramesConfig = {
+  /**
+   * When enabled, video frames can be sampled and processed (OCR/caption).
+   * This is typically worker-only (requires ffmpeg and significant runtime).
+   */
+  enabled: boolean;
+  sampleFps: number;
+  maxFrames: number;
+  /** Optional path to ffmpeg binary (worker environments). */
+  ffmpegPath?: string;
+  /** Hard cap on video bytes for frame sampling. */
+  maxBytes: number;
+  /** Vision-capable model id (AI Gateway) for per-frame processing. */
+  model: string;
+  /** Prompt to apply to each sampled frame. */
+  prompt: string;
+  /** Timeout per frame analysis call. */
+  timeoutMs: number;
+  /** Hard cap on total extracted text length (characters). */
+  maxOutputChars: number;
+};
+export type FileTextConfig = {
+  /** When enabled, text-ish files (txt/md/html) can be extracted into chunks. */
+  enabled: boolean;
+  maxBytes: number;
+  maxOutputChars: number;
+  minChars: number;
+};
+export type FileDocxConfig = {
+  enabled: boolean;
+  maxBytes: number;
+  maxOutputChars: number;
+  minChars: number;
+};
+export type FilePptxConfig = {
+  enabled: boolean;
+  maxBytes: number;
+  maxOutputChars: number;
+  minChars: number;
+};
+export type FileXlsxConfig = {
+  enabled: boolean;
+  maxBytes: number;
+  maxOutputChars: number;
+  minChars: number;
+};
+export type AssetProcessingConfig = {
+  /**
+   * What to do when an asset kind is present but unsupported (e.g. audio in v1).
+   * Recommended default: \"skip\".
+   */
+  onUnsupportedAsset: AssetPolicy;
+  /** What to do when processing an asset fails (fetch/LLM errors). */
+  onError: AssetPolicy;
+  /**
+   * Bounded concurrency for asset processing (extraction + any I/O).
+   * This does not affect text chunking/embedding batching.
+   */
+  concurrency: number;
+  /**
+   * Optional hooks for observability (structured events).
+   * Prefer this over ad-hoc logging inside extractors.
+   */
+  hooks?: {
+    onEvent?: (event: AssetProcessingEvent) => void;
+  };
+  /** Network fetch settings for URL-based assets. */
+  fetch: AssetFetchConfig;
+  pdf: {
+    textLayer: PdfTextLayerConfig;
+    llmExtraction: PdfLlmExtractionConfig;
+    ocr: PdfOcrConfig;
+  };
+  image: {
+    ocr: ImageOcrConfig;
+    captionLlm: ImageCaptionLlmConfig;
+  };
+  audio: {
+    transcription: AudioTranscriptionConfig;
+  };
+  video: {
+    transcription: VideoTranscriptionConfig;
+    frames: VideoFramesConfig;
+  };
+  file: {
+    text: FileTextConfig;
+    docx: FileDocxConfig;
+    pptx: FilePptxConfig;
+    xlsx: FileXlsxConfig;
+  };
+};
+export type AssetProcessingEvent =
+  | {
+      type: "asset:start";
+      sourceId: string;
+      documentId: string;
+      assetId: string;
+      assetKind: AssetKind;
+      assetUri?: string;
+      assetMediaType?: string;
+    }
+  | ({
+      type: "asset:skipped";
+      sourceId: string;
+      documentId: string;
+    } & IngestWarning)
+  | {
+      type: "extractor:start";
+      sourceId: string;
+      documentId: string;
+      assetId: string;
+      assetKind: AssetKind;
+      extractor: string;
+    }
+  | {
+      type: "extractor:success";
+      sourceId: string;
+      documentId: string;
+      assetId: string;
+      assetKind: AssetKind;
+      extractor: string;
+      durationMs: number;
+      textItemCount: number;
+    }
+  | {
+      type: "extractor:error";
+      sourceId: string;
+      documentId: string;
+      assetId: string;
+      assetKind: AssetKind;
+      extractor: string;
+      durationMs: number;
+      errorMessage: string;
+    };
+export type ExtractedTextItem = {
+  /**
+   * A label describing the extraction output (e.g. \"fulltext\", \"ocr\", \"transcript\").
+   * Used only for metadata/debugging.
+   */
+  label: string;
+  /** Extracted text content. This will be chunked and embedded as normal text. */
+  content: string;
+  confidence?: number;
+  /**
+   * Optional range metadata produced by the extractor.
+   * This is stored in chunk metadata (if provided) for traceability.
+   */
+  pageRange?: [number, number];
+  timeRangeSec?: [number, number];
+};
+export type AssetExtractorResult = {
+  texts: ExtractedTextItem[];
+  /**
+   * Optional structured skip reason. Prefer returning `texts: []` + `skipped` when the
+   * extractor is configured off or cannot operate under current limits, without treating
+   * it as an error (so the pipeline can fall back to other extractors).
+   */
+  skipped?: {
+    code: string;
+    message: string;
+  };
+  /**
+   * Extractor-produced metadata merged into chunk metadata.
+   * Useful for things like detected language, page count, etc.
+   */
+  metadata?: Metadata;
+  diagnostics?: {
+    model?: string;
+    tokens?: number;
+    seconds?: number;
+  };
+};
+export type AssetExtractorContext = {
+  sourceId: string;
+  documentId: string;
+  documentMetadata: Metadata;
+  /** Engine-resolved asset processing config (defaults + overrides). */
+  assetProcessing: AssetProcessingConfig;
+};
+export type AssetExtractor = {
+  /** Stable name used in metadata and routing (e.g. \"pdf:llm\"). */
+  name: string;
+  /** Whether this extractor can handle a given asset input. */
+  supports: (args: { asset: AssetInput; ctx: AssetExtractorContext }) => boolean;
+  /** Extract text outputs from the asset. */
+  extract: (args: {
+    asset: AssetInput;
+    ctx: AssetExtractorContext;
+  }) => Promise<AssetExtractorResult>;
+};
+export type AssetProcessingPlanItem =
+  | ({
+      status: "will_process";
+      extractors: string[];
+    } & Pick<AssetInput, "assetId" | "kind" | "uri">)
+  | ({
+      status: "will_skip";
+      reason: IngestWarning["code"];
+    } & Pick<AssetInput, "assetId" | "kind" | "uri">);
+export type IngestPlanResult = {
+  documentId: string;
+  sourceId: string;
+  assets: AssetProcessingPlanItem[];
+  warnings: IngestWarning[];
+};
+/**
+ * Deep partial for ergonomic overrides.
+ * Used for engine defaults and per-ingest overrides.
+ */
+export type DeepPartial<T> = {
+  [K in keyof T]?: T[K] extends Array<infer U>
+    ? Array<U>
+    : T[K] extends object
+      ? DeepPartial<T[K]>
+      : T[K];
+};
 export type EmbeddingInput = {
   text: string;
   metadata: Metadata;
@@ -38,10 +475,32 @@ export type EmbeddingInput = {
   documentId: string;
 };
+export type ImageEmbeddingInput = {
+  /** Image bytes or URL. */
+  data: Uint8Array | string;
+  /** IANA media type (recommended when data is bytes). */
+  mediaType?: string;
+  metadata: Metadata;
+  position: number;
+  sourceId: string;
+  documentId: string;
+  assetId?: string;
+};
 export type EmbeddingProvider = {
   name: string;
   dimensions?: number;
   embed: (input: EmbeddingInput) => Promise<number[]>;
+  /**
+   * Optional batch embedding for performance.
+   * When present, the engine may embed text chunks in a single call.
+   */
+  embedMany?: (inputs: EmbeddingInput[]) => Promise<number[][]>;
+  /**
+   * Optional image embedding for unified multimodal retrieval.
+   * Only used when the configured provider supports it.
+   */
+  embedImage?: (input: ImageEmbeddingInput) => Promise<number[]>;
 };
 export type DeleteInput =
@@ -83,12 +542,83 @@ export type IngestInput = {
   content: string;
   metadata?: Metadata;
   chunking?: Partial<ChunkingOptions>;
+  /** Optional rich media attached to the document. */
+  assets?: AssetInput[];
+  /**
+   * Per-ingest overrides for asset processing. Merged with engine defaults.
+   * Use this to toggle expensive features (like PDF LLM extraction) per run.
+   */
+  assetProcessing?: DeepPartial<AssetProcessingConfig>;
 };
+type IngestWarningBase<K extends AssetKind> = {
+  message: string;
+  assetId: string;
+  assetKind: K;
+  assetUri?: string;
+  assetMediaType?: string;
+};
+export type IngestWarning =
+  | (IngestWarningBase<AssetKind> & {
+      /**
+       * A rich media asset was encountered but no extractor exists for its kind.
+       * (Example: audio/video in v1.)
+       */
+      code: "asset_skipped_unsupported_kind";
+    })
+  | (IngestWarningBase<AssetKind> & {
+      /**
+       * An asset kind was encountered, but extraction for that kind is disabled by config.
+       * (Example: audio transcription disabled.)
+       */
+      code: "asset_skipped_extraction_disabled";
+    })
+  | (IngestWarningBase<"pdf"> & {
+      /**
+       * A PDF was encountered but PDF LLM extraction is disabled.
+       * Enable `assetProcessing.pdf.llmExtraction.enabled` to process PDFs.
+       */
+      code: "asset_skipped_pdf_llm_extraction_disabled";
+    })
+  | (IngestWarningBase<"image"> & {
+      /**
+       * An image was encountered but the embedding provider does not support image embedding
+       * AND the asset did not include a non-empty caption/alt text (`assets[].text`).
+       */
+      code: "asset_skipped_image_no_multimodal_and_no_caption";
+    })
+  | (IngestWarningBase<"pdf"> & {
+      /**
+       * PDF LLM extraction ran but produced no usable text.
+       * This is typically due to empty/scanned PDFs or model limitations.
+       */
+      code: "asset_skipped_pdf_empty_extraction";
+    })
+  | (IngestWarningBase<AssetKind> & {
+      /**
+       * Extraction ran but produced no usable text for the asset (non-PDF kinds).
+       * For PDFs, use `asset_skipped_pdf_empty_extraction`.
+       */
+      code: "asset_skipped_extraction_empty";
+    })
+  | (IngestWarningBase<AssetKind> & {
+      /**
+       * Asset processing failed, but policy allowed continuing (`assetProcessing.onError: "skip"`).
+       */
+      code: "asset_processing_error";
+      stage: "fetch" | "extract" | "embed" | "unknown";
+    });
 export type IngestResult = {
   documentId: string;
   chunkCount: number;
   embeddingModel: string;
+  /**
+   * Structured warnings emitted during ingestion.
+   * Use this to detect skipped rich media (unsupported kinds, disabled extraction, best-effort failures).
+   */
+  warnings: IngestWarning[];
   durations: {
     totalMs: number;
     chunkingMs: number;
@@ -115,12 +645,85 @@ export type RetrieveResult = {
   };
 };
+/**
+ * Higher-level (ergonomic) Unrag config wrapper.
+ *
+ * This is intentionally separate from `ContextEngineConfig`:
+ * - `defaults.retrieval` is not part of the engine config; it's a convenience default for callers.
+ * - `defaults.chunking` maps to the engine's `defaults` field.
+ * - `embedding` is configured declaratively and can be turned into an `EmbeddingProvider`.
+ */
+export type UnragDefaultsConfig = {
+  chunking?: Partial<ChunkingOptions>;
+  retrieval?: {
+    topK?: number;
+  };
+};
+export type UnragEngineConfig = Omit<
+  ContextEngineConfig,
+  "embedding" | "store" | "defaults"
+>;
+export type UnragEmbeddingConfig =
+  | {
+      provider: "ai";
+      config?: import("../embedding/ai").AiEmbeddingConfig;
+    }
+  | {
+      provider: "custom";
+      /**
+       * Escape hatch for bringing your own embedding provider.
+       * Use this when you need a provider that is not backed by the AI SDK.
+       */
+      create: () => EmbeddingProvider;
+    };
+export type DefineUnragConfigInput = {
+  defaults?: UnragDefaultsConfig;
+  /**
+   * Engine configuration (everything except embedding/store/defaults).
+   * This is where you configure storage, asset processing, chunker/idGenerator, etc.
+   */
+  engine?: UnragEngineConfig;
+  /**
+   * Embedding configuration. The engine's embedding provider is derived from this.
+   */
+  embedding: UnragEmbeddingConfig;
+};
+export type UnragCreateEngineRuntime = {
+  store: VectorStore;
+  /**
+   * Optional runtime override/extension of extractors.
+   * - If you pass an array, it replaces the base extractors from `engine.extractors`.
+   * - If you pass a function, it receives the base extractors and should return the final array.
+   */
+  extractors?: AssetExtractor[] | ((base: AssetExtractor[]) => AssetExtractor[]);
+};
 export type ContextEngineConfig = {
   embedding: EmbeddingProvider;
   store: VectorStore;
   defaults?: Partial<ChunkingOptions>;
   chunker?: Chunker;
   idGenerator?: () => string;
+  /**
+   * Optional extractor modules that can process non-text assets into text outputs.
+   * These are typically installed via `unrag add extractor <name>` and imported
+   * from your vendored module directory.
+   */
+  extractors?: AssetExtractor[];
+  /**
+   * Controls whether Unrag persists chunk/document text into the database.
+   * Defaults to storing both.
+   */
+  storage?: Partial<ContentStorageConfig>;
+  /**
+   * Asset processing defaults. If omitted, rich media is ignored (except image
+   * captions, which can still be ingested via `assets[].text` if you choose).
+   */
+  assetProcessing?: DeepPartial<AssetProcessingConfig>;
 };
 export type ResolvedContextEngineConfig = {
@@ -129,6 +732,9 @@ export type ResolvedContextEngineConfig = {
   defaults: ChunkingOptions;
   chunker: Chunker;
   idGenerator: () => string;
+  extractors: AssetExtractor[];
+  storage: ContentStorageConfig;
+  assetProcessing: AssetProcessingConfig;
 };

package/registry/docs/unrag.md CHANGED Viewed

@@ -27,6 +27,7 @@ You are responsible for migrations. Create these tables:
 create table documents (
   id uuid primary key,
   source_id text not null,
+  content text not null,
   metadata jsonb,
   created_at timestamp default now()
 );
@@ -50,6 +51,11 @@ create table embeddings (
 );
 ```
+Notes:
+- `documents.content` stores the full original document text (used for debugging/re-chunking).
+- `chunks.content` stores the chunk text returned by retrieval (`chunk.content`).
+- You can disable persisting either/both via the engine config (`storage.storeDocumentContent` / `storage.storeChunkContent`). The schema still requires `text not null`, so Unrag stores empty strings when disabled.
 Recommended indexes:
 ```sql