npm - unrag - Versions diffs - 0.2.1 → 0.2.3 - Mend

unrag 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +2 -2
package/dist/cli/index.js +251 -42
package/package.json +2 -1
package/registry/config/unrag.config.ts +140 -7
package/registry/connectors/notion/render.ts +78 -0
package/registry/connectors/notion/sync.ts +12 -3
package/registry/connectors/notion/types.ts +3 -1
package/registry/core/assets.ts +54 -0
package/registry/core/config.ts +150 -0
package/registry/core/context-engine.ts +69 -1
package/registry/core/index.ts +15 -2
package/registry/core/ingest.ts +743 -17
package/registry/core/types.ts +606 -0
package/registry/docs/unrag.md +6 -0
package/registry/embedding/ai.ts +89 -8
package/registry/extractors/_shared/fetch.ts +113 -0
package/registry/extractors/_shared/media.ts +14 -0
package/registry/extractors/_shared/text.ts +11 -0
package/registry/extractors/audio-transcribe/index.ts +75 -0
package/registry/extractors/file-docx/index.ts +53 -0
package/registry/extractors/file-pptx/index.ts +92 -0
package/registry/extractors/file-text/index.ts +85 -0
package/registry/extractors/file-xlsx/index.ts +58 -0
package/registry/extractors/image-caption-llm/index.ts +60 -0
package/registry/extractors/image-ocr/index.ts +60 -0
package/registry/extractors/pdf-llm/index.ts +84 -0
package/registry/extractors/pdf-ocr/index.ts +125 -0
package/registry/extractors/pdf-text-layer/index.ts +76 -0
package/registry/extractors/video-frames/index.ts +126 -0
package/registry/extractors/video-transcribe/index.ts +78 -0
package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1

package/registry/extractors/image-ocr/index.ts ADDED Viewed

@@ -0,0 +1,60 @@
+import { generateText } from "ai";
+import type { AssetExtractor } from "../../core/types";
+import { getAssetBytes } from "../_shared/fetch";
+import { normalizeMediaType } from "../_shared/media";
+import { capText } from "../_shared/text";
+/**
+ * Image OCR via a vision-capable LLM.
+ *
+ * This extractor is intended for screenshots, charts, diagrams, and any image with embedded text.
+ */
+export function createImageOcrExtractor(): AssetExtractor {
+  return {
+    name: "image:ocr",
+    supports: ({ asset, ctx }) =>
+      asset.kind === "image" && ctx.assetProcessing.image.ocr.enabled,
+    extract: async ({ asset, ctx }) => {
+      const cfg = ctx.assetProcessing.image.ocr;
+      const fetchConfig = ctx.assetProcessing.fetch;
+      const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
+      const { bytes, mediaType } = await getAssetBytes({
+        data: asset.data,
+        fetchConfig,
+        maxBytes,
+        defaultMediaType: "image/jpeg",
+      });
+      const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
+      const result = await generateText({
+        model: cfg.model as any,
+        abortSignal,
+        messages: [
+          {
+            role: "user",
+            content: [
+              { type: "text", text: cfg.prompt },
+              {
+                type: "image",
+                image: bytes,
+                mediaType: normalizeMediaType(mediaType),
+              },
+            ],
+          },
+        ],
+      });
+      const text = String((result as any)?.text ?? "").trim();
+      if (!text) return { texts: [], diagnostics: { model: cfg.model } };
+      return {
+        texts: [{ label: "ocr", content: capText(text, cfg.maxOutputChars) }],
+        diagnostics: { model: cfg.model },
+      };
+    },
+  };
+}

package/registry/extractors/pdf-llm/index.ts ADDED Viewed

@@ -0,0 +1,84 @@
+import { generateText } from "ai";
+import type { AssetData, AssetExtractor, AssetFetchConfig } from "../../core/types";
+import { getAssetBytes } from "../_shared/fetch";
+import { normalizeMediaType } from "../_shared/media";
+import { capText } from "../_shared/text";
+async function getPdfBytes(args: {
+  data: AssetData;
+  fetchConfig: AssetFetchConfig;
+  maxBytes: number;
+}): Promise<{ bytes: Uint8Array; mediaType: string; filename?: string }> {
+  return await getAssetBytes({
+    data: args.data,
+    fetchConfig: args.fetchConfig,
+    maxBytes: args.maxBytes,
+    defaultMediaType: "application/pdf",
+  });
+}
+/**
+ * PDF text extraction via LLM (default model: Gemini via AI Gateway).
+ *
+ * This extractor reads its configuration from `assetProcessing.pdf.llmExtraction`.
+ */
+export function createPdfLlmExtractor(): AssetExtractor {
+  return {
+    name: "pdf:llm",
+    supports: ({ asset, ctx }) =>
+      asset.kind === "pdf" && ctx.assetProcessing.pdf.llmExtraction.enabled,
+    extract: async ({ asset, ctx }) => {
+      const llm = ctx.assetProcessing.pdf.llmExtraction;
+      const fetchConfig = ctx.assetProcessing.fetch;
+      if (!llm.enabled) {
+        return { texts: [] };
+      }
+      const maxBytes = Math.min(llm.maxBytes, fetchConfig.maxBytes);
+      const { bytes, mediaType, filename } = await getPdfBytes({
+        data: asset.data,
+        fetchConfig,
+        maxBytes,
+      });
+      if (bytes.byteLength > maxBytes) {
+        throw new Error(`PDF too large (${bytes.byteLength} > ${maxBytes})`);
+      }
+      const abortSignal = AbortSignal.timeout(llm.timeoutMs);
+      const result = await generateText({
+        // Intentionally allow string model ids for AI Gateway usage.
+        model: llm.model as any,
+        abortSignal,
+        messages: [
+          {
+            role: "user",
+            content: [
+              { type: "text", text: llm.prompt },
+              {
+                type: "file",
+                data: bytes,
+                mediaType: normalizeMediaType(mediaType) ?? "application/pdf",
+                ...(filename ? { filename } : {}),
+              },
+            ],
+          },
+        ],
+      });
+      const text = String((result as any)?.text ?? "").trim();
+      if (!text) return { texts: [], diagnostics: { model: llm.model } };
+      const capped = capText(text, llm.maxOutputChars);
+      return {
+        texts: [{ label: "fulltext", content: capped }],
+        diagnostics: { model: llm.model },
+      };
+    },
+  };
+}

package/registry/extractors/pdf-ocr/index.ts ADDED Viewed

@@ -0,0 +1,125 @@
+import { spawn } from "node:child_process";
+import { mkdir, readdir, rm, writeFile } from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import type { AssetExtractor } from "../../core/types";
+import { getAssetBytes } from "../_shared/fetch";
+import { capText } from "../_shared/text";
+const run = async (cmd: string, args: string[], opts: { cwd: string }) => {
+  return await new Promise<{ stdout: string; stderr: string }>((resolve, reject) => {
+    const child = spawn(cmd, args, { cwd: opts.cwd, stdio: ["ignore", "pipe", "pipe"] });
+    let stdout = "";
+    let stderr = "";
+    child.stdout.on("data", (d) => (stdout += d.toString()));
+    child.stderr.on("data", (d) => (stderr += d.toString()));
+    child.on("error", reject);
+    child.on("close", (code) => {
+      if (code === 0) return resolve({ stdout, stderr });
+      reject(new Error(`${cmd} exited with code ${code}\n${stderr}`.trim()));
+    });
+  });
+};
+/**
+ * Worker-only PDF OCR extractor.
+ *
+ * This extractor expects external binaries to be available:
+ * - `pdftoppm` (Poppler) to rasterize pages
+ * - `tesseract` to OCR rasterized images
+ *
+ * It is intentionally not serverless-friendly.
+ */
+export function createPdfOcrExtractor(): AssetExtractor {
+  return {
+    name: "pdf:ocr",
+    supports: ({ asset, ctx }) =>
+      asset.kind === "pdf" && ctx.assetProcessing.pdf.ocr.enabled,
+    extract: async ({ asset, ctx }) => {
+      const cfg = ctx.assetProcessing.pdf.ocr;
+      const fetchConfig = ctx.assetProcessing.fetch;
+      const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
+      const { bytes } = await getAssetBytes({
+        data: asset.data,
+        fetchConfig,
+        maxBytes,
+        defaultMediaType: "application/pdf",
+      });
+      const tmpDir = path.join(os.tmpdir(), `unrag-pdf-ocr-${crypto.randomUUID()}`);
+      await mkdir(tmpDir, { recursive: true });
+      try {
+        const pdfPath = path.join(tmpDir, "input.pdf");
+        await writeFile(pdfPath, bytes);
+        const prefix = path.join(tmpDir, "page");
+        const pdftoppm = cfg.pdftoppmPath ?? "pdftoppm";
+        const dpi = cfg.dpi ?? 200;
+        const pdftoppmArgs = [
+          "-png",
+          "-r",
+          String(dpi),
+          "-f",
+          "1",
+          ...(cfg.maxPages ? ["-l", String(cfg.maxPages)] : []),
+          pdfPath,
+          prefix,
+        ];
+        await run(pdftoppm, pdftoppmArgs, { cwd: tmpDir });
+        const files = (await readdir(tmpDir)).filter((f) =>
+          /^page-\d+\.png$/.test(f)
+        );
+        // Sort by page number: page-1.png, page-2.png, ...
+        files.sort((a, b) => {
+          const na = Number(a.match(/^page-(\d+)\.png$/)?.[1] ?? 0);
+          const nb = Number(b.match(/^page-(\d+)\.png$/)?.[1] ?? 0);
+          return na - nb;
+        });
+        const tesseract = cfg.tesseractPath ?? "tesseract";
+        const lang = cfg.lang ?? "eng";
+        let out = "";
+        for (const f of files) {
+          const imgPath = path.join(tmpDir, f);
+          const { stdout } = await run(
+            tesseract,
+            [imgPath, "stdout", "-l", lang],
+            { cwd: tmpDir }
+          );
+          const text = String(stdout ?? "").trim();
+          if (text) {
+            out += (out ? "\n\n" : "") + text;
+          }
+          if (out.length >= cfg.maxOutputChars) {
+            break;
+          }
+        }
+        out = capText(out.trim(), cfg.maxOutputChars);
+        if (out.length < cfg.minChars) {
+          return { texts: [] };
+        }
+        return {
+          texts: [
+            {
+              label: "ocr",
+              content: out,
+            },
+          ],
+        };
+      } finally {
+        await rm(tmpDir, { recursive: true, force: true });
+      }
+    },
+  };
+}

package/registry/extractors/pdf-text-layer/index.ts ADDED Viewed

@@ -0,0 +1,76 @@
+import type { AssetExtractor } from "../../core/types";
+import { getAssetBytes } from "../_shared/fetch";
+import { capText } from "../_shared/text";
+/**
+ * Fast/cheap PDF extraction using the PDF's built-in text layer.
+ *
+ * This extractor is best-effort: if the PDF has little/no embedded text (scanned PDFs),
+ * it returns empty output so the pipeline can fall back to another extractor (e.g. `pdf:llm`).
+ *
+ * Dependencies (installed by CLI):
+ * - `pdfjs-dist`
+ */
+export function createPdfTextLayerExtractor(): AssetExtractor {
+  return {
+    name: "pdf:text-layer",
+    supports: ({ asset, ctx }) =>
+      asset.kind === "pdf" && ctx.assetProcessing.pdf.textLayer.enabled,
+    extract: async ({ asset, ctx }) => {
+      const cfg = ctx.assetProcessing.pdf.textLayer;
+      const fetchConfig = ctx.assetProcessing.fetch;
+      const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
+      const { bytes } = await getAssetBytes({
+        data: asset.data,
+        fetchConfig,
+        maxBytes,
+        defaultMediaType: "application/pdf",
+      });
+      // Dynamic import so the core package can be used without pdfjs unless this extractor is installed.
+      const pdfjs: any = await import("pdfjs-dist/legacy/build/pdf.mjs");
+      const doc = await pdfjs.getDocument({ data: bytes }).promise;
+      const totalPages: number = Number(doc?.numPages ?? 0);
+      const maxPages = Math.max(
+        0,
+        Math.min(totalPages, cfg.maxPages ?? totalPages)
+      );
+      let out = "";
+      for (let pageNum = 1; pageNum <= maxPages; pageNum++) {
+        const page = await doc.getPage(pageNum);
+        const textContent = await page.getTextContent();
+        const items: any[] = Array.isArray(textContent?.items)
+          ? textContent.items
+          : [];
+        const pageText = items
+          .map((it) => (typeof it?.str === "string" ? it.str : ""))
+          .join(" ")
+          .replace(/\s+/g, " ")
+          .trim();
+        if (pageText) {
+          out += (out ? "\n\n" : "") + pageText;
+        }
+      }
+      out = out.trim();
+      if (out.length < cfg.minChars) {
+        return { texts: [] };
+      }
+      return {
+        texts: [
+          {
+            label: "text-layer",
+            content: capText(out, cfg.maxOutputChars),
+            pageRange: totalPages ? [1, maxPages || totalPages] : undefined,
+          },
+        ],
+      };
+    },
+  };
+}

package/registry/extractors/video-frames/index.ts ADDED Viewed

@@ -0,0 +1,126 @@
+import { generateText } from "ai";
+import { spawn } from "node:child_process";
+import { mkdir, readdir, readFile, rm, writeFile } from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import type { AssetExtractor } from "../../core/types";
+import { getAssetBytes } from "../_shared/fetch";
+import { capText } from "../_shared/text";
+const run = async (cmd: string, args: string[], opts: { cwd: string }) => {
+  return await new Promise<{ stdout: string; stderr: string }>((resolve, reject) => {
+    const child = spawn(cmd, args, { cwd: opts.cwd, stdio: ["ignore", "pipe", "pipe"] });
+    let stdout = "";
+    let stderr = "";
+    child.stdout.on("data", (d) => (stdout += d.toString()));
+    child.stderr.on("data", (d) => (stderr += d.toString()));
+    child.on("error", reject);
+    child.on("close", (code) => {
+      if (code === 0) return resolve({ stdout, stderr });
+      reject(new Error(`${cmd} exited with code ${code}\n${stderr}`.trim()));
+    });
+  });
+};
+/**
+ * Worker-only frame sampling + per-frame vision extraction.
+ *
+ * This extractor requires `ffmpeg` and is not suitable for serverless runtimes.
+ */
+export function createVideoFramesExtractor(): AssetExtractor {
+  return {
+    name: "video:frames",
+    supports: ({ asset, ctx }) =>
+      asset.kind === "video" && ctx.assetProcessing.video.frames.enabled,
+    extract: async ({ asset, ctx }) => {
+      const cfg = ctx.assetProcessing.video.frames;
+      const fetchConfig = ctx.assetProcessing.fetch;
+      const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
+      const { bytes } = await getAssetBytes({
+        data: asset.data,
+        fetchConfig,
+        maxBytes,
+        defaultMediaType: "video/mp4",
+      });
+      const tmpDir = path.join(os.tmpdir(), `unrag-video-frames-${crypto.randomUUID()}`);
+      await mkdir(tmpDir, { recursive: true });
+      try {
+        const videoPath = path.join(tmpDir, "input.mp4");
+        await writeFile(videoPath, bytes);
+        const ffmpeg = cfg.ffmpegPath ?? "ffmpeg";
+        const outPattern = path.join(tmpDir, "frame-%03d.jpg");
+        const fps = Math.max(0.001, cfg.sampleFps);
+        const maxFrames = Math.max(1, Math.floor(cfg.maxFrames));
+        await run(
+          ffmpeg,
+          [
+            "-hide_banner",
+            "-loglevel",
+            "error",
+            "-i",
+            videoPath,
+            "-vf",
+            `fps=${fps}`,
+            "-vframes",
+            String(maxFrames),
+            outPattern,
+          ],
+          { cwd: tmpDir }
+        );
+        const frames = (await readdir(tmpDir))
+          .filter((f) => /^frame-\d+\.jpg$/.test(f))
+          .sort();
+        const abortPerFrame = (ms: number) => AbortSignal.timeout(ms);
+        const texts: Array<{ label: string; content: string }> = [];
+        let totalChars = 0;
+        for (const f of frames) {
+          if (texts.length >= maxFrames) break;
+          if (totalChars >= cfg.maxOutputChars) break;
+          const imgBytes = await readFile(path.join(tmpDir, f));
+          const result = await generateText({
+            model: cfg.model as any,
+            abortSignal: abortPerFrame(cfg.timeoutMs),
+            messages: [
+              {
+                role: "user",
+                content: [
+                  { type: "text", text: cfg.prompt },
+                  { type: "image", image: new Uint8Array(imgBytes), mediaType: "image/jpeg" },
+                ],
+              },
+            ],
+          });
+          const t = String((result as any)?.text ?? "").trim();
+          if (!t) continue;
+          const capped = capText(t, cfg.maxOutputChars - totalChars);
+          if (!capped) continue;
+          texts.push({ label: f, content: capped });
+          totalChars += capped.length;
+        }
+        if (texts.length === 0) return { texts: [] };
+        return {
+          texts: texts.map((t) => ({ label: t.label, content: t.content })),
+          diagnostics: { model: cfg.model },
+        };
+      } finally {
+        await rm(tmpDir, { recursive: true, force: true });
+      }
+    },
+  };
+}

package/registry/extractors/video-transcribe/index.ts ADDED Viewed

@@ -0,0 +1,78 @@
+import { experimental_transcribe as transcribe } from "ai";
+import type { AssetExtractor } from "../../core/types";
+import { getAssetBytes } from "../_shared/fetch";
+/**
+ * Video transcription by sending the video file to the AI SDK transcription API.
+ *
+ * Note: provider support varies; many transcription providers accept audio formats only.
+ * If your provider does not accept video files, use a worker pipeline to extract audio first.
+ */
+export function createVideoTranscribeExtractor(): AssetExtractor {
+  return {
+    name: "video:transcribe",
+    supports: ({ asset, ctx }) =>
+      asset.kind === "video" && ctx.assetProcessing.video.transcription.enabled,
+    extract: async ({ asset, ctx }) => {
+      const cfg = ctx.assetProcessing.video.transcription;
+      const fetchConfig = ctx.assetProcessing.fetch;
+      const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
+      const { bytes } = await getAssetBytes({
+        data: asset.data,
+        fetchConfig,
+        maxBytes,
+        defaultMediaType: "video/mp4",
+      });
+      const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
+      const result = await transcribe({
+        model: cfg.model as any,
+        audio: bytes as any,
+        abortSignal,
+      });
+      const segments: any[] = Array.isArray((result as any)?.segments)
+        ? (result as any).segments
+        : [];
+      if (segments.length > 0) {
+        return {
+          texts: segments
+            .map((s, i) => {
+              const t = String(s?.text ?? "").trim();
+              if (!t) return null;
+              const start = Number(s?.startSecond ?? NaN);
+              const end = Number(s?.endSecond ?? NaN);
+              return {
+                label: `segment-${i + 1}`,
+                content: t,
+                ...(Number.isFinite(start) && Number.isFinite(end)
+                  ? { timeRangeSec: [start, end] as [number, number] }
+                  : {}),
+              };
+            })
+            .filter(Boolean) as any,
+          diagnostics: {
+            model: cfg.model,
+            seconds:
+              typeof (result as any)?.durationInSeconds === "number"
+                ? (result as any).durationInSeconds
+                : undefined,
+          },
+        };
+      }
+      const text = String((result as any)?.text ?? "").trim();
+      if (!text) return { texts: [], diagnostics: { model: cfg.model } };
+      return {
+        texts: [{ label: "transcript", content: text }],
+        diagnostics: { model: cfg.model },
+      };
+    },
+  };
+}

package/registry/store/drizzle-postgres-pgvector/store.ts CHANGED Viewed

@@ -146,7 +146,7 @@ export const createDrizzleVectorStore = (db: DrizzleDb): VectorStore => ({
   },
   delete: async (input) => {
-    if ("sourceId" in input) {
+    if (input.sourceId !== undefined) {
       await db.delete(documents).where(eq(documents.sourceId, input.sourceId));
       return;
     }