npm - unrag - Versions diffs - 0.2.5 → 0.2.7 - Mend

unrag 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/dist/cli/index.js +611 -174
package/package.json +12 -6
package/registry/config/unrag.config.ts +9 -8
package/registry/connectors/google-drive/_api-types.ts +60 -0
package/registry/connectors/google-drive/client.ts +99 -38
package/registry/connectors/google-drive/sync.ts +97 -69
package/registry/connectors/google-drive/types.ts +76 -37
package/registry/connectors/notion/client.ts +12 -3
package/registry/connectors/notion/render.ts +62 -23
package/registry/connectors/notion/sync.ts +30 -23
package/registry/core/assets.ts +11 -10
package/registry/core/config.ts +10 -25
package/registry/core/context-engine.ts +71 -2
package/registry/core/deep-merge.ts +45 -0
package/registry/core/ingest.ts +117 -44
package/registry/core/types.ts +96 -2
package/registry/docs/unrag.md +6 -1
package/registry/embedding/_shared.ts +25 -0
package/registry/embedding/ai.ts +8 -68
package/registry/embedding/azure.ts +88 -0
package/registry/embedding/bedrock.ts +88 -0
package/registry/embedding/cohere.ts +88 -0
package/registry/embedding/google.ts +102 -0
package/registry/embedding/mistral.ts +71 -0
package/registry/embedding/ollama.ts +90 -0
package/registry/embedding/openai.ts +88 -0
package/registry/embedding/openrouter.ts +127 -0
package/registry/embedding/together.ts +77 -0
package/registry/embedding/vertex.ts +111 -0
package/registry/embedding/voyage.ts +169 -0
package/registry/extractors/audio-transcribe/index.ts +39 -23
package/registry/extractors/file-docx/index.ts +8 -1
package/registry/extractors/file-pptx/index.ts +22 -1
package/registry/extractors/file-xlsx/index.ts +24 -1
package/registry/extractors/image-caption-llm/index.ts +8 -3
package/registry/extractors/image-ocr/index.ts +9 -4
package/registry/extractors/pdf-llm/index.ts +9 -4
package/registry/extractors/pdf-text-layer/index.ts +23 -2
package/registry/extractors/video-frames/index.ts +8 -3
package/registry/extractors/video-transcribe/index.ts +40 -24
package/registry/manifest.json +346 -0
package/registry/store/drizzle-postgres-pgvector/store.ts +26 -6

package/registry/embedding/cohere.ts ADDED Viewed

@@ -0,0 +1,88 @@
+import { embed, embedMany, type EmbeddingModel } from "ai";
+import type { EmbeddingProvider } from "../core/types";
+import { requireOptional } from "./_shared";
+/**
+ * Cohere provider module interface.
+ */
+interface CohereModule {
+  cohere: {
+    embedding: (model: string) => EmbeddingModel<string>;
+  };
+}
+export type CohereEmbeddingConfig = {
+  model?: string;
+  timeoutMs?: number;
+  inputType?: "search_document" | "search_query" | "classification" | "clustering";
+  truncate?: "NONE" | "START" | "END";
+};
+const DEFAULT_TEXT_MODEL = "embed-english-v3.0";
+const buildProviderOptions = (config: CohereEmbeddingConfig) => {
+  if (!config.inputType && !config.truncate) {
+    return undefined;
+  }
+  return {
+    cohere: {
+      ...(config.inputType ? { inputType: config.inputType } : {}),
+      ...(config.truncate ? { truncate: config.truncate } : {}),
+    },
+  };
+};
+export const createCohereEmbeddingProvider = (
+  config: CohereEmbeddingConfig = {}
+): EmbeddingProvider => {
+  const { cohere } = requireOptional<CohereModule>({
+    id: "@ai-sdk/cohere",
+    installHint: "bun add @ai-sdk/cohere",
+    providerName: "cohere",
+  });
+  const model =
+    config.model ?? process.env.COHERE_EMBEDDING_MODEL ?? DEFAULT_TEXT_MODEL;
+  const timeoutMs = config.timeoutMs;
+  const providerOptions = buildProviderOptions(config);
+  const embeddingModel = cohere.embedding(model);
+  return {
+    name: `cohere:${model}`,
+    dimensions: undefined,
+    embed: async ({ text }) => {
+      const abortSignal = timeoutMs
+        ? AbortSignal.timeout(timeoutMs)
+        : undefined;
+      const result = await embed({
+        model: embeddingModel,
+        value: text,
+        ...(providerOptions ? { providerOptions } : {}),
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      if (!result.embedding) {
+        throw new Error("Embedding missing from Cohere response");
+      }
+      return result.embedding;
+    },
+    embedMany: async (inputs) => {
+      const values = inputs.map((i) => i.text);
+      const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
+      const result = await embedMany({
+        model: embeddingModel,
+        values,
+        ...(providerOptions ? { providerOptions } : {}),
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      const { embeddings } = result;
+      if (!Array.isArray(embeddings)) {
+        throw new Error("Embeddings missing from Cohere embedMany response");
+      }
+      return embeddings;
+    },
+  };
+};

package/registry/embedding/google.ts ADDED Viewed

@@ -0,0 +1,102 @@
+import { embed, embedMany, type EmbeddingModel } from "ai";
+import type { EmbeddingProvider } from "../core/types";
+import { requireOptional } from "./_shared";
+/**
+ * Google AI provider module interface.
+ */
+interface GoogleModule {
+  google: {
+    embedding: (model: string) => EmbeddingModel<string>;
+  };
+}
+export type GoogleEmbeddingTaskType =
+  | "SEMANTIC_SIMILARITY"
+  | "CLASSIFICATION"
+  | "CLUSTERING"
+  | "RETRIEVAL_DOCUMENT"
+  | "RETRIEVAL_QUERY"
+  | "QUESTION_ANSWERING"
+  | "FACT_VERIFICATION"
+  | "CODE_RETRIEVAL_QUERY";
+export type GoogleEmbeddingConfig = {
+  model?: string;
+  timeoutMs?: number;
+  outputDimensionality?: number;
+  taskType?: GoogleEmbeddingTaskType;
+};
+const DEFAULT_TEXT_MODEL = "gemini-embedding-001";
+const buildProviderOptions = (config: GoogleEmbeddingConfig) => {
+  if (config.outputDimensionality === undefined && config.taskType === undefined) {
+    return undefined;
+  }
+  return {
+    google: {
+      ...(config.outputDimensionality !== undefined
+        ? { outputDimensionality: config.outputDimensionality }
+        : {}),
+      ...(config.taskType ? { taskType: config.taskType } : {}),
+    },
+  };
+};
+export const createGoogleEmbeddingProvider = (
+  config: GoogleEmbeddingConfig = {}
+): EmbeddingProvider => {
+  const { google } = requireOptional<GoogleModule>({
+    id: "@ai-sdk/google",
+    installHint: "bun add @ai-sdk/google",
+    providerName: "google",
+  });
+  const model =
+    config.model ??
+    process.env.GOOGLE_GENERATIVE_AI_EMBEDDING_MODEL ??
+    DEFAULT_TEXT_MODEL;
+  const timeoutMs = config.timeoutMs;
+  const providerOptions = buildProviderOptions(config);
+  const embeddingModel = google.embedding(model);
+  return {
+    name: `google:${model}`,
+    dimensions: config.outputDimensionality,
+    embed: async ({ text }) => {
+      const abortSignal = timeoutMs
+        ? AbortSignal.timeout(timeoutMs)
+        : undefined;
+      const result = await embed({
+        model: embeddingModel,
+        value: text,
+        ...(providerOptions ? { providerOptions } : {}),
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      if (!result.embedding) {
+        throw new Error("Embedding missing from Google response");
+      }
+      return result.embedding;
+    },
+    embedMany: async (inputs) => {
+      const values = inputs.map((i) => i.text);
+      const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
+      const result = await embedMany({
+        model: embeddingModel,
+        values,
+        ...(providerOptions ? { providerOptions } : {}),
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      const { embeddings } = result;
+      if (!Array.isArray(embeddings)) {
+        throw new Error("Embeddings missing from Google embedMany response");
+      }
+      return embeddings;
+    },
+  };
+};

package/registry/embedding/mistral.ts ADDED Viewed

@@ -0,0 +1,71 @@
+import { embed, embedMany, type EmbeddingModel } from "ai";
+import type { EmbeddingProvider } from "../core/types";
+import { requireOptional } from "./_shared";
+/**
+ * Mistral provider module interface.
+ */
+interface MistralModule {
+  mistral: {
+    embedding: (model: string) => EmbeddingModel<string>;
+  };
+}
+export type MistralEmbeddingConfig = {
+  model?: string;
+  timeoutMs?: number;
+};
+const DEFAULT_TEXT_MODEL = "mistral-embed";
+export const createMistralEmbeddingProvider = (
+  config: MistralEmbeddingConfig = {}
+): EmbeddingProvider => {
+  const { mistral } = requireOptional<MistralModule>({
+    id: "@ai-sdk/mistral",
+    installHint: "bun add @ai-sdk/mistral",
+    providerName: "mistral",
+  });
+  const model =
+    config.model ?? process.env.MISTRAL_EMBEDDING_MODEL ?? DEFAULT_TEXT_MODEL;
+  const timeoutMs = config.timeoutMs;
+  const embeddingModel = mistral.embedding(model);
+  return {
+    name: `mistral:${model}`,
+    dimensions: undefined,
+    embed: async ({ text }) => {
+      const abortSignal = timeoutMs
+        ? AbortSignal.timeout(timeoutMs)
+        : undefined;
+      const result = await embed({
+        model: embeddingModel,
+        value: text,
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      if (!result.embedding) {
+        throw new Error("Embedding missing from Mistral response");
+      }
+      return result.embedding;
+    },
+    embedMany: async (inputs) => {
+      const values = inputs.map((i) => i.text);
+      const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
+      const result = await embedMany({
+        model: embeddingModel,
+        values,
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      const { embeddings } = result;
+      if (!Array.isArray(embeddings)) {
+        throw new Error("Embeddings missing from Mistral embedMany response");
+      }
+      return embeddings;
+    },
+  };
+};

package/registry/embedding/ollama.ts ADDED Viewed

@@ -0,0 +1,90 @@
+import { embed, embedMany, type EmbeddingModel } from "ai";
+import type { EmbeddingProvider } from "../core/types";
+import { requireOptional } from "./_shared";
+/**
+ * Ollama provider instance interface.
+ */
+interface OllamaProvider {
+  textEmbeddingModel: (model: string) => EmbeddingModel<string>;
+}
+/**
+ * Ollama provider module interface.
+ */
+interface OllamaModule {
+  createOllama: (config: { baseURL?: string; headers?: Record<string, string> }) => OllamaProvider;
+  ollama: OllamaProvider;
+}
+export type OllamaEmbeddingConfig = {
+  model?: string;
+  timeoutMs?: number;
+  baseURL?: string;
+  headers?: Record<string, string>;
+};
+const DEFAULT_TEXT_MODEL = "nomic-embed-text";
+const resolveProvider = (config: OllamaEmbeddingConfig): OllamaProvider => {
+  const { createOllama, ollama } = requireOptional<OllamaModule>({
+    id: "ollama-ai-provider-v2",
+    installHint: "bun add ollama-ai-provider-v2",
+    providerName: "ollama",
+  });
+  if (config.baseURL || config.headers) {
+    return createOllama({
+      ...(config.baseURL ? { baseURL: config.baseURL } : {}),
+      ...(config.headers ? { headers: config.headers } : {}),
+    });
+  }
+  return ollama;
+};
+export const createOllamaEmbeddingProvider = (
+  config: OllamaEmbeddingConfig = {}
+): EmbeddingProvider => {
+  const model =
+    config.model ?? process.env.OLLAMA_EMBEDDING_MODEL ?? DEFAULT_TEXT_MODEL;
+  const timeoutMs = config.timeoutMs;
+  const provider = resolveProvider(config);
+  const embeddingModel = provider.textEmbeddingModel(model);
+  return {
+    name: `ollama:${model}`,
+    dimensions: undefined,
+    embed: async ({ text }) => {
+      const abortSignal = timeoutMs
+        ? AbortSignal.timeout(timeoutMs)
+        : undefined;
+      const result = await embed({
+        model: embeddingModel,
+        value: text,
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      if (!result.embedding) {
+        throw new Error("Embedding missing from Ollama response");
+      }
+      return result.embedding;
+    },
+    embedMany: async (inputs) => {
+      const values = inputs.map((i) => i.text);
+      const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
+      const result = await embedMany({
+        model: embeddingModel,
+        values,
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      const { embeddings } = result;
+      if (!Array.isArray(embeddings)) {
+        throw new Error("Embeddings missing from Ollama embedMany response");
+      }
+      return embeddings;
+    },
+  };
+};

package/registry/embedding/openai.ts ADDED Viewed

@@ -0,0 +1,88 @@
+import { embed, embedMany, type EmbeddingModel } from "ai";
+import type { EmbeddingProvider } from "../core/types";
+import { requireOptional } from "./_shared";
+/**
+ * OpenAI provider module interface.
+ */
+interface OpenAiModule {
+  openai: {
+    embedding: (model: string) => EmbeddingModel<string>;
+  };
+}
+export type OpenAiEmbeddingConfig = {
+  model?: string;
+  timeoutMs?: number;
+  dimensions?: number;
+  user?: string;
+};
+const DEFAULT_TEXT_MODEL = "text-embedding-3-small";
+const buildProviderOptions = (config: OpenAiEmbeddingConfig) => {
+  if (config.dimensions === undefined && config.user === undefined) {
+    return undefined;
+  }
+  return {
+    openai: {
+      ...(config.dimensions !== undefined ? { dimensions: config.dimensions } : {}),
+      ...(config.user ? { user: config.user } : {}),
+    },
+  };
+};
+export const createOpenAiEmbeddingProvider = (
+  config: OpenAiEmbeddingConfig = {}
+): EmbeddingProvider => {
+  const { openai } = requireOptional<OpenAiModule>({
+    id: "@ai-sdk/openai",
+    installHint: "bun add @ai-sdk/openai",
+    providerName: "openai",
+  });
+  const model =
+    config.model ?? process.env.OPENAI_EMBEDDING_MODEL ?? DEFAULT_TEXT_MODEL;
+  const timeoutMs = config.timeoutMs;
+  const providerOptions = buildProviderOptions(config);
+  const embeddingModel = openai.embedding(model);
+  return {
+    name: `openai:${model}`,
+    dimensions: config.dimensions,
+    embed: async ({ text }) => {
+      const abortSignal = timeoutMs
+        ? AbortSignal.timeout(timeoutMs)
+        : undefined;
+      const result = await embed({
+        model: embeddingModel,
+        value: text,
+        ...(providerOptions ? { providerOptions } : {}),
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      if (!result.embedding) {
+        throw new Error("Embedding missing from OpenAI response");
+      }
+      return result.embedding;
+    },
+    embedMany: async (inputs) => {
+      const values = inputs.map((i) => i.text);
+      const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
+      const result = await embedMany({
+        model: embeddingModel,
+        values,
+        ...(providerOptions ? { providerOptions } : {}),
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      const { embeddings } = result;
+      if (!Array.isArray(embeddings)) {
+        throw new Error("Embeddings missing from OpenAI embedMany response");
+      }
+      return embeddings;
+    },
+  };
+};

package/registry/embedding/openrouter.ts ADDED Viewed

@@ -0,0 +1,127 @@
+import type { EmbeddingProvider } from "../core/types";
+import { requireOptional } from "./_shared";
+export type OpenRouterEmbeddingConfig = {
+  model?: string;
+  timeoutMs?: number;
+  apiKey?: string;
+  baseURL?: string;
+  headers?: Record<string, string>;
+  referer?: string;
+  title?: string;
+};
+/**
+ * OpenRouter embedding result item.
+ */
+interface EmbeddingDataItem {
+  embedding?: number[];
+}
+/**
+ * OpenRouter embedding response.
+ */
+interface EmbeddingResponse {
+  data?: EmbeddingDataItem[];
+  embedding?: number[];
+}
+/**
+ * OpenRouter client embeddings interface.
+ */
+interface EmbeddingsClient {
+  generate(
+    params: { input: string | string[]; model: string },
+    options?: { fetchOptions?: { signal?: AbortSignal } }
+  ): Promise<EmbeddingResponse>;
+}
+/**
+ * OpenRouter client interface.
+ */
+interface OpenRouterClient {
+  embeddings: EmbeddingsClient;
+}
+/**
+ * OpenRouter SDK module interface.
+ */
+interface OpenRouterModule {
+  OpenRouter: new (config: {
+    apiKey: string;
+    baseURL?: string;
+    headers?: Record<string, string>;
+  }) => OpenRouterClient;
+}
+const DEFAULT_TEXT_MODEL = "text-embedding-3-small";
+const buildHeaders = (config: OpenRouterEmbeddingConfig) => {
+  const headers: Record<string, string> = { ...(config.headers ?? {}) };
+  if (config.referer) headers["HTTP-Referer"] = config.referer;
+  if (config.title) headers["X-Title"] = config.title;
+  return headers;
+};
+export const createOpenRouterEmbeddingProvider = (
+  config: OpenRouterEmbeddingConfig = {}
+): EmbeddingProvider => {
+  const { OpenRouter } = requireOptional<OpenRouterModule>({
+    id: "@openrouter/sdk",
+    installHint: "bun add @openrouter/sdk",
+    providerName: "openrouter",
+  });
+  const model =
+    config.model ?? process.env.OPENROUTER_EMBEDDING_MODEL ?? DEFAULT_TEXT_MODEL;
+  const timeoutMs = config.timeoutMs;
+  const headers = buildHeaders(config);
+  const client = new OpenRouter({
+    apiKey: config.apiKey ?? process.env.OPENROUTER_API_KEY ?? "",
+    ...(config.baseURL ? { baseURL: config.baseURL } : {}),
+    ...(Object.keys(headers).length ? { headers } : {}),
+  });
+  return {
+    name: `openrouter:${model}`,
+    dimensions: undefined,
+    embed: async ({ text }) => {
+      const abortSignal = timeoutMs
+        ? AbortSignal.timeout(timeoutMs)
+        : undefined;
+      const result = await client.embeddings.generate(
+        { input: text, model },
+        abortSignal ? { fetchOptions: { signal: abortSignal } } : undefined
+      );
+      const embedding =
+        result.data?.[0]?.embedding ??
+        result.embedding;
+      if (!embedding) {
+        throw new Error("Embedding missing from OpenRouter response");
+      }
+      return embedding;
+    },
+    embedMany: async (inputs) => {
+      const values = inputs.map((i) => i.text);
+      const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
+      const result = await client.embeddings.generate(
+        { input: values, model },
+        abortSignal ? { fetchOptions: { signal: abortSignal } } : undefined
+      );
+      const embeddings = result.data?.map(
+        (item) => item.embedding
+      );
+      if (!embeddings || embeddings.some((e) => !Array.isArray(e))) {
+        throw new Error("Embeddings missing from OpenRouter response");
+      }
+      return embeddings as number[][];
+    },
+  };
+};

package/registry/embedding/together.ts ADDED Viewed

@@ -0,0 +1,77 @@
+import { embed, embedMany, type EmbeddingModel } from "ai";
+import type { EmbeddingProvider } from "../core/types";
+import { requireOptional } from "./_shared";
+/**
+ * Together AI provider module interface.
+ */
+interface TogetherAiModule {
+  togetherai: {
+    embeddingModel?: (model: string) => EmbeddingModel<string>;
+    textEmbeddingModel?: (model: string) => EmbeddingModel<string>;
+  };
+}
+export type TogetherEmbeddingConfig = {
+  model?: string;
+  timeoutMs?: number;
+};
+const DEFAULT_TEXT_MODEL = "togethercomputer/m2-bert-80M-2k-retrieval";
+export const createTogetherEmbeddingProvider = (
+  config: TogetherEmbeddingConfig = {}
+): EmbeddingProvider => {
+  const { togetherai } = requireOptional<TogetherAiModule>({
+    id: "@ai-sdk/togetherai",
+    installHint: "bun add @ai-sdk/togetherai",
+    providerName: "together",
+  });
+  const model =
+    config.model ??
+    process.env.TOGETHER_AI_EMBEDDING_MODEL ??
+    DEFAULT_TEXT_MODEL;
+  const timeoutMs = config.timeoutMs;
+  const embeddingModel =
+    typeof togetherai.embeddingModel === "function"
+      ? togetherai.embeddingModel(model)
+      : togetherai.textEmbeddingModel?.(model);
+  return {
+    name: `together:${model}`,
+    dimensions: undefined,
+    embed: async ({ text }) => {
+      const abortSignal = timeoutMs
+        ? AbortSignal.timeout(timeoutMs)
+        : undefined;
+      const result = await embed({
+        model: embeddingModel,
+        value: text,
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      if (!result.embedding) {
+        throw new Error("Embedding missing from Together.ai response");
+      }
+      return result.embedding;
+    },
+    embedMany: async (inputs) => {
+      const values = inputs.map((i) => i.text);
+      const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
+      const result = await embedMany({
+        model: embeddingModel,
+        values,
+        ...(abortSignal ? { abortSignal } : {}),
+      });
+      const { embeddings } = result;
+      if (!Array.isArray(embeddings)) {
+        throw new Error("Embeddings missing from Together.ai embedMany response");
+      }
+      return embeddings;
+    },
+  };
+};