unrag 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +2 -2
  2. package/dist/cli/index.js +199 -41
  3. package/package.json +2 -1
  4. package/registry/config/unrag.config.ts +140 -7
  5. package/registry/connectors/notion/render.ts +78 -0
  6. package/registry/connectors/notion/sync.ts +12 -3
  7. package/registry/connectors/notion/types.ts +3 -1
  8. package/registry/core/assets.ts +54 -0
  9. package/registry/core/config.ts +150 -0
  10. package/registry/core/context-engine.ts +69 -1
  11. package/registry/core/index.ts +15 -2
  12. package/registry/core/ingest.ts +743 -17
  13. package/registry/core/types.ts +606 -0
  14. package/registry/docs/unrag.md +6 -0
  15. package/registry/embedding/ai.ts +89 -8
  16. package/registry/extractors/_shared/fetch.ts +113 -0
  17. package/registry/extractors/_shared/media.ts +14 -0
  18. package/registry/extractors/_shared/text.ts +11 -0
  19. package/registry/extractors/audio-transcribe/index.ts +75 -0
  20. package/registry/extractors/file-docx/index.ts +53 -0
  21. package/registry/extractors/file-pptx/index.ts +92 -0
  22. package/registry/extractors/file-text/index.ts +85 -0
  23. package/registry/extractors/file-xlsx/index.ts +58 -0
  24. package/registry/extractors/image-caption-llm/index.ts +60 -0
  25. package/registry/extractors/image-ocr/index.ts +60 -0
  26. package/registry/extractors/pdf-llm/index.ts +84 -0
  27. package/registry/extractors/pdf-ocr/index.ts +125 -0
  28. package/registry/extractors/pdf-text-layer/index.ts +76 -0
  29. package/registry/extractors/video-frames/index.ts +126 -0
  30. package/registry/extractors/video-transcribe/index.ts +78 -0
  31. package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
@@ -1,27 +1,68 @@
1
- import { embed } from "ai";
2
- import type { EmbeddingProvider } from "../core/types";
1
+ import { embed, embedMany } from "ai";
2
+ import type { EmbeddingProvider, ImageEmbeddingInput } from "../core/types";
3
3
 
4
- export type AiEmbeddingConfig = {
4
+ type BaseConfig = {
5
5
  /**
6
- * AI Gateway model id, e.g. "openai/text-embedding-3-small"
6
+ * AI Gateway model id, e.g. "openai/text-embedding-3-small" or "google/gemini-...".
7
7
  */
8
8
  model?: string;
9
9
  timeoutMs?: number;
10
10
  };
11
11
 
12
- const DEFAULT_MODEL = "openai/text-embedding-3-small";
12
+ export type AiEmbeddingConfig =
13
+ | (BaseConfig & {
14
+ /**
15
+ * Defaults to "text" for backwards compatibility.
16
+ * - "text": only supports embedding strings
17
+ * - "multimodal": additionally enables `embedImage()` for image assets (when the model supports it)
18
+ */
19
+ type?: "text";
20
+ })
21
+ | (BaseConfig & {
22
+ type: "multimodal";
23
+ /**
24
+ * Controls how images are translated into AI SDK `embed()` values.
25
+ * Different providers use different shapes; this is the escape hatch.
26
+ */
27
+ image?: {
28
+ value?: (input: ImageEmbeddingInput) => unknown;
29
+ };
30
+ });
31
+
32
+ const DEFAULT_TEXT_MODEL = "openai/text-embedding-3-small";
33
+ const DEFAULT_MULTIMODAL_MODEL = "cohere/embed-v4.0";
34
+
35
+ const bytesToDataUrl = (bytes: Uint8Array, mediaType: string) => {
36
+ const base64 = Buffer.from(bytes).toString("base64");
37
+ return `data:${mediaType};base64,${base64}`;
38
+ };
39
+
40
+ const defaultImageValue = (input: ImageEmbeddingInput) => {
41
+ const v =
42
+ typeof input.data === "string"
43
+ ? input.data
44
+ : bytesToDataUrl(input.data, input.mediaType ?? "image/jpeg");
45
+ // This matches common AI Gateway multimodal embedding inputs where
46
+ // the embedding value is an object containing one or more images.
47
+ return { image: [v] };
48
+ };
13
49
 
14
50
  export const createAiEmbeddingProvider = (
15
51
  config: AiEmbeddingConfig = {}
16
52
  ): EmbeddingProvider => {
17
- const model = config.model ?? process.env.AI_GATEWAY_MODEL ?? DEFAULT_MODEL;
53
+ const type = (config as any).type ?? "text";
54
+ const model =
55
+ config.model ??
56
+ process.env.AI_GATEWAY_MODEL ??
57
+ (type === "multimodal" ? DEFAULT_MULTIMODAL_MODEL : DEFAULT_TEXT_MODEL);
58
+ const timeoutMs = config.timeoutMs;
18
59
 
19
60
  return {
20
61
  name: `ai-sdk:${model}`,
21
62
  dimensions: undefined,
22
63
  embed: async ({ text }) => {
23
- const abortSignal = config.timeoutMs
24
- ? AbortSignal.timeout(config.timeoutMs)
64
+ const abortSignal = timeoutMs
65
+ ? AbortSignal.timeout(timeoutMs)
25
66
  : undefined;
26
67
 
27
68
  const result = await embed({
@@ -36,6 +77,46 @@ export const createAiEmbeddingProvider = (
36
77
 
37
78
  return result.embedding;
38
79
  },
80
+ embedMany: async (inputs) => {
81
+ const values = inputs.map((i) => i.text);
82
+ const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
83
+
84
+ const result = await embedMany({
85
+ model,
86
+ values,
87
+ ...(abortSignal ? { abortSignal } : {}),
88
+ });
89
+
90
+ const embeddings = (result as any)?.embeddings as number[][] | undefined;
91
+ if (!embeddings) {
92
+ throw new Error("Embeddings missing from AI SDK embedMany response");
93
+ }
94
+ return embeddings;
95
+ },
96
+ ...(type === "multimodal"
97
+ ? {
98
+ embedImage: async (input: ImageEmbeddingInput) => {
99
+ const abortSignal = timeoutMs
100
+ ? AbortSignal.timeout(timeoutMs)
101
+ : undefined;
102
+
103
+ const imageValue =
104
+ (config as any)?.image?.value?.(input) ?? defaultImageValue(input);
105
+
106
+ const result = await embed({
107
+ model,
108
+ value: imageValue,
109
+ ...(abortSignal ? { abortSignal } : {}),
110
+ });
111
+
112
+ if (!result.embedding) {
113
+ throw new Error("Embedding missing from AI SDK response");
114
+ }
115
+
116
+ return result.embedding;
117
+ },
118
+ }
119
+ : {}),
39
120
  };
40
121
  };
41
122
 
@@ -0,0 +1,113 @@
1
+ import type { AssetData, AssetFetchConfig } from "../../core/types";
2
+
3
+ const DEFAULT_UA = "unrag/asset-fetch";
4
+
5
+ const isProbablyIpLiteral = (host: string) =>
6
+ /^\d{1,3}(\.\d{1,3}){3}$/.test(host) || host.includes(":");
7
+
8
+ const isDisallowedHost = (host: string) => {
9
+ const h = host.toLowerCase();
10
+ if (h === "localhost" || h.endsWith(".localhost")) return true;
11
+ if (h === "0.0.0.0") return true;
12
+ if (h === "127.0.0.1" || h.startsWith("127.")) return true;
13
+ if (h === "::1") return true;
14
+
15
+ // If host is an IP literal, block common private ranges.
16
+ if (isProbablyIpLiteral(h)) {
17
+ if (h.startsWith("10.")) return true;
18
+ if (h.startsWith("192.168.")) return true;
19
+ if (/^172\.(1[6-9]|2\d|3[0-1])\./.test(h)) return true;
20
+ }
21
+
22
+ return false;
23
+ };
24
+
25
+ export async function fetchBytesFromUrl(args: {
26
+ url: string;
27
+ fetchConfig: AssetFetchConfig;
28
+ headers?: Record<string, string>;
29
+ maxBytes: number;
30
+ }): Promise<{ bytes: Uint8Array; mediaType?: string }> {
31
+ if (!args.fetchConfig.enabled) {
32
+ throw new Error("Asset fetch disabled (assetProcessing.fetch.enabled=false)");
33
+ }
34
+
35
+ const u = new URL(args.url);
36
+ if (u.protocol !== "https:") {
37
+ throw new Error("Only https:// URLs are allowed for asset fetching");
38
+ }
39
+
40
+ if (isDisallowedHost(u.hostname)) {
41
+ throw new Error(`Disallowed host for asset fetch: ${u.hostname}`);
42
+ }
43
+
44
+ const allow = args.fetchConfig.allowedHosts;
45
+ if (Array.isArray(allow) && allow.length > 0) {
46
+ const ok = allow.some((h) => h.toLowerCase() === u.hostname.toLowerCase());
47
+ if (!ok) {
48
+ throw new Error(`Host not allowlisted for asset fetch: ${u.hostname}`);
49
+ }
50
+ }
51
+
52
+ const abortSignal = AbortSignal.timeout(args.fetchConfig.timeoutMs);
53
+ const headers = {
54
+ "user-agent": DEFAULT_UA,
55
+ ...(args.fetchConfig.headers ?? {}),
56
+ ...(args.headers ?? {}),
57
+ };
58
+
59
+ const res = await fetch(args.url, { headers, signal: abortSignal });
60
+ if (!res.ok) {
61
+ throw new Error(`Asset fetch failed (${res.status} ${res.statusText})`);
62
+ }
63
+
64
+ const contentLength = Number(res.headers.get("content-length") ?? NaN);
65
+ if (Number.isFinite(contentLength) && contentLength > args.maxBytes) {
66
+ throw new Error(
67
+ `Asset too large (content-length ${contentLength} > ${args.maxBytes})`
68
+ );
69
+ }
70
+
71
+ const buf = new Uint8Array(await res.arrayBuffer());
72
+ if (buf.byteLength > args.maxBytes) {
73
+ throw new Error(`Asset too large (${buf.byteLength} > ${args.maxBytes})`);
74
+ }
75
+
76
+ const mediaType = res.headers.get("content-type")?.split(";")[0]?.trim();
77
+ return { bytes: buf, mediaType: mediaType || undefined };
78
+ }
79
+
80
+ export async function getAssetBytes(args: {
81
+ data: AssetData;
82
+ fetchConfig: AssetFetchConfig;
83
+ maxBytes: number;
84
+ /** Optional fallback when data does not provide a mediaType and the response lacks one. */
85
+ defaultMediaType?: string;
86
+ }): Promise<{ bytes: Uint8Array; mediaType: string; filename?: string }> {
87
+ if (args.data.kind === "bytes") {
88
+ return {
89
+ bytes: args.data.bytes,
90
+ mediaType: args.data.mediaType,
91
+ ...(args.data.filename ? { filename: args.data.filename } : {}),
92
+ };
93
+ }
94
+
95
+ const fetched = await fetchBytesFromUrl({
96
+ url: args.data.url,
97
+ fetchConfig: args.fetchConfig,
98
+ headers: args.data.headers,
99
+ maxBytes: args.maxBytes,
100
+ });
101
+
102
+ return {
103
+ bytes: fetched.bytes,
104
+ mediaType:
105
+ args.data.mediaType ??
106
+ fetched.mediaType ??
107
+ args.defaultMediaType ??
108
+ "application/octet-stream",
109
+ ...(args.data.filename ? { filename: args.data.filename } : {}),
110
+ };
111
+ }
112
+
113
+
@@ -0,0 +1,14 @@
1
+ export function normalizeMediaType(mediaType: string | undefined): string | undefined {
2
+ if (!mediaType) return undefined;
3
+ return mediaType.split(";")[0]?.trim().toLowerCase() || undefined;
4
+ }
5
+
6
+ export function extFromFilename(filename: string | undefined): string | undefined {
7
+ if (!filename) return undefined;
8
+ const idx = filename.lastIndexOf(".");
9
+ if (idx < 0) return undefined;
10
+ const ext = filename.slice(idx + 1).trim().toLowerCase();
11
+ return ext || undefined;
12
+ }
13
+
14
+
@@ -0,0 +1,11 @@
1
+ export function capText(text: string, maxChars: number): string {
2
+ const t = String(text ?? "");
3
+ if (!Number.isFinite(maxChars) || maxChars <= 0) return t;
4
+ return t.length <= maxChars ? t : t.slice(0, maxChars).trimEnd();
5
+ }
6
+
7
+ export function toUtf8String(bytes: Uint8Array): string {
8
+ return new TextDecoder("utf-8", { fatal: false }).decode(bytes);
9
+ }
10
+
11
+
@@ -0,0 +1,75 @@
1
+ import { experimental_transcribe as transcribe } from "ai";
2
+ import type { AssetExtractor } from "../../core/types";
3
+ import { getAssetBytes } from "../_shared/fetch";
4
+
5
+ /**
6
+ * Audio transcription via the AI SDK `transcribe()` API.
7
+ */
8
+ export function createAudioTranscribeExtractor(): AssetExtractor {
9
+ return {
10
+ name: "audio:transcribe",
11
+ supports: ({ asset, ctx }) =>
12
+ asset.kind === "audio" && ctx.assetProcessing.audio.transcription.enabled,
13
+ extract: async ({ asset, ctx }) => {
14
+ const cfg = ctx.assetProcessing.audio.transcription;
15
+ const fetchConfig = ctx.assetProcessing.fetch;
16
+
17
+ const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
18
+ const { bytes } = await getAssetBytes({
19
+ data: asset.data,
20
+ fetchConfig,
21
+ maxBytes,
22
+ defaultMediaType: "audio/mpeg",
23
+ });
24
+
25
+ const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
26
+
27
+ const result = await transcribe({
28
+ model: cfg.model as any,
29
+ audio: bytes,
30
+ abortSignal,
31
+ });
32
+
33
+ const segments: any[] = Array.isArray((result as any)?.segments)
34
+ ? (result as any).segments
35
+ : [];
36
+
37
+ if (segments.length > 0) {
38
+ return {
39
+ texts: segments
40
+ .map((s, i) => {
41
+ const t = String(s?.text ?? "").trim();
42
+ if (!t) return null;
43
+ const start = Number(s?.startSecond ?? NaN);
44
+ const end = Number(s?.endSecond ?? NaN);
45
+ return {
46
+ label: `segment-${i + 1}`,
47
+ content: t,
48
+ ...(Number.isFinite(start) && Number.isFinite(end)
49
+ ? { timeRangeSec: [start, end] as [number, number] }
50
+ : {}),
51
+ };
52
+ })
53
+ .filter(Boolean) as any,
54
+ diagnostics: {
55
+ model: cfg.model,
56
+ seconds:
57
+ typeof (result as any)?.durationInSeconds === "number"
58
+ ? (result as any).durationInSeconds
59
+ : undefined,
60
+ },
61
+ };
62
+ }
63
+
64
+ const text = String((result as any)?.text ?? "").trim();
65
+ if (!text) return { texts: [], diagnostics: { model: cfg.model } };
66
+
67
+ return {
68
+ texts: [{ label: "transcript", content: text }],
69
+ diagnostics: { model: cfg.model },
70
+ };
71
+ },
72
+ };
73
+ }
74
+
75
+
@@ -0,0 +1,53 @@
1
+ import type { AssetExtractor } from "../../core/types";
2
+ import { getAssetBytes } from "../_shared/fetch";
3
+ import { extFromFilename, normalizeMediaType } from "../_shared/media";
4
+ import { capText } from "../_shared/text";
5
+
6
+ const DOCX_MEDIA =
7
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
8
+
9
+ export function createFileDocxExtractor(): AssetExtractor {
10
+ return {
11
+ name: "file:docx",
12
+ supports: ({ asset, ctx }) => {
13
+ if (asset.kind !== "file") return false;
14
+ if (!ctx.assetProcessing.file.docx.enabled) return false;
15
+ const filename = asset.data.kind === "bytes" ? asset.data.filename : asset.data.filename;
16
+ const ext = extFromFilename(filename);
17
+ const mt =
18
+ asset.data.kind === "bytes"
19
+ ? normalizeMediaType(asset.data.mediaType)
20
+ : normalizeMediaType(asset.data.mediaType);
21
+ return ext === "docx" || mt === DOCX_MEDIA;
22
+ },
23
+ extract: async ({ asset, ctx }) => {
24
+ const cfg = ctx.assetProcessing.file.docx;
25
+ const fetchConfig = ctx.assetProcessing.fetch;
26
+
27
+ const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
28
+ const { bytes } = await getAssetBytes({
29
+ data: asset.data,
30
+ fetchConfig,
31
+ maxBytes,
32
+ defaultMediaType: DOCX_MEDIA,
33
+ });
34
+
35
+ // Dynamic import so the core package can be used without mammoth unless this extractor is installed.
36
+ const mammoth: any = await import("mammoth");
37
+ const arrayBuffer = bytes.buffer.slice(
38
+ bytes.byteOffset,
39
+ bytes.byteOffset + bytes.byteLength
40
+ );
41
+ const res = await mammoth.extractRawText({ arrayBuffer });
42
+
43
+ const text = String(res?.value ?? "").trim();
44
+ if (text.length < cfg.minChars) return { texts: [] };
45
+
46
+ return {
47
+ texts: [{ label: "docx", content: capText(text, cfg.maxOutputChars) }],
48
+ };
49
+ },
50
+ };
51
+ }
52
+
53
+
@@ -0,0 +1,92 @@
1
+ import type { AssetExtractor } from "../../core/types";
2
+ import { getAssetBytes } from "../_shared/fetch";
3
+ import { extFromFilename, normalizeMediaType } from "../_shared/media";
4
+ import { capText } from "../_shared/text";
5
+
6
+ const PPTX_MEDIA =
7
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation";
8
+
9
+ const decodeXmlEntities = (s: string) =>
10
+ s
11
+ .replace(/&lt;/g, "<")
12
+ .replace(/&gt;/g, ">")
13
+ .replace(/&quot;/g, '"')
14
+ .replace(/&apos;/g, "'")
15
+ .replace(/&amp;/g, "&");
16
+
17
+ export function createFilePptxExtractor(): AssetExtractor {
18
+ return {
19
+ name: "file:pptx",
20
+ supports: ({ asset, ctx }) => {
21
+ if (asset.kind !== "file") return false;
22
+ if (!ctx.assetProcessing.file.pptx.enabled) return false;
23
+ const filename = asset.data.kind === "bytes" ? asset.data.filename : asset.data.filename;
24
+ const ext = extFromFilename(filename);
25
+ const mt =
26
+ asset.data.kind === "bytes"
27
+ ? normalizeMediaType(asset.data.mediaType)
28
+ : normalizeMediaType(asset.data.mediaType);
29
+ return ext === "pptx" || mt === PPTX_MEDIA;
30
+ },
31
+ extract: async ({ asset, ctx }) => {
32
+ const cfg = ctx.assetProcessing.file.pptx;
33
+ const fetchConfig = ctx.assetProcessing.fetch;
34
+
35
+ const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
36
+ const { bytes } = await getAssetBytes({
37
+ data: asset.data,
38
+ fetchConfig,
39
+ maxBytes,
40
+ defaultMediaType: PPTX_MEDIA,
41
+ });
42
+
43
+ // Dynamic import to avoid hard dependency unless installed.
44
+ const JSZip: any = (await import("jszip")).default;
45
+ const zip = await JSZip.loadAsync(bytes);
46
+
47
+ const slidePaths = Object.keys(zip.files).filter((p) =>
48
+ /^ppt\/slides\/slide\d+\.xml$/.test(p)
49
+ );
50
+ slidePaths.sort((a, b) => {
51
+ const na = Number(a.match(/slide(\d+)\.xml$/)?.[1] ?? 0);
52
+ const nb = Number(b.match(/slide(\d+)\.xml$/)?.[1] ?? 0);
53
+ return na - nb;
54
+ });
55
+
56
+ const texts: Array<{ label: string; content: string }> = [];
57
+ let totalChars = 0;
58
+
59
+ for (const slidePath of slidePaths) {
60
+ if (totalChars >= cfg.maxOutputChars) break;
61
+
62
+ const xml = await zip.files[slidePath]!.async("string");
63
+ const parts: string[] = [];
64
+ const re = /<a:t[^>]*>([\s\S]*?)<\/a:t>/g;
65
+ let m: RegExpExecArray | null;
66
+ while ((m = re.exec(xml))) {
67
+ const t = decodeXmlEntities(String(m[1] ?? "")).replace(/\s+/g, " ").trim();
68
+ if (t) parts.push(t);
69
+ }
70
+
71
+ const slideText = parts.join(" ").trim();
72
+ if (!slideText) continue;
73
+
74
+ const slideNum = Number(slidePath.match(/slide(\d+)\.xml$/)?.[1] ?? 0);
75
+ const capped = capText(slideText, cfg.maxOutputChars - totalChars);
76
+ if (!capped) continue;
77
+
78
+ texts.push({ label: `slide-${slideNum || texts.length + 1}`, content: capped });
79
+ totalChars += capped.length;
80
+ }
81
+
82
+ const joinedChars = texts.reduce((n, t) => n + t.content.length, 0);
83
+ if (joinedChars < cfg.minChars) return { texts: [] };
84
+
85
+ return {
86
+ texts: texts.map((t) => ({ label: t.label, content: t.content })),
87
+ };
88
+ },
89
+ };
90
+ }
91
+
92
+
@@ -0,0 +1,85 @@
1
+ import type { AssetExtractor } from "../../core/types";
2
+ import { getAssetBytes } from "../_shared/fetch";
3
+ import { extFromFilename, normalizeMediaType } from "../_shared/media";
4
+ import { capText, toUtf8String } from "../_shared/text";
5
+
6
+ const stripHtml = (html: string) => {
7
+ const withoutScripts = html
8
+ .replace(/<script[\s\S]*?<\/script>/gi, " ")
9
+ .replace(/<style[\s\S]*?<\/style>/gi, " ");
10
+ const withBreaks = withoutScripts
11
+ .replace(/<br\s*\/?>/gi, "\n")
12
+ .replace(/<\/p>/gi, "\n")
13
+ .replace(/<\/div>/gi, "\n");
14
+ const withoutTags = withBreaks.replace(/<[^>]+>/g, " ");
15
+ return withoutTags.replace(/\s+\n/g, "\n").replace(/[ \t]{2,}/g, " ").trim();
16
+ };
17
+
18
+ const isTextish = (mediaType: string | undefined, filename: string | undefined) => {
19
+ const mt = normalizeMediaType(mediaType);
20
+ if (mt?.startsWith("text/")) return true;
21
+ if (mt === "application/json") return true;
22
+ if (mt === "application/xml") return true;
23
+ if (mt === "application/xhtml+xml") return true;
24
+
25
+ const ext = extFromFilename(filename);
26
+ if (!ext) return false;
27
+ return (
28
+ ext === "txt" ||
29
+ ext === "md" ||
30
+ ext === "markdown" ||
31
+ ext === "html" ||
32
+ ext === "htm" ||
33
+ ext === "json" ||
34
+ ext === "csv" ||
35
+ ext === "log" ||
36
+ ext === "xml"
37
+ );
38
+ };
39
+
40
+ export function createFileTextExtractor(): AssetExtractor {
41
+ return {
42
+ name: "file:text",
43
+ supports: ({ asset, ctx }) => {
44
+ if (asset.kind !== "file") return false;
45
+ if (!ctx.assetProcessing.file.text.enabled) return false;
46
+ const filename = asset.data.kind === "bytes" ? asset.data.filename : asset.data.filename;
47
+ const mediaType = asset.data.kind === "bytes" ? asset.data.mediaType : asset.data.mediaType;
48
+ return isTextish(mediaType, filename);
49
+ },
50
+ extract: async ({ asset, ctx }) => {
51
+ const cfg = ctx.assetProcessing.file.text;
52
+ const fetchConfig = ctx.assetProcessing.fetch;
53
+
54
+ const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
55
+ const { bytes, mediaType, filename } = await getAssetBytes({
56
+ data: asset.data,
57
+ fetchConfig,
58
+ maxBytes,
59
+ defaultMediaType: "text/plain",
60
+ });
61
+
62
+ const ext = extFromFilename(filename);
63
+ const mt = normalizeMediaType(mediaType);
64
+
65
+ let text = toUtf8String(bytes);
66
+
67
+ if (mt === "text/html" || mt === "application/xhtml+xml" || ext === "html" || ext === "htm") {
68
+ text = stripHtml(text);
69
+ }
70
+
71
+ text = text.trim();
72
+ if (text.length < cfg.minChars) return { texts: [] };
73
+
74
+ return {
75
+ texts: [{ label: "text", content: capText(text, cfg.maxOutputChars) }],
76
+ metadata: {
77
+ ...(mt ? { mediaType: mt } : {}),
78
+ ...(ext ? { ext } : {}),
79
+ },
80
+ };
81
+ },
82
+ };
83
+ }
84
+
85
+
@@ -0,0 +1,58 @@
1
+ import type { AssetExtractor } from "../../core/types";
2
+ import { getAssetBytes } from "../_shared/fetch";
3
+ import { extFromFilename, normalizeMediaType } from "../_shared/media";
4
+ import { capText } from "../_shared/text";
5
+
6
+ const XLSX_MEDIA =
7
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
8
+
9
+ export function createFileXlsxExtractor(): AssetExtractor {
10
+ return {
11
+ name: "file:xlsx",
12
+ supports: ({ asset, ctx }) => {
13
+ if (asset.kind !== "file") return false;
14
+ if (!ctx.assetProcessing.file.xlsx.enabled) return false;
15
+ const filename = asset.data.kind === "bytes" ? asset.data.filename : asset.data.filename;
16
+ const ext = extFromFilename(filename);
17
+ const mt =
18
+ asset.data.kind === "bytes"
19
+ ? normalizeMediaType(asset.data.mediaType)
20
+ : normalizeMediaType(asset.data.mediaType);
21
+ return ext === "xlsx" || mt === XLSX_MEDIA;
22
+ },
23
+ extract: async ({ asset, ctx }) => {
24
+ const cfg = ctx.assetProcessing.file.xlsx;
25
+ const fetchConfig = ctx.assetProcessing.fetch;
26
+
27
+ const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
28
+ const { bytes } = await getAssetBytes({
29
+ data: asset.data,
30
+ fetchConfig,
31
+ maxBytes,
32
+ defaultMediaType: XLSX_MEDIA,
33
+ });
34
+
35
+ const xlsx: any = await import("xlsx");
36
+ const wb = xlsx.read(Buffer.from(bytes), { type: "buffer" });
37
+
38
+ const parts: string[] = [];
39
+ for (const sheetName of wb.SheetNames ?? []) {
40
+ if (parts.join("\n\n").length >= cfg.maxOutputChars) break;
41
+ const sheet = wb.Sheets?.[sheetName];
42
+ if (!sheet) continue;
43
+ const csv = String(xlsx.utils.sheet_to_csv(sheet) ?? "").trim();
44
+ if (!csv) continue;
45
+ parts.push(`# Sheet: ${sheetName}\n\n${csv}`);
46
+ }
47
+
48
+ const text = capText(parts.join("\n\n"), cfg.maxOutputChars).trim();
49
+ if (text.length < cfg.minChars) return { texts: [] };
50
+
51
+ return {
52
+ texts: [{ label: "xlsx", content: text }],
53
+ };
54
+ },
55
+ };
56
+ }
57
+
58
+
@@ -0,0 +1,60 @@
1
+ import { generateText } from "ai";
2
+ import type { AssetExtractor } from "../../core/types";
3
+ import { getAssetBytes } from "../_shared/fetch";
4
+ import { normalizeMediaType } from "../_shared/media";
5
+ import { capText } from "../_shared/text";
6
+
7
+ /**
8
+ * Caption generation for images via a vision-capable LLM.
9
+ *
10
+ * Useful when you want text-based retrieval for images but the source system does not provide captions/alt text.
11
+ */
12
+ export function createImageCaptionLlmExtractor(): AssetExtractor {
13
+ return {
14
+ name: "image:caption-llm",
15
+ supports: ({ asset, ctx }) =>
16
+ asset.kind === "image" && ctx.assetProcessing.image.captionLlm.enabled,
17
+ extract: async ({ asset, ctx }) => {
18
+ const cfg = ctx.assetProcessing.image.captionLlm;
19
+ const fetchConfig = ctx.assetProcessing.fetch;
20
+
21
+ const maxBytes = Math.min(cfg.maxBytes, fetchConfig.maxBytes);
22
+ const { bytes, mediaType } = await getAssetBytes({
23
+ data: asset.data,
24
+ fetchConfig,
25
+ maxBytes,
26
+ defaultMediaType: "image/jpeg",
27
+ });
28
+
29
+ const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
30
+
31
+ const result = await generateText({
32
+ model: cfg.model as any,
33
+ abortSignal,
34
+ messages: [
35
+ {
36
+ role: "user",
37
+ content: [
38
+ { type: "text", text: cfg.prompt },
39
+ {
40
+ type: "image",
41
+ image: bytes,
42
+ mediaType: normalizeMediaType(mediaType),
43
+ },
44
+ ],
45
+ },
46
+ ],
47
+ });
48
+
49
+ const caption = String((result as any)?.text ?? "").trim();
50
+ if (!caption) return { texts: [], diagnostics: { model: cfg.model } };
51
+
52
+ return {
53
+ texts: [{ label: "caption", content: capText(caption, cfg.maxOutputChars) }],
54
+ diagnostics: { model: cfg.model },
55
+ };
56
+ },
57
+ };
58
+ }
59
+
60
+