unrag 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +611 -174
- package/package.json +12 -6
- package/registry/config/unrag.config.ts +9 -8
- package/registry/connectors/google-drive/_api-types.ts +60 -0
- package/registry/connectors/google-drive/client.ts +99 -38
- package/registry/connectors/google-drive/sync.ts +97 -69
- package/registry/connectors/google-drive/types.ts +76 -37
- package/registry/connectors/notion/client.ts +12 -3
- package/registry/connectors/notion/render.ts +62 -23
- package/registry/connectors/notion/sync.ts +30 -23
- package/registry/core/assets.ts +11 -10
- package/registry/core/config.ts +10 -25
- package/registry/core/context-engine.ts +71 -2
- package/registry/core/deep-merge.ts +45 -0
- package/registry/core/ingest.ts +117 -44
- package/registry/core/types.ts +96 -2
- package/registry/docs/unrag.md +6 -1
- package/registry/embedding/_shared.ts +25 -0
- package/registry/embedding/ai.ts +8 -68
- package/registry/embedding/azure.ts +88 -0
- package/registry/embedding/bedrock.ts +88 -0
- package/registry/embedding/cohere.ts +88 -0
- package/registry/embedding/google.ts +102 -0
- package/registry/embedding/mistral.ts +71 -0
- package/registry/embedding/ollama.ts +90 -0
- package/registry/embedding/openai.ts +88 -0
- package/registry/embedding/openrouter.ts +127 -0
- package/registry/embedding/together.ts +77 -0
- package/registry/embedding/vertex.ts +111 -0
- package/registry/embedding/voyage.ts +169 -0
- package/registry/extractors/audio-transcribe/index.ts +39 -23
- package/registry/extractors/file-docx/index.ts +8 -1
- package/registry/extractors/file-pptx/index.ts +22 -1
- package/registry/extractors/file-xlsx/index.ts +24 -1
- package/registry/extractors/image-caption-llm/index.ts +8 -3
- package/registry/extractors/image-ocr/index.ts +9 -4
- package/registry/extractors/pdf-llm/index.ts +9 -4
- package/registry/extractors/pdf-text-layer/index.ts +23 -2
- package/registry/extractors/video-frames/index.ts +8 -3
- package/registry/extractors/video-transcribe/index.ts +40 -24
- package/registry/manifest.json +346 -0
- package/registry/store/drizzle-postgres-pgvector/store.ts +26 -6
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { embed, embedMany, type EmbeddingModel } from "ai";
|
|
2
|
+
import type { EmbeddingProvider } from "../core/types";
|
|
3
|
+
import { requireOptional } from "./_shared";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Vertex AI provider module interface.
|
|
7
|
+
*/
|
|
8
|
+
interface VertexModule {
|
|
9
|
+
vertex: {
|
|
10
|
+
embeddingModel: (model: string) => EmbeddingModel<string>;
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export type VertexEmbeddingTaskType =
|
|
15
|
+
| "SEMANTIC_SIMILARITY"
|
|
16
|
+
| "CLASSIFICATION"
|
|
17
|
+
| "CLUSTERING"
|
|
18
|
+
| "RETRIEVAL_DOCUMENT"
|
|
19
|
+
| "RETRIEVAL_QUERY"
|
|
20
|
+
| "QUESTION_ANSWERING"
|
|
21
|
+
| "FACT_VERIFICATION"
|
|
22
|
+
| "CODE_RETRIEVAL_QUERY";
|
|
23
|
+
|
|
24
|
+
export type VertexEmbeddingConfig = {
|
|
25
|
+
model?: string;
|
|
26
|
+
timeoutMs?: number;
|
|
27
|
+
outputDimensionality?: number;
|
|
28
|
+
taskType?: VertexEmbeddingTaskType;
|
|
29
|
+
title?: string;
|
|
30
|
+
autoTruncate?: boolean;
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
const DEFAULT_TEXT_MODEL = "text-embedding-004";
|
|
34
|
+
|
|
35
|
+
const buildProviderOptions = (config: VertexEmbeddingConfig) => {
|
|
36
|
+
if (
|
|
37
|
+
config.outputDimensionality === undefined &&
|
|
38
|
+
!config.taskType &&
|
|
39
|
+
config.autoTruncate === undefined &&
|
|
40
|
+
!config.title
|
|
41
|
+
) {
|
|
42
|
+
return undefined;
|
|
43
|
+
}
|
|
44
|
+
return {
|
|
45
|
+
google: {
|
|
46
|
+
...(config.outputDimensionality !== undefined
|
|
47
|
+
? { outputDimensionality: config.outputDimensionality }
|
|
48
|
+
: {}),
|
|
49
|
+
...(config.taskType ? { taskType: config.taskType } : {}),
|
|
50
|
+
...(config.autoTruncate !== undefined ? { autoTruncate: config.autoTruncate } : {}),
|
|
51
|
+
...(config.title ? { title: config.title } : {}),
|
|
52
|
+
},
|
|
53
|
+
};
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
export const createVertexEmbeddingProvider = (
|
|
57
|
+
config: VertexEmbeddingConfig = {}
|
|
58
|
+
): EmbeddingProvider => {
|
|
59
|
+
const { vertex } = requireOptional<VertexModule>({
|
|
60
|
+
id: "@ai-sdk/google-vertex",
|
|
61
|
+
installHint: "bun add @ai-sdk/google-vertex",
|
|
62
|
+
providerName: "vertex",
|
|
63
|
+
});
|
|
64
|
+
const model =
|
|
65
|
+
config.model ??
|
|
66
|
+
process.env.GOOGLE_VERTEX_EMBEDDING_MODEL ??
|
|
67
|
+
DEFAULT_TEXT_MODEL;
|
|
68
|
+
const timeoutMs = config.timeoutMs;
|
|
69
|
+
const providerOptions = buildProviderOptions(config);
|
|
70
|
+
const embeddingModel = vertex.embeddingModel(model);
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
name: `vertex:${model}`,
|
|
74
|
+
dimensions: config.outputDimensionality,
|
|
75
|
+
embed: async ({ text }) => {
|
|
76
|
+
const abortSignal = timeoutMs
|
|
77
|
+
? AbortSignal.timeout(timeoutMs)
|
|
78
|
+
: undefined;
|
|
79
|
+
|
|
80
|
+
const result = await embed({
|
|
81
|
+
model: embeddingModel,
|
|
82
|
+
value: text,
|
|
83
|
+
...(providerOptions ? { providerOptions } : {}),
|
|
84
|
+
...(abortSignal ? { abortSignal } : {}),
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
if (!result.embedding) {
|
|
88
|
+
throw new Error("Embedding missing from Vertex response");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return result.embedding;
|
|
92
|
+
},
|
|
93
|
+
embedMany: async (inputs) => {
|
|
94
|
+
const values = inputs.map((i) => i.text);
|
|
95
|
+
const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
|
|
96
|
+
|
|
97
|
+
const result = await embedMany({
|
|
98
|
+
model: embeddingModel,
|
|
99
|
+
values,
|
|
100
|
+
...(providerOptions ? { providerOptions } : {}),
|
|
101
|
+
...(abortSignal ? { abortSignal } : {}),
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
const { embeddings } = result;
|
|
105
|
+
if (!Array.isArray(embeddings)) {
|
|
106
|
+
throw new Error("Embeddings missing from Vertex embedMany response");
|
|
107
|
+
}
|
|
108
|
+
return embeddings;
|
|
109
|
+
},
|
|
110
|
+
};
|
|
111
|
+
};
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import { embed, embedMany, type EmbeddingModel } from "ai";
|
|
2
|
+
import type { EmbeddingProvider, ImageEmbeddingInput } from "../core/types";
|
|
3
|
+
import { requireOptional } from "./_shared";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Voyage AI provider module interface.
|
|
7
|
+
*/
|
|
8
|
+
interface VoyageModule {
|
|
9
|
+
voyage: {
|
|
10
|
+
embeddingModel?: (model: string) => EmbeddingModel<string>;
|
|
11
|
+
textEmbeddingModel?: (model: string) => EmbeddingModel<string>;
|
|
12
|
+
multimodalEmbeddingModel?: (model: string) => EmbeddingModel<unknown>;
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
type BaseConfig = {
|
|
17
|
+
model?: string;
|
|
18
|
+
timeoutMs?: number;
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
export type VoyageEmbeddingConfig =
|
|
22
|
+
| (BaseConfig & {
|
|
23
|
+
type?: "text";
|
|
24
|
+
})
|
|
25
|
+
| (BaseConfig & {
|
|
26
|
+
type: "multimodal";
|
|
27
|
+
text?: {
|
|
28
|
+
value?: (text: string) => unknown;
|
|
29
|
+
};
|
|
30
|
+
image?: {
|
|
31
|
+
value?: (input: ImageEmbeddingInput) => unknown;
|
|
32
|
+
};
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
const DEFAULT_TEXT_MODEL = "voyage-3.5-lite";
|
|
36
|
+
const DEFAULT_MULTIMODAL_MODEL = "voyage-multimodal-3";
|
|
37
|
+
|
|
38
|
+
const bytesToDataUrl = (bytes: Uint8Array, mediaType: string) => {
|
|
39
|
+
const base64 = Buffer.from(bytes).toString("base64");
|
|
40
|
+
return `data:${mediaType};base64,${base64}`;
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
const defaultTextValue = (text: string) => ({
|
|
44
|
+
text: [text],
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
const defaultImageValue = (input: ImageEmbeddingInput) => {
|
|
48
|
+
const v =
|
|
49
|
+
typeof input.data === "string"
|
|
50
|
+
? input.data
|
|
51
|
+
: bytesToDataUrl(input.data, input.mediaType ?? "image/jpeg");
|
|
52
|
+
return { image: [v] };
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
export const createVoyageEmbeddingProvider = (
|
|
56
|
+
config: VoyageEmbeddingConfig = {}
|
|
57
|
+
): EmbeddingProvider => {
|
|
58
|
+
const { voyage } = requireOptional<VoyageModule>({
|
|
59
|
+
id: "voyage-ai-provider",
|
|
60
|
+
installHint: "bun add voyage-ai-provider",
|
|
61
|
+
providerName: "voyage",
|
|
62
|
+
});
|
|
63
|
+
const type = config.type ?? "text";
|
|
64
|
+
const isMultimodal = config.type === "multimodal";
|
|
65
|
+
const model =
|
|
66
|
+
config.model ??
|
|
67
|
+
process.env.VOYAGE_MODEL ??
|
|
68
|
+
(type === "multimodal" ? DEFAULT_MULTIMODAL_MODEL : DEFAULT_TEXT_MODEL);
|
|
69
|
+
const timeoutMs = config.timeoutMs;
|
|
70
|
+
|
|
71
|
+
const textEmbeddingModel =
|
|
72
|
+
type === "multimodal"
|
|
73
|
+
? undefined
|
|
74
|
+
: typeof voyage.embeddingModel === "function"
|
|
75
|
+
? voyage.embeddingModel(model)
|
|
76
|
+
: voyage.textEmbeddingModel?.(model);
|
|
77
|
+
const multimodalEmbeddingModel =
|
|
78
|
+
type === "multimodal" ? voyage.multimodalEmbeddingModel?.(model) : undefined;
|
|
79
|
+
|
|
80
|
+
// AI SDK 6 types only accept string inputs; cast multimodal models/values.
|
|
81
|
+
const multimodalModel = multimodalEmbeddingModel as unknown as EmbeddingModel;
|
|
82
|
+
|
|
83
|
+
const resolveTextValue = (text: string) => {
|
|
84
|
+
if (isMultimodal && config.text?.value) {
|
|
85
|
+
return config.text.value(text);
|
|
86
|
+
}
|
|
87
|
+
return defaultTextValue(text);
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
const resolveImageValue = (input: ImageEmbeddingInput) => {
|
|
91
|
+
if (isMultimodal && config.image?.value) {
|
|
92
|
+
return config.image.value(input);
|
|
93
|
+
}
|
|
94
|
+
return defaultImageValue(input);
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
name: `voyage:${model}`,
|
|
99
|
+
dimensions: undefined,
|
|
100
|
+
embed: async ({ text }) => {
|
|
101
|
+
const abortSignal = timeoutMs
|
|
102
|
+
? AbortSignal.timeout(timeoutMs)
|
|
103
|
+
: undefined;
|
|
104
|
+
|
|
105
|
+
const result =
|
|
106
|
+
type === "multimodal"
|
|
107
|
+
? await embed({
|
|
108
|
+
model: multimodalModel,
|
|
109
|
+
value: resolveTextValue(text) as unknown as string,
|
|
110
|
+
...(abortSignal ? { abortSignal } : {}),
|
|
111
|
+
})
|
|
112
|
+
: await embed({
|
|
113
|
+
model: textEmbeddingModel!,
|
|
114
|
+
value: text,
|
|
115
|
+
...(abortSignal ? { abortSignal } : {}),
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
if (!result.embedding) {
|
|
119
|
+
throw new Error("Embedding missing from Voyage response");
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return result.embedding;
|
|
123
|
+
},
|
|
124
|
+
embedMany: async (inputs) => {
|
|
125
|
+
const abortSignal = timeoutMs ? AbortSignal.timeout(timeoutMs) : undefined;
|
|
126
|
+
const result =
|
|
127
|
+
type === "multimodal"
|
|
128
|
+
? await embedMany({
|
|
129
|
+
model: multimodalModel,
|
|
130
|
+
values: inputs.map((i) => resolveTextValue(i.text)) as unknown as string[],
|
|
131
|
+
...(abortSignal ? { abortSignal } : {}),
|
|
132
|
+
})
|
|
133
|
+
: await embedMany({
|
|
134
|
+
model: textEmbeddingModel!,
|
|
135
|
+
values: inputs.map((i) => i.text),
|
|
136
|
+
...(abortSignal ? { abortSignal } : {}),
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
const { embeddings } = result;
|
|
140
|
+
if (!Array.isArray(embeddings)) {
|
|
141
|
+
throw new Error("Embeddings missing from Voyage embedMany response");
|
|
142
|
+
}
|
|
143
|
+
return embeddings;
|
|
144
|
+
},
|
|
145
|
+
...(type === "multimodal"
|
|
146
|
+
? {
|
|
147
|
+
embedImage: async (input: ImageEmbeddingInput) => {
|
|
148
|
+
const abortSignal = timeoutMs
|
|
149
|
+
? AbortSignal.timeout(timeoutMs)
|
|
150
|
+
: undefined;
|
|
151
|
+
|
|
152
|
+
const value = resolveImageValue(input);
|
|
153
|
+
|
|
154
|
+
const result = await embed({
|
|
155
|
+
model: multimodalModel,
|
|
156
|
+
value: value as unknown as string,
|
|
157
|
+
...(abortSignal ? { abortSignal } : {}),
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
if (!result.embedding) {
|
|
161
|
+
throw new Error("Embedding missing from Voyage response");
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
return result.embedding;
|
|
165
|
+
},
|
|
166
|
+
}
|
|
167
|
+
: {}),
|
|
168
|
+
};
|
|
169
|
+
};
|
|
@@ -1,7 +1,21 @@
|
|
|
1
|
-
import { experimental_transcribe as transcribe } from "ai";
|
|
2
|
-
import type { AssetExtractor } from "../../core/types";
|
|
1
|
+
import { experimental_transcribe as transcribe, type TranscriptionModel } from "ai";
|
|
2
|
+
import type { AssetExtractor, ExtractedTextItem } from "../../core/types";
|
|
3
3
|
import { getAssetBytes } from "../_shared/fetch";
|
|
4
4
|
|
|
5
|
+
/**
|
|
6
|
+
* Model reference type that accepts both string gateway IDs and TranscriptionModel instances.
|
|
7
|
+
*/
|
|
8
|
+
type TranscriptionModelRef = string | TranscriptionModel;
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Transcription segment from the AI SDK.
|
|
12
|
+
*/
|
|
13
|
+
interface TranscriptionSegment {
|
|
14
|
+
text?: string;
|
|
15
|
+
startSecond?: number;
|
|
16
|
+
endSecond?: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
5
19
|
/**
|
|
6
20
|
* Audio transcription via the AI SDK `transcribe()` API.
|
|
7
21
|
*/
|
|
@@ -25,43 +39,45 @@ export function createAudioTranscribeExtractor(): AssetExtractor {
|
|
|
25
39
|
const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
|
|
26
40
|
|
|
27
41
|
const result = await transcribe({
|
|
28
|
-
model: cfg.model as
|
|
42
|
+
model: cfg.model as TranscriptionModelRef,
|
|
29
43
|
audio: bytes,
|
|
30
44
|
abortSignal,
|
|
31
45
|
});
|
|
32
46
|
|
|
33
|
-
const segments:
|
|
34
|
-
?
|
|
47
|
+
const segments: TranscriptionSegment[] = Array.isArray(result.segments)
|
|
48
|
+
? result.segments
|
|
35
49
|
: [];
|
|
36
50
|
|
|
37
51
|
if (segments.length > 0) {
|
|
52
|
+
const textItems: ExtractedTextItem[] = segments
|
|
53
|
+
.map((s, i) => {
|
|
54
|
+
const t = String(s?.text ?? "").trim();
|
|
55
|
+
if (!t) return null;
|
|
56
|
+
const start = Number(s?.startSecond ?? NaN);
|
|
57
|
+
const end = Number(s?.endSecond ?? NaN);
|
|
58
|
+
return {
|
|
59
|
+
label: `segment-${i + 1}`,
|
|
60
|
+
content: t,
|
|
61
|
+
...(Number.isFinite(start) && Number.isFinite(end)
|
|
62
|
+
? { timeRangeSec: [start, end] as [number, number] }
|
|
63
|
+
: {}),
|
|
64
|
+
};
|
|
65
|
+
})
|
|
66
|
+
.filter((item): item is ExtractedTextItem => item !== null);
|
|
67
|
+
|
|
38
68
|
return {
|
|
39
|
-
texts:
|
|
40
|
-
.map((s, i) => {
|
|
41
|
-
const t = String(s?.text ?? "").trim();
|
|
42
|
-
if (!t) return null;
|
|
43
|
-
const start = Number(s?.startSecond ?? NaN);
|
|
44
|
-
const end = Number(s?.endSecond ?? NaN);
|
|
45
|
-
return {
|
|
46
|
-
label: `segment-${i + 1}`,
|
|
47
|
-
content: t,
|
|
48
|
-
...(Number.isFinite(start) && Number.isFinite(end)
|
|
49
|
-
? { timeRangeSec: [start, end] as [number, number] }
|
|
50
|
-
: {}),
|
|
51
|
-
};
|
|
52
|
-
})
|
|
53
|
-
.filter(Boolean) as any,
|
|
69
|
+
texts: textItems,
|
|
54
70
|
diagnostics: {
|
|
55
71
|
model: cfg.model,
|
|
56
72
|
seconds:
|
|
57
|
-
typeof
|
|
58
|
-
?
|
|
73
|
+
typeof result.durationInSeconds === "number"
|
|
74
|
+
? result.durationInSeconds
|
|
59
75
|
: undefined,
|
|
60
76
|
},
|
|
61
77
|
};
|
|
62
78
|
}
|
|
63
79
|
|
|
64
|
-
const text =
|
|
80
|
+
const text = (result.text ?? "").trim();
|
|
65
81
|
if (!text) return { texts: [], diagnostics: { model: cfg.model } };
|
|
66
82
|
|
|
67
83
|
return {
|
|
@@ -3,6 +3,13 @@ import { getAssetBytes } from "../_shared/fetch";
|
|
|
3
3
|
import { extFromFilename, normalizeMediaType } from "../_shared/media";
|
|
4
4
|
import { capText } from "../_shared/text";
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* Minimal mammoth module interface.
|
|
8
|
+
*/
|
|
9
|
+
interface MammothModule {
|
|
10
|
+
extractRawText(options: { arrayBuffer: ArrayBuffer }): Promise<{ value?: string }>;
|
|
11
|
+
}
|
|
12
|
+
|
|
6
13
|
const DOCX_MEDIA =
|
|
7
14
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
|
8
15
|
|
|
@@ -33,7 +40,7 @@ export function createFileDocxExtractor(): AssetExtractor {
|
|
|
33
40
|
});
|
|
34
41
|
|
|
35
42
|
// Dynamic import so the core package can be used without mammoth unless this extractor is installed.
|
|
36
|
-
const mammoth
|
|
43
|
+
const mammoth = (await import("mammoth")) as MammothModule;
|
|
37
44
|
const arrayBuffer = bytes.buffer.slice(
|
|
38
45
|
bytes.byteOffset,
|
|
39
46
|
bytes.byteOffset + bytes.byteLength
|
|
@@ -3,6 +3,27 @@ import { getAssetBytes } from "../_shared/fetch";
|
|
|
3
3
|
import { extFromFilename, normalizeMediaType } from "../_shared/media";
|
|
4
4
|
import { capText } from "../_shared/text";
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* Zip file entry interface.
|
|
8
|
+
*/
|
|
9
|
+
interface ZipFile {
|
|
10
|
+
async(type: "string"): Promise<string>;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* JSZip instance interface.
|
|
15
|
+
*/
|
|
16
|
+
interface JSZipInstance {
|
|
17
|
+
files: Record<string, ZipFile>;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* JSZip constructor interface.
|
|
22
|
+
*/
|
|
23
|
+
interface JSZipConstructor {
|
|
24
|
+
loadAsync(data: Uint8Array): Promise<JSZipInstance>;
|
|
25
|
+
}
|
|
26
|
+
|
|
6
27
|
const PPTX_MEDIA =
|
|
7
28
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation";
|
|
8
29
|
|
|
@@ -41,7 +62,7 @@ export function createFilePptxExtractor(): AssetExtractor {
|
|
|
41
62
|
});
|
|
42
63
|
|
|
43
64
|
// Dynamic import to avoid hard dependency unless installed.
|
|
44
|
-
const JSZip
|
|
65
|
+
const JSZip = (await import("jszip")).default as unknown as JSZipConstructor;
|
|
45
66
|
const zip = await JSZip.loadAsync(bytes);
|
|
46
67
|
|
|
47
68
|
const slidePaths = Object.keys(zip.files).filter((p) =>
|
|
@@ -3,6 +3,29 @@ import { getAssetBytes } from "../_shared/fetch";
|
|
|
3
3
|
import { extFromFilename, normalizeMediaType } from "../_shared/media";
|
|
4
4
|
import { capText } from "../_shared/text";
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* XLSX sheet interface.
|
|
8
|
+
*/
|
|
9
|
+
type XLSXSheet = unknown;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* XLSX workbook interface.
|
|
13
|
+
*/
|
|
14
|
+
interface XLSXWorkbook {
|
|
15
|
+
SheetNames?: string[];
|
|
16
|
+
Sheets?: Record<string, XLSXSheet>;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Minimal xlsx module interface.
|
|
21
|
+
*/
|
|
22
|
+
interface XLSXModule {
|
|
23
|
+
read(data: Buffer, options: { type: string }): XLSXWorkbook;
|
|
24
|
+
utils: {
|
|
25
|
+
sheet_to_csv(sheet: XLSXSheet): string;
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
|
|
6
29
|
const XLSX_MEDIA =
|
|
7
30
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
|
8
31
|
|
|
@@ -32,7 +55,7 @@ export function createFileXlsxExtractor(): AssetExtractor {
|
|
|
32
55
|
defaultMediaType: XLSX_MEDIA,
|
|
33
56
|
});
|
|
34
57
|
|
|
35
|
-
const xlsx
|
|
58
|
+
const xlsx = (await import("xlsx")) as XLSXModule;
|
|
36
59
|
const wb = xlsx.read(Buffer.from(bytes), { type: "buffer" });
|
|
37
60
|
|
|
38
61
|
const parts: string[] = [];
|
|
@@ -1,9 +1,14 @@
|
|
|
1
|
-
import { generateText } from "ai";
|
|
1
|
+
import { generateText, type LanguageModel } from "ai";
|
|
2
2
|
import type { AssetExtractor } from "../../core/types";
|
|
3
3
|
import { getAssetBytes } from "../_shared/fetch";
|
|
4
4
|
import { normalizeMediaType } from "../_shared/media";
|
|
5
5
|
import { capText } from "../_shared/text";
|
|
6
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Model reference type that accepts both string gateway IDs and LanguageModel instances.
|
|
9
|
+
*/
|
|
10
|
+
type ModelRef = string | LanguageModel;
|
|
11
|
+
|
|
7
12
|
/**
|
|
8
13
|
* Caption generation for images via a vision-capable LLM.
|
|
9
14
|
*
|
|
@@ -29,7 +34,7 @@ export function createImageCaptionLlmExtractor(): AssetExtractor {
|
|
|
29
34
|
const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
|
|
30
35
|
|
|
31
36
|
const result = await generateText({
|
|
32
|
-
model: cfg.model as
|
|
37
|
+
model: cfg.model as ModelRef,
|
|
33
38
|
abortSignal,
|
|
34
39
|
messages: [
|
|
35
40
|
{
|
|
@@ -46,7 +51,7 @@ export function createImageCaptionLlmExtractor(): AssetExtractor {
|
|
|
46
51
|
],
|
|
47
52
|
});
|
|
48
53
|
|
|
49
|
-
const caption =
|
|
54
|
+
const caption = (result.text ?? "").trim();
|
|
50
55
|
if (!caption) return { texts: [], diagnostics: { model: cfg.model } };
|
|
51
56
|
|
|
52
57
|
return {
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
import { generateText } from "ai";
|
|
1
|
+
import { generateText, type LanguageModel } from "ai";
|
|
2
2
|
import type { AssetExtractor } from "../../core/types";
|
|
3
3
|
import { getAssetBytes } from "../_shared/fetch";
|
|
4
4
|
import { normalizeMediaType } from "../_shared/media";
|
|
5
5
|
import { capText } from "../_shared/text";
|
|
6
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Model reference type that accepts both string gateway IDs and LanguageModel instances.
|
|
9
|
+
*/
|
|
10
|
+
type ModelRef = string | LanguageModel;
|
|
11
|
+
|
|
7
12
|
/**
|
|
8
13
|
* Image OCR via a vision-capable LLM.
|
|
9
14
|
*
|
|
10
|
-
* This extractor is intended for screenshots, charts, diagrams, and
|
|
15
|
+
* This extractor is intended for screenshots, charts, diagrams, and images with embedded text.
|
|
11
16
|
*/
|
|
12
17
|
export function createImageOcrExtractor(): AssetExtractor {
|
|
13
18
|
return {
|
|
@@ -29,7 +34,7 @@ export function createImageOcrExtractor(): AssetExtractor {
|
|
|
29
34
|
const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
|
|
30
35
|
|
|
31
36
|
const result = await generateText({
|
|
32
|
-
model: cfg.model as
|
|
37
|
+
model: cfg.model as ModelRef,
|
|
33
38
|
abortSignal,
|
|
34
39
|
messages: [
|
|
35
40
|
{
|
|
@@ -46,7 +51,7 @@ export function createImageOcrExtractor(): AssetExtractor {
|
|
|
46
51
|
],
|
|
47
52
|
});
|
|
48
53
|
|
|
49
|
-
const text =
|
|
54
|
+
const text = (result.text ?? "").trim();
|
|
50
55
|
if (!text) return { texts: [], diagnostics: { model: cfg.model } };
|
|
51
56
|
|
|
52
57
|
return {
|
|
@@ -1,9 +1,14 @@
|
|
|
1
|
-
import { generateText } from "ai";
|
|
1
|
+
import { generateText, type LanguageModel } from "ai";
|
|
2
2
|
import type { AssetData, AssetExtractor, AssetFetchConfig } from "../../core/types";
|
|
3
3
|
import { getAssetBytes } from "../_shared/fetch";
|
|
4
4
|
import { normalizeMediaType } from "../_shared/media";
|
|
5
5
|
import { capText } from "../_shared/text";
|
|
6
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Model reference type that accepts both string gateway IDs and LanguageModel instances.
|
|
9
|
+
*/
|
|
10
|
+
type ModelRef = string | LanguageModel;
|
|
11
|
+
|
|
7
12
|
async function getPdfBytes(args: {
|
|
8
13
|
data: AssetData;
|
|
9
14
|
fetchConfig: AssetFetchConfig;
|
|
@@ -49,8 +54,8 @@ export function createPdfLlmExtractor(): AssetExtractor {
|
|
|
49
54
|
const abortSignal = AbortSignal.timeout(llm.timeoutMs);
|
|
50
55
|
|
|
51
56
|
const result = await generateText({
|
|
52
|
-
//
|
|
53
|
-
model: llm.model as
|
|
57
|
+
// String model IDs are supported for AI Gateway routing.
|
|
58
|
+
model: llm.model as ModelRef,
|
|
54
59
|
abortSignal,
|
|
55
60
|
messages: [
|
|
56
61
|
{
|
|
@@ -68,7 +73,7 @@ export function createPdfLlmExtractor(): AssetExtractor {
|
|
|
68
73
|
],
|
|
69
74
|
});
|
|
70
75
|
|
|
71
|
-
const text =
|
|
76
|
+
const text = (result.text ?? "").trim();
|
|
72
77
|
if (!text) return { texts: [], diagnostics: { model: llm.model } };
|
|
73
78
|
|
|
74
79
|
const capped = capText(text, llm.maxOutputChars);
|
|
@@ -2,6 +2,27 @@ import type { AssetExtractor } from "../../core/types";
|
|
|
2
2
|
import { getAssetBytes } from "../_shared/fetch";
|
|
3
3
|
import { capText } from "../_shared/text";
|
|
4
4
|
|
|
5
|
+
/**
|
|
6
|
+
* Text content item from pdfjs-dist.
|
|
7
|
+
*/
|
|
8
|
+
interface PdfTextItem {
|
|
9
|
+
str?: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Minimal pdfjs-dist module interface.
|
|
14
|
+
*/
|
|
15
|
+
interface PdfJsModule {
|
|
16
|
+
getDocument(params: { data: Uint8Array }): {
|
|
17
|
+
promise: Promise<{
|
|
18
|
+
numPages: number;
|
|
19
|
+
getPage(pageNum: number): Promise<{
|
|
20
|
+
getTextContent(): Promise<{ items?: PdfTextItem[] }>;
|
|
21
|
+
}>;
|
|
22
|
+
}>;
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
|
|
5
26
|
/**
|
|
6
27
|
* Fast/cheap PDF extraction using the PDF's built-in text layer.
|
|
7
28
|
*
|
|
@@ -29,7 +50,7 @@ export function createPdfTextLayerExtractor(): AssetExtractor {
|
|
|
29
50
|
});
|
|
30
51
|
|
|
31
52
|
// Dynamic import so the core package can be used without pdfjs unless this extractor is installed.
|
|
32
|
-
const pdfjs
|
|
53
|
+
const pdfjs = (await import("pdfjs-dist/legacy/build/pdf.mjs")) as PdfJsModule;
|
|
33
54
|
|
|
34
55
|
const doc = await pdfjs.getDocument({ data: bytes }).promise;
|
|
35
56
|
const totalPages: number = Number(doc?.numPages ?? 0);
|
|
@@ -42,7 +63,7 @@ export function createPdfTextLayerExtractor(): AssetExtractor {
|
|
|
42
63
|
for (let pageNum = 1; pageNum <= maxPages; pageNum++) {
|
|
43
64
|
const page = await doc.getPage(pageNum);
|
|
44
65
|
const textContent = await page.getTextContent();
|
|
45
|
-
const items:
|
|
66
|
+
const items: PdfTextItem[] = Array.isArray(textContent?.items)
|
|
46
67
|
? textContent.items
|
|
47
68
|
: [];
|
|
48
69
|
const pageText = items
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { generateText } from "ai";
|
|
1
|
+
import { generateText, type LanguageModel } from "ai";
|
|
2
2
|
import { spawn } from "node:child_process";
|
|
3
3
|
import { mkdir, readdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -7,6 +7,11 @@ import type { AssetExtractor } from "../../core/types";
|
|
|
7
7
|
import { getAssetBytes } from "../_shared/fetch";
|
|
8
8
|
import { capText } from "../_shared/text";
|
|
9
9
|
|
|
10
|
+
/**
|
|
11
|
+
* Model reference type that accepts both string gateway IDs and LanguageModel instances.
|
|
12
|
+
*/
|
|
13
|
+
type ModelRef = string | LanguageModel;
|
|
14
|
+
|
|
10
15
|
const run = async (cmd: string, args: string[], opts: { cwd: string }) => {
|
|
11
16
|
return await new Promise<{ stdout: string; stderr: string }>((resolve, reject) => {
|
|
12
17
|
const child = spawn(cmd, args, { cwd: opts.cwd, stdio: ["ignore", "pipe", "pipe"] });
|
|
@@ -87,7 +92,7 @@ export function createVideoFramesExtractor(): AssetExtractor {
|
|
|
87
92
|
|
|
88
93
|
const imgBytes = await readFile(path.join(tmpDir, f));
|
|
89
94
|
const result = await generateText({
|
|
90
|
-
model: cfg.model as
|
|
95
|
+
model: cfg.model as ModelRef,
|
|
91
96
|
abortSignal: abortPerFrame(cfg.timeoutMs),
|
|
92
97
|
messages: [
|
|
93
98
|
{
|
|
@@ -100,7 +105,7 @@ export function createVideoFramesExtractor(): AssetExtractor {
|
|
|
100
105
|
],
|
|
101
106
|
});
|
|
102
107
|
|
|
103
|
-
const t =
|
|
108
|
+
const t = (result.text ?? "").trim();
|
|
104
109
|
if (!t) continue;
|
|
105
110
|
|
|
106
111
|
const capped = capText(t, cfg.maxOutputChars - totalChars);
|