unrag 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/registry/connectors/google-drive/_api-types.ts +60 -0
- package/registry/connectors/google-drive/client.ts +99 -38
- package/registry/connectors/google-drive/sync.ts +97 -69
- package/registry/connectors/google-drive/types.ts +76 -37
- package/registry/connectors/notion/client.ts +12 -3
- package/registry/connectors/notion/render.ts +62 -23
- package/registry/connectors/notion/sync.ts +30 -23
- package/registry/core/assets.ts +11 -10
- package/registry/core/config.ts +10 -25
- package/registry/core/context-engine.ts +5 -0
- package/registry/core/deep-merge.ts +45 -0
- package/registry/core/ingest.ts +117 -44
- package/registry/core/types.ts +52 -0
- package/registry/embedding/_shared.ts +6 -1
- package/registry/embedding/ai.ts +2 -3
- package/registry/embedding/azure.ts +11 -2
- package/registry/embedding/bedrock.ts +11 -2
- package/registry/embedding/cohere.ts +11 -2
- package/registry/embedding/google.ts +11 -2
- package/registry/embedding/mistral.ts +11 -2
- package/registry/embedding/ollama.ts +18 -3
- package/registry/embedding/openai.ts +11 -2
- package/registry/embedding/openrouter.ts +53 -11
- package/registry/embedding/together.ts +15 -5
- package/registry/embedding/vertex.ts +11 -2
- package/registry/embedding/voyage.ts +16 -6
- package/registry/extractors/audio-transcribe/index.ts +39 -23
- package/registry/extractors/file-docx/index.ts +8 -1
- package/registry/extractors/file-pptx/index.ts +22 -1
- package/registry/extractors/file-xlsx/index.ts +24 -1
- package/registry/extractors/image-caption-llm/index.ts +8 -3
- package/registry/extractors/image-ocr/index.ts +9 -4
- package/registry/extractors/pdf-llm/index.ts +9 -4
- package/registry/extractors/pdf-text-layer/index.ts +23 -2
- package/registry/extractors/video-frames/index.ts +8 -3
- package/registry/extractors/video-transcribe/index.ts +40 -24
- package/registry/manifest.json +6 -6
- package/registry/store/drizzle-postgres-pgvector/store.ts +24 -7
|
@@ -1,7 +1,16 @@
|
|
|
1
|
-
import { embed, embedMany } from "ai";
|
|
1
|
+
import { embed, embedMany, type EmbeddingModel } from "ai";
|
|
2
2
|
import type { EmbeddingProvider } from "../core/types";
|
|
3
3
|
import { requireOptional } from "./_shared";
|
|
4
4
|
|
|
5
|
+
/**
|
|
6
|
+
* Vertex AI provider module interface.
|
|
7
|
+
*/
|
|
8
|
+
interface VertexModule {
|
|
9
|
+
vertex: {
|
|
10
|
+
embeddingModel: (model: string) => EmbeddingModel<string>;
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
|
|
5
14
|
export type VertexEmbeddingTaskType =
|
|
6
15
|
| "SEMANTIC_SIMILARITY"
|
|
7
16
|
| "CLASSIFICATION"
|
|
@@ -47,7 +56,7 @@ const buildProviderOptions = (config: VertexEmbeddingConfig) => {
|
|
|
47
56
|
export const createVertexEmbeddingProvider = (
|
|
48
57
|
config: VertexEmbeddingConfig = {}
|
|
49
58
|
): EmbeddingProvider => {
|
|
50
|
-
const { vertex } = requireOptional<
|
|
59
|
+
const { vertex } = requireOptional<VertexModule>({
|
|
51
60
|
id: "@ai-sdk/google-vertex",
|
|
52
61
|
installHint: "bun add @ai-sdk/google-vertex",
|
|
53
62
|
providerName: "vertex",
|
|
@@ -2,6 +2,17 @@ import { embed, embedMany, type EmbeddingModel } from "ai";
|
|
|
2
2
|
import type { EmbeddingProvider, ImageEmbeddingInput } from "../core/types";
|
|
3
3
|
import { requireOptional } from "./_shared";
|
|
4
4
|
|
|
5
|
+
/**
|
|
6
|
+
* Voyage AI provider module interface.
|
|
7
|
+
*/
|
|
8
|
+
interface VoyageModule {
|
|
9
|
+
voyage: {
|
|
10
|
+
embeddingModel?: (model: string) => EmbeddingModel<string>;
|
|
11
|
+
textEmbeddingModel?: (model: string) => EmbeddingModel<string>;
|
|
12
|
+
multimodalEmbeddingModel?: (model: string) => EmbeddingModel<unknown>;
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
|
|
5
16
|
type BaseConfig = {
|
|
6
17
|
model?: string;
|
|
7
18
|
timeoutMs?: number;
|
|
@@ -44,7 +55,7 @@ const defaultImageValue = (input: ImageEmbeddingInput) => {
|
|
|
44
55
|
export const createVoyageEmbeddingProvider = (
|
|
45
56
|
config: VoyageEmbeddingConfig = {}
|
|
46
57
|
): EmbeddingProvider => {
|
|
47
|
-
const { voyage } = requireOptional<
|
|
58
|
+
const { voyage } = requireOptional<VoyageModule>({
|
|
48
59
|
id: "voyage-ai-provider",
|
|
49
60
|
installHint: "bun add voyage-ai-provider",
|
|
50
61
|
providerName: "voyage",
|
|
@@ -57,15 +68,14 @@ export const createVoyageEmbeddingProvider = (
|
|
|
57
68
|
(type === "multimodal" ? DEFAULT_MULTIMODAL_MODEL : DEFAULT_TEXT_MODEL);
|
|
58
69
|
const timeoutMs = config.timeoutMs;
|
|
59
70
|
|
|
60
|
-
const voyageProvider = voyage as any;
|
|
61
71
|
const textEmbeddingModel =
|
|
62
72
|
type === "multimodal"
|
|
63
73
|
? undefined
|
|
64
|
-
: typeof
|
|
65
|
-
?
|
|
66
|
-
:
|
|
74
|
+
: typeof voyage.embeddingModel === "function"
|
|
75
|
+
? voyage.embeddingModel(model)
|
|
76
|
+
: voyage.textEmbeddingModel?.(model);
|
|
67
77
|
const multimodalEmbeddingModel =
|
|
68
|
-
type === "multimodal" ?
|
|
78
|
+
type === "multimodal" ? voyage.multimodalEmbeddingModel?.(model) : undefined;
|
|
69
79
|
|
|
70
80
|
// AI SDK 6 types only accept string inputs; cast multimodal models/values.
|
|
71
81
|
const multimodalModel = multimodalEmbeddingModel as unknown as EmbeddingModel;
|
|
@@ -1,7 +1,21 @@
|
|
|
1
|
-
import { experimental_transcribe as transcribe } from "ai";
|
|
2
|
-
import type { AssetExtractor } from "../../core/types";
|
|
1
|
+
import { experimental_transcribe as transcribe, type TranscriptionModel } from "ai";
|
|
2
|
+
import type { AssetExtractor, ExtractedTextItem } from "../../core/types";
|
|
3
3
|
import { getAssetBytes } from "../_shared/fetch";
|
|
4
4
|
|
|
5
|
+
/**
|
|
6
|
+
* Model reference type that accepts both string gateway IDs and TranscriptionModel instances.
|
|
7
|
+
*/
|
|
8
|
+
type TranscriptionModelRef = string | TranscriptionModel;
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Transcription segment from the AI SDK.
|
|
12
|
+
*/
|
|
13
|
+
interface TranscriptionSegment {
|
|
14
|
+
text?: string;
|
|
15
|
+
startSecond?: number;
|
|
16
|
+
endSecond?: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
5
19
|
/**
|
|
6
20
|
* Audio transcription via the AI SDK `transcribe()` API.
|
|
7
21
|
*/
|
|
@@ -25,43 +39,45 @@ export function createAudioTranscribeExtractor(): AssetExtractor {
|
|
|
25
39
|
const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
|
|
26
40
|
|
|
27
41
|
const result = await transcribe({
|
|
28
|
-
model: cfg.model as
|
|
42
|
+
model: cfg.model as TranscriptionModelRef,
|
|
29
43
|
audio: bytes,
|
|
30
44
|
abortSignal,
|
|
31
45
|
});
|
|
32
46
|
|
|
33
|
-
const segments:
|
|
34
|
-
?
|
|
47
|
+
const segments: TranscriptionSegment[] = Array.isArray(result.segments)
|
|
48
|
+
? result.segments
|
|
35
49
|
: [];
|
|
36
50
|
|
|
37
51
|
if (segments.length > 0) {
|
|
52
|
+
const textItems: ExtractedTextItem[] = segments
|
|
53
|
+
.map((s, i) => {
|
|
54
|
+
const t = String(s?.text ?? "").trim();
|
|
55
|
+
if (!t) return null;
|
|
56
|
+
const start = Number(s?.startSecond ?? NaN);
|
|
57
|
+
const end = Number(s?.endSecond ?? NaN);
|
|
58
|
+
return {
|
|
59
|
+
label: `segment-${i + 1}`,
|
|
60
|
+
content: t,
|
|
61
|
+
...(Number.isFinite(start) && Number.isFinite(end)
|
|
62
|
+
? { timeRangeSec: [start, end] as [number, number] }
|
|
63
|
+
: {}),
|
|
64
|
+
};
|
|
65
|
+
})
|
|
66
|
+
.filter((item): item is ExtractedTextItem => item !== null);
|
|
67
|
+
|
|
38
68
|
return {
|
|
39
|
-
texts:
|
|
40
|
-
.map((s, i) => {
|
|
41
|
-
const t = String(s?.text ?? "").trim();
|
|
42
|
-
if (!t) return null;
|
|
43
|
-
const start = Number(s?.startSecond ?? NaN);
|
|
44
|
-
const end = Number(s?.endSecond ?? NaN);
|
|
45
|
-
return {
|
|
46
|
-
label: `segment-${i + 1}`,
|
|
47
|
-
content: t,
|
|
48
|
-
...(Number.isFinite(start) && Number.isFinite(end)
|
|
49
|
-
? { timeRangeSec: [start, end] as [number, number] }
|
|
50
|
-
: {}),
|
|
51
|
-
};
|
|
52
|
-
})
|
|
53
|
-
.filter(Boolean) as any,
|
|
69
|
+
texts: textItems,
|
|
54
70
|
diagnostics: {
|
|
55
71
|
model: cfg.model,
|
|
56
72
|
seconds:
|
|
57
|
-
typeof
|
|
58
|
-
?
|
|
73
|
+
typeof result.durationInSeconds === "number"
|
|
74
|
+
? result.durationInSeconds
|
|
59
75
|
: undefined,
|
|
60
76
|
},
|
|
61
77
|
};
|
|
62
78
|
}
|
|
63
79
|
|
|
64
|
-
const text =
|
|
80
|
+
const text = (result.text ?? "").trim();
|
|
65
81
|
if (!text) return { texts: [], diagnostics: { model: cfg.model } };
|
|
66
82
|
|
|
67
83
|
return {
|
|
@@ -3,6 +3,13 @@ import { getAssetBytes } from "../_shared/fetch";
|
|
|
3
3
|
import { extFromFilename, normalizeMediaType } from "../_shared/media";
|
|
4
4
|
import { capText } from "../_shared/text";
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* Minimal mammoth module interface.
|
|
8
|
+
*/
|
|
9
|
+
interface MammothModule {
|
|
10
|
+
extractRawText(options: { arrayBuffer: ArrayBuffer }): Promise<{ value?: string }>;
|
|
11
|
+
}
|
|
12
|
+
|
|
6
13
|
const DOCX_MEDIA =
|
|
7
14
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
|
8
15
|
|
|
@@ -33,7 +40,7 @@ export function createFileDocxExtractor(): AssetExtractor {
|
|
|
33
40
|
});
|
|
34
41
|
|
|
35
42
|
// Dynamic import so the core package can be used without mammoth unless this extractor is installed.
|
|
36
|
-
const mammoth
|
|
43
|
+
const mammoth = (await import("mammoth")) as MammothModule;
|
|
37
44
|
const arrayBuffer = bytes.buffer.slice(
|
|
38
45
|
bytes.byteOffset,
|
|
39
46
|
bytes.byteOffset + bytes.byteLength
|
|
@@ -3,6 +3,27 @@ import { getAssetBytes } from "../_shared/fetch";
|
|
|
3
3
|
import { extFromFilename, normalizeMediaType } from "../_shared/media";
|
|
4
4
|
import { capText } from "../_shared/text";
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* Zip file entry interface.
|
|
8
|
+
*/
|
|
9
|
+
interface ZipFile {
|
|
10
|
+
async(type: "string"): Promise<string>;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* JSZip instance interface.
|
|
15
|
+
*/
|
|
16
|
+
interface JSZipInstance {
|
|
17
|
+
files: Record<string, ZipFile>;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* JSZip constructor interface.
|
|
22
|
+
*/
|
|
23
|
+
interface JSZipConstructor {
|
|
24
|
+
loadAsync(data: Uint8Array): Promise<JSZipInstance>;
|
|
25
|
+
}
|
|
26
|
+
|
|
6
27
|
const PPTX_MEDIA =
|
|
7
28
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation";
|
|
8
29
|
|
|
@@ -41,7 +62,7 @@ export function createFilePptxExtractor(): AssetExtractor {
|
|
|
41
62
|
});
|
|
42
63
|
|
|
43
64
|
// Dynamic import to avoid hard dependency unless installed.
|
|
44
|
-
const JSZip
|
|
65
|
+
const JSZip = (await import("jszip")).default as unknown as JSZipConstructor;
|
|
45
66
|
const zip = await JSZip.loadAsync(bytes);
|
|
46
67
|
|
|
47
68
|
const slidePaths = Object.keys(zip.files).filter((p) =>
|
|
@@ -3,6 +3,29 @@ import { getAssetBytes } from "../_shared/fetch";
|
|
|
3
3
|
import { extFromFilename, normalizeMediaType } from "../_shared/media";
|
|
4
4
|
import { capText } from "../_shared/text";
|
|
5
5
|
|
|
6
|
+
/**
|
|
7
|
+
* XLSX sheet interface.
|
|
8
|
+
*/
|
|
9
|
+
type XLSXSheet = unknown;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* XLSX workbook interface.
|
|
13
|
+
*/
|
|
14
|
+
interface XLSXWorkbook {
|
|
15
|
+
SheetNames?: string[];
|
|
16
|
+
Sheets?: Record<string, XLSXSheet>;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Minimal xlsx module interface.
|
|
21
|
+
*/
|
|
22
|
+
interface XLSXModule {
|
|
23
|
+
read(data: Buffer, options: { type: string }): XLSXWorkbook;
|
|
24
|
+
utils: {
|
|
25
|
+
sheet_to_csv(sheet: XLSXSheet): string;
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
|
|
6
29
|
const XLSX_MEDIA =
|
|
7
30
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
|
8
31
|
|
|
@@ -32,7 +55,7 @@ export function createFileXlsxExtractor(): AssetExtractor {
|
|
|
32
55
|
defaultMediaType: XLSX_MEDIA,
|
|
33
56
|
});
|
|
34
57
|
|
|
35
|
-
const xlsx
|
|
58
|
+
const xlsx = (await import("xlsx")) as XLSXModule;
|
|
36
59
|
const wb = xlsx.read(Buffer.from(bytes), { type: "buffer" });
|
|
37
60
|
|
|
38
61
|
const parts: string[] = [];
|
|
@@ -1,9 +1,14 @@
|
|
|
1
|
-
import { generateText } from "ai";
|
|
1
|
+
import { generateText, type LanguageModel } from "ai";
|
|
2
2
|
import type { AssetExtractor } from "../../core/types";
|
|
3
3
|
import { getAssetBytes } from "../_shared/fetch";
|
|
4
4
|
import { normalizeMediaType } from "../_shared/media";
|
|
5
5
|
import { capText } from "../_shared/text";
|
|
6
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Model reference type that accepts both string gateway IDs and LanguageModel instances.
|
|
9
|
+
*/
|
|
10
|
+
type ModelRef = string | LanguageModel;
|
|
11
|
+
|
|
7
12
|
/**
|
|
8
13
|
* Caption generation for images via a vision-capable LLM.
|
|
9
14
|
*
|
|
@@ -29,7 +34,7 @@ export function createImageCaptionLlmExtractor(): AssetExtractor {
|
|
|
29
34
|
const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
|
|
30
35
|
|
|
31
36
|
const result = await generateText({
|
|
32
|
-
model: cfg.model as
|
|
37
|
+
model: cfg.model as ModelRef,
|
|
33
38
|
abortSignal,
|
|
34
39
|
messages: [
|
|
35
40
|
{
|
|
@@ -46,7 +51,7 @@ export function createImageCaptionLlmExtractor(): AssetExtractor {
|
|
|
46
51
|
],
|
|
47
52
|
});
|
|
48
53
|
|
|
49
|
-
const caption =
|
|
54
|
+
const caption = (result.text ?? "").trim();
|
|
50
55
|
if (!caption) return { texts: [], diagnostics: { model: cfg.model } };
|
|
51
56
|
|
|
52
57
|
return {
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
import { generateText } from "ai";
|
|
1
|
+
import { generateText, type LanguageModel } from "ai";
|
|
2
2
|
import type { AssetExtractor } from "../../core/types";
|
|
3
3
|
import { getAssetBytes } from "../_shared/fetch";
|
|
4
4
|
import { normalizeMediaType } from "../_shared/media";
|
|
5
5
|
import { capText } from "../_shared/text";
|
|
6
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Model reference type that accepts both string gateway IDs and LanguageModel instances.
|
|
9
|
+
*/
|
|
10
|
+
type ModelRef = string | LanguageModel;
|
|
11
|
+
|
|
7
12
|
/**
|
|
8
13
|
* Image OCR via a vision-capable LLM.
|
|
9
14
|
*
|
|
10
|
-
* This extractor is intended for screenshots, charts, diagrams, and
|
|
15
|
+
* This extractor is intended for screenshots, charts, diagrams, and images with embedded text.
|
|
11
16
|
*/
|
|
12
17
|
export function createImageOcrExtractor(): AssetExtractor {
|
|
13
18
|
return {
|
|
@@ -29,7 +34,7 @@ export function createImageOcrExtractor(): AssetExtractor {
|
|
|
29
34
|
const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
|
|
30
35
|
|
|
31
36
|
const result = await generateText({
|
|
32
|
-
model: cfg.model as
|
|
37
|
+
model: cfg.model as ModelRef,
|
|
33
38
|
abortSignal,
|
|
34
39
|
messages: [
|
|
35
40
|
{
|
|
@@ -46,7 +51,7 @@ export function createImageOcrExtractor(): AssetExtractor {
|
|
|
46
51
|
],
|
|
47
52
|
});
|
|
48
53
|
|
|
49
|
-
const text =
|
|
54
|
+
const text = (result.text ?? "").trim();
|
|
50
55
|
if (!text) return { texts: [], diagnostics: { model: cfg.model } };
|
|
51
56
|
|
|
52
57
|
return {
|
|
@@ -1,9 +1,14 @@
|
|
|
1
|
-
import { generateText } from "ai";
|
|
1
|
+
import { generateText, type LanguageModel } from "ai";
|
|
2
2
|
import type { AssetData, AssetExtractor, AssetFetchConfig } from "../../core/types";
|
|
3
3
|
import { getAssetBytes } from "../_shared/fetch";
|
|
4
4
|
import { normalizeMediaType } from "../_shared/media";
|
|
5
5
|
import { capText } from "../_shared/text";
|
|
6
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Model reference type that accepts both string gateway IDs and LanguageModel instances.
|
|
9
|
+
*/
|
|
10
|
+
type ModelRef = string | LanguageModel;
|
|
11
|
+
|
|
7
12
|
async function getPdfBytes(args: {
|
|
8
13
|
data: AssetData;
|
|
9
14
|
fetchConfig: AssetFetchConfig;
|
|
@@ -49,8 +54,8 @@ export function createPdfLlmExtractor(): AssetExtractor {
|
|
|
49
54
|
const abortSignal = AbortSignal.timeout(llm.timeoutMs);
|
|
50
55
|
|
|
51
56
|
const result = await generateText({
|
|
52
|
-
//
|
|
53
|
-
model: llm.model as
|
|
57
|
+
// String model IDs are supported for AI Gateway routing.
|
|
58
|
+
model: llm.model as ModelRef,
|
|
54
59
|
abortSignal,
|
|
55
60
|
messages: [
|
|
56
61
|
{
|
|
@@ -68,7 +73,7 @@ export function createPdfLlmExtractor(): AssetExtractor {
|
|
|
68
73
|
],
|
|
69
74
|
});
|
|
70
75
|
|
|
71
|
-
const text =
|
|
76
|
+
const text = (result.text ?? "").trim();
|
|
72
77
|
if (!text) return { texts: [], diagnostics: { model: llm.model } };
|
|
73
78
|
|
|
74
79
|
const capped = capText(text, llm.maxOutputChars);
|
|
@@ -2,6 +2,27 @@ import type { AssetExtractor } from "../../core/types";
|
|
|
2
2
|
import { getAssetBytes } from "../_shared/fetch";
|
|
3
3
|
import { capText } from "../_shared/text";
|
|
4
4
|
|
|
5
|
+
/**
|
|
6
|
+
* Text content item from pdfjs-dist.
|
|
7
|
+
*/
|
|
8
|
+
interface PdfTextItem {
|
|
9
|
+
str?: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Minimal pdfjs-dist module interface.
|
|
14
|
+
*/
|
|
15
|
+
interface PdfJsModule {
|
|
16
|
+
getDocument(params: { data: Uint8Array }): {
|
|
17
|
+
promise: Promise<{
|
|
18
|
+
numPages: number;
|
|
19
|
+
getPage(pageNum: number): Promise<{
|
|
20
|
+
getTextContent(): Promise<{ items?: PdfTextItem[] }>;
|
|
21
|
+
}>;
|
|
22
|
+
}>;
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
|
|
5
26
|
/**
|
|
6
27
|
* Fast/cheap PDF extraction using the PDF's built-in text layer.
|
|
7
28
|
*
|
|
@@ -29,7 +50,7 @@ export function createPdfTextLayerExtractor(): AssetExtractor {
|
|
|
29
50
|
});
|
|
30
51
|
|
|
31
52
|
// Dynamic import so the core package can be used without pdfjs unless this extractor is installed.
|
|
32
|
-
const pdfjs
|
|
53
|
+
const pdfjs = (await import("pdfjs-dist/legacy/build/pdf.mjs")) as PdfJsModule;
|
|
33
54
|
|
|
34
55
|
const doc = await pdfjs.getDocument({ data: bytes }).promise;
|
|
35
56
|
const totalPages: number = Number(doc?.numPages ?? 0);
|
|
@@ -42,7 +63,7 @@ export function createPdfTextLayerExtractor(): AssetExtractor {
|
|
|
42
63
|
for (let pageNum = 1; pageNum <= maxPages; pageNum++) {
|
|
43
64
|
const page = await doc.getPage(pageNum);
|
|
44
65
|
const textContent = await page.getTextContent();
|
|
45
|
-
const items:
|
|
66
|
+
const items: PdfTextItem[] = Array.isArray(textContent?.items)
|
|
46
67
|
? textContent.items
|
|
47
68
|
: [];
|
|
48
69
|
const pageText = items
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { generateText } from "ai";
|
|
1
|
+
import { generateText, type LanguageModel } from "ai";
|
|
2
2
|
import { spawn } from "node:child_process";
|
|
3
3
|
import { mkdir, readdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -7,6 +7,11 @@ import type { AssetExtractor } from "../../core/types";
|
|
|
7
7
|
import { getAssetBytes } from "../_shared/fetch";
|
|
8
8
|
import { capText } from "../_shared/text";
|
|
9
9
|
|
|
10
|
+
/**
|
|
11
|
+
* Model reference type that accepts both string gateway IDs and LanguageModel instances.
|
|
12
|
+
*/
|
|
13
|
+
type ModelRef = string | LanguageModel;
|
|
14
|
+
|
|
10
15
|
const run = async (cmd: string, args: string[], opts: { cwd: string }) => {
|
|
11
16
|
return await new Promise<{ stdout: string; stderr: string }>((resolve, reject) => {
|
|
12
17
|
const child = spawn(cmd, args, { cwd: opts.cwd, stdio: ["ignore", "pipe", "pipe"] });
|
|
@@ -87,7 +92,7 @@ export function createVideoFramesExtractor(): AssetExtractor {
|
|
|
87
92
|
|
|
88
93
|
const imgBytes = await readFile(path.join(tmpDir, f));
|
|
89
94
|
const result = await generateText({
|
|
90
|
-
model: cfg.model as
|
|
95
|
+
model: cfg.model as ModelRef,
|
|
91
96
|
abortSignal: abortPerFrame(cfg.timeoutMs),
|
|
92
97
|
messages: [
|
|
93
98
|
{
|
|
@@ -100,7 +105,7 @@ export function createVideoFramesExtractor(): AssetExtractor {
|
|
|
100
105
|
],
|
|
101
106
|
});
|
|
102
107
|
|
|
103
|
-
const t =
|
|
108
|
+
const t = (result.text ?? "").trim();
|
|
104
109
|
if (!t) continue;
|
|
105
110
|
|
|
106
111
|
const capped = capText(t, cfg.maxOutputChars - totalChars);
|
|
@@ -1,7 +1,21 @@
|
|
|
1
|
-
import { experimental_transcribe as transcribe } from "ai";
|
|
2
|
-
import type { AssetExtractor } from "../../core/types";
|
|
1
|
+
import { experimental_transcribe as transcribe, type TranscriptionModel } from "ai";
|
|
2
|
+
import type { AssetExtractor, ExtractedTextItem } from "../../core/types";
|
|
3
3
|
import { getAssetBytes } from "../_shared/fetch";
|
|
4
4
|
|
|
5
|
+
/**
|
|
6
|
+
* Model reference type that accepts both string gateway IDs and TranscriptionModel instances.
|
|
7
|
+
*/
|
|
8
|
+
type TranscriptionModelRef = string | TranscriptionModel;
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Transcription segment from the AI SDK.
|
|
12
|
+
*/
|
|
13
|
+
interface TranscriptionSegment {
|
|
14
|
+
text?: string;
|
|
15
|
+
startSecond?: number;
|
|
16
|
+
endSecond?: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
5
19
|
/**
|
|
6
20
|
* Video transcription by sending the video file to the AI SDK transcription API.
|
|
7
21
|
*
|
|
@@ -28,43 +42,45 @@ export function createVideoTranscribeExtractor(): AssetExtractor {
|
|
|
28
42
|
const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
|
|
29
43
|
|
|
30
44
|
const result = await transcribe({
|
|
31
|
-
model: cfg.model as
|
|
32
|
-
audio: bytes
|
|
45
|
+
model: cfg.model as TranscriptionModelRef,
|
|
46
|
+
audio: bytes,
|
|
33
47
|
abortSignal,
|
|
34
48
|
});
|
|
35
49
|
|
|
36
|
-
const segments:
|
|
37
|
-
?
|
|
50
|
+
const segments: TranscriptionSegment[] = Array.isArray(result.segments)
|
|
51
|
+
? result.segments
|
|
38
52
|
: [];
|
|
39
53
|
|
|
40
54
|
if (segments.length > 0) {
|
|
55
|
+
const textItems: ExtractedTextItem[] = segments
|
|
56
|
+
.map((s, i) => {
|
|
57
|
+
const t = String(s?.text ?? "").trim();
|
|
58
|
+
if (!t) return null;
|
|
59
|
+
const start = Number(s?.startSecond ?? NaN);
|
|
60
|
+
const end = Number(s?.endSecond ?? NaN);
|
|
61
|
+
return {
|
|
62
|
+
label: `segment-${i + 1}`,
|
|
63
|
+
content: t,
|
|
64
|
+
...(Number.isFinite(start) && Number.isFinite(end)
|
|
65
|
+
? { timeRangeSec: [start, end] as [number, number] }
|
|
66
|
+
: {}),
|
|
67
|
+
};
|
|
68
|
+
})
|
|
69
|
+
.filter((item): item is ExtractedTextItem => item !== null);
|
|
70
|
+
|
|
41
71
|
return {
|
|
42
|
-
texts:
|
|
43
|
-
.map((s, i) => {
|
|
44
|
-
const t = String(s?.text ?? "").trim();
|
|
45
|
-
if (!t) return null;
|
|
46
|
-
const start = Number(s?.startSecond ?? NaN);
|
|
47
|
-
const end = Number(s?.endSecond ?? NaN);
|
|
48
|
-
return {
|
|
49
|
-
label: `segment-${i + 1}`,
|
|
50
|
-
content: t,
|
|
51
|
-
...(Number.isFinite(start) && Number.isFinite(end)
|
|
52
|
-
? { timeRangeSec: [start, end] as [number, number] }
|
|
53
|
-
: {}),
|
|
54
|
-
};
|
|
55
|
-
})
|
|
56
|
-
.filter(Boolean) as any,
|
|
72
|
+
texts: textItems,
|
|
57
73
|
diagnostics: {
|
|
58
74
|
model: cfg.model,
|
|
59
75
|
seconds:
|
|
60
|
-
typeof
|
|
61
|
-
?
|
|
76
|
+
typeof result.durationInSeconds === "number"
|
|
77
|
+
? result.durationInSeconds
|
|
62
78
|
: undefined,
|
|
63
79
|
},
|
|
64
80
|
};
|
|
65
81
|
}
|
|
66
82
|
|
|
67
|
-
const text =
|
|
83
|
+
const text = (result.text ?? "").trim();
|
|
68
84
|
if (!text) return { texts: [], diagnostics: { model: cfg.model } };
|
|
69
85
|
|
|
70
86
|
return {
|
package/registry/manifest.json
CHANGED
|
@@ -33,7 +33,7 @@
|
|
|
33
33
|
"inputModes": ["file", "url", "buffer"],
|
|
34
34
|
"output": "text (markdown)",
|
|
35
35
|
"docsPath": "/docs/extractors/pdf/llm",
|
|
36
|
-
"deps": { "ai": "^
|
|
36
|
+
"deps": { "ai": "^6.0.3" },
|
|
37
37
|
"devDeps": {},
|
|
38
38
|
"factory": "createPdfLlmExtractor",
|
|
39
39
|
"assetProcessingFlagKeys": ["pdf_llmExtraction"]
|
|
@@ -70,7 +70,7 @@
|
|
|
70
70
|
"inputModes": ["file", "url", "buffer"],
|
|
71
71
|
"output": "text",
|
|
72
72
|
"docsPath": "/docs/extractors/image/ocr",
|
|
73
|
-
"deps": { "ai": "^
|
|
73
|
+
"deps": { "ai": "^6.0.0" },
|
|
74
74
|
"devDeps": {},
|
|
75
75
|
"factory": "createImageOcrExtractor",
|
|
76
76
|
"assetProcessingFlagKeys": ["image_ocr"]
|
|
@@ -88,7 +88,7 @@
|
|
|
88
88
|
"inputModes": ["file", "url", "buffer"],
|
|
89
89
|
"output": "caption",
|
|
90
90
|
"docsPath": "/docs/extractors/image/caption-llm",
|
|
91
|
-
"deps": { "ai": "^
|
|
91
|
+
"deps": { "ai": "^6.0.0" },
|
|
92
92
|
"devDeps": {},
|
|
93
93
|
"factory": "createImageCaptionLlmExtractor",
|
|
94
94
|
"assetProcessingFlagKeys": ["image_captionLlm"]
|
|
@@ -106,7 +106,7 @@
|
|
|
106
106
|
"inputModes": ["file", "url", "buffer"],
|
|
107
107
|
"output": "transcript",
|
|
108
108
|
"docsPath": "/docs/extractors/audio/transcribe",
|
|
109
|
-
"deps": { "ai": "^
|
|
109
|
+
"deps": { "ai": "^6.0.0" },
|
|
110
110
|
"devDeps": {},
|
|
111
111
|
"factory": "createAudioTranscribeExtractor",
|
|
112
112
|
"assetProcessingFlagKeys": ["audio_transcription"]
|
|
@@ -124,7 +124,7 @@
|
|
|
124
124
|
"inputModes": ["file", "url", "buffer"],
|
|
125
125
|
"output": "transcript",
|
|
126
126
|
"docsPath": "/docs/extractors/video/transcribe",
|
|
127
|
-
"deps": { "ai": "^
|
|
127
|
+
"deps": { "ai": "^6.0.0" },
|
|
128
128
|
"devDeps": {},
|
|
129
129
|
"factory": "createVideoTranscribeExtractor",
|
|
130
130
|
"assetProcessingFlagKeys": ["video_transcription"]
|
|
@@ -143,7 +143,7 @@
|
|
|
143
143
|
"inputModes": ["file", "url", "buffer"],
|
|
144
144
|
"output": "frame descriptions",
|
|
145
145
|
"docsPath": "/docs/extractors/video/frames",
|
|
146
|
-
"deps": { "ai": "^
|
|
146
|
+
"deps": { "ai": "^6.0.0" },
|
|
147
147
|
"devDeps": {},
|
|
148
148
|
"factory": "createVideoFramesExtractor",
|
|
149
149
|
"assetProcessingFlagKeys": ["video_frames"]
|
|
@@ -1,9 +1,26 @@
|
|
|
1
1
|
import { documents, chunks, embeddings } from "./schema";
|
|
2
2
|
import type { Chunk, VectorStore } from "../../core/types";
|
|
3
3
|
import { eq, like, sql, type SQL } from "drizzle-orm";
|
|
4
|
-
import type { PgDatabase } from "drizzle-orm/pg-core";
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
import type { PgDatabase, PgQueryResultHKT } from "drizzle-orm/pg-core";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Accepts any Drizzle Postgres database instance regardless of schema type.
|
|
8
|
+
*/
|
|
9
|
+
type DrizzleDb = PgDatabase<PgQueryResultHKT, Record<string, unknown>>;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Query row type for vector similarity search results.
|
|
13
|
+
*/
|
|
14
|
+
interface QueryRow {
|
|
15
|
+
id: string;
|
|
16
|
+
document_id: string;
|
|
17
|
+
source_id: string;
|
|
18
|
+
idx: number;
|
|
19
|
+
content: string;
|
|
20
|
+
token_count: number;
|
|
21
|
+
metadata: Record<string, unknown> | null;
|
|
22
|
+
score: number;
|
|
23
|
+
}
|
|
7
24
|
|
|
8
25
|
const sanitizeMetadata = (metadata: unknown) => {
|
|
9
26
|
if (metadata === undefined) {
|
|
@@ -133,11 +150,11 @@ export const createDrizzleVectorStore = (db: DrizzleDb): VectorStore => ({
|
|
|
133
150
|
`
|
|
134
151
|
);
|
|
135
152
|
|
|
136
|
-
const rows = Array.isArray(result)
|
|
137
|
-
? result
|
|
138
|
-
: ((result as { rows?:
|
|
153
|
+
const rows: QueryRow[] = Array.isArray(result)
|
|
154
|
+
? (result as QueryRow[])
|
|
155
|
+
: ((result as { rows?: QueryRow[] }).rows ?? []);
|
|
139
156
|
|
|
140
|
-
return
|
|
157
|
+
return rows.map((row) => ({
|
|
141
158
|
id: String(row.id),
|
|
142
159
|
documentId: String(row.document_id),
|
|
143
160
|
sourceId: String(row.source_id),
|