unrag 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/cli/index.js +611 -174
  2. package/package.json +12 -6
  3. package/registry/config/unrag.config.ts +9 -8
  4. package/registry/connectors/google-drive/_api-types.ts +60 -0
  5. package/registry/connectors/google-drive/client.ts +99 -38
  6. package/registry/connectors/google-drive/sync.ts +97 -69
  7. package/registry/connectors/google-drive/types.ts +76 -37
  8. package/registry/connectors/notion/client.ts +12 -3
  9. package/registry/connectors/notion/render.ts +62 -23
  10. package/registry/connectors/notion/sync.ts +30 -23
  11. package/registry/core/assets.ts +11 -10
  12. package/registry/core/config.ts +10 -25
  13. package/registry/core/context-engine.ts +71 -2
  14. package/registry/core/deep-merge.ts +45 -0
  15. package/registry/core/ingest.ts +117 -44
  16. package/registry/core/types.ts +96 -2
  17. package/registry/docs/unrag.md +6 -1
  18. package/registry/embedding/_shared.ts +25 -0
  19. package/registry/embedding/ai.ts +8 -68
  20. package/registry/embedding/azure.ts +88 -0
  21. package/registry/embedding/bedrock.ts +88 -0
  22. package/registry/embedding/cohere.ts +88 -0
  23. package/registry/embedding/google.ts +102 -0
  24. package/registry/embedding/mistral.ts +71 -0
  25. package/registry/embedding/ollama.ts +90 -0
  26. package/registry/embedding/openai.ts +88 -0
  27. package/registry/embedding/openrouter.ts +127 -0
  28. package/registry/embedding/together.ts +77 -0
  29. package/registry/embedding/vertex.ts +111 -0
  30. package/registry/embedding/voyage.ts +169 -0
  31. package/registry/extractors/audio-transcribe/index.ts +39 -23
  32. package/registry/extractors/file-docx/index.ts +8 -1
  33. package/registry/extractors/file-pptx/index.ts +22 -1
  34. package/registry/extractors/file-xlsx/index.ts +24 -1
  35. package/registry/extractors/image-caption-llm/index.ts +8 -3
  36. package/registry/extractors/image-ocr/index.ts +9 -4
  37. package/registry/extractors/pdf-llm/index.ts +9 -4
  38. package/registry/extractors/pdf-text-layer/index.ts +23 -2
  39. package/registry/extractors/video-frames/index.ts +8 -3
  40. package/registry/extractors/video-transcribe/index.ts +40 -24
  41. package/registry/manifest.json +346 -0
  42. package/registry/store/drizzle-postgres-pgvector/store.ts +26 -6
@@ -1,7 +1,21 @@
1
- import { experimental_transcribe as transcribe } from "ai";
2
- import type { AssetExtractor } from "../../core/types";
1
+ import { experimental_transcribe as transcribe, type TranscriptionModel } from "ai";
2
+ import type { AssetExtractor, ExtractedTextItem } from "../../core/types";
3
3
  import { getAssetBytes } from "../_shared/fetch";
4
4
 
5
+ /**
6
+ * Model reference type that accepts both string gateway IDs and TranscriptionModel instances.
7
+ */
8
+ type TranscriptionModelRef = string | TranscriptionModel;
9
+
10
+ /**
11
+ * Transcription segment from the AI SDK.
12
+ */
13
+ interface TranscriptionSegment {
14
+ text?: string;
15
+ startSecond?: number;
16
+ endSecond?: number;
17
+ }
18
+
5
19
  /**
6
20
  * Video transcription by sending the video file to the AI SDK transcription API.
7
21
  *
@@ -28,43 +42,45 @@ export function createVideoTranscribeExtractor(): AssetExtractor {
28
42
  const abortSignal = AbortSignal.timeout(cfg.timeoutMs);
29
43
 
30
44
  const result = await transcribe({
31
- model: cfg.model as any,
32
- audio: bytes as any,
45
+ model: cfg.model as TranscriptionModelRef,
46
+ audio: bytes,
33
47
  abortSignal,
34
48
  });
35
49
 
36
- const segments: any[] = Array.isArray((result as any)?.segments)
37
- ? (result as any).segments
50
+ const segments: TranscriptionSegment[] = Array.isArray(result.segments)
51
+ ? result.segments
38
52
  : [];
39
53
 
40
54
  if (segments.length > 0) {
55
+ const textItems: ExtractedTextItem[] = segments
56
+ .map((s, i) => {
57
+ const t = String(s?.text ?? "").trim();
58
+ if (!t) return null;
59
+ const start = Number(s?.startSecond ?? NaN);
60
+ const end = Number(s?.endSecond ?? NaN);
61
+ return {
62
+ label: `segment-${i + 1}`,
63
+ content: t,
64
+ ...(Number.isFinite(start) && Number.isFinite(end)
65
+ ? { timeRangeSec: [start, end] as [number, number] }
66
+ : {}),
67
+ };
68
+ })
69
+ .filter((item): item is ExtractedTextItem => item !== null);
70
+
41
71
  return {
42
- texts: segments
43
- .map((s, i) => {
44
- const t = String(s?.text ?? "").trim();
45
- if (!t) return null;
46
- const start = Number(s?.startSecond ?? NaN);
47
- const end = Number(s?.endSecond ?? NaN);
48
- return {
49
- label: `segment-${i + 1}`,
50
- content: t,
51
- ...(Number.isFinite(start) && Number.isFinite(end)
52
- ? { timeRangeSec: [start, end] as [number, number] }
53
- : {}),
54
- };
55
- })
56
- .filter(Boolean) as any,
72
+ texts: textItems,
57
73
  diagnostics: {
58
74
  model: cfg.model,
59
75
  seconds:
60
- typeof (result as any)?.durationInSeconds === "number"
61
- ? (result as any).durationInSeconds
76
+ typeof result.durationInSeconds === "number"
77
+ ? result.durationInSeconds
62
78
  : undefined,
63
79
  },
64
80
  };
65
81
  }
66
82
 
67
- const text = String((result as any)?.text ?? "").trim();
83
+ const text = (result.text ?? "").trim();
68
84
  if (!text) return { texts: [], diagnostics: { model: cfg.model } };
69
85
 
70
86
  return {
@@ -0,0 +1,346 @@
1
+ {
2
+ "version": 1,
3
+ "extractors": [
4
+ {
5
+ "id": "pdf-text-layer",
6
+ "extractorName": "pdf:text-layer",
7
+ "group": "PDF",
8
+ "label": "pdf-text-layer",
9
+ "description": "Fast/cheap extraction via PDF text layer",
10
+ "hint": "recommended",
11
+ "defaultSelected": true,
12
+ "workerOnly": false,
13
+ "configComplexity": "needs-dep",
14
+ "fileTypes": ["pdf"],
15
+ "inputModes": ["file", "url", "buffer"],
16
+ "output": "text",
17
+ "docsPath": "/docs/extractors/pdf/text-layer",
18
+ "deps": { "pdfjs-dist": "^5.4.149" },
19
+ "devDeps": {},
20
+ "factory": "createPdfTextLayerExtractor",
21
+ "assetProcessingFlagKeys": ["pdf_textLayer"]
22
+ },
23
+ {
24
+ "id": "pdf-llm",
25
+ "extractorName": "pdf:llm",
26
+ "group": "PDF",
27
+ "label": "pdf-llm",
28
+ "description": "LLM-based PDF extraction; higher cost",
29
+ "defaultSelected": false,
30
+ "workerOnly": false,
31
+ "configComplexity": "needs-api-key",
32
+ "fileTypes": ["pdf"],
33
+ "inputModes": ["file", "url", "buffer"],
34
+ "output": "text (markdown)",
35
+ "docsPath": "/docs/extractors/pdf/llm",
36
+ "deps": { "ai": "^6.0.3" },
37
+ "devDeps": {},
38
+ "factory": "createPdfLlmExtractor",
39
+ "assetProcessingFlagKeys": ["pdf_llmExtraction"]
40
+ },
41
+ {
42
+ "id": "pdf-ocr",
43
+ "extractorName": "pdf:ocr",
44
+ "group": "PDF",
45
+ "label": "pdf-ocr",
46
+ "description": "OCR scanned PDFs; requires native binaries",
47
+ "hint": "worker-only",
48
+ "defaultSelected": false,
49
+ "workerOnly": true,
50
+ "configComplexity": "advanced",
51
+ "fileTypes": ["pdf"],
52
+ "inputModes": ["file", "url", "buffer"],
53
+ "output": "text",
54
+ "docsPath": "/docs/extractors/pdf/ocr",
55
+ "deps": {},
56
+ "devDeps": {},
57
+ "factory": "createPdfOcrExtractor",
58
+ "assetProcessingFlagKeys": ["pdf_ocr"]
59
+ },
60
+ {
61
+ "id": "image-ocr",
62
+ "extractorName": "image:ocr",
63
+ "group": "Image",
64
+ "label": "image-ocr",
65
+ "description": "Extract text from images via vision LLM",
66
+ "defaultSelected": false,
67
+ "workerOnly": false,
68
+ "configComplexity": "needs-api-key",
69
+ "fileTypes": ["jpg", "png", "webp", "gif"],
70
+ "inputModes": ["file", "url", "buffer"],
71
+ "output": "text",
72
+ "docsPath": "/docs/extractors/image/ocr",
73
+ "deps": { "ai": "^6.0.0" },
74
+ "devDeps": {},
75
+ "factory": "createImageOcrExtractor",
76
+ "assetProcessingFlagKeys": ["image_ocr"]
77
+ },
78
+ {
79
+ "id": "image-caption-llm",
80
+ "extractorName": "image:caption-llm",
81
+ "group": "Image",
82
+ "label": "image-caption-llm",
83
+ "description": "Generate captions for images via vision LLM",
84
+ "defaultSelected": false,
85
+ "workerOnly": false,
86
+ "configComplexity": "needs-api-key",
87
+ "fileTypes": ["jpg", "png", "webp", "gif"],
88
+ "inputModes": ["file", "url", "buffer"],
89
+ "output": "caption",
90
+ "docsPath": "/docs/extractors/image/caption-llm",
91
+ "deps": { "ai": "^6.0.0" },
92
+ "devDeps": {},
93
+ "factory": "createImageCaptionLlmExtractor",
94
+ "assetProcessingFlagKeys": ["image_captionLlm"]
95
+ },
96
+ {
97
+ "id": "audio-transcribe",
98
+ "extractorName": "audio:transcribe",
99
+ "group": "Audio",
100
+ "label": "audio-transcribe",
101
+ "description": "Speech-to-text transcription",
102
+ "defaultSelected": false,
103
+ "workerOnly": false,
104
+ "configComplexity": "needs-api-key",
105
+ "fileTypes": ["mp3", "wav", "ogg", "m4a"],
106
+ "inputModes": ["file", "url", "buffer"],
107
+ "output": "transcript",
108
+ "docsPath": "/docs/extractors/audio/transcribe",
109
+ "deps": { "ai": "^6.0.0" },
110
+ "devDeps": {},
111
+ "factory": "createAudioTranscribeExtractor",
112
+ "assetProcessingFlagKeys": ["audio_transcription"]
113
+ },
114
+ {
115
+ "id": "video-transcribe",
116
+ "extractorName": "video:transcribe",
117
+ "group": "Video",
118
+ "label": "video-transcribe",
119
+ "description": "Transcribe video audio track",
120
+ "defaultSelected": false,
121
+ "workerOnly": false,
122
+ "configComplexity": "needs-api-key",
123
+ "fileTypes": ["mp4", "webm", "mov"],
124
+ "inputModes": ["file", "url", "buffer"],
125
+ "output": "transcript",
126
+ "docsPath": "/docs/extractors/video/transcribe",
127
+ "deps": { "ai": "^6.0.0" },
128
+ "devDeps": {},
129
+ "factory": "createVideoTranscribeExtractor",
130
+ "assetProcessingFlagKeys": ["video_transcription"]
131
+ },
132
+ {
133
+ "id": "video-frames",
134
+ "extractorName": "video:frames",
135
+ "group": "Video",
136
+ "label": "video-frames",
137
+ "description": "Sample frames + analyze via vision LLM; requires ffmpeg",
138
+ "hint": "worker-only",
139
+ "defaultSelected": false,
140
+ "workerOnly": true,
141
+ "configComplexity": "advanced",
142
+ "fileTypes": ["mp4", "webm", "mov"],
143
+ "inputModes": ["file", "url", "buffer"],
144
+ "output": "frame descriptions",
145
+ "docsPath": "/docs/extractors/video/frames",
146
+ "deps": { "ai": "^6.0.0" },
147
+ "devDeps": {},
148
+ "factory": "createVideoFramesExtractor",
149
+ "assetProcessingFlagKeys": ["video_frames"]
150
+ },
151
+ {
152
+ "id": "file-text",
153
+ "extractorName": "file:text",
154
+ "group": "Files",
155
+ "label": "file-text",
156
+ "description": "Extract text/markdown/json/html from common text files",
157
+ "hint": "recommended",
158
+ "defaultSelected": true,
159
+ "workerOnly": false,
160
+ "configComplexity": "zero-config",
161
+ "fileTypes": ["txt", "md", "json", "csv"],
162
+ "inputModes": ["file", "url", "buffer"],
163
+ "output": "text",
164
+ "docsPath": "/docs/extractors/file/text",
165
+ "deps": {},
166
+ "devDeps": {},
167
+ "factory": "createFileTextExtractor",
168
+ "assetProcessingFlagKeys": ["file_text"]
169
+ },
170
+ {
171
+ "id": "file-docx",
172
+ "extractorName": "file:docx",
173
+ "group": "Files",
174
+ "label": "file-docx",
175
+ "description": "Extract text from .docx files",
176
+ "defaultSelected": false,
177
+ "workerOnly": false,
178
+ "configComplexity": "needs-dep",
179
+ "fileTypes": ["docx"],
180
+ "inputModes": ["file", "url", "buffer"],
181
+ "output": "text",
182
+ "docsPath": "/docs/extractors/file/docx",
183
+ "deps": { "mammoth": "^1.10.0" },
184
+ "devDeps": {},
185
+ "factory": "createFileDocxExtractor",
186
+ "assetProcessingFlagKeys": ["file_docx"]
187
+ },
188
+ {
189
+ "id": "file-pptx",
190
+ "extractorName": "file:pptx",
191
+ "group": "Files",
192
+ "label": "file-pptx",
193
+ "description": "Extract text from .pptx slides",
194
+ "defaultSelected": false,
195
+ "workerOnly": false,
196
+ "configComplexity": "needs-dep",
197
+ "fileTypes": ["pptx"],
198
+ "inputModes": ["file", "url", "buffer"],
199
+ "output": "text",
200
+ "docsPath": "/docs/extractors/file/pptx",
201
+ "deps": { "jszip": "^3.10.1" },
202
+ "devDeps": {},
203
+ "factory": "createFilePptxExtractor",
204
+ "assetProcessingFlagKeys": ["file_pptx"]
205
+ },
206
+ {
207
+ "id": "file-xlsx",
208
+ "extractorName": "file:xlsx",
209
+ "group": "Files",
210
+ "label": "file-xlsx",
211
+ "description": "Extract tables from .xlsx spreadsheets",
212
+ "defaultSelected": false,
213
+ "workerOnly": false,
214
+ "configComplexity": "needs-dep",
215
+ "fileTypes": ["xlsx"],
216
+ "inputModes": ["file", "url", "buffer"],
217
+ "output": "text (csv)",
218
+ "docsPath": "/docs/extractors/file/xlsx",
219
+ "deps": { "xlsx": "^0.18.5" },
220
+ "devDeps": {},
221
+ "factory": "createFileXlsxExtractor",
222
+ "assetProcessingFlagKeys": ["file_xlsx"]
223
+ }
224
+ ],
225
+ "connectors": [
226
+ {
227
+ "id": "notion",
228
+ "displayName": "Notion",
229
+ "types": ["docs", "wiki", "db"],
230
+ "description": "Sync pages, databases, and blocks from Notion workspaces",
231
+ "status": "available",
232
+ "docsPath": "/docs/connectors/notion",
233
+ "deps": { "@notionhq/client": "^2.2.16" },
234
+ "devDeps": {},
235
+ "envVars": [
236
+ { "name": "NOTION_TOKEN", "required": true, "notes": "Server-only Notion integration token." }
237
+ ]
238
+ },
239
+ {
240
+ "id": "google-drive",
241
+ "displayName": "Google Drive",
242
+ "types": ["files", "docs"],
243
+ "description": "Ingest Docs/Sheets exports and shared folders",
244
+ "status": "available",
245
+ "docsPath": "/docs/connectors/google-drive",
246
+ "deps": { "googleapis": "^148.0.0", "google-auth-library": "^10.0.0" },
247
+ "devDeps": {},
248
+ "envVars": [
249
+ {
250
+ "name": "GOOGLE_SERVICE_ACCOUNT_JSON",
251
+ "required": false,
252
+ "notes": "Service account JSON credentials (server-only)."
253
+ },
254
+ { "name": "GOOGLE_CLIENT_ID", "required": false, "notes": "OAuth client id (server-only)." },
255
+ {
256
+ "name": "GOOGLE_CLIENT_SECRET",
257
+ "required": false,
258
+ "notes": "OAuth client secret (server-only)."
259
+ },
260
+ { "name": "GOOGLE_REDIRECT_URI", "required": false, "notes": "OAuth redirect URI." }
261
+ ]
262
+ },
263
+ {
264
+ "id": "github",
265
+ "displayName": "GitHub",
266
+ "types": ["code", "docs"],
267
+ "description": "Ingest repositories (Markdown, READMEs, docs folders) and issues/PRs",
268
+ "status": "coming-soon",
269
+ "docsPath": null,
270
+ "deps": {},
271
+ "devDeps": {}
272
+ },
273
+ {
274
+ "id": "gitlab",
275
+ "displayName": "GitLab",
276
+ "types": ["code", "docs"],
277
+ "description": "Ingest repos + wiki pages for self-hosted documentation",
278
+ "status": "coming-soon",
279
+ "docsPath": null,
280
+ "deps": {},
281
+ "devDeps": {}
282
+ },
283
+ {
284
+ "id": "slack",
285
+ "displayName": "Slack",
286
+ "types": ["chat"],
287
+ "description": "Ingest channels (messages + threads) as searchable knowledge",
288
+ "status": "coming-soon",
289
+ "docsPath": null,
290
+ "deps": {},
291
+ "devDeps": {}
292
+ },
293
+ {
294
+ "id": "discord",
295
+ "displayName": "Discord",
296
+ "types": ["chat"],
297
+ "description": "Ingest server channels and threads for community support knowledge",
298
+ "status": "coming-soon",
299
+ "docsPath": null,
300
+ "deps": {},
301
+ "devDeps": {}
302
+ },
303
+ {
304
+ "id": "linear",
305
+ "displayName": "Linear",
306
+ "types": ["issues", "project"],
307
+ "description": "Ingest issues and project updates for internal knowledge",
308
+ "status": "coming-soon",
309
+ "docsPath": null,
310
+ "deps": {},
311
+ "devDeps": {}
312
+ },
313
+ {
314
+ "id": "dropbox",
315
+ "displayName": "Dropbox",
316
+ "types": ["files"],
317
+ "description": "Ingest shared folders and exported docs/files",
318
+ "status": "coming-soon",
319
+ "docsPath": null,
320
+ "deps": {},
321
+ "devDeps": {}
322
+ },
323
+ {
324
+ "id": "onedrive",
325
+ "displayName": "OneDrive",
326
+ "types": ["files"],
327
+ "description": "Ingest files and Office exports from Microsoft 365",
328
+ "status": "coming-soon",
329
+ "docsPath": null,
330
+ "deps": {},
331
+ "devDeps": {}
332
+ },
333
+ {
334
+ "id": "teams",
335
+ "displayName": "Microsoft Teams",
336
+ "types": ["chat"],
337
+ "description": "Ingest channels and conversations for internal support knowledge",
338
+ "status": "coming-soon",
339
+ "docsPath": null,
340
+ "deps": {},
341
+ "devDeps": {}
342
+ }
343
+ ]
344
+ }
345
+
346
+
@@ -1,9 +1,26 @@
1
1
  import { documents, chunks, embeddings } from "./schema";
2
2
  import type { Chunk, VectorStore } from "../../core/types";
3
3
  import { eq, like, sql, type SQL } from "drizzle-orm";
4
- import type { PgDatabase } from "drizzle-orm/pg-core";
5
-
6
- type DrizzleDb = PgDatabase<any, any, any>;
4
+ import type { PgDatabase, PgQueryResultHKT } from "drizzle-orm/pg-core";
5
+
6
+ /**
7
+ * Accepts any Drizzle Postgres database instance regardless of schema type.
8
+ */
9
+ type DrizzleDb = PgDatabase<PgQueryResultHKT, Record<string, unknown>>;
10
+
11
+ /**
12
+ * Query row type for vector similarity search results.
13
+ */
14
+ interface QueryRow {
15
+ id: string;
16
+ document_id: string;
17
+ source_id: string;
18
+ idx: number;
19
+ content: string;
20
+ token_count: number;
21
+ metadata: Record<string, unknown> | null;
22
+ score: number;
23
+ }
7
24
 
8
25
  const sanitizeMetadata = (metadata: unknown) => {
9
26
  if (metadata === undefined) {
@@ -113,7 +130,7 @@ export const createDrizzleVectorStore = (db: DrizzleDb): VectorStore => ({
113
130
 
114
131
  const vectorLiteral = `[${embedding.join(",")}]`;
115
132
 
116
- const rows = await db.execute(
133
+ const result = await db.execute(
117
134
  sql`
118
135
  select
119
136
  c.id,
@@ -133,7 +150,11 @@ export const createDrizzleVectorStore = (db: DrizzleDb): VectorStore => ({
133
150
  `
134
151
  );
135
152
 
136
- return (rows as Array<Record<string, unknown>>).map((row) => ({
153
+ const rows: QueryRow[] = Array.isArray(result)
154
+ ? (result as QueryRow[])
155
+ : ((result as { rows?: QueryRow[] }).rows ?? []);
156
+
157
+ return rows.map((row) => ({
137
158
  id: String(row.id),
138
159
  documentId: String(row.document_id),
139
160
  sourceId: String(row.source_id),
@@ -157,4 +178,3 @@ export const createDrizzleVectorStore = (db: DrizzleDb): VectorStore => ({
157
178
  },
158
179
  });
159
180
 
160
-