unrag 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +2 -2
  2. package/dist/cli/index.js +199 -41
  3. package/package.json +2 -1
  4. package/registry/config/unrag.config.ts +140 -7
  5. package/registry/connectors/notion/render.ts +78 -0
  6. package/registry/connectors/notion/sync.ts +12 -3
  7. package/registry/connectors/notion/types.ts +3 -1
  8. package/registry/core/assets.ts +54 -0
  9. package/registry/core/config.ts +150 -0
  10. package/registry/core/context-engine.ts +69 -1
  11. package/registry/core/index.ts +15 -2
  12. package/registry/core/ingest.ts +743 -17
  13. package/registry/core/types.ts +606 -0
  14. package/registry/docs/unrag.md +6 -0
  15. package/registry/embedding/ai.ts +89 -8
  16. package/registry/extractors/_shared/fetch.ts +113 -0
  17. package/registry/extractors/_shared/media.ts +14 -0
  18. package/registry/extractors/_shared/text.ts +11 -0
  19. package/registry/extractors/audio-transcribe/index.ts +75 -0
  20. package/registry/extractors/file-docx/index.ts +53 -0
  21. package/registry/extractors/file-pptx/index.ts +92 -0
  22. package/registry/extractors/file-text/index.ts +85 -0
  23. package/registry/extractors/file-xlsx/index.ts +58 -0
  24. package/registry/extractors/image-caption-llm/index.ts +60 -0
  25. package/registry/extractors/image-ocr/index.ts +60 -0
  26. package/registry/extractors/pdf-llm/index.ts +84 -0
  27. package/registry/extractors/pdf-ocr/index.ts +125 -0
  28. package/registry/extractors/pdf-text-layer/index.ts +76 -0
  29. package/registry/extractors/video-frames/index.ts +126 -0
  30. package/registry/extractors/video-transcribe/index.ts +78 -0
  31. package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
package/README.md CHANGED
@@ -10,13 +10,13 @@ It installs small, auditable source files into your repo:
10
10
  ## Usage
11
11
 
12
12
  ```bash
13
- bunx unrag init
13
+ bunx unrag@latest init
14
14
  ```
15
15
 
16
16
  ### Common flags
17
17
 
18
18
  ```bash
19
- bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag
19
+ bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag
20
20
  ```
21
21
 
22
22
  - `--store`: `drizzle` | `prisma` | `raw-sql`
package/dist/cli/index.js CHANGED
@@ -74,8 +74,7 @@ var writeText = async (filePath, content) => {
74
74
  var renderUnragConfig = (content, selection) => {
75
75
  const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
76
76
  const baseImports = [
77
- `import { createContextEngine, defineConfig } from "${installImportBase}/core";`,
78
- `import { createAiEmbeddingProvider } from "${installImportBase}/embedding/ai";`
77
+ `import { defineUnragConfig } from "${installImportBase}/core";`
79
78
  ];
80
79
  const storeImports = [];
81
80
  const storeCreateLines = [];
@@ -93,24 +92,14 @@ var renderUnragConfig = (content, selection) => {
93
92
  `);
94
93
  const createEngineBlock = [
95
94
  `export function createUnragEngine() {`,
96
- ` const embedding = createAiEmbeddingProvider({`,
97
- ` model: unragConfig.embedding.model,`,
98
- ` timeoutMs: unragConfig.embedding.timeoutMs,`,
99
- ` });`,
100
95
  ...storeCreateLines,
101
96
  ``,
102
- ` return createContextEngine(`,
103
- ` defineConfig({`,
104
- ` embedding,`,
105
- ` store,`,
106
- ` defaults: unragConfig.chunking,`,
107
- ` })`,
108
- ` );`,
97
+ ` return unrag.createEngine({ store });`,
109
98
  `}`,
110
99
  ``,
111
100
  `export async function retrieve(query: string) {`,
112
101
  ` const engine = createUnragEngine();`,
113
- ` return engine.retrieve({ query, topK: unragConfig.retrieval.topK });`,
102
+ ` return engine.retrieve({ query, topK: unrag.defaults.retrieval.topK });`,
114
103
  `}`
115
104
  ].join(`
116
105
  `);
@@ -147,6 +136,10 @@ async function copyRegistryFiles(selection) {
147
136
  src: path2.join(selection.registryRoot, "core/index.ts"),
148
137
  dest: path2.join(installBaseAbs, "core/index.ts")
149
138
  },
139
+ {
140
+ src: path2.join(selection.registryRoot, "core/assets.ts"),
141
+ dest: path2.join(installBaseAbs, "core/assets.ts")
142
+ },
150
143
  {
151
144
  src: path2.join(selection.registryRoot, "core/types.ts"),
152
145
  dest: path2.join(installBaseAbs, "core/types.ts")
@@ -163,6 +156,10 @@ async function copyRegistryFiles(selection) {
163
156
  src: path2.join(selection.registryRoot, "core/context-engine.ts"),
164
157
  dest: path2.join(installBaseAbs, "core/context-engine.ts")
165
158
  },
159
+ {
160
+ src: path2.join(selection.registryRoot, "core/delete.ts"),
161
+ dest: path2.join(installBaseAbs, "core/delete.ts")
162
+ },
166
163
  {
167
164
  src: path2.join(selection.registryRoot, "core/ingest.ts"),
168
165
  dest: path2.join(installBaseAbs, "core/ingest.ts")
@@ -262,6 +259,70 @@ async function copyConnectorFiles(selection) {
262
259
  await writeText(dest, raw);
263
260
  }
264
261
  }
262
+ async function copyExtractorFiles(selection) {
263
+ const toAbs = (projectRelative) => path2.join(selection.projectRoot, projectRelative);
264
+ const installBaseAbs = toAbs(selection.installDir);
265
+ const extractorRegistryAbs = path2.join(selection.registryRoot, "extractors", selection.extractor);
266
+ const sharedRegistryAbs = path2.join(selection.registryRoot, "extractors", "_shared");
267
+ if (!await exists(extractorRegistryAbs)) {
268
+ throw new Error(`Unknown extractor registry: ${path2.relative(selection.registryRoot, extractorRegistryAbs)}`);
269
+ }
270
+ const extractorFiles = await listFilesRecursive(extractorRegistryAbs);
271
+ const sharedFiles = await exists(sharedRegistryAbs) ? await listFilesRecursive(sharedRegistryAbs) : [];
272
+ const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
273
+ const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
274
+ const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
275
+ for (const src of extractorFiles) {
276
+ if (!await exists(src)) {
277
+ throw new Error(`Registry file missing: ${src}`);
278
+ }
279
+ const rel = path2.relative(extractorRegistryAbs, src);
280
+ const dest = path2.join(destRootAbs, rel);
281
+ if (await exists(dest)) {
282
+ if (nonInteractive) {
283
+ continue;
284
+ }
285
+ const answer = await confirm({
286
+ message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
287
+ initialValue: false
288
+ });
289
+ if (isCancel(answer)) {
290
+ cancel("Cancelled.");
291
+ return;
292
+ }
293
+ if (!answer) {
294
+ continue;
295
+ }
296
+ }
297
+ const raw = await readText(src);
298
+ await writeText(dest, raw);
299
+ }
300
+ for (const src of sharedFiles) {
301
+ if (!await exists(src)) {
302
+ throw new Error(`Registry file missing: ${src}`);
303
+ }
304
+ const rel = path2.relative(sharedRegistryAbs, src);
305
+ const dest = path2.join(sharedDestRootAbs, rel);
306
+ if (await exists(dest)) {
307
+ if (nonInteractive) {
308
+ continue;
309
+ }
310
+ const answer = await confirm({
311
+ message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
312
+ initialValue: false
313
+ });
314
+ if (isCancel(answer)) {
315
+ cancel("Cancelled.");
316
+ return;
317
+ }
318
+ if (!answer) {
319
+ continue;
320
+ }
321
+ }
322
+ const raw = await readText(src);
323
+ await writeText(dest, raw);
324
+ }
325
+ }
265
326
 
266
327
  // cli/lib/json.ts
267
328
  import { readFile as readFile2, writeFile as writeFile2 } from "node:fs/promises";
@@ -347,6 +408,37 @@ function depsForConnector(connector) {
347
408
  }
348
409
  return { deps, devDeps };
349
410
  }
411
+ function depsForExtractor(extractor) {
412
+ const deps = {};
413
+ const devDeps = {};
414
+ if (extractor === "pdf-llm") {
415
+ deps["ai"] = "^5.0.113";
416
+ }
417
+ if (extractor === "pdf-text-layer") {
418
+ deps["pdfjs-dist"] = "^5.4.149";
419
+ }
420
+ if (extractor === "pdf-ocr") {}
421
+ if (extractor === "image-ocr" || extractor === "image-caption-llm") {
422
+ deps["ai"] = "^5.0.113";
423
+ }
424
+ if (extractor === "audio-transcribe" || extractor === "video-transcribe") {
425
+ deps["ai"] = "^5.0.113";
426
+ }
427
+ if (extractor === "video-frames") {
428
+ deps["ai"] = "^5.0.113";
429
+ }
430
+ if (extractor === "file-text") {}
431
+ if (extractor === "file-docx") {
432
+ deps["mammoth"] = "^1.10.0";
433
+ }
434
+ if (extractor === "file-pptx") {
435
+ deps["jszip"] = "^3.10.1";
436
+ }
437
+ if (extractor === "file-xlsx") {
438
+ deps["xlsx"] = "^0.18.5";
439
+ }
440
+ return { deps, devDeps };
441
+ }
350
442
  function installCmd(pm) {
351
443
  if (pm === "bun")
352
444
  return "bun install";
@@ -550,7 +642,8 @@ async function initCommand(args) {
550
642
  storeAdapter: storeAdapterAnswer,
551
643
  aliasBase,
552
644
  version: CONFIG_VERSION,
553
- connectors: existing?.connectors ?? []
645
+ connectors: existing?.connectors ?? [],
646
+ extractors: existing?.extractors ?? []
554
647
  };
555
648
  await writeJsonFile(path5.join(root, CONFIG_FILE), config);
556
649
  const pm = await detectPackageManager(root);
@@ -592,6 +685,20 @@ function docsUrl(siteRelativePath) {
592
685
  var CONFIG_FILE2 = "unrag.json";
593
686
  var __filename3 = fileURLToPath2(import.meta.url);
594
687
  var __dirname3 = path6.dirname(__filename3);
688
+ var AVAILABLE_EXTRACTORS = [
689
+ "pdf-llm",
690
+ "pdf-text-layer",
691
+ "pdf-ocr",
692
+ "image-ocr",
693
+ "image-caption-llm",
694
+ "audio-transcribe",
695
+ "video-transcribe",
696
+ "video-frames",
697
+ "file-text",
698
+ "file-docx",
699
+ "file-pptx",
700
+ "file-xlsx"
701
+ ];
595
702
  var parseAddArgs = (args) => {
596
703
  const out = {};
597
704
  for (let i = 0;i < args.length; i++) {
@@ -600,8 +707,17 @@ var parseAddArgs = (args) => {
600
707
  out.yes = true;
601
708
  continue;
602
709
  }
603
- if (!out.connector && a && !a.startsWith("-")) {
604
- out.connector = a;
710
+ if (!out.kind && a && !a.startsWith("-")) {
711
+ if (a === "extractor") {
712
+ out.kind = "extractor";
713
+ continue;
714
+ }
715
+ out.kind = "connector";
716
+ out.name = a;
717
+ continue;
718
+ }
719
+ if (out.kind === "extractor" && !out.name && a && !a.startsWith("-")) {
720
+ out.name = a;
605
721
  continue;
606
722
  }
607
723
  }
@@ -613,23 +729,24 @@ async function addCommand(args) {
613
729
  throw new Error("Could not find a project root (no package.json found).");
614
730
  }
615
731
  const parsed = parseAddArgs(args);
616
- const connector = parsed.connector;
617
- if (!connector) {
618
- outro2(`Usage: unrag add <connector>
619
-
620
- Available connectors: notion`);
621
- return;
622
- }
623
- if (connector !== "notion") {
624
- outro2(`Unknown connector: ${connector}
625
-
626
- Available connectors: notion`);
732
+ const kind = parsed.kind ?? "connector";
733
+ const name = parsed.name;
734
+ if (!name) {
735
+ outro2([
736
+ "Usage:",
737
+ " unrag add <connector>",
738
+ " unrag add extractor <name>",
739
+ "",
740
+ "Available connectors: notion",
741
+ `Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`
742
+ ].join(`
743
+ `));
627
744
  return;
628
745
  }
629
746
  const configPath = path6.join(root, CONFIG_FILE2);
630
747
  const config = await readJsonFile(configPath);
631
748
  if (!config?.installDir) {
632
- throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag init\` first.`);
749
+ throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag@latest init\` first.`);
633
750
  }
634
751
  const cliPackageRoot = await findUp(__dirname3, "package.json");
635
752
  if (!cliPackageRoot) {
@@ -637,29 +754,70 @@ Available connectors: notion`);
637
754
  }
638
755
  const registryRoot = path6.join(cliPackageRoot, "registry");
639
756
  const nonInteractive = parsed.yes || !process.stdin.isTTY;
640
- await copyConnectorFiles({
757
+ const pkg = await readPackageJson(root);
758
+ if (kind === "connector") {
759
+ const connector = name;
760
+ if (connector !== "notion") {
761
+ outro2(`Unknown connector: ${name}
762
+
763
+ Available connectors: notion`);
764
+ return;
765
+ }
766
+ await copyConnectorFiles({
767
+ projectRoot: root,
768
+ registryRoot,
769
+ installDir: config.installDir,
770
+ connector,
771
+ yes: nonInteractive
772
+ });
773
+ const { deps: deps2, devDeps: devDeps2 } = depsForConnector(connector);
774
+ const merged2 = mergeDeps(pkg, deps2, devDeps2);
775
+ if (merged2.changes.length > 0) {
776
+ await writePackageJson(root, merged2.pkg);
777
+ }
778
+ const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
779
+ await writeJsonFile(configPath, { ...config, connectors });
780
+ outro2([
781
+ `Installed connector: ${connector}.`,
782
+ "",
783
+ `- Code: ${path6.join(config.installDir, "connectors", connector)}`,
784
+ `- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
785
+ "",
786
+ merged2.changes.length > 0 ? `Added deps: ${merged2.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
787
+ nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
788
+ ].filter(Boolean).join(`
789
+ `));
790
+ return;
791
+ }
792
+ const extractor = name;
793
+ if (!extractor || !AVAILABLE_EXTRACTORS.includes(extractor)) {
794
+ outro2(`Unknown extractor: ${name}
795
+
796
+ Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`);
797
+ return;
798
+ }
799
+ await copyExtractorFiles({
641
800
  projectRoot: root,
642
801
  registryRoot,
643
802
  installDir: config.installDir,
644
- connector,
803
+ extractor,
645
804
  yes: nonInteractive
646
805
  });
647
- const pkg = await readPackageJson(root);
648
- const { deps, devDeps } = depsForConnector(connector);
806
+ const { deps, devDeps } = depsForExtractor(extractor);
649
807
  const merged = mergeDeps(pkg, deps, devDeps);
650
808
  if (merged.changes.length > 0) {
651
809
  await writePackageJson(root, merged.pkg);
652
810
  }
653
- const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
654
- await writeJsonFile(configPath, { ...config, connectors });
811
+ const extractors = Array.from(new Set([...config.extractors ?? [], extractor])).sort();
812
+ await writeJsonFile(configPath, { ...config, extractors });
655
813
  outro2([
656
- `Installed connector: ${connector}.`,
814
+ `Installed extractor: ${extractor}.`,
657
815
  "",
658
- `- Code: ${path6.join(config.installDir, "connectors", connector)}`,
659
- `- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
816
+ `- Code: ${path6.join(config.installDir, "extractors", extractor)}`,
660
817
  "",
661
818
  merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
662
- nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
819
+ "",
820
+ `Next: import the extractor and pass it to createContextEngine({ extractors: [...] }).`
663
821
  ].filter(Boolean).join(`
664
822
  `));
665
823
  }
@@ -688,8 +846,8 @@ function renderHelp() {
688
846
  " --alias <@name> Import alias base (e.g. @unrag)",
689
847
  "",
690
848
  "Examples:",
691
- " bunx unrag init",
692
- " bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag",
849
+ " bunx unrag@latest init",
850
+ " bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
693
851
  " bunx unrag add notion --yes",
694
852
  "",
695
853
  "Docs:",
package/package.json CHANGED
@@ -1,10 +1,11 @@
1
1
  {
2
2
  "name": "unrag",
3
3
  "type": "module",
4
+ "repository": "https://github.com/BetterStacks/unrag",
4
5
  "bin": {
5
6
  "unrag": "./dist/cli/index.js"
6
7
  },
7
- "version": "0.2.2",
8
+ "version": "0.2.3",
8
9
  "private": false,
9
10
  "license": "Apache-2.0",
10
11
  "devDependencies": {
@@ -2,10 +2,10 @@
2
2
  * Root Unrag config (generated).
3
3
  *
4
4
  * This file is meant to be the single place you tweak:
5
+ * - Defaults (chunking + retrieval)
6
+ * - Engine settings (storage, asset processing, extractors)
5
7
  * - Embedding provider/model/timeouts
6
- * - Chunking defaults
7
- * - Retrieval defaults
8
- * - How you construct your DB client (Pool/Prisma/etc)
8
+ * - How you construct your DB client (Pool/Prisma/etc) and vector store adapter
9
9
  *
10
10
  * The files under your install dir (e.g. `lib/unrag/**`) are intended to be
11
11
  * treated like vendored source code.
@@ -13,7 +13,8 @@
13
13
 
14
14
  // __UNRAG_IMPORTS__
15
15
 
16
- export const unragConfig = {
16
+ export const unrag = defineUnragConfig({
17
+ defaults: {
17
18
  chunking: {
18
19
  chunkSize: 200,
19
20
  chunkOverlap: 40,
@@ -21,11 +22,143 @@ export const unragConfig = {
21
22
  retrieval: {
22
23
  topK: 8,
23
24
  },
25
+ },
24
26
  embedding: {
25
- model: "openai/text-embedding-3-small",
26
- timeoutMs: 15_000,
27
+ provider: "ai",
28
+ config: {
29
+ type: "text",
30
+ model: "openai/text-embedding-3-small",
31
+ timeoutMs: 15_000,
32
+ },
33
+ },
34
+ engine: {
35
+ /**
36
+ * Storage controls.
37
+ *
38
+ * - storeChunkContent: whether `chunk.content` is persisted and returned by retrieval.
39
+ * - storeDocumentContent: whether the full original document text is stored in `documents.content`.
40
+ */
41
+ storage: {
42
+ storeChunkContent: true,
43
+ storeDocumentContent: true,
44
+ },
45
+ /**
46
+ * Optional extractor modules that can process non-text assets into text outputs.
47
+ *
48
+ * To install:
49
+ * - `unrag add extractor pdf-llm`
50
+ *
51
+ * Then import it in this file and add it here, for example:
52
+ * - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
53
+ * - `extractors: [createPdfLlmExtractor()]`
54
+ */
55
+ extractors: [],
56
+ /**
57
+ * Rich media processing controls.
58
+ *
59
+ * Notes:
60
+ * - The library defaults are cost-safe (PDF LLM extraction is off).
61
+ * - This generated config opts you into PDF extraction for convenience.
62
+ * - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
63
+ */
64
+ assetProcessing: {
65
+ onUnsupportedAsset: "skip",
66
+ onError: "skip",
67
+ concurrency: 4,
68
+ fetch: {
69
+ enabled: true,
70
+ maxBytes: 15 * 1024 * 1024,
71
+ timeoutMs: 20_000,
72
+ // allowedHosts: ["..."], // recommended to mitigate SSRF
73
+ },
74
+ pdf: {
75
+ // Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
76
+ textLayer: {
77
+ enabled: false,
78
+ maxBytes: 15 * 1024 * 1024,
79
+ maxOutputChars: 200_000,
80
+ minChars: 200,
81
+ // maxPages: 200,
82
+ },
83
+ llmExtraction: {
84
+ enabled: true,
85
+ model: "google/gemini-2.0-flash",
86
+ prompt:
87
+ "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
88
+ timeoutMs: 60_000,
89
+ maxBytes: 15 * 1024 * 1024,
90
+ maxOutputChars: 200_000,
91
+ },
92
+ // Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
93
+ ocr: {
94
+ enabled: false,
95
+ maxBytes: 15 * 1024 * 1024,
96
+ maxOutputChars: 200_000,
97
+ minChars: 200,
98
+ // maxPages: 200,
99
+ // pdftoppmPath: "/usr/bin/pdftoppm",
100
+ // tesseractPath: "/usr/bin/tesseract",
101
+ // dpi: 200,
102
+ // lang: "eng",
103
+ },
104
+ },
105
+ image: {
106
+ ocr: {
107
+ enabled: false,
108
+ model: "google/gemini-2.0-flash",
109
+ prompt:
110
+ "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
111
+ timeoutMs: 60_000,
112
+ maxBytes: 10 * 1024 * 1024,
113
+ maxOutputChars: 50_000,
114
+ },
115
+ captionLlm: {
116
+ enabled: false,
117
+ model: "google/gemini-2.0-flash",
118
+ prompt:
119
+ "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
120
+ timeoutMs: 60_000,
121
+ maxBytes: 10 * 1024 * 1024,
122
+ maxOutputChars: 10_000,
123
+ },
124
+ },
125
+ audio: {
126
+ transcription: {
127
+ enabled: false,
128
+ model: "openai/whisper-1",
129
+ timeoutMs: 120_000,
130
+ maxBytes: 25 * 1024 * 1024,
131
+ },
132
+ },
133
+ video: {
134
+ transcription: {
135
+ enabled: false,
136
+ model: "openai/whisper-1",
137
+ timeoutMs: 120_000,
138
+ maxBytes: 50 * 1024 * 1024,
139
+ },
140
+ frames: {
141
+ enabled: false,
142
+ sampleFps: 0.2,
143
+ maxFrames: 50,
144
+ // ffmpegPath: "/usr/bin/ffmpeg",
145
+ maxBytes: 50 * 1024 * 1024,
146
+ model: "google/gemini-2.0-flash",
147
+ prompt:
148
+ "Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
149
+ timeoutMs: 60_000,
150
+ maxOutputChars: 50_000,
151
+ },
152
+ },
153
+ file: {
154
+ text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
155
+ docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
156
+ pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
157
+ xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
158
+ },
159
+ },
27
160
  },
28
- } as const;
161
+ } as const);
29
162
 
30
163
  // __UNRAG_CREATE_ENGINE__
31
164
 
@@ -1,3 +1,5 @@
1
+ import type { AssetInput, AssetKind, Metadata } from "../../core";
2
+
1
3
  type RichText = { plain_text?: string };
2
4
 
3
5
  export type NotionBlock = {
@@ -20,6 +22,82 @@ const rt = (value: unknown): string => {
20
22
 
21
23
  const indent = (n: number) => (n > 0 ? " ".repeat(n) : "");
22
24
 
25
+ const asString = (v: unknown) => String(v ?? "").trim();
26
+
27
+ const supportedAssetKinds = new Set<AssetKind>([
28
+ "image",
29
+ "pdf",
30
+ "audio",
31
+ "video",
32
+ "file",
33
+ ]);
34
+
35
+ const toAssetKind = (notionType: string): AssetKind | null => {
36
+ const t = notionType as AssetKind;
37
+ return supportedAssetKinds.has(t) ? t : null;
38
+ };
39
+
40
+ const pickUrl = (payload: any): string | undefined => {
41
+ const type = String(payload?.type ?? "");
42
+ if (type === "external") return asString(payload?.external?.url);
43
+ if (type === "file") return asString(payload?.file?.url);
44
+ return undefined;
45
+ };
46
+
47
+ const pickCaption = (payload: any): string => {
48
+ // Notion captions are typically an array of rich text items.
49
+ return rt(payload?.caption);
50
+ };
51
+
52
+ const inferMediaType = (assetKind: AssetKind, payload: any): string | undefined => {
53
+ if (assetKind === "pdf") return "application/pdf";
54
+ // Notion does not consistently include media types; keep it optional.
55
+ return asString(payload?.media_type) || undefined;
56
+ };
57
+
58
+ const asMetadata = (obj: Record<string, unknown>): Metadata => obj as any;
59
+
60
+ export function extractNotionAssets(
61
+ nodes: NotionBlockNode[],
62
+ opts: { maxDepth?: number } = {}
63
+ ): AssetInput[] {
64
+ const maxDepth = opts.maxDepth ?? 6;
65
+ const out: AssetInput[] = [];
66
+
67
+ const walk = (node: NotionBlockNode, depth: number) => {
68
+ if (depth > maxDepth) return;
69
+ const b = node.block as any;
70
+ const kind = toAssetKind(String(b.type ?? ""));
71
+ if (kind) {
72
+ const payload = b[kind];
73
+ const url = pickUrl(payload);
74
+ if (url) {
75
+ const caption = pickCaption(payload).trim();
76
+ const mediaType = inferMediaType(kind, payload);
77
+ out.push({
78
+ assetId: String(b.id),
79
+ kind,
80
+ data: { kind: "url", url, ...(mediaType ? { mediaType } : {}) },
81
+ uri: url,
82
+ ...(caption ? { text: caption } : {}),
83
+ metadata: asMetadata({
84
+ connector: "notion",
85
+ notionBlockId: String(b.id),
86
+ notionBlockType: String(b.type),
87
+ }),
88
+ });
89
+ }
90
+ }
91
+
92
+ for (const child of node.children) {
93
+ walk(child, depth + 1);
94
+ }
95
+ };
96
+
97
+ for (const n of nodes) walk(n, 0);
98
+ return out;
99
+ }
100
+
23
101
  export function renderNotionBlocksToText(
24
102
  nodes: NotionBlockNode[],
25
103
  opts: { maxDepth?: number } = {}
@@ -1,8 +1,12 @@
1
- import type { ContextEngine } from "../../core";
2
- import type { IngestResult } from "../../core/types";
1
+ import type { IngestResult } from "../../core";
3
2
  import { createNotionClient, type NotionClient } from "./client";
4
3
  import { normalizeNotionPageId32, toUuidHyphenated } from "./ids";
5
- import { renderNotionBlocksToText, type NotionBlock, type NotionBlockNode } from "./render";
4
+ import {
5
+ extractNotionAssets,
6
+ renderNotionBlocksToText,
7
+ type NotionBlock,
8
+ type NotionBlockNode,
9
+ } from "./render";
6
10
  import type {
7
11
  BuildNotionPageIngestInputArgs,
8
12
  NotionPageDocument,
@@ -29,6 +33,7 @@ export function buildNotionPageIngestInput(
29
33
  sourceId,
30
34
  content: args.content,
31
35
  metadata: args.metadata ?? {},
36
+ assets: args.assets ?? [],
32
37
  };
33
38
  }
34
39
 
@@ -108,6 +113,7 @@ export async function loadNotionPageDocument(args: {
108
113
  const tree = await buildBlockTree(args.notion, apiId, 0, args.maxDepth ?? 4);
109
114
  const body = renderNotionBlocksToText(tree);
110
115
  const content = [title.trim(), body.trim()].filter(Boolean).join("\n\n");
116
+ const assets = extractNotionAssets(tree);
111
117
 
112
118
  const metadata = {
113
119
  connector: "notion",
@@ -121,6 +127,7 @@ export async function loadNotionPageDocument(args: {
121
127
  const ingest = buildNotionPageIngestInput({
122
128
  pageId,
123
129
  content,
130
+ assets,
124
131
  metadata: metadata as any,
125
132
  sourceIdPrefix: args.sourceIdPrefix,
126
133
  });
@@ -129,6 +136,7 @@ export async function loadNotionPageDocument(args: {
129
136
  sourceId: ingest.sourceId,
130
137
  content: ingest.content,
131
138
  metadata: ingest.metadata ?? {},
139
+ assets: ingest.assets ?? [],
132
140
  };
133
141
  }
134
142
 
@@ -178,6 +186,7 @@ export async function syncNotionPages(
178
186
  const result: IngestResult = await input.engine.ingest({
179
187
  sourceId: doc.sourceId,
180
188
  content: doc.content,
189
+ assets: doc.assets,
181
190
  metadata: doc.metadata as any,
182
191
  });
183
192