unrag 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +2 -2
  2. package/dist/cli/index.js +251 -42
  3. package/package.json +2 -1
  4. package/registry/config/unrag.config.ts +140 -7
  5. package/registry/connectors/notion/render.ts +78 -0
  6. package/registry/connectors/notion/sync.ts +12 -3
  7. package/registry/connectors/notion/types.ts +3 -1
  8. package/registry/core/assets.ts +54 -0
  9. package/registry/core/config.ts +150 -0
  10. package/registry/core/context-engine.ts +69 -1
  11. package/registry/core/index.ts +15 -2
  12. package/registry/core/ingest.ts +743 -17
  13. package/registry/core/types.ts +606 -0
  14. package/registry/docs/unrag.md +6 -0
  15. package/registry/embedding/ai.ts +89 -8
  16. package/registry/extractors/_shared/fetch.ts +113 -0
  17. package/registry/extractors/_shared/media.ts +14 -0
  18. package/registry/extractors/_shared/text.ts +11 -0
  19. package/registry/extractors/audio-transcribe/index.ts +75 -0
  20. package/registry/extractors/file-docx/index.ts +53 -0
  21. package/registry/extractors/file-pptx/index.ts +92 -0
  22. package/registry/extractors/file-text/index.ts +85 -0
  23. package/registry/extractors/file-xlsx/index.ts +58 -0
  24. package/registry/extractors/image-caption-llm/index.ts +60 -0
  25. package/registry/extractors/image-ocr/index.ts +60 -0
  26. package/registry/extractors/pdf-llm/index.ts +84 -0
  27. package/registry/extractors/pdf-ocr/index.ts +125 -0
  28. package/registry/extractors/pdf-text-layer/index.ts +76 -0
  29. package/registry/extractors/video-frames/index.ts +126 -0
  30. package/registry/extractors/video-transcribe/index.ts +78 -0
  31. package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
package/README.md CHANGED
@@ -10,13 +10,13 @@ It installs small, auditable source files into your repo:
10
10
  ## Usage
11
11
 
12
12
  ```bash
13
- bunx unrag init
13
+ bunx unrag@latest init
14
14
  ```
15
15
 
16
16
  ### Common flags
17
17
 
18
18
  ```bash
19
- bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag
19
+ bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag
20
20
  ```
21
21
 
22
22
  - `--store`: `drizzle` | `prisma` | `raw-sql`
package/dist/cli/index.js CHANGED
@@ -74,8 +74,7 @@ var writeText = async (filePath, content) => {
74
74
  var renderUnragConfig = (content, selection) => {
75
75
  const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
76
76
  const baseImports = [
77
- `import { createContextEngine, defineConfig } from "${installImportBase}/core";`,
78
- `import { createAiEmbeddingProvider } from "${installImportBase}/embedding/ai";`
77
+ `import { defineUnragConfig } from "${installImportBase}/core";`
79
78
  ];
80
79
  const storeImports = [];
81
80
  const storeCreateLines = [];
@@ -93,24 +92,14 @@ var renderUnragConfig = (content, selection) => {
93
92
  `);
94
93
  const createEngineBlock = [
95
94
  `export function createUnragEngine() {`,
96
- ` const embedding = createAiEmbeddingProvider({`,
97
- ` model: unragConfig.embedding.model,`,
98
- ` timeoutMs: unragConfig.embedding.timeoutMs,`,
99
- ` });`,
100
95
  ...storeCreateLines,
101
96
  ``,
102
- ` return createContextEngine(`,
103
- ` defineConfig({`,
104
- ` embedding,`,
105
- ` store,`,
106
- ` defaults: unragConfig.chunking,`,
107
- ` })`,
108
- ` );`,
97
+ ` return unrag.createEngine({ store });`,
109
98
  `}`,
110
99
  ``,
111
100
  `export async function retrieve(query: string) {`,
112
101
  ` const engine = createUnragEngine();`,
113
- ` return engine.retrieve({ query, topK: unragConfig.retrieval.topK });`,
102
+ ` return engine.retrieve({ query, topK: unrag.defaults.retrieval.topK });`,
114
103
  `}`
115
104
  ].join(`
116
105
  `);
@@ -147,6 +136,10 @@ async function copyRegistryFiles(selection) {
147
136
  src: path2.join(selection.registryRoot, "core/index.ts"),
148
137
  dest: path2.join(installBaseAbs, "core/index.ts")
149
138
  },
139
+ {
140
+ src: path2.join(selection.registryRoot, "core/assets.ts"),
141
+ dest: path2.join(installBaseAbs, "core/assets.ts")
142
+ },
150
143
  {
151
144
  src: path2.join(selection.registryRoot, "core/types.ts"),
152
145
  dest: path2.join(installBaseAbs, "core/types.ts")
@@ -163,6 +156,10 @@ async function copyRegistryFiles(selection) {
163
156
  src: path2.join(selection.registryRoot, "core/context-engine.ts"),
164
157
  dest: path2.join(installBaseAbs, "core/context-engine.ts")
165
158
  },
159
+ {
160
+ src: path2.join(selection.registryRoot, "core/delete.ts"),
161
+ dest: path2.join(installBaseAbs, "core/delete.ts")
162
+ },
166
163
  {
167
164
  src: path2.join(selection.registryRoot, "core/ingest.ts"),
168
165
  dest: path2.join(installBaseAbs, "core/ingest.ts")
@@ -262,6 +259,70 @@ async function copyConnectorFiles(selection) {
262
259
  await writeText(dest, raw);
263
260
  }
264
261
  }
262
+ async function copyExtractorFiles(selection) {
263
+ const toAbs = (projectRelative) => path2.join(selection.projectRoot, projectRelative);
264
+ const installBaseAbs = toAbs(selection.installDir);
265
+ const extractorRegistryAbs = path2.join(selection.registryRoot, "extractors", selection.extractor);
266
+ const sharedRegistryAbs = path2.join(selection.registryRoot, "extractors", "_shared");
267
+ if (!await exists(extractorRegistryAbs)) {
268
+ throw new Error(`Unknown extractor registry: ${path2.relative(selection.registryRoot, extractorRegistryAbs)}`);
269
+ }
270
+ const extractorFiles = await listFilesRecursive(extractorRegistryAbs);
271
+ const sharedFiles = await exists(sharedRegistryAbs) ? await listFilesRecursive(sharedRegistryAbs) : [];
272
+ const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
273
+ const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
274
+ const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
275
+ for (const src of extractorFiles) {
276
+ if (!await exists(src)) {
277
+ throw new Error(`Registry file missing: ${src}`);
278
+ }
279
+ const rel = path2.relative(extractorRegistryAbs, src);
280
+ const dest = path2.join(destRootAbs, rel);
281
+ if (await exists(dest)) {
282
+ if (nonInteractive) {
283
+ continue;
284
+ }
285
+ const answer = await confirm({
286
+ message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
287
+ initialValue: false
288
+ });
289
+ if (isCancel(answer)) {
290
+ cancel("Cancelled.");
291
+ return;
292
+ }
293
+ if (!answer) {
294
+ continue;
295
+ }
296
+ }
297
+ const raw = await readText(src);
298
+ await writeText(dest, raw);
299
+ }
300
+ for (const src of sharedFiles) {
301
+ if (!await exists(src)) {
302
+ throw new Error(`Registry file missing: ${src}`);
303
+ }
304
+ const rel = path2.relative(sharedRegistryAbs, src);
305
+ const dest = path2.join(sharedDestRootAbs, rel);
306
+ if (await exists(dest)) {
307
+ if (nonInteractive) {
308
+ continue;
309
+ }
310
+ const answer = await confirm({
311
+ message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
312
+ initialValue: false
313
+ });
314
+ if (isCancel(answer)) {
315
+ cancel("Cancelled.");
316
+ return;
317
+ }
318
+ if (!answer) {
319
+ continue;
320
+ }
321
+ }
322
+ const raw = await readText(src);
323
+ await writeText(dest, raw);
324
+ }
325
+ }
265
326
 
266
327
  // cli/lib/json.ts
267
328
  import { readFile as readFile2, writeFile as writeFile2 } from "node:fs/promises";
@@ -347,6 +408,37 @@ function depsForConnector(connector) {
347
408
  }
348
409
  return { deps, devDeps };
349
410
  }
411
+ function depsForExtractor(extractor) {
412
+ const deps = {};
413
+ const devDeps = {};
414
+ if (extractor === "pdf-llm") {
415
+ deps["ai"] = "^5.0.113";
416
+ }
417
+ if (extractor === "pdf-text-layer") {
418
+ deps["pdfjs-dist"] = "^5.4.149";
419
+ }
420
+ if (extractor === "pdf-ocr") {}
421
+ if (extractor === "image-ocr" || extractor === "image-caption-llm") {
422
+ deps["ai"] = "^5.0.113";
423
+ }
424
+ if (extractor === "audio-transcribe" || extractor === "video-transcribe") {
425
+ deps["ai"] = "^5.0.113";
426
+ }
427
+ if (extractor === "video-frames") {
428
+ deps["ai"] = "^5.0.113";
429
+ }
430
+ if (extractor === "file-text") {}
431
+ if (extractor === "file-docx") {
432
+ deps["mammoth"] = "^1.10.0";
433
+ }
434
+ if (extractor === "file-pptx") {
435
+ deps["jszip"] = "^3.10.1";
436
+ }
437
+ if (extractor === "file-xlsx") {
438
+ deps["xlsx"] = "^0.18.5";
439
+ }
440
+ return { deps, devDeps };
441
+ }
350
442
  function installCmd(pm) {
351
443
  if (pm === "bun")
352
444
  return "bun install";
@@ -550,7 +642,8 @@ async function initCommand(args) {
550
642
  storeAdapter: storeAdapterAnswer,
551
643
  aliasBase,
552
644
  version: CONFIG_VERSION,
553
- connectors: existing?.connectors ?? []
645
+ connectors: existing?.connectors ?? [],
646
+ extractors: existing?.extractors ?? []
554
647
  };
555
648
  await writeJsonFile(path5.join(root, CONFIG_FILE), config);
556
649
  const pm = await detectPackageManager(root);
@@ -578,9 +671,34 @@ async function initCommand(args) {
578
671
  import { outro as outro2 } from "@clack/prompts";
579
672
  import path6 from "node:path";
580
673
  import { fileURLToPath as fileURLToPath2 } from "node:url";
674
+
675
+ // cli/lib/constants.ts
676
+ var UNRAG_SITE_URL = (process.env.UNRAG_SITE_URL ?? process.env.UNRAG_DOCS_BASE_URL)?.trim() || "https://unrag.dev";
677
+ var UNRAG_GITHUB_REPO_URL = "https://github.com/BetterStacks/unrag";
678
+ function docsUrl(siteRelativePath) {
679
+ const p = siteRelativePath.startsWith("/") ? siteRelativePath : `/${siteRelativePath}`;
680
+ const base = UNRAG_SITE_URL.endsWith("/") ? UNRAG_SITE_URL : `${UNRAG_SITE_URL}/`;
681
+ return new URL(p.replace(/^\/+/, "/"), base).toString();
682
+ }
683
+
684
+ // cli/commands/add.ts
581
685
  var CONFIG_FILE2 = "unrag.json";
582
686
  var __filename3 = fileURLToPath2(import.meta.url);
583
687
  var __dirname3 = path6.dirname(__filename3);
688
+ var AVAILABLE_EXTRACTORS = [
689
+ "pdf-llm",
690
+ "pdf-text-layer",
691
+ "pdf-ocr",
692
+ "image-ocr",
693
+ "image-caption-llm",
694
+ "audio-transcribe",
695
+ "video-transcribe",
696
+ "video-frames",
697
+ "file-text",
698
+ "file-docx",
699
+ "file-pptx",
700
+ "file-xlsx"
701
+ ];
584
702
  var parseAddArgs = (args) => {
585
703
  const out = {};
586
704
  for (let i = 0;i < args.length; i++) {
@@ -589,8 +707,17 @@ var parseAddArgs = (args) => {
589
707
  out.yes = true;
590
708
  continue;
591
709
  }
592
- if (!out.connector && !a.startsWith("-")) {
593
- out.connector = a;
710
+ if (!out.kind && a && !a.startsWith("-")) {
711
+ if (a === "extractor") {
712
+ out.kind = "extractor";
713
+ continue;
714
+ }
715
+ out.kind = "connector";
716
+ out.name = a;
717
+ continue;
718
+ }
719
+ if (out.kind === "extractor" && !out.name && a && !a.startsWith("-")) {
720
+ out.name = a;
594
721
  continue;
595
722
  }
596
723
  }
@@ -602,23 +729,24 @@ async function addCommand(args) {
602
729
  throw new Error("Could not find a project root (no package.json found).");
603
730
  }
604
731
  const parsed = parseAddArgs(args);
605
- const connector = parsed.connector;
606
- if (!connector) {
607
- outro2(`Usage: unrag add <connector>
608
-
609
- Available connectors: notion`);
610
- return;
611
- }
612
- if (connector !== "notion") {
613
- outro2(`Unknown connector: ${connector}
614
-
615
- Available connectors: notion`);
732
+ const kind = parsed.kind ?? "connector";
733
+ const name = parsed.name;
734
+ if (!name) {
735
+ outro2([
736
+ "Usage:",
737
+ " unrag add <connector>",
738
+ " unrag add extractor <name>",
739
+ "",
740
+ "Available connectors: notion",
741
+ `Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`
742
+ ].join(`
743
+ `));
616
744
  return;
617
745
  }
618
746
  const configPath = path6.join(root, CONFIG_FILE2);
619
747
  const config = await readJsonFile(configPath);
620
748
  if (!config?.installDir) {
621
- throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag init\` first.`);
749
+ throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag@latest init\` first.`);
622
750
  }
623
751
  const cliPackageRoot = await findUp(__dirname3, "package.json");
624
752
  if (!cliPackageRoot) {
@@ -626,40 +754,120 @@ Available connectors: notion`);
626
754
  }
627
755
  const registryRoot = path6.join(cliPackageRoot, "registry");
628
756
  const nonInteractive = parsed.yes || !process.stdin.isTTY;
629
- await copyConnectorFiles({
757
+ const pkg = await readPackageJson(root);
758
+ if (kind === "connector") {
759
+ const connector = name;
760
+ if (connector !== "notion") {
761
+ outro2(`Unknown connector: ${name}
762
+
763
+ Available connectors: notion`);
764
+ return;
765
+ }
766
+ await copyConnectorFiles({
767
+ projectRoot: root,
768
+ registryRoot,
769
+ installDir: config.installDir,
770
+ connector,
771
+ yes: nonInteractive
772
+ });
773
+ const { deps: deps2, devDeps: devDeps2 } = depsForConnector(connector);
774
+ const merged2 = mergeDeps(pkg, deps2, devDeps2);
775
+ if (merged2.changes.length > 0) {
776
+ await writePackageJson(root, merged2.pkg);
777
+ }
778
+ const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
779
+ await writeJsonFile(configPath, { ...config, connectors });
780
+ outro2([
781
+ `Installed connector: ${connector}.`,
782
+ "",
783
+ `- Code: ${path6.join(config.installDir, "connectors", connector)}`,
784
+ `- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
785
+ "",
786
+ merged2.changes.length > 0 ? `Added deps: ${merged2.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
787
+ nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
788
+ ].filter(Boolean).join(`
789
+ `));
790
+ return;
791
+ }
792
+ const extractor = name;
793
+ if (!extractor || !AVAILABLE_EXTRACTORS.includes(extractor)) {
794
+ outro2(`Unknown extractor: ${name}
795
+
796
+ Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`);
797
+ return;
798
+ }
799
+ await copyExtractorFiles({
630
800
  projectRoot: root,
631
801
  registryRoot,
632
802
  installDir: config.installDir,
633
- connector,
803
+ extractor,
634
804
  yes: nonInteractive
635
805
  });
636
- const pkg = await readPackageJson(root);
637
- const { deps, devDeps } = depsForConnector(connector);
806
+ const { deps, devDeps } = depsForExtractor(extractor);
638
807
  const merged = mergeDeps(pkg, deps, devDeps);
639
808
  if (merged.changes.length > 0) {
640
809
  await writePackageJson(root, merged.pkg);
641
810
  }
642
- const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
643
- await writeJsonFile(configPath, { ...config, connectors });
811
+ const extractors = Array.from(new Set([...config.extractors ?? [], extractor])).sort();
812
+ await writeJsonFile(configPath, { ...config, extractors });
644
813
  outro2([
645
- `Installed connector: ${connector}.`,
814
+ `Installed extractor: ${extractor}.`,
646
815
  "",
647
- `- Code: ${path6.join(config.installDir, "connectors", connector)}`,
648
- `- Docs: /docs/connectors/${connector}`,
816
+ `- Code: ${path6.join(config.installDir, "extractors", extractor)}`,
649
817
  "",
650
818
  merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
651
- nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
819
+ "",
820
+ `Next: import the extractor and pass it to createContextEngine({ extractors: [...] }).`
652
821
  ].filter(Boolean).join(`
653
822
  `));
654
823
  }
655
824
 
656
825
  // cli/run.ts
826
+ function renderHelp() {
827
+ return [
828
+ "unrag — vendor-in RAG primitives (ingest/retrieve + adapters) into your repo.",
829
+ "",
830
+ "Usage:",
831
+ " bunx unrag <command> [options]",
832
+ " npx unrag <command> [options]",
833
+ "",
834
+ "Commands:",
835
+ " init Install core files (config + store adapter templates)",
836
+ " add <connector> Install a connector (currently: notion)",
837
+ " help Show this help",
838
+ "",
839
+ "Global options:",
840
+ " -h, --help Show help",
841
+ " -y, --yes Non-interactive; accept defaults",
842
+ "",
843
+ "init options:",
844
+ " --store <adapter> drizzle | prisma | raw-sql",
845
+ " --dir <path> Install directory (alias: --install-dir)",
846
+ " --alias <@name> Import alias base (e.g. @unrag)",
847
+ "",
848
+ "Examples:",
849
+ " bunx unrag@latest init",
850
+ " bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
851
+ " bunx unrag add notion --yes",
852
+ "",
853
+ "Docs:",
854
+ ` - Quickstart: ${docsUrl("/docs/getting-started/quickstart")}`,
855
+ ` - CLI: ${docsUrl("/docs/reference/cli")}`,
856
+ ` - Notion: ${docsUrl("/docs/connectors/notion")}`,
857
+ "",
858
+ "Repo:",
859
+ ` ${UNRAG_GITHUB_REPO_URL}`,
860
+ "",
861
+ "Tip:",
862
+ " After `init`, open the generated unrag.md for schema + env vars (DATABASE_URL)."
863
+ ].join(`
864
+ `);
865
+ }
657
866
  async function run(argv) {
658
867
  const [, , command, ...rest] = argv;
659
868
  intro("unrag");
660
869
  if (!command || command === "help" || command === "--help" || command === "-h") {
661
- outro3(["Usage:", "", "- unrag init", "- unrag add <connector>"].join(`
662
- `));
870
+ outro3(renderHelp());
663
871
  return;
664
872
  }
665
873
  if (command === "init") {
@@ -670,7 +878,8 @@ async function run(argv) {
670
878
  await addCommand(rest);
671
879
  return;
672
880
  }
673
- outro3(`Unknown command: ${command}`);
881
+ outro3([`Unknown command: ${command}`, "", renderHelp()].join(`
882
+ `));
674
883
  process.exitCode = 1;
675
884
  }
676
885
 
package/package.json CHANGED
@@ -1,10 +1,11 @@
1
1
  {
2
2
  "name": "unrag",
3
3
  "type": "module",
4
+ "repository": "https://github.com/BetterStacks/unrag",
4
5
  "bin": {
5
6
  "unrag": "./dist/cli/index.js"
6
7
  },
7
- "version": "0.2.1",
8
+ "version": "0.2.3",
8
9
  "private": false,
9
10
  "license": "Apache-2.0",
10
11
  "devDependencies": {
@@ -2,10 +2,10 @@
2
2
  * Root Unrag config (generated).
3
3
  *
4
4
  * This file is meant to be the single place you tweak:
5
+ * - Defaults (chunking + retrieval)
6
+ * - Engine settings (storage, asset processing, extractors)
5
7
  * - Embedding provider/model/timeouts
6
- * - Chunking defaults
7
- * - Retrieval defaults
8
- * - How you construct your DB client (Pool/Prisma/etc)
8
+ * - How you construct your DB client (Pool/Prisma/etc) and vector store adapter
9
9
  *
10
10
  * The files under your install dir (e.g. `lib/unrag/**`) are intended to be
11
11
  * treated like vendored source code.
@@ -13,7 +13,8 @@
13
13
 
14
14
  // __UNRAG_IMPORTS__
15
15
 
16
- export const unragConfig = {
16
+ export const unrag = defineUnragConfig({
17
+ defaults: {
17
18
  chunking: {
18
19
  chunkSize: 200,
19
20
  chunkOverlap: 40,
@@ -21,11 +22,143 @@ export const unragConfig = {
21
22
  retrieval: {
22
23
  topK: 8,
23
24
  },
25
+ },
24
26
  embedding: {
25
- model: "openai/text-embedding-3-small",
26
- timeoutMs: 15_000,
27
+ provider: "ai",
28
+ config: {
29
+ type: "text",
30
+ model: "openai/text-embedding-3-small",
31
+ timeoutMs: 15_000,
32
+ },
33
+ },
34
+ engine: {
35
+ /**
36
+ * Storage controls.
37
+ *
38
+ * - storeChunkContent: whether `chunk.content` is persisted and returned by retrieval.
39
+ * - storeDocumentContent: whether the full original document text is stored in `documents.content`.
40
+ */
41
+ storage: {
42
+ storeChunkContent: true,
43
+ storeDocumentContent: true,
44
+ },
45
+ /**
46
+ * Optional extractor modules that can process non-text assets into text outputs.
47
+ *
48
+ * To install:
49
+ * - `unrag add extractor pdf-llm`
50
+ *
51
+ * Then import it in this file and add it here, for example:
52
+ * - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
53
+ * - `extractors: [createPdfLlmExtractor()]`
54
+ */
55
+ extractors: [],
56
+ /**
57
+ * Rich media processing controls.
58
+ *
59
+ * Notes:
60
+ * - The library defaults are cost-safe (PDF LLM extraction is off).
61
+ * - This generated config opts you into PDF extraction for convenience.
62
+ * - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
63
+ */
64
+ assetProcessing: {
65
+ onUnsupportedAsset: "skip",
66
+ onError: "skip",
67
+ concurrency: 4,
68
+ fetch: {
69
+ enabled: true,
70
+ maxBytes: 15 * 1024 * 1024,
71
+ timeoutMs: 20_000,
72
+ // allowedHosts: ["..."], // recommended to mitigate SSRF
73
+ },
74
+ pdf: {
75
+ // Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
76
+ textLayer: {
77
+ enabled: false,
78
+ maxBytes: 15 * 1024 * 1024,
79
+ maxOutputChars: 200_000,
80
+ minChars: 200,
81
+ // maxPages: 200,
82
+ },
83
+ llmExtraction: {
84
+ enabled: true,
85
+ model: "google/gemini-2.0-flash",
86
+ prompt:
87
+ "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
88
+ timeoutMs: 60_000,
89
+ maxBytes: 15 * 1024 * 1024,
90
+ maxOutputChars: 200_000,
91
+ },
92
+ // Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
93
+ ocr: {
94
+ enabled: false,
95
+ maxBytes: 15 * 1024 * 1024,
96
+ maxOutputChars: 200_000,
97
+ minChars: 200,
98
+ // maxPages: 200,
99
+ // pdftoppmPath: "/usr/bin/pdftoppm",
100
+ // tesseractPath: "/usr/bin/tesseract",
101
+ // dpi: 200,
102
+ // lang: "eng",
103
+ },
104
+ },
105
+ image: {
106
+ ocr: {
107
+ enabled: false,
108
+ model: "google/gemini-2.0-flash",
109
+ prompt:
110
+ "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
111
+ timeoutMs: 60_000,
112
+ maxBytes: 10 * 1024 * 1024,
113
+ maxOutputChars: 50_000,
114
+ },
115
+ captionLlm: {
116
+ enabled: false,
117
+ model: "google/gemini-2.0-flash",
118
+ prompt:
119
+ "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
120
+ timeoutMs: 60_000,
121
+ maxBytes: 10 * 1024 * 1024,
122
+ maxOutputChars: 10_000,
123
+ },
124
+ },
125
+ audio: {
126
+ transcription: {
127
+ enabled: false,
128
+ model: "openai/whisper-1",
129
+ timeoutMs: 120_000,
130
+ maxBytes: 25 * 1024 * 1024,
131
+ },
132
+ },
133
+ video: {
134
+ transcription: {
135
+ enabled: false,
136
+ model: "openai/whisper-1",
137
+ timeoutMs: 120_000,
138
+ maxBytes: 50 * 1024 * 1024,
139
+ },
140
+ frames: {
141
+ enabled: false,
142
+ sampleFps: 0.2,
143
+ maxFrames: 50,
144
+ // ffmpegPath: "/usr/bin/ffmpeg",
145
+ maxBytes: 50 * 1024 * 1024,
146
+ model: "google/gemini-2.0-flash",
147
+ prompt:
148
+ "Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
149
+ timeoutMs: 60_000,
150
+ maxOutputChars: 50_000,
151
+ },
152
+ },
153
+ file: {
154
+ text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
155
+ docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
156
+ pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
157
+ xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
158
+ },
159
+ },
27
160
  },
28
- } as const;
161
+ } as const);
29
162
 
30
163
  // __UNRAG_CREATE_ENGINE__
31
164
 
@@ -1,3 +1,5 @@
1
+ import type { AssetInput, AssetKind, Metadata } from "../../core";
2
+
1
3
  type RichText = { plain_text?: string };
2
4
 
3
5
  export type NotionBlock = {
@@ -20,6 +22,82 @@ const rt = (value: unknown): string => {
20
22
 
21
23
  const indent = (n: number) => (n > 0 ? " ".repeat(n) : "");
22
24
 
25
+ const asString = (v: unknown) => String(v ?? "").trim();
26
+
27
+ const supportedAssetKinds = new Set<AssetKind>([
28
+ "image",
29
+ "pdf",
30
+ "audio",
31
+ "video",
32
+ "file",
33
+ ]);
34
+
35
+ const toAssetKind = (notionType: string): AssetKind | null => {
36
+ const t = notionType as AssetKind;
37
+ return supportedAssetKinds.has(t) ? t : null;
38
+ };
39
+
40
+ const pickUrl = (payload: any): string | undefined => {
41
+ const type = String(payload?.type ?? "");
42
+ if (type === "external") return asString(payload?.external?.url);
43
+ if (type === "file") return asString(payload?.file?.url);
44
+ return undefined;
45
+ };
46
+
47
+ const pickCaption = (payload: any): string => {
48
+ // Notion captions are typically an array of rich text items.
49
+ return rt(payload?.caption);
50
+ };
51
+
52
+ const inferMediaType = (assetKind: AssetKind, payload: any): string | undefined => {
53
+ if (assetKind === "pdf") return "application/pdf";
54
+ // Notion does not consistently include media types; keep it optional.
55
+ return asString(payload?.media_type) || undefined;
56
+ };
57
+
58
+ const asMetadata = (obj: Record<string, unknown>): Metadata => obj as any;
59
+
60
+ export function extractNotionAssets(
61
+ nodes: NotionBlockNode[],
62
+ opts: { maxDepth?: number } = {}
63
+ ): AssetInput[] {
64
+ const maxDepth = opts.maxDepth ?? 6;
65
+ const out: AssetInput[] = [];
66
+
67
+ const walk = (node: NotionBlockNode, depth: number) => {
68
+ if (depth > maxDepth) return;
69
+ const b = node.block as any;
70
+ const kind = toAssetKind(String(b.type ?? ""));
71
+ if (kind) {
72
+ const payload = b[kind];
73
+ const url = pickUrl(payload);
74
+ if (url) {
75
+ const caption = pickCaption(payload).trim();
76
+ const mediaType = inferMediaType(kind, payload);
77
+ out.push({
78
+ assetId: String(b.id),
79
+ kind,
80
+ data: { kind: "url", url, ...(mediaType ? { mediaType } : {}) },
81
+ uri: url,
82
+ ...(caption ? { text: caption } : {}),
83
+ metadata: asMetadata({
84
+ connector: "notion",
85
+ notionBlockId: String(b.id),
86
+ notionBlockType: String(b.type),
87
+ }),
88
+ });
89
+ }
90
+ }
91
+
92
+ for (const child of node.children) {
93
+ walk(child, depth + 1);
94
+ }
95
+ };
96
+
97
+ for (const n of nodes) walk(n, 0);
98
+ return out;
99
+ }
100
+
23
101
  export function renderNotionBlocksToText(
24
102
  nodes: NotionBlockNode[],
25
103
  opts: { maxDepth?: number } = {}