unrag 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/cli/index.js +199 -41
- package/package.json +2 -1
- package/registry/config/unrag.config.ts +140 -7
- package/registry/connectors/notion/render.ts +78 -0
- package/registry/connectors/notion/sync.ts +12 -3
- package/registry/connectors/notion/types.ts +3 -1
- package/registry/core/assets.ts +54 -0
- package/registry/core/config.ts +150 -0
- package/registry/core/context-engine.ts +69 -1
- package/registry/core/index.ts +15 -2
- package/registry/core/ingest.ts +743 -17
- package/registry/core/types.ts +606 -0
- package/registry/docs/unrag.md +6 -0
- package/registry/embedding/ai.ts +89 -8
- package/registry/extractors/_shared/fetch.ts +113 -0
- package/registry/extractors/_shared/media.ts +14 -0
- package/registry/extractors/_shared/text.ts +11 -0
- package/registry/extractors/audio-transcribe/index.ts +75 -0
- package/registry/extractors/file-docx/index.ts +53 -0
- package/registry/extractors/file-pptx/index.ts +92 -0
- package/registry/extractors/file-text/index.ts +85 -0
- package/registry/extractors/file-xlsx/index.ts +58 -0
- package/registry/extractors/image-caption-llm/index.ts +60 -0
- package/registry/extractors/image-ocr/index.ts +60 -0
- package/registry/extractors/pdf-llm/index.ts +84 -0
- package/registry/extractors/pdf-ocr/index.ts +125 -0
- package/registry/extractors/pdf-text-layer/index.ts +76 -0
- package/registry/extractors/video-frames/index.ts +126 -0
- package/registry/extractors/video-transcribe/index.ts +78 -0
- package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
package/README.md
CHANGED
|
@@ -10,13 +10,13 @@ It installs small, auditable source files into your repo:
|
|
|
10
10
|
## Usage
|
|
11
11
|
|
|
12
12
|
```bash
|
|
13
|
-
bunx unrag init
|
|
13
|
+
bunx unrag@latest init
|
|
14
14
|
```
|
|
15
15
|
|
|
16
16
|
### Common flags
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
|
-
bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag
|
|
19
|
+
bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag
|
|
20
20
|
```
|
|
21
21
|
|
|
22
22
|
- `--store`: `drizzle` | `prisma` | `raw-sql`
|
package/dist/cli/index.js
CHANGED
|
@@ -74,8 +74,7 @@ var writeText = async (filePath, content) => {
|
|
|
74
74
|
var renderUnragConfig = (content, selection) => {
|
|
75
75
|
const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
|
|
76
76
|
const baseImports = [
|
|
77
|
-
`import {
|
|
78
|
-
`import { createAiEmbeddingProvider } from "${installImportBase}/embedding/ai";`
|
|
77
|
+
`import { defineUnragConfig } from "${installImportBase}/core";`
|
|
79
78
|
];
|
|
80
79
|
const storeImports = [];
|
|
81
80
|
const storeCreateLines = [];
|
|
@@ -93,24 +92,14 @@ var renderUnragConfig = (content, selection) => {
|
|
|
93
92
|
`);
|
|
94
93
|
const createEngineBlock = [
|
|
95
94
|
`export function createUnragEngine() {`,
|
|
96
|
-
` const embedding = createAiEmbeddingProvider({`,
|
|
97
|
-
` model: unragConfig.embedding.model,`,
|
|
98
|
-
` timeoutMs: unragConfig.embedding.timeoutMs,`,
|
|
99
|
-
` });`,
|
|
100
95
|
...storeCreateLines,
|
|
101
96
|
``,
|
|
102
|
-
` return
|
|
103
|
-
` defineConfig({`,
|
|
104
|
-
` embedding,`,
|
|
105
|
-
` store,`,
|
|
106
|
-
` defaults: unragConfig.chunking,`,
|
|
107
|
-
` })`,
|
|
108
|
-
` );`,
|
|
97
|
+
` return unrag.createEngine({ store });`,
|
|
109
98
|
`}`,
|
|
110
99
|
``,
|
|
111
100
|
`export async function retrieve(query: string) {`,
|
|
112
101
|
` const engine = createUnragEngine();`,
|
|
113
|
-
` return engine.retrieve({ query, topK:
|
|
102
|
+
` return engine.retrieve({ query, topK: unrag.defaults.retrieval.topK });`,
|
|
114
103
|
`}`
|
|
115
104
|
].join(`
|
|
116
105
|
`);
|
|
@@ -147,6 +136,10 @@ async function copyRegistryFiles(selection) {
|
|
|
147
136
|
src: path2.join(selection.registryRoot, "core/index.ts"),
|
|
148
137
|
dest: path2.join(installBaseAbs, "core/index.ts")
|
|
149
138
|
},
|
|
139
|
+
{
|
|
140
|
+
src: path2.join(selection.registryRoot, "core/assets.ts"),
|
|
141
|
+
dest: path2.join(installBaseAbs, "core/assets.ts")
|
|
142
|
+
},
|
|
150
143
|
{
|
|
151
144
|
src: path2.join(selection.registryRoot, "core/types.ts"),
|
|
152
145
|
dest: path2.join(installBaseAbs, "core/types.ts")
|
|
@@ -163,6 +156,10 @@ async function copyRegistryFiles(selection) {
|
|
|
163
156
|
src: path2.join(selection.registryRoot, "core/context-engine.ts"),
|
|
164
157
|
dest: path2.join(installBaseAbs, "core/context-engine.ts")
|
|
165
158
|
},
|
|
159
|
+
{
|
|
160
|
+
src: path2.join(selection.registryRoot, "core/delete.ts"),
|
|
161
|
+
dest: path2.join(installBaseAbs, "core/delete.ts")
|
|
162
|
+
},
|
|
166
163
|
{
|
|
167
164
|
src: path2.join(selection.registryRoot, "core/ingest.ts"),
|
|
168
165
|
dest: path2.join(installBaseAbs, "core/ingest.ts")
|
|
@@ -262,6 +259,70 @@ async function copyConnectorFiles(selection) {
|
|
|
262
259
|
await writeText(dest, raw);
|
|
263
260
|
}
|
|
264
261
|
}
|
|
262
|
+
async function copyExtractorFiles(selection) {
|
|
263
|
+
const toAbs = (projectRelative) => path2.join(selection.projectRoot, projectRelative);
|
|
264
|
+
const installBaseAbs = toAbs(selection.installDir);
|
|
265
|
+
const extractorRegistryAbs = path2.join(selection.registryRoot, "extractors", selection.extractor);
|
|
266
|
+
const sharedRegistryAbs = path2.join(selection.registryRoot, "extractors", "_shared");
|
|
267
|
+
if (!await exists(extractorRegistryAbs)) {
|
|
268
|
+
throw new Error(`Unknown extractor registry: ${path2.relative(selection.registryRoot, extractorRegistryAbs)}`);
|
|
269
|
+
}
|
|
270
|
+
const extractorFiles = await listFilesRecursive(extractorRegistryAbs);
|
|
271
|
+
const sharedFiles = await exists(sharedRegistryAbs) ? await listFilesRecursive(sharedRegistryAbs) : [];
|
|
272
|
+
const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
|
|
273
|
+
const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
|
|
274
|
+
const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
|
|
275
|
+
for (const src of extractorFiles) {
|
|
276
|
+
if (!await exists(src)) {
|
|
277
|
+
throw new Error(`Registry file missing: ${src}`);
|
|
278
|
+
}
|
|
279
|
+
const rel = path2.relative(extractorRegistryAbs, src);
|
|
280
|
+
const dest = path2.join(destRootAbs, rel);
|
|
281
|
+
if (await exists(dest)) {
|
|
282
|
+
if (nonInteractive) {
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
const answer = await confirm({
|
|
286
|
+
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
287
|
+
initialValue: false
|
|
288
|
+
});
|
|
289
|
+
if (isCancel(answer)) {
|
|
290
|
+
cancel("Cancelled.");
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
if (!answer) {
|
|
294
|
+
continue;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
const raw = await readText(src);
|
|
298
|
+
await writeText(dest, raw);
|
|
299
|
+
}
|
|
300
|
+
for (const src of sharedFiles) {
|
|
301
|
+
if (!await exists(src)) {
|
|
302
|
+
throw new Error(`Registry file missing: ${src}`);
|
|
303
|
+
}
|
|
304
|
+
const rel = path2.relative(sharedRegistryAbs, src);
|
|
305
|
+
const dest = path2.join(sharedDestRootAbs, rel);
|
|
306
|
+
if (await exists(dest)) {
|
|
307
|
+
if (nonInteractive) {
|
|
308
|
+
continue;
|
|
309
|
+
}
|
|
310
|
+
const answer = await confirm({
|
|
311
|
+
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
312
|
+
initialValue: false
|
|
313
|
+
});
|
|
314
|
+
if (isCancel(answer)) {
|
|
315
|
+
cancel("Cancelled.");
|
|
316
|
+
return;
|
|
317
|
+
}
|
|
318
|
+
if (!answer) {
|
|
319
|
+
continue;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
const raw = await readText(src);
|
|
323
|
+
await writeText(dest, raw);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
265
326
|
|
|
266
327
|
// cli/lib/json.ts
|
|
267
328
|
import { readFile as readFile2, writeFile as writeFile2 } from "node:fs/promises";
|
|
@@ -347,6 +408,37 @@ function depsForConnector(connector) {
|
|
|
347
408
|
}
|
|
348
409
|
return { deps, devDeps };
|
|
349
410
|
}
|
|
411
|
+
function depsForExtractor(extractor) {
|
|
412
|
+
const deps = {};
|
|
413
|
+
const devDeps = {};
|
|
414
|
+
if (extractor === "pdf-llm") {
|
|
415
|
+
deps["ai"] = "^5.0.113";
|
|
416
|
+
}
|
|
417
|
+
if (extractor === "pdf-text-layer") {
|
|
418
|
+
deps["pdfjs-dist"] = "^5.4.149";
|
|
419
|
+
}
|
|
420
|
+
if (extractor === "pdf-ocr") {}
|
|
421
|
+
if (extractor === "image-ocr" || extractor === "image-caption-llm") {
|
|
422
|
+
deps["ai"] = "^5.0.113";
|
|
423
|
+
}
|
|
424
|
+
if (extractor === "audio-transcribe" || extractor === "video-transcribe") {
|
|
425
|
+
deps["ai"] = "^5.0.113";
|
|
426
|
+
}
|
|
427
|
+
if (extractor === "video-frames") {
|
|
428
|
+
deps["ai"] = "^5.0.113";
|
|
429
|
+
}
|
|
430
|
+
if (extractor === "file-text") {}
|
|
431
|
+
if (extractor === "file-docx") {
|
|
432
|
+
deps["mammoth"] = "^1.10.0";
|
|
433
|
+
}
|
|
434
|
+
if (extractor === "file-pptx") {
|
|
435
|
+
deps["jszip"] = "^3.10.1";
|
|
436
|
+
}
|
|
437
|
+
if (extractor === "file-xlsx") {
|
|
438
|
+
deps["xlsx"] = "^0.18.5";
|
|
439
|
+
}
|
|
440
|
+
return { deps, devDeps };
|
|
441
|
+
}
|
|
350
442
|
function installCmd(pm) {
|
|
351
443
|
if (pm === "bun")
|
|
352
444
|
return "bun install";
|
|
@@ -550,7 +642,8 @@ async function initCommand(args) {
|
|
|
550
642
|
storeAdapter: storeAdapterAnswer,
|
|
551
643
|
aliasBase,
|
|
552
644
|
version: CONFIG_VERSION,
|
|
553
|
-
connectors: existing?.connectors ?? []
|
|
645
|
+
connectors: existing?.connectors ?? [],
|
|
646
|
+
extractors: existing?.extractors ?? []
|
|
554
647
|
};
|
|
555
648
|
await writeJsonFile(path5.join(root, CONFIG_FILE), config);
|
|
556
649
|
const pm = await detectPackageManager(root);
|
|
@@ -592,6 +685,20 @@ function docsUrl(siteRelativePath) {
|
|
|
592
685
|
var CONFIG_FILE2 = "unrag.json";
|
|
593
686
|
var __filename3 = fileURLToPath2(import.meta.url);
|
|
594
687
|
var __dirname3 = path6.dirname(__filename3);
|
|
688
|
+
var AVAILABLE_EXTRACTORS = [
|
|
689
|
+
"pdf-llm",
|
|
690
|
+
"pdf-text-layer",
|
|
691
|
+
"pdf-ocr",
|
|
692
|
+
"image-ocr",
|
|
693
|
+
"image-caption-llm",
|
|
694
|
+
"audio-transcribe",
|
|
695
|
+
"video-transcribe",
|
|
696
|
+
"video-frames",
|
|
697
|
+
"file-text",
|
|
698
|
+
"file-docx",
|
|
699
|
+
"file-pptx",
|
|
700
|
+
"file-xlsx"
|
|
701
|
+
];
|
|
595
702
|
var parseAddArgs = (args) => {
|
|
596
703
|
const out = {};
|
|
597
704
|
for (let i = 0;i < args.length; i++) {
|
|
@@ -600,8 +707,17 @@ var parseAddArgs = (args) => {
|
|
|
600
707
|
out.yes = true;
|
|
601
708
|
continue;
|
|
602
709
|
}
|
|
603
|
-
if (!out.
|
|
604
|
-
|
|
710
|
+
if (!out.kind && a && !a.startsWith("-")) {
|
|
711
|
+
if (a === "extractor") {
|
|
712
|
+
out.kind = "extractor";
|
|
713
|
+
continue;
|
|
714
|
+
}
|
|
715
|
+
out.kind = "connector";
|
|
716
|
+
out.name = a;
|
|
717
|
+
continue;
|
|
718
|
+
}
|
|
719
|
+
if (out.kind === "extractor" && !out.name && a && !a.startsWith("-")) {
|
|
720
|
+
out.name = a;
|
|
605
721
|
continue;
|
|
606
722
|
}
|
|
607
723
|
}
|
|
@@ -613,23 +729,24 @@ async function addCommand(args) {
|
|
|
613
729
|
throw new Error("Could not find a project root (no package.json found).");
|
|
614
730
|
}
|
|
615
731
|
const parsed = parseAddArgs(args);
|
|
616
|
-
const
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
732
|
+
const kind = parsed.kind ?? "connector";
|
|
733
|
+
const name = parsed.name;
|
|
734
|
+
if (!name) {
|
|
735
|
+
outro2([
|
|
736
|
+
"Usage:",
|
|
737
|
+
" unrag add <connector>",
|
|
738
|
+
" unrag add extractor <name>",
|
|
739
|
+
"",
|
|
740
|
+
"Available connectors: notion",
|
|
741
|
+
`Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`
|
|
742
|
+
].join(`
|
|
743
|
+
`));
|
|
627
744
|
return;
|
|
628
745
|
}
|
|
629
746
|
const configPath = path6.join(root, CONFIG_FILE2);
|
|
630
747
|
const config = await readJsonFile(configPath);
|
|
631
748
|
if (!config?.installDir) {
|
|
632
|
-
throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag init\` first.`);
|
|
749
|
+
throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag@latest init\` first.`);
|
|
633
750
|
}
|
|
634
751
|
const cliPackageRoot = await findUp(__dirname3, "package.json");
|
|
635
752
|
if (!cliPackageRoot) {
|
|
@@ -637,29 +754,70 @@ Available connectors: notion`);
|
|
|
637
754
|
}
|
|
638
755
|
const registryRoot = path6.join(cliPackageRoot, "registry");
|
|
639
756
|
const nonInteractive = parsed.yes || !process.stdin.isTTY;
|
|
640
|
-
await
|
|
757
|
+
const pkg = await readPackageJson(root);
|
|
758
|
+
if (kind === "connector") {
|
|
759
|
+
const connector = name;
|
|
760
|
+
if (connector !== "notion") {
|
|
761
|
+
outro2(`Unknown connector: ${name}
|
|
762
|
+
|
|
763
|
+
Available connectors: notion`);
|
|
764
|
+
return;
|
|
765
|
+
}
|
|
766
|
+
await copyConnectorFiles({
|
|
767
|
+
projectRoot: root,
|
|
768
|
+
registryRoot,
|
|
769
|
+
installDir: config.installDir,
|
|
770
|
+
connector,
|
|
771
|
+
yes: nonInteractive
|
|
772
|
+
});
|
|
773
|
+
const { deps: deps2, devDeps: devDeps2 } = depsForConnector(connector);
|
|
774
|
+
const merged2 = mergeDeps(pkg, deps2, devDeps2);
|
|
775
|
+
if (merged2.changes.length > 0) {
|
|
776
|
+
await writePackageJson(root, merged2.pkg);
|
|
777
|
+
}
|
|
778
|
+
const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
|
|
779
|
+
await writeJsonFile(configPath, { ...config, connectors });
|
|
780
|
+
outro2([
|
|
781
|
+
`Installed connector: ${connector}.`,
|
|
782
|
+
"",
|
|
783
|
+
`- Code: ${path6.join(config.installDir, "connectors", connector)}`,
|
|
784
|
+
`- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
|
|
785
|
+
"",
|
|
786
|
+
merged2.changes.length > 0 ? `Added deps: ${merged2.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
787
|
+
nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
|
|
788
|
+
].filter(Boolean).join(`
|
|
789
|
+
`));
|
|
790
|
+
return;
|
|
791
|
+
}
|
|
792
|
+
const extractor = name;
|
|
793
|
+
if (!extractor || !AVAILABLE_EXTRACTORS.includes(extractor)) {
|
|
794
|
+
outro2(`Unknown extractor: ${name}
|
|
795
|
+
|
|
796
|
+
Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`);
|
|
797
|
+
return;
|
|
798
|
+
}
|
|
799
|
+
await copyExtractorFiles({
|
|
641
800
|
projectRoot: root,
|
|
642
801
|
registryRoot,
|
|
643
802
|
installDir: config.installDir,
|
|
644
|
-
|
|
803
|
+
extractor,
|
|
645
804
|
yes: nonInteractive
|
|
646
805
|
});
|
|
647
|
-
const
|
|
648
|
-
const { deps, devDeps } = depsForConnector(connector);
|
|
806
|
+
const { deps, devDeps } = depsForExtractor(extractor);
|
|
649
807
|
const merged = mergeDeps(pkg, deps, devDeps);
|
|
650
808
|
if (merged.changes.length > 0) {
|
|
651
809
|
await writePackageJson(root, merged.pkg);
|
|
652
810
|
}
|
|
653
|
-
const
|
|
654
|
-
await writeJsonFile(configPath, { ...config,
|
|
811
|
+
const extractors = Array.from(new Set([...config.extractors ?? [], extractor])).sort();
|
|
812
|
+
await writeJsonFile(configPath, { ...config, extractors });
|
|
655
813
|
outro2([
|
|
656
|
-
`Installed
|
|
814
|
+
`Installed extractor: ${extractor}.`,
|
|
657
815
|
"",
|
|
658
|
-
`- Code: ${path6.join(config.installDir, "
|
|
659
|
-
`- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
|
|
816
|
+
`- Code: ${path6.join(config.installDir, "extractors", extractor)}`,
|
|
660
817
|
"",
|
|
661
818
|
merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
662
|
-
|
|
819
|
+
"",
|
|
820
|
+
`Next: import the extractor and pass it to createContextEngine({ extractors: [...] }).`
|
|
663
821
|
].filter(Boolean).join(`
|
|
664
822
|
`));
|
|
665
823
|
}
|
|
@@ -688,8 +846,8 @@ function renderHelp() {
|
|
|
688
846
|
" --alias <@name> Import alias base (e.g. @unrag)",
|
|
689
847
|
"",
|
|
690
848
|
"Examples:",
|
|
691
|
-
" bunx unrag init",
|
|
692
|
-
" bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag",
|
|
849
|
+
" bunx unrag@latest init",
|
|
850
|
+
" bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
|
|
693
851
|
" bunx unrag add notion --yes",
|
|
694
852
|
"",
|
|
695
853
|
"Docs:",
|
package/package.json
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unrag",
|
|
3
3
|
"type": "module",
|
|
4
|
+
"repository": "https://github.com/BetterStacks/unrag",
|
|
4
5
|
"bin": {
|
|
5
6
|
"unrag": "./dist/cli/index.js"
|
|
6
7
|
},
|
|
7
|
-
"version": "0.2.
|
|
8
|
+
"version": "0.2.3",
|
|
8
9
|
"private": false,
|
|
9
10
|
"license": "Apache-2.0",
|
|
10
11
|
"devDependencies": {
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
* Root Unrag config (generated).
|
|
3
3
|
*
|
|
4
4
|
* This file is meant to be the single place you tweak:
|
|
5
|
+
* - Defaults (chunking + retrieval)
|
|
6
|
+
* - Engine settings (storage, asset processing, extractors)
|
|
5
7
|
* - Embedding provider/model/timeouts
|
|
6
|
-
* -
|
|
7
|
-
* - Retrieval defaults
|
|
8
|
-
* - How you construct your DB client (Pool/Prisma/etc)
|
|
8
|
+
* - How you construct your DB client (Pool/Prisma/etc) and vector store adapter
|
|
9
9
|
*
|
|
10
10
|
* The files under your install dir (e.g. `lib/unrag/**`) are intended to be
|
|
11
11
|
* treated like vendored source code.
|
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
|
|
14
14
|
// __UNRAG_IMPORTS__
|
|
15
15
|
|
|
16
|
-
export const
|
|
16
|
+
export const unrag = defineUnragConfig({
|
|
17
|
+
defaults: {
|
|
17
18
|
chunking: {
|
|
18
19
|
chunkSize: 200,
|
|
19
20
|
chunkOverlap: 40,
|
|
@@ -21,11 +22,143 @@ export const unragConfig = {
|
|
|
21
22
|
retrieval: {
|
|
22
23
|
topK: 8,
|
|
23
24
|
},
|
|
25
|
+
},
|
|
24
26
|
embedding: {
|
|
25
|
-
|
|
26
|
-
|
|
27
|
+
provider: "ai",
|
|
28
|
+
config: {
|
|
29
|
+
type: "text",
|
|
30
|
+
model: "openai/text-embedding-3-small",
|
|
31
|
+
timeoutMs: 15_000,
|
|
32
|
+
},
|
|
33
|
+
},
|
|
34
|
+
engine: {
|
|
35
|
+
/**
|
|
36
|
+
* Storage controls.
|
|
37
|
+
*
|
|
38
|
+
* - storeChunkContent: whether `chunk.content` is persisted and returned by retrieval.
|
|
39
|
+
* - storeDocumentContent: whether the full original document text is stored in `documents.content`.
|
|
40
|
+
*/
|
|
41
|
+
storage: {
|
|
42
|
+
storeChunkContent: true,
|
|
43
|
+
storeDocumentContent: true,
|
|
44
|
+
},
|
|
45
|
+
/**
|
|
46
|
+
* Optional extractor modules that can process non-text assets into text outputs.
|
|
47
|
+
*
|
|
48
|
+
* To install:
|
|
49
|
+
* - `unrag add extractor pdf-llm`
|
|
50
|
+
*
|
|
51
|
+
* Then import it in this file and add it here, for example:
|
|
52
|
+
* - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
|
|
53
|
+
* - `extractors: [createPdfLlmExtractor()]`
|
|
54
|
+
*/
|
|
55
|
+
extractors: [],
|
|
56
|
+
/**
|
|
57
|
+
* Rich media processing controls.
|
|
58
|
+
*
|
|
59
|
+
* Notes:
|
|
60
|
+
* - The library defaults are cost-safe (PDF LLM extraction is off).
|
|
61
|
+
* - This generated config opts you into PDF extraction for convenience.
|
|
62
|
+
* - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
|
|
63
|
+
*/
|
|
64
|
+
assetProcessing: {
|
|
65
|
+
onUnsupportedAsset: "skip",
|
|
66
|
+
onError: "skip",
|
|
67
|
+
concurrency: 4,
|
|
68
|
+
fetch: {
|
|
69
|
+
enabled: true,
|
|
70
|
+
maxBytes: 15 * 1024 * 1024,
|
|
71
|
+
timeoutMs: 20_000,
|
|
72
|
+
// allowedHosts: ["..."], // recommended to mitigate SSRF
|
|
73
|
+
},
|
|
74
|
+
pdf: {
|
|
75
|
+
// Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
|
|
76
|
+
textLayer: {
|
|
77
|
+
enabled: false,
|
|
78
|
+
maxBytes: 15 * 1024 * 1024,
|
|
79
|
+
maxOutputChars: 200_000,
|
|
80
|
+
minChars: 200,
|
|
81
|
+
// maxPages: 200,
|
|
82
|
+
},
|
|
83
|
+
llmExtraction: {
|
|
84
|
+
enabled: true,
|
|
85
|
+
model: "google/gemini-2.0-flash",
|
|
86
|
+
prompt:
|
|
87
|
+
"Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
|
|
88
|
+
timeoutMs: 60_000,
|
|
89
|
+
maxBytes: 15 * 1024 * 1024,
|
|
90
|
+
maxOutputChars: 200_000,
|
|
91
|
+
},
|
|
92
|
+
// Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
|
|
93
|
+
ocr: {
|
|
94
|
+
enabled: false,
|
|
95
|
+
maxBytes: 15 * 1024 * 1024,
|
|
96
|
+
maxOutputChars: 200_000,
|
|
97
|
+
minChars: 200,
|
|
98
|
+
// maxPages: 200,
|
|
99
|
+
// pdftoppmPath: "/usr/bin/pdftoppm",
|
|
100
|
+
// tesseractPath: "/usr/bin/tesseract",
|
|
101
|
+
// dpi: 200,
|
|
102
|
+
// lang: "eng",
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
image: {
|
|
106
|
+
ocr: {
|
|
107
|
+
enabled: false,
|
|
108
|
+
model: "google/gemini-2.0-flash",
|
|
109
|
+
prompt:
|
|
110
|
+
"Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
111
|
+
timeoutMs: 60_000,
|
|
112
|
+
maxBytes: 10 * 1024 * 1024,
|
|
113
|
+
maxOutputChars: 50_000,
|
|
114
|
+
},
|
|
115
|
+
captionLlm: {
|
|
116
|
+
enabled: false,
|
|
117
|
+
model: "google/gemini-2.0-flash",
|
|
118
|
+
prompt:
|
|
119
|
+
"Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
|
|
120
|
+
timeoutMs: 60_000,
|
|
121
|
+
maxBytes: 10 * 1024 * 1024,
|
|
122
|
+
maxOutputChars: 10_000,
|
|
123
|
+
},
|
|
124
|
+
},
|
|
125
|
+
audio: {
|
|
126
|
+
transcription: {
|
|
127
|
+
enabled: false,
|
|
128
|
+
model: "openai/whisper-1",
|
|
129
|
+
timeoutMs: 120_000,
|
|
130
|
+
maxBytes: 25 * 1024 * 1024,
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
video: {
|
|
134
|
+
transcription: {
|
|
135
|
+
enabled: false,
|
|
136
|
+
model: "openai/whisper-1",
|
|
137
|
+
timeoutMs: 120_000,
|
|
138
|
+
maxBytes: 50 * 1024 * 1024,
|
|
139
|
+
},
|
|
140
|
+
frames: {
|
|
141
|
+
enabled: false,
|
|
142
|
+
sampleFps: 0.2,
|
|
143
|
+
maxFrames: 50,
|
|
144
|
+
// ffmpegPath: "/usr/bin/ffmpeg",
|
|
145
|
+
maxBytes: 50 * 1024 * 1024,
|
|
146
|
+
model: "google/gemini-2.0-flash",
|
|
147
|
+
prompt:
|
|
148
|
+
"Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
149
|
+
timeoutMs: 60_000,
|
|
150
|
+
maxOutputChars: 50_000,
|
|
151
|
+
},
|
|
152
|
+
},
|
|
153
|
+
file: {
|
|
154
|
+
text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
155
|
+
docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
156
|
+
pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
157
|
+
xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
158
|
+
},
|
|
159
|
+
},
|
|
27
160
|
},
|
|
28
|
-
} as const;
|
|
161
|
+
} as const);
|
|
29
162
|
|
|
30
163
|
// __UNRAG_CREATE_ENGINE__
|
|
31
164
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import type { AssetInput, AssetKind, Metadata } from "../../core";
|
|
2
|
+
|
|
1
3
|
type RichText = { plain_text?: string };
|
|
2
4
|
|
|
3
5
|
export type NotionBlock = {
|
|
@@ -20,6 +22,82 @@ const rt = (value: unknown): string => {
|
|
|
20
22
|
|
|
21
23
|
const indent = (n: number) => (n > 0 ? " ".repeat(n) : "");
|
|
22
24
|
|
|
25
|
+
const asString = (v: unknown) => String(v ?? "").trim();
|
|
26
|
+
|
|
27
|
+
const supportedAssetKinds = new Set<AssetKind>([
|
|
28
|
+
"image",
|
|
29
|
+
"pdf",
|
|
30
|
+
"audio",
|
|
31
|
+
"video",
|
|
32
|
+
"file",
|
|
33
|
+
]);
|
|
34
|
+
|
|
35
|
+
const toAssetKind = (notionType: string): AssetKind | null => {
|
|
36
|
+
const t = notionType as AssetKind;
|
|
37
|
+
return supportedAssetKinds.has(t) ? t : null;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
const pickUrl = (payload: any): string | undefined => {
|
|
41
|
+
const type = String(payload?.type ?? "");
|
|
42
|
+
if (type === "external") return asString(payload?.external?.url);
|
|
43
|
+
if (type === "file") return asString(payload?.file?.url);
|
|
44
|
+
return undefined;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
const pickCaption = (payload: any): string => {
|
|
48
|
+
// Notion captions are typically an array of rich text items.
|
|
49
|
+
return rt(payload?.caption);
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
const inferMediaType = (assetKind: AssetKind, payload: any): string | undefined => {
|
|
53
|
+
if (assetKind === "pdf") return "application/pdf";
|
|
54
|
+
// Notion does not consistently include media types; keep it optional.
|
|
55
|
+
return asString(payload?.media_type) || undefined;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
const asMetadata = (obj: Record<string, unknown>): Metadata => obj as any;
|
|
59
|
+
|
|
60
|
+
export function extractNotionAssets(
|
|
61
|
+
nodes: NotionBlockNode[],
|
|
62
|
+
opts: { maxDepth?: number } = {}
|
|
63
|
+
): AssetInput[] {
|
|
64
|
+
const maxDepth = opts.maxDepth ?? 6;
|
|
65
|
+
const out: AssetInput[] = [];
|
|
66
|
+
|
|
67
|
+
const walk = (node: NotionBlockNode, depth: number) => {
|
|
68
|
+
if (depth > maxDepth) return;
|
|
69
|
+
const b = node.block as any;
|
|
70
|
+
const kind = toAssetKind(String(b.type ?? ""));
|
|
71
|
+
if (kind) {
|
|
72
|
+
const payload = b[kind];
|
|
73
|
+
const url = pickUrl(payload);
|
|
74
|
+
if (url) {
|
|
75
|
+
const caption = pickCaption(payload).trim();
|
|
76
|
+
const mediaType = inferMediaType(kind, payload);
|
|
77
|
+
out.push({
|
|
78
|
+
assetId: String(b.id),
|
|
79
|
+
kind,
|
|
80
|
+
data: { kind: "url", url, ...(mediaType ? { mediaType } : {}) },
|
|
81
|
+
uri: url,
|
|
82
|
+
...(caption ? { text: caption } : {}),
|
|
83
|
+
metadata: asMetadata({
|
|
84
|
+
connector: "notion",
|
|
85
|
+
notionBlockId: String(b.id),
|
|
86
|
+
notionBlockType: String(b.type),
|
|
87
|
+
}),
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
for (const child of node.children) {
|
|
93
|
+
walk(child, depth + 1);
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
for (const n of nodes) walk(n, 0);
|
|
98
|
+
return out;
|
|
99
|
+
}
|
|
100
|
+
|
|
23
101
|
export function renderNotionBlocksToText(
|
|
24
102
|
nodes: NotionBlockNode[],
|
|
25
103
|
opts: { maxDepth?: number } = {}
|
|
@@ -1,8 +1,12 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
import type { IngestResult } from "../../core/types";
|
|
1
|
+
import type { IngestResult } from "../../core";
|
|
3
2
|
import { createNotionClient, type NotionClient } from "./client";
|
|
4
3
|
import { normalizeNotionPageId32, toUuidHyphenated } from "./ids";
|
|
5
|
-
import {
|
|
4
|
+
import {
|
|
5
|
+
extractNotionAssets,
|
|
6
|
+
renderNotionBlocksToText,
|
|
7
|
+
type NotionBlock,
|
|
8
|
+
type NotionBlockNode,
|
|
9
|
+
} from "./render";
|
|
6
10
|
import type {
|
|
7
11
|
BuildNotionPageIngestInputArgs,
|
|
8
12
|
NotionPageDocument,
|
|
@@ -29,6 +33,7 @@ export function buildNotionPageIngestInput(
|
|
|
29
33
|
sourceId,
|
|
30
34
|
content: args.content,
|
|
31
35
|
metadata: args.metadata ?? {},
|
|
36
|
+
assets: args.assets ?? [],
|
|
32
37
|
};
|
|
33
38
|
}
|
|
34
39
|
|
|
@@ -108,6 +113,7 @@ export async function loadNotionPageDocument(args: {
|
|
|
108
113
|
const tree = await buildBlockTree(args.notion, apiId, 0, args.maxDepth ?? 4);
|
|
109
114
|
const body = renderNotionBlocksToText(tree);
|
|
110
115
|
const content = [title.trim(), body.trim()].filter(Boolean).join("\n\n");
|
|
116
|
+
const assets = extractNotionAssets(tree);
|
|
111
117
|
|
|
112
118
|
const metadata = {
|
|
113
119
|
connector: "notion",
|
|
@@ -121,6 +127,7 @@ export async function loadNotionPageDocument(args: {
|
|
|
121
127
|
const ingest = buildNotionPageIngestInput({
|
|
122
128
|
pageId,
|
|
123
129
|
content,
|
|
130
|
+
assets,
|
|
124
131
|
metadata: metadata as any,
|
|
125
132
|
sourceIdPrefix: args.sourceIdPrefix,
|
|
126
133
|
});
|
|
@@ -129,6 +136,7 @@ export async function loadNotionPageDocument(args: {
|
|
|
129
136
|
sourceId: ingest.sourceId,
|
|
130
137
|
content: ingest.content,
|
|
131
138
|
metadata: ingest.metadata ?? {},
|
|
139
|
+
assets: ingest.assets ?? [],
|
|
132
140
|
};
|
|
133
141
|
}
|
|
134
142
|
|
|
@@ -178,6 +186,7 @@ export async function syncNotionPages(
|
|
|
178
186
|
const result: IngestResult = await input.engine.ingest({
|
|
179
187
|
sourceId: doc.sourceId,
|
|
180
188
|
content: doc.content,
|
|
189
|
+
assets: doc.assets,
|
|
181
190
|
metadata: doc.metadata as any,
|
|
182
191
|
});
|
|
183
192
|
|