unrag 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/cli/index.js +251 -42
- package/package.json +2 -1
- package/registry/config/unrag.config.ts +140 -7
- package/registry/connectors/notion/render.ts +78 -0
- package/registry/connectors/notion/sync.ts +12 -3
- package/registry/connectors/notion/types.ts +3 -1
- package/registry/core/assets.ts +54 -0
- package/registry/core/config.ts +150 -0
- package/registry/core/context-engine.ts +69 -1
- package/registry/core/index.ts +15 -2
- package/registry/core/ingest.ts +743 -17
- package/registry/core/types.ts +606 -0
- package/registry/docs/unrag.md +6 -0
- package/registry/embedding/ai.ts +89 -8
- package/registry/extractors/_shared/fetch.ts +113 -0
- package/registry/extractors/_shared/media.ts +14 -0
- package/registry/extractors/_shared/text.ts +11 -0
- package/registry/extractors/audio-transcribe/index.ts +75 -0
- package/registry/extractors/file-docx/index.ts +53 -0
- package/registry/extractors/file-pptx/index.ts +92 -0
- package/registry/extractors/file-text/index.ts +85 -0
- package/registry/extractors/file-xlsx/index.ts +58 -0
- package/registry/extractors/image-caption-llm/index.ts +60 -0
- package/registry/extractors/image-ocr/index.ts +60 -0
- package/registry/extractors/pdf-llm/index.ts +84 -0
- package/registry/extractors/pdf-ocr/index.ts +125 -0
- package/registry/extractors/pdf-text-layer/index.ts +76 -0
- package/registry/extractors/video-frames/index.ts +126 -0
- package/registry/extractors/video-transcribe/index.ts +78 -0
- package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
package/README.md
CHANGED
|
@@ -10,13 +10,13 @@ It installs small, auditable source files into your repo:
|
|
|
10
10
|
## Usage
|
|
11
11
|
|
|
12
12
|
```bash
|
|
13
|
-
bunx unrag init
|
|
13
|
+
bunx unrag@latest init
|
|
14
14
|
```
|
|
15
15
|
|
|
16
16
|
### Common flags
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
|
-
bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag
|
|
19
|
+
bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag
|
|
20
20
|
```
|
|
21
21
|
|
|
22
22
|
- `--store`: `drizzle` | `prisma` | `raw-sql`
|
package/dist/cli/index.js
CHANGED
|
@@ -74,8 +74,7 @@ var writeText = async (filePath, content) => {
|
|
|
74
74
|
var renderUnragConfig = (content, selection) => {
|
|
75
75
|
const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
|
|
76
76
|
const baseImports = [
|
|
77
|
-
`import {
|
|
78
|
-
`import { createAiEmbeddingProvider } from "${installImportBase}/embedding/ai";`
|
|
77
|
+
`import { defineUnragConfig } from "${installImportBase}/core";`
|
|
79
78
|
];
|
|
80
79
|
const storeImports = [];
|
|
81
80
|
const storeCreateLines = [];
|
|
@@ -93,24 +92,14 @@ var renderUnragConfig = (content, selection) => {
|
|
|
93
92
|
`);
|
|
94
93
|
const createEngineBlock = [
|
|
95
94
|
`export function createUnragEngine() {`,
|
|
96
|
-
` const embedding = createAiEmbeddingProvider({`,
|
|
97
|
-
` model: unragConfig.embedding.model,`,
|
|
98
|
-
` timeoutMs: unragConfig.embedding.timeoutMs,`,
|
|
99
|
-
` });`,
|
|
100
95
|
...storeCreateLines,
|
|
101
96
|
``,
|
|
102
|
-
` return
|
|
103
|
-
` defineConfig({`,
|
|
104
|
-
` embedding,`,
|
|
105
|
-
` store,`,
|
|
106
|
-
` defaults: unragConfig.chunking,`,
|
|
107
|
-
` })`,
|
|
108
|
-
` );`,
|
|
97
|
+
` return unrag.createEngine({ store });`,
|
|
109
98
|
`}`,
|
|
110
99
|
``,
|
|
111
100
|
`export async function retrieve(query: string) {`,
|
|
112
101
|
` const engine = createUnragEngine();`,
|
|
113
|
-
` return engine.retrieve({ query, topK:
|
|
102
|
+
` return engine.retrieve({ query, topK: unrag.defaults.retrieval.topK });`,
|
|
114
103
|
`}`
|
|
115
104
|
].join(`
|
|
116
105
|
`);
|
|
@@ -147,6 +136,10 @@ async function copyRegistryFiles(selection) {
|
|
|
147
136
|
src: path2.join(selection.registryRoot, "core/index.ts"),
|
|
148
137
|
dest: path2.join(installBaseAbs, "core/index.ts")
|
|
149
138
|
},
|
|
139
|
+
{
|
|
140
|
+
src: path2.join(selection.registryRoot, "core/assets.ts"),
|
|
141
|
+
dest: path2.join(installBaseAbs, "core/assets.ts")
|
|
142
|
+
},
|
|
150
143
|
{
|
|
151
144
|
src: path2.join(selection.registryRoot, "core/types.ts"),
|
|
152
145
|
dest: path2.join(installBaseAbs, "core/types.ts")
|
|
@@ -163,6 +156,10 @@ async function copyRegistryFiles(selection) {
|
|
|
163
156
|
src: path2.join(selection.registryRoot, "core/context-engine.ts"),
|
|
164
157
|
dest: path2.join(installBaseAbs, "core/context-engine.ts")
|
|
165
158
|
},
|
|
159
|
+
{
|
|
160
|
+
src: path2.join(selection.registryRoot, "core/delete.ts"),
|
|
161
|
+
dest: path2.join(installBaseAbs, "core/delete.ts")
|
|
162
|
+
},
|
|
166
163
|
{
|
|
167
164
|
src: path2.join(selection.registryRoot, "core/ingest.ts"),
|
|
168
165
|
dest: path2.join(installBaseAbs, "core/ingest.ts")
|
|
@@ -262,6 +259,70 @@ async function copyConnectorFiles(selection) {
|
|
|
262
259
|
await writeText(dest, raw);
|
|
263
260
|
}
|
|
264
261
|
}
|
|
262
|
+
async function copyExtractorFiles(selection) {
|
|
263
|
+
const toAbs = (projectRelative) => path2.join(selection.projectRoot, projectRelative);
|
|
264
|
+
const installBaseAbs = toAbs(selection.installDir);
|
|
265
|
+
const extractorRegistryAbs = path2.join(selection.registryRoot, "extractors", selection.extractor);
|
|
266
|
+
const sharedRegistryAbs = path2.join(selection.registryRoot, "extractors", "_shared");
|
|
267
|
+
if (!await exists(extractorRegistryAbs)) {
|
|
268
|
+
throw new Error(`Unknown extractor registry: ${path2.relative(selection.registryRoot, extractorRegistryAbs)}`);
|
|
269
|
+
}
|
|
270
|
+
const extractorFiles = await listFilesRecursive(extractorRegistryAbs);
|
|
271
|
+
const sharedFiles = await exists(sharedRegistryAbs) ? await listFilesRecursive(sharedRegistryAbs) : [];
|
|
272
|
+
const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
|
|
273
|
+
const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
|
|
274
|
+
const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
|
|
275
|
+
for (const src of extractorFiles) {
|
|
276
|
+
if (!await exists(src)) {
|
|
277
|
+
throw new Error(`Registry file missing: ${src}`);
|
|
278
|
+
}
|
|
279
|
+
const rel = path2.relative(extractorRegistryAbs, src);
|
|
280
|
+
const dest = path2.join(destRootAbs, rel);
|
|
281
|
+
if (await exists(dest)) {
|
|
282
|
+
if (nonInteractive) {
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
const answer = await confirm({
|
|
286
|
+
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
287
|
+
initialValue: false
|
|
288
|
+
});
|
|
289
|
+
if (isCancel(answer)) {
|
|
290
|
+
cancel("Cancelled.");
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
if (!answer) {
|
|
294
|
+
continue;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
const raw = await readText(src);
|
|
298
|
+
await writeText(dest, raw);
|
|
299
|
+
}
|
|
300
|
+
for (const src of sharedFiles) {
|
|
301
|
+
if (!await exists(src)) {
|
|
302
|
+
throw new Error(`Registry file missing: ${src}`);
|
|
303
|
+
}
|
|
304
|
+
const rel = path2.relative(sharedRegistryAbs, src);
|
|
305
|
+
const dest = path2.join(sharedDestRootAbs, rel);
|
|
306
|
+
if (await exists(dest)) {
|
|
307
|
+
if (nonInteractive) {
|
|
308
|
+
continue;
|
|
309
|
+
}
|
|
310
|
+
const answer = await confirm({
|
|
311
|
+
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
312
|
+
initialValue: false
|
|
313
|
+
});
|
|
314
|
+
if (isCancel(answer)) {
|
|
315
|
+
cancel("Cancelled.");
|
|
316
|
+
return;
|
|
317
|
+
}
|
|
318
|
+
if (!answer) {
|
|
319
|
+
continue;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
const raw = await readText(src);
|
|
323
|
+
await writeText(dest, raw);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
265
326
|
|
|
266
327
|
// cli/lib/json.ts
|
|
267
328
|
import { readFile as readFile2, writeFile as writeFile2 } from "node:fs/promises";
|
|
@@ -347,6 +408,37 @@ function depsForConnector(connector) {
|
|
|
347
408
|
}
|
|
348
409
|
return { deps, devDeps };
|
|
349
410
|
}
|
|
411
|
+
function depsForExtractor(extractor) {
|
|
412
|
+
const deps = {};
|
|
413
|
+
const devDeps = {};
|
|
414
|
+
if (extractor === "pdf-llm") {
|
|
415
|
+
deps["ai"] = "^5.0.113";
|
|
416
|
+
}
|
|
417
|
+
if (extractor === "pdf-text-layer") {
|
|
418
|
+
deps["pdfjs-dist"] = "^5.4.149";
|
|
419
|
+
}
|
|
420
|
+
if (extractor === "pdf-ocr") {}
|
|
421
|
+
if (extractor === "image-ocr" || extractor === "image-caption-llm") {
|
|
422
|
+
deps["ai"] = "^5.0.113";
|
|
423
|
+
}
|
|
424
|
+
if (extractor === "audio-transcribe" || extractor === "video-transcribe") {
|
|
425
|
+
deps["ai"] = "^5.0.113";
|
|
426
|
+
}
|
|
427
|
+
if (extractor === "video-frames") {
|
|
428
|
+
deps["ai"] = "^5.0.113";
|
|
429
|
+
}
|
|
430
|
+
if (extractor === "file-text") {}
|
|
431
|
+
if (extractor === "file-docx") {
|
|
432
|
+
deps["mammoth"] = "^1.10.0";
|
|
433
|
+
}
|
|
434
|
+
if (extractor === "file-pptx") {
|
|
435
|
+
deps["jszip"] = "^3.10.1";
|
|
436
|
+
}
|
|
437
|
+
if (extractor === "file-xlsx") {
|
|
438
|
+
deps["xlsx"] = "^0.18.5";
|
|
439
|
+
}
|
|
440
|
+
return { deps, devDeps };
|
|
441
|
+
}
|
|
350
442
|
function installCmd(pm) {
|
|
351
443
|
if (pm === "bun")
|
|
352
444
|
return "bun install";
|
|
@@ -550,7 +642,8 @@ async function initCommand(args) {
|
|
|
550
642
|
storeAdapter: storeAdapterAnswer,
|
|
551
643
|
aliasBase,
|
|
552
644
|
version: CONFIG_VERSION,
|
|
553
|
-
connectors: existing?.connectors ?? []
|
|
645
|
+
connectors: existing?.connectors ?? [],
|
|
646
|
+
extractors: existing?.extractors ?? []
|
|
554
647
|
};
|
|
555
648
|
await writeJsonFile(path5.join(root, CONFIG_FILE), config);
|
|
556
649
|
const pm = await detectPackageManager(root);
|
|
@@ -578,9 +671,34 @@ async function initCommand(args) {
|
|
|
578
671
|
import { outro as outro2 } from "@clack/prompts";
|
|
579
672
|
import path6 from "node:path";
|
|
580
673
|
import { fileURLToPath as fileURLToPath2 } from "node:url";
|
|
674
|
+
|
|
675
|
+
// cli/lib/constants.ts
|
|
676
|
+
var UNRAG_SITE_URL = (process.env.UNRAG_SITE_URL ?? process.env.UNRAG_DOCS_BASE_URL)?.trim() || "https://unrag.dev";
|
|
677
|
+
var UNRAG_GITHUB_REPO_URL = "https://github.com/BetterStacks/unrag";
|
|
678
|
+
function docsUrl(siteRelativePath) {
|
|
679
|
+
const p = siteRelativePath.startsWith("/") ? siteRelativePath : `/${siteRelativePath}`;
|
|
680
|
+
const base = UNRAG_SITE_URL.endsWith("/") ? UNRAG_SITE_URL : `${UNRAG_SITE_URL}/`;
|
|
681
|
+
return new URL(p.replace(/^\/+/, "/"), base).toString();
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
// cli/commands/add.ts
|
|
581
685
|
var CONFIG_FILE2 = "unrag.json";
|
|
582
686
|
var __filename3 = fileURLToPath2(import.meta.url);
|
|
583
687
|
var __dirname3 = path6.dirname(__filename3);
|
|
688
|
+
var AVAILABLE_EXTRACTORS = [
|
|
689
|
+
"pdf-llm",
|
|
690
|
+
"pdf-text-layer",
|
|
691
|
+
"pdf-ocr",
|
|
692
|
+
"image-ocr",
|
|
693
|
+
"image-caption-llm",
|
|
694
|
+
"audio-transcribe",
|
|
695
|
+
"video-transcribe",
|
|
696
|
+
"video-frames",
|
|
697
|
+
"file-text",
|
|
698
|
+
"file-docx",
|
|
699
|
+
"file-pptx",
|
|
700
|
+
"file-xlsx"
|
|
701
|
+
];
|
|
584
702
|
var parseAddArgs = (args) => {
|
|
585
703
|
const out = {};
|
|
586
704
|
for (let i = 0;i < args.length; i++) {
|
|
@@ -589,8 +707,17 @@ var parseAddArgs = (args) => {
|
|
|
589
707
|
out.yes = true;
|
|
590
708
|
continue;
|
|
591
709
|
}
|
|
592
|
-
if (!out.
|
|
593
|
-
|
|
710
|
+
if (!out.kind && a && !a.startsWith("-")) {
|
|
711
|
+
if (a === "extractor") {
|
|
712
|
+
out.kind = "extractor";
|
|
713
|
+
continue;
|
|
714
|
+
}
|
|
715
|
+
out.kind = "connector";
|
|
716
|
+
out.name = a;
|
|
717
|
+
continue;
|
|
718
|
+
}
|
|
719
|
+
if (out.kind === "extractor" && !out.name && a && !a.startsWith("-")) {
|
|
720
|
+
out.name = a;
|
|
594
721
|
continue;
|
|
595
722
|
}
|
|
596
723
|
}
|
|
@@ -602,23 +729,24 @@ async function addCommand(args) {
|
|
|
602
729
|
throw new Error("Could not find a project root (no package.json found).");
|
|
603
730
|
}
|
|
604
731
|
const parsed = parseAddArgs(args);
|
|
605
|
-
const
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
732
|
+
const kind = parsed.kind ?? "connector";
|
|
733
|
+
const name = parsed.name;
|
|
734
|
+
if (!name) {
|
|
735
|
+
outro2([
|
|
736
|
+
"Usage:",
|
|
737
|
+
" unrag add <connector>",
|
|
738
|
+
" unrag add extractor <name>",
|
|
739
|
+
"",
|
|
740
|
+
"Available connectors: notion",
|
|
741
|
+
`Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`
|
|
742
|
+
].join(`
|
|
743
|
+
`));
|
|
616
744
|
return;
|
|
617
745
|
}
|
|
618
746
|
const configPath = path6.join(root, CONFIG_FILE2);
|
|
619
747
|
const config = await readJsonFile(configPath);
|
|
620
748
|
if (!config?.installDir) {
|
|
621
|
-
throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag init\` first.`);
|
|
749
|
+
throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag@latest init\` first.`);
|
|
622
750
|
}
|
|
623
751
|
const cliPackageRoot = await findUp(__dirname3, "package.json");
|
|
624
752
|
if (!cliPackageRoot) {
|
|
@@ -626,40 +754,120 @@ Available connectors: notion`);
|
|
|
626
754
|
}
|
|
627
755
|
const registryRoot = path6.join(cliPackageRoot, "registry");
|
|
628
756
|
const nonInteractive = parsed.yes || !process.stdin.isTTY;
|
|
629
|
-
await
|
|
757
|
+
const pkg = await readPackageJson(root);
|
|
758
|
+
if (kind === "connector") {
|
|
759
|
+
const connector = name;
|
|
760
|
+
if (connector !== "notion") {
|
|
761
|
+
outro2(`Unknown connector: ${name}
|
|
762
|
+
|
|
763
|
+
Available connectors: notion`);
|
|
764
|
+
return;
|
|
765
|
+
}
|
|
766
|
+
await copyConnectorFiles({
|
|
767
|
+
projectRoot: root,
|
|
768
|
+
registryRoot,
|
|
769
|
+
installDir: config.installDir,
|
|
770
|
+
connector,
|
|
771
|
+
yes: nonInteractive
|
|
772
|
+
});
|
|
773
|
+
const { deps: deps2, devDeps: devDeps2 } = depsForConnector(connector);
|
|
774
|
+
const merged2 = mergeDeps(pkg, deps2, devDeps2);
|
|
775
|
+
if (merged2.changes.length > 0) {
|
|
776
|
+
await writePackageJson(root, merged2.pkg);
|
|
777
|
+
}
|
|
778
|
+
const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
|
|
779
|
+
await writeJsonFile(configPath, { ...config, connectors });
|
|
780
|
+
outro2([
|
|
781
|
+
`Installed connector: ${connector}.`,
|
|
782
|
+
"",
|
|
783
|
+
`- Code: ${path6.join(config.installDir, "connectors", connector)}`,
|
|
784
|
+
`- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
|
|
785
|
+
"",
|
|
786
|
+
merged2.changes.length > 0 ? `Added deps: ${merged2.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
787
|
+
nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
|
|
788
|
+
].filter(Boolean).join(`
|
|
789
|
+
`));
|
|
790
|
+
return;
|
|
791
|
+
}
|
|
792
|
+
const extractor = name;
|
|
793
|
+
if (!extractor || !AVAILABLE_EXTRACTORS.includes(extractor)) {
|
|
794
|
+
outro2(`Unknown extractor: ${name}
|
|
795
|
+
|
|
796
|
+
Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`);
|
|
797
|
+
return;
|
|
798
|
+
}
|
|
799
|
+
await copyExtractorFiles({
|
|
630
800
|
projectRoot: root,
|
|
631
801
|
registryRoot,
|
|
632
802
|
installDir: config.installDir,
|
|
633
|
-
|
|
803
|
+
extractor,
|
|
634
804
|
yes: nonInteractive
|
|
635
805
|
});
|
|
636
|
-
const
|
|
637
|
-
const { deps, devDeps } = depsForConnector(connector);
|
|
806
|
+
const { deps, devDeps } = depsForExtractor(extractor);
|
|
638
807
|
const merged = mergeDeps(pkg, deps, devDeps);
|
|
639
808
|
if (merged.changes.length > 0) {
|
|
640
809
|
await writePackageJson(root, merged.pkg);
|
|
641
810
|
}
|
|
642
|
-
const
|
|
643
|
-
await writeJsonFile(configPath, { ...config,
|
|
811
|
+
const extractors = Array.from(new Set([...config.extractors ?? [], extractor])).sort();
|
|
812
|
+
await writeJsonFile(configPath, { ...config, extractors });
|
|
644
813
|
outro2([
|
|
645
|
-
`Installed
|
|
814
|
+
`Installed extractor: ${extractor}.`,
|
|
646
815
|
"",
|
|
647
|
-
`- Code: ${path6.join(config.installDir, "
|
|
648
|
-
`- Docs: /docs/connectors/${connector}`,
|
|
816
|
+
`- Code: ${path6.join(config.installDir, "extractors", extractor)}`,
|
|
649
817
|
"",
|
|
650
818
|
merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
651
|
-
|
|
819
|
+
"",
|
|
820
|
+
`Next: import the extractor and pass it to createContextEngine({ extractors: [...] }).`
|
|
652
821
|
].filter(Boolean).join(`
|
|
653
822
|
`));
|
|
654
823
|
}
|
|
655
824
|
|
|
656
825
|
// cli/run.ts
|
|
826
|
+
function renderHelp() {
|
|
827
|
+
return [
|
|
828
|
+
"unrag — vendor-in RAG primitives (ingest/retrieve + adapters) into your repo.",
|
|
829
|
+
"",
|
|
830
|
+
"Usage:",
|
|
831
|
+
" bunx unrag <command> [options]",
|
|
832
|
+
" npx unrag <command> [options]",
|
|
833
|
+
"",
|
|
834
|
+
"Commands:",
|
|
835
|
+
" init Install core files (config + store adapter templates)",
|
|
836
|
+
" add <connector> Install a connector (currently: notion)",
|
|
837
|
+
" help Show this help",
|
|
838
|
+
"",
|
|
839
|
+
"Global options:",
|
|
840
|
+
" -h, --help Show help",
|
|
841
|
+
" -y, --yes Non-interactive; accept defaults",
|
|
842
|
+
"",
|
|
843
|
+
"init options:",
|
|
844
|
+
" --store <adapter> drizzle | prisma | raw-sql",
|
|
845
|
+
" --dir <path> Install directory (alias: --install-dir)",
|
|
846
|
+
" --alias <@name> Import alias base (e.g. @unrag)",
|
|
847
|
+
"",
|
|
848
|
+
"Examples:",
|
|
849
|
+
" bunx unrag@latest init",
|
|
850
|
+
" bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
|
|
851
|
+
" bunx unrag add notion --yes",
|
|
852
|
+
"",
|
|
853
|
+
"Docs:",
|
|
854
|
+
` - Quickstart: ${docsUrl("/docs/getting-started/quickstart")}`,
|
|
855
|
+
` - CLI: ${docsUrl("/docs/reference/cli")}`,
|
|
856
|
+
` - Notion: ${docsUrl("/docs/connectors/notion")}`,
|
|
857
|
+
"",
|
|
858
|
+
"Repo:",
|
|
859
|
+
` ${UNRAG_GITHUB_REPO_URL}`,
|
|
860
|
+
"",
|
|
861
|
+
"Tip:",
|
|
862
|
+
" After `init`, open the generated unrag.md for schema + env vars (DATABASE_URL)."
|
|
863
|
+
].join(`
|
|
864
|
+
`);
|
|
865
|
+
}
|
|
657
866
|
async function run(argv) {
|
|
658
867
|
const [, , command, ...rest] = argv;
|
|
659
868
|
intro("unrag");
|
|
660
869
|
if (!command || command === "help" || command === "--help" || command === "-h") {
|
|
661
|
-
outro3(
|
|
662
|
-
`));
|
|
870
|
+
outro3(renderHelp());
|
|
663
871
|
return;
|
|
664
872
|
}
|
|
665
873
|
if (command === "init") {
|
|
@@ -670,7 +878,8 @@ async function run(argv) {
|
|
|
670
878
|
await addCommand(rest);
|
|
671
879
|
return;
|
|
672
880
|
}
|
|
673
|
-
outro3(`Unknown command: ${command}`
|
|
881
|
+
outro3([`Unknown command: ${command}`, "", renderHelp()].join(`
|
|
882
|
+
`));
|
|
674
883
|
process.exitCode = 1;
|
|
675
884
|
}
|
|
676
885
|
|
package/package.json
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unrag",
|
|
3
3
|
"type": "module",
|
|
4
|
+
"repository": "https://github.com/BetterStacks/unrag",
|
|
4
5
|
"bin": {
|
|
5
6
|
"unrag": "./dist/cli/index.js"
|
|
6
7
|
},
|
|
7
|
-
"version": "0.2.
|
|
8
|
+
"version": "0.2.3",
|
|
8
9
|
"private": false,
|
|
9
10
|
"license": "Apache-2.0",
|
|
10
11
|
"devDependencies": {
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
* Root Unrag config (generated).
|
|
3
3
|
*
|
|
4
4
|
* This file is meant to be the single place you tweak:
|
|
5
|
+
* - Defaults (chunking + retrieval)
|
|
6
|
+
* - Engine settings (storage, asset processing, extractors)
|
|
5
7
|
* - Embedding provider/model/timeouts
|
|
6
|
-
* -
|
|
7
|
-
* - Retrieval defaults
|
|
8
|
-
* - How you construct your DB client (Pool/Prisma/etc)
|
|
8
|
+
* - How you construct your DB client (Pool/Prisma/etc) and vector store adapter
|
|
9
9
|
*
|
|
10
10
|
* The files under your install dir (e.g. `lib/unrag/**`) are intended to be
|
|
11
11
|
* treated like vendored source code.
|
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
|
|
14
14
|
// __UNRAG_IMPORTS__
|
|
15
15
|
|
|
16
|
-
export const
|
|
16
|
+
export const unrag = defineUnragConfig({
|
|
17
|
+
defaults: {
|
|
17
18
|
chunking: {
|
|
18
19
|
chunkSize: 200,
|
|
19
20
|
chunkOverlap: 40,
|
|
@@ -21,11 +22,143 @@ export const unragConfig = {
|
|
|
21
22
|
retrieval: {
|
|
22
23
|
topK: 8,
|
|
23
24
|
},
|
|
25
|
+
},
|
|
24
26
|
embedding: {
|
|
25
|
-
|
|
26
|
-
|
|
27
|
+
provider: "ai",
|
|
28
|
+
config: {
|
|
29
|
+
type: "text",
|
|
30
|
+
model: "openai/text-embedding-3-small",
|
|
31
|
+
timeoutMs: 15_000,
|
|
32
|
+
},
|
|
33
|
+
},
|
|
34
|
+
engine: {
|
|
35
|
+
/**
|
|
36
|
+
* Storage controls.
|
|
37
|
+
*
|
|
38
|
+
* - storeChunkContent: whether `chunk.content` is persisted and returned by retrieval.
|
|
39
|
+
* - storeDocumentContent: whether the full original document text is stored in `documents.content`.
|
|
40
|
+
*/
|
|
41
|
+
storage: {
|
|
42
|
+
storeChunkContent: true,
|
|
43
|
+
storeDocumentContent: true,
|
|
44
|
+
},
|
|
45
|
+
/**
|
|
46
|
+
* Optional extractor modules that can process non-text assets into text outputs.
|
|
47
|
+
*
|
|
48
|
+
* To install:
|
|
49
|
+
* - `unrag add extractor pdf-llm`
|
|
50
|
+
*
|
|
51
|
+
* Then import it in this file and add it here, for example:
|
|
52
|
+
* - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
|
|
53
|
+
* - `extractors: [createPdfLlmExtractor()]`
|
|
54
|
+
*/
|
|
55
|
+
extractors: [],
|
|
56
|
+
/**
|
|
57
|
+
* Rich media processing controls.
|
|
58
|
+
*
|
|
59
|
+
* Notes:
|
|
60
|
+
* - The library defaults are cost-safe (PDF LLM extraction is off).
|
|
61
|
+
* - This generated config opts you into PDF extraction for convenience.
|
|
62
|
+
* - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
|
|
63
|
+
*/
|
|
64
|
+
assetProcessing: {
|
|
65
|
+
onUnsupportedAsset: "skip",
|
|
66
|
+
onError: "skip",
|
|
67
|
+
concurrency: 4,
|
|
68
|
+
fetch: {
|
|
69
|
+
enabled: true,
|
|
70
|
+
maxBytes: 15 * 1024 * 1024,
|
|
71
|
+
timeoutMs: 20_000,
|
|
72
|
+
// allowedHosts: ["..."], // recommended to mitigate SSRF
|
|
73
|
+
},
|
|
74
|
+
pdf: {
|
|
75
|
+
// Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
|
|
76
|
+
textLayer: {
|
|
77
|
+
enabled: false,
|
|
78
|
+
maxBytes: 15 * 1024 * 1024,
|
|
79
|
+
maxOutputChars: 200_000,
|
|
80
|
+
minChars: 200,
|
|
81
|
+
// maxPages: 200,
|
|
82
|
+
},
|
|
83
|
+
llmExtraction: {
|
|
84
|
+
enabled: true,
|
|
85
|
+
model: "google/gemini-2.0-flash",
|
|
86
|
+
prompt:
|
|
87
|
+
"Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
|
|
88
|
+
timeoutMs: 60_000,
|
|
89
|
+
maxBytes: 15 * 1024 * 1024,
|
|
90
|
+
maxOutputChars: 200_000,
|
|
91
|
+
},
|
|
92
|
+
// Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
|
|
93
|
+
ocr: {
|
|
94
|
+
enabled: false,
|
|
95
|
+
maxBytes: 15 * 1024 * 1024,
|
|
96
|
+
maxOutputChars: 200_000,
|
|
97
|
+
minChars: 200,
|
|
98
|
+
// maxPages: 200,
|
|
99
|
+
// pdftoppmPath: "/usr/bin/pdftoppm",
|
|
100
|
+
// tesseractPath: "/usr/bin/tesseract",
|
|
101
|
+
// dpi: 200,
|
|
102
|
+
// lang: "eng",
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
image: {
|
|
106
|
+
ocr: {
|
|
107
|
+
enabled: false,
|
|
108
|
+
model: "google/gemini-2.0-flash",
|
|
109
|
+
prompt:
|
|
110
|
+
"Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
111
|
+
timeoutMs: 60_000,
|
|
112
|
+
maxBytes: 10 * 1024 * 1024,
|
|
113
|
+
maxOutputChars: 50_000,
|
|
114
|
+
},
|
|
115
|
+
captionLlm: {
|
|
116
|
+
enabled: false,
|
|
117
|
+
model: "google/gemini-2.0-flash",
|
|
118
|
+
prompt:
|
|
119
|
+
"Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
|
|
120
|
+
timeoutMs: 60_000,
|
|
121
|
+
maxBytes: 10 * 1024 * 1024,
|
|
122
|
+
maxOutputChars: 10_000,
|
|
123
|
+
},
|
|
124
|
+
},
|
|
125
|
+
audio: {
|
|
126
|
+
transcription: {
|
|
127
|
+
enabled: false,
|
|
128
|
+
model: "openai/whisper-1",
|
|
129
|
+
timeoutMs: 120_000,
|
|
130
|
+
maxBytes: 25 * 1024 * 1024,
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
video: {
|
|
134
|
+
transcription: {
|
|
135
|
+
enabled: false,
|
|
136
|
+
model: "openai/whisper-1",
|
|
137
|
+
timeoutMs: 120_000,
|
|
138
|
+
maxBytes: 50 * 1024 * 1024,
|
|
139
|
+
},
|
|
140
|
+
frames: {
|
|
141
|
+
enabled: false,
|
|
142
|
+
sampleFps: 0.2,
|
|
143
|
+
maxFrames: 50,
|
|
144
|
+
// ffmpegPath: "/usr/bin/ffmpeg",
|
|
145
|
+
maxBytes: 50 * 1024 * 1024,
|
|
146
|
+
model: "google/gemini-2.0-flash",
|
|
147
|
+
prompt:
|
|
148
|
+
"Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
149
|
+
timeoutMs: 60_000,
|
|
150
|
+
maxOutputChars: 50_000,
|
|
151
|
+
},
|
|
152
|
+
},
|
|
153
|
+
file: {
|
|
154
|
+
text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
155
|
+
docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
156
|
+
pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
157
|
+
xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
158
|
+
},
|
|
159
|
+
},
|
|
27
160
|
},
|
|
28
|
-
} as const;
|
|
161
|
+
} as const);
|
|
29
162
|
|
|
30
163
|
// __UNRAG_CREATE_ENGINE__
|
|
31
164
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import type { AssetInput, AssetKind, Metadata } from "../../core";
|
|
2
|
+
|
|
1
3
|
type RichText = { plain_text?: string };
|
|
2
4
|
|
|
3
5
|
export type NotionBlock = {
|
|
@@ -20,6 +22,82 @@ const rt = (value: unknown): string => {
|
|
|
20
22
|
|
|
21
23
|
const indent = (n: number) => (n > 0 ? " ".repeat(n) : "");
|
|
22
24
|
|
|
25
|
+
const asString = (v: unknown) => String(v ?? "").trim();
|
|
26
|
+
|
|
27
|
+
const supportedAssetKinds = new Set<AssetKind>([
|
|
28
|
+
"image",
|
|
29
|
+
"pdf",
|
|
30
|
+
"audio",
|
|
31
|
+
"video",
|
|
32
|
+
"file",
|
|
33
|
+
]);
|
|
34
|
+
|
|
35
|
+
const toAssetKind = (notionType: string): AssetKind | null => {
|
|
36
|
+
const t = notionType as AssetKind;
|
|
37
|
+
return supportedAssetKinds.has(t) ? t : null;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
const pickUrl = (payload: any): string | undefined => {
|
|
41
|
+
const type = String(payload?.type ?? "");
|
|
42
|
+
if (type === "external") return asString(payload?.external?.url);
|
|
43
|
+
if (type === "file") return asString(payload?.file?.url);
|
|
44
|
+
return undefined;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
const pickCaption = (payload: any): string => {
|
|
48
|
+
// Notion captions are typically an array of rich text items.
|
|
49
|
+
return rt(payload?.caption);
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
const inferMediaType = (assetKind: AssetKind, payload: any): string | undefined => {
|
|
53
|
+
if (assetKind === "pdf") return "application/pdf";
|
|
54
|
+
// Notion does not consistently include media types; keep it optional.
|
|
55
|
+
return asString(payload?.media_type) || undefined;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
const asMetadata = (obj: Record<string, unknown>): Metadata => obj as any;
|
|
59
|
+
|
|
60
|
+
export function extractNotionAssets(
|
|
61
|
+
nodes: NotionBlockNode[],
|
|
62
|
+
opts: { maxDepth?: number } = {}
|
|
63
|
+
): AssetInput[] {
|
|
64
|
+
const maxDepth = opts.maxDepth ?? 6;
|
|
65
|
+
const out: AssetInput[] = [];
|
|
66
|
+
|
|
67
|
+
const walk = (node: NotionBlockNode, depth: number) => {
|
|
68
|
+
if (depth > maxDepth) return;
|
|
69
|
+
const b = node.block as any;
|
|
70
|
+
const kind = toAssetKind(String(b.type ?? ""));
|
|
71
|
+
if (kind) {
|
|
72
|
+
const payload = b[kind];
|
|
73
|
+
const url = pickUrl(payload);
|
|
74
|
+
if (url) {
|
|
75
|
+
const caption = pickCaption(payload).trim();
|
|
76
|
+
const mediaType = inferMediaType(kind, payload);
|
|
77
|
+
out.push({
|
|
78
|
+
assetId: String(b.id),
|
|
79
|
+
kind,
|
|
80
|
+
data: { kind: "url", url, ...(mediaType ? { mediaType } : {}) },
|
|
81
|
+
uri: url,
|
|
82
|
+
...(caption ? { text: caption } : {}),
|
|
83
|
+
metadata: asMetadata({
|
|
84
|
+
connector: "notion",
|
|
85
|
+
notionBlockId: String(b.id),
|
|
86
|
+
notionBlockType: String(b.type),
|
|
87
|
+
}),
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
for (const child of node.children) {
|
|
93
|
+
walk(child, depth + 1);
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
for (const n of nodes) walk(n, 0);
|
|
98
|
+
return out;
|
|
99
|
+
}
|
|
100
|
+
|
|
23
101
|
export function renderNotionBlocksToText(
|
|
24
102
|
nodes: NotionBlockNode[],
|
|
25
103
|
opts: { maxDepth?: number } = {}
|