unrag 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/cli/index.js +408 -50
- package/package.json +3 -1
- package/registry/config/unrag.config.ts +164 -7
- package/registry/connectors/notion/render.ts +78 -0
- package/registry/connectors/notion/sync.ts +12 -3
- package/registry/connectors/notion/types.ts +3 -1
- package/registry/core/assets.ts +54 -0
- package/registry/core/config.ts +150 -0
- package/registry/core/context-engine.ts +69 -1
- package/registry/core/index.ts +15 -2
- package/registry/core/ingest.ts +743 -17
- package/registry/core/types.ts +606 -0
- package/registry/docs/unrag.md +6 -0
- package/registry/embedding/ai.ts +89 -8
- package/registry/extractors/_shared/fetch.ts +113 -0
- package/registry/extractors/_shared/media.ts +14 -0
- package/registry/extractors/_shared/text.ts +11 -0
- package/registry/extractors/audio-transcribe/index.ts +75 -0
- package/registry/extractors/file-docx/index.ts +53 -0
- package/registry/extractors/file-pptx/index.ts +92 -0
- package/registry/extractors/file-text/index.ts +85 -0
- package/registry/extractors/file-xlsx/index.ts +58 -0
- package/registry/extractors/image-caption-llm/index.ts +60 -0
- package/registry/extractors/image-ocr/index.ts +60 -0
- package/registry/extractors/pdf-llm/index.ts +84 -0
- package/registry/extractors/pdf-ocr/index.ts +125 -0
- package/registry/extractors/pdf-text-layer/index.ts +76 -0
- package/registry/extractors/video-frames/index.ts +126 -0
- package/registry/extractors/video-transcribe/index.ts +78 -0
- package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
package/README.md
CHANGED
|
@@ -10,13 +10,13 @@ It installs small, auditable source files into your repo:
|
|
|
10
10
|
## Usage
|
|
11
11
|
|
|
12
12
|
```bash
|
|
13
|
-
bunx unrag init
|
|
13
|
+
bunx unrag@latest init
|
|
14
14
|
```
|
|
15
15
|
|
|
16
16
|
### Common flags
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
|
-
bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag
|
|
19
|
+
bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag
|
|
20
20
|
```
|
|
21
21
|
|
|
22
22
|
- `--store`: `drizzle` | `prisma` | `raw-sql`
|
package/dist/cli/index.js
CHANGED
|
@@ -4,7 +4,15 @@
|
|
|
4
4
|
import { intro, outro as outro3 } from "@clack/prompts";
|
|
5
5
|
|
|
6
6
|
// cli/commands/init.ts
|
|
7
|
-
import {
|
|
7
|
+
import {
|
|
8
|
+
cancel as cancel2,
|
|
9
|
+
confirm as confirm2,
|
|
10
|
+
groupMultiselect,
|
|
11
|
+
isCancel as isCancel2,
|
|
12
|
+
outro,
|
|
13
|
+
select,
|
|
14
|
+
text
|
|
15
|
+
} from "@clack/prompts";
|
|
8
16
|
import path5 from "node:path";
|
|
9
17
|
import { fileURLToPath } from "node:url";
|
|
10
18
|
|
|
@@ -71,11 +79,41 @@ var writeText = async (filePath, content) => {
|
|
|
71
79
|
await ensureDir(path2.dirname(filePath));
|
|
72
80
|
await writeFile(filePath, content, "utf8");
|
|
73
81
|
};
|
|
82
|
+
var EXTRACTOR_FACTORY = {
|
|
83
|
+
"pdf-llm": "createPdfLlmExtractor",
|
|
84
|
+
"pdf-text-layer": "createPdfTextLayerExtractor",
|
|
85
|
+
"pdf-ocr": "createPdfOcrExtractor",
|
|
86
|
+
"image-ocr": "createImageOcrExtractor",
|
|
87
|
+
"image-caption-llm": "createImageCaptionLlmExtractor",
|
|
88
|
+
"audio-transcribe": "createAudioTranscribeExtractor",
|
|
89
|
+
"video-transcribe": "createVideoTranscribeExtractor",
|
|
90
|
+
"video-frames": "createVideoFramesExtractor",
|
|
91
|
+
"file-text": "createFileTextExtractor",
|
|
92
|
+
"file-docx": "createFileDocxExtractor",
|
|
93
|
+
"file-pptx": "createFilePptxExtractor",
|
|
94
|
+
"file-xlsx": "createFileXlsxExtractor"
|
|
95
|
+
};
|
|
96
|
+
var EXTRACTOR_FLAG_KEYS = {
|
|
97
|
+
"pdf-text-layer": ["pdf_textLayer"],
|
|
98
|
+
"pdf-llm": ["pdf_llmExtraction"],
|
|
99
|
+
"pdf-ocr": ["pdf_ocr"],
|
|
100
|
+
"image-ocr": ["image_ocr"],
|
|
101
|
+
"image-caption-llm": ["image_captionLlm"],
|
|
102
|
+
"audio-transcribe": ["audio_transcription"],
|
|
103
|
+
"video-transcribe": ["video_transcription"],
|
|
104
|
+
"video-frames": ["video_frames"],
|
|
105
|
+
"file-text": ["file_text"],
|
|
106
|
+
"file-docx": ["file_docx"],
|
|
107
|
+
"file-pptx": ["file_pptx"],
|
|
108
|
+
"file-xlsx": ["file_xlsx"]
|
|
109
|
+
};
|
|
110
|
+
var ALL_FLAG_KEYS = Array.from(new Set(Object.values(EXTRACTOR_FLAG_KEYS).flat())).sort();
|
|
74
111
|
var renderUnragConfig = (content, selection) => {
|
|
75
112
|
const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
|
|
113
|
+
const richMedia = selection.richMedia ?? { enabled: false, extractors: [] };
|
|
114
|
+
const selectedExtractors = Array.from(new Set(richMedia.extractors ?? [])).sort();
|
|
76
115
|
const baseImports = [
|
|
77
|
-
`import {
|
|
78
|
-
`import { createAiEmbeddingProvider } from "${installImportBase}/embedding/ai";`
|
|
116
|
+
`import { defineUnragConfig } from "${installImportBase}/core";`
|
|
79
117
|
];
|
|
80
118
|
const storeImports = [];
|
|
81
119
|
const storeCreateLines = [];
|
|
@@ -89,32 +127,40 @@ var renderUnragConfig = (content, selection) => {
|
|
|
89
127
|
storeImports.push(`import { createPrismaVectorStore } from "${installImportBase}/store/prisma";`, `import { PrismaClient } from "@prisma/client";`);
|
|
90
128
|
storeCreateLines.push(` const prisma = (globalThis as any).__unragPrisma ?? new PrismaClient();`, ` (globalThis as any).__unragPrisma = prisma;`, ` const store = createPrismaVectorStore(prisma);`);
|
|
91
129
|
}
|
|
92
|
-
const
|
|
130
|
+
const extractorImports = [];
|
|
131
|
+
if (richMedia.enabled && selectedExtractors.length > 0) {
|
|
132
|
+
for (const ex of selectedExtractors) {
|
|
133
|
+
const factory = EXTRACTOR_FACTORY[ex];
|
|
134
|
+
extractorImports.push(`import { ${factory} } from "${installImportBase}/extractors/${ex}";`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
const importsBlock = [...baseImports, ...storeImports, ...extractorImports].join(`
|
|
93
138
|
`);
|
|
94
139
|
const createEngineBlock = [
|
|
95
140
|
`export function createUnragEngine() {`,
|
|
96
|
-
` const embedding = createAiEmbeddingProvider({`,
|
|
97
|
-
` model: unragConfig.embedding.model,`,
|
|
98
|
-
` timeoutMs: unragConfig.embedding.timeoutMs,`,
|
|
99
|
-
` });`,
|
|
100
141
|
...storeCreateLines,
|
|
101
142
|
``,
|
|
102
|
-
` return
|
|
103
|
-
` defineConfig({`,
|
|
104
|
-
` embedding,`,
|
|
105
|
-
` store,`,
|
|
106
|
-
` defaults: unragConfig.chunking,`,
|
|
107
|
-
` })`,
|
|
108
|
-
` );`,
|
|
109
|
-
`}`,
|
|
110
|
-
``,
|
|
111
|
-
`export async function retrieve(query: string) {`,
|
|
112
|
-
` const engine = createUnragEngine();`,
|
|
113
|
-
` return engine.retrieve({ query, topK: unragConfig.retrieval.topK });`,
|
|
143
|
+
` return unrag.createEngine({ store });`,
|
|
114
144
|
`}`
|
|
115
145
|
].join(`
|
|
116
146
|
`);
|
|
117
|
-
|
|
147
|
+
let out = content.replace("// __UNRAG_IMPORTS__", importsBlock).replace("// __UNRAG_CREATE_ENGINE__", createEngineBlock);
|
|
148
|
+
out = out.replace('type: "text", // __UNRAG_EMBEDDING_TYPE__', richMedia.enabled ? 'type: "multimodal",' : 'type: "text",').replace('model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__', richMedia.enabled ? 'model: "cohere/embed-v4.0",' : 'model: "openai/text-embedding-3-small",');
|
|
149
|
+
const enabledFlagKeys = new Set;
|
|
150
|
+
if (richMedia.enabled) {
|
|
151
|
+
for (const ex of selectedExtractors) {
|
|
152
|
+
for (const k of EXTRACTOR_FLAG_KEYS[ex] ?? []) {
|
|
153
|
+
enabledFlagKeys.add(k);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
for (const k of ALL_FLAG_KEYS) {
|
|
158
|
+
out = out.replace(`enabled: false, // __UNRAG_FLAG_${k}__`, `enabled: ${enabledFlagKeys.has(k) ? "true" : "false"},`);
|
|
159
|
+
}
|
|
160
|
+
const extractorLines = richMedia.enabled && selectedExtractors.length > 0 ? selectedExtractors.map((ex) => ` ${EXTRACTOR_FACTORY[ex]}(),`).join(`
|
|
161
|
+
`) : "";
|
|
162
|
+
out = out.replace(" // __UNRAG_EXTRACTORS__", extractorLines);
|
|
163
|
+
return out;
|
|
118
164
|
};
|
|
119
165
|
var renderDocs = (content, selection) => {
|
|
120
166
|
const notes = [];
|
|
@@ -147,6 +193,10 @@ async function copyRegistryFiles(selection) {
|
|
|
147
193
|
src: path2.join(selection.registryRoot, "core/index.ts"),
|
|
148
194
|
dest: path2.join(installBaseAbs, "core/index.ts")
|
|
149
195
|
},
|
|
196
|
+
{
|
|
197
|
+
src: path2.join(selection.registryRoot, "core/assets.ts"),
|
|
198
|
+
dest: path2.join(installBaseAbs, "core/assets.ts")
|
|
199
|
+
},
|
|
150
200
|
{
|
|
151
201
|
src: path2.join(selection.registryRoot, "core/types.ts"),
|
|
152
202
|
dest: path2.join(installBaseAbs, "core/types.ts")
|
|
@@ -163,6 +213,10 @@ async function copyRegistryFiles(selection) {
|
|
|
163
213
|
src: path2.join(selection.registryRoot, "core/context-engine.ts"),
|
|
164
214
|
dest: path2.join(installBaseAbs, "core/context-engine.ts")
|
|
165
215
|
},
|
|
216
|
+
{
|
|
217
|
+
src: path2.join(selection.registryRoot, "core/delete.ts"),
|
|
218
|
+
dest: path2.join(installBaseAbs, "core/delete.ts")
|
|
219
|
+
},
|
|
166
220
|
{
|
|
167
221
|
src: path2.join(selection.registryRoot, "core/ingest.ts"),
|
|
168
222
|
dest: path2.join(installBaseAbs, "core/ingest.ts")
|
|
@@ -262,6 +316,62 @@ async function copyConnectorFiles(selection) {
|
|
|
262
316
|
await writeText(dest, raw);
|
|
263
317
|
}
|
|
264
318
|
}
|
|
319
|
+
async function copyExtractorFiles(selection) {
|
|
320
|
+
const toAbs = (projectRelative) => path2.join(selection.projectRoot, projectRelative);
|
|
321
|
+
const installBaseAbs = toAbs(selection.installDir);
|
|
322
|
+
const extractorRegistryAbs = path2.join(selection.registryRoot, "extractors", selection.extractor);
|
|
323
|
+
const sharedRegistryAbs = path2.join(selection.registryRoot, "extractors", "_shared");
|
|
324
|
+
if (!await exists(extractorRegistryAbs)) {
|
|
325
|
+
throw new Error(`Unknown extractor registry: ${path2.relative(selection.registryRoot, extractorRegistryAbs)}`);
|
|
326
|
+
}
|
|
327
|
+
const extractorFiles = await listFilesRecursive(extractorRegistryAbs);
|
|
328
|
+
const sharedFiles = await exists(sharedRegistryAbs) ? await listFilesRecursive(sharedRegistryAbs) : [];
|
|
329
|
+
const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
|
|
330
|
+
const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
|
|
331
|
+
const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
|
|
332
|
+
const shouldWrite = async (src, dest) => {
|
|
333
|
+
if (!await exists(dest))
|
|
334
|
+
return true;
|
|
335
|
+
if (nonInteractive)
|
|
336
|
+
return false;
|
|
337
|
+
try {
|
|
338
|
+
const [srcRaw, destRaw] = await Promise.all([readText(src), readText(dest)]);
|
|
339
|
+
if (srcRaw === destRaw)
|
|
340
|
+
return false;
|
|
341
|
+
} catch {}
|
|
342
|
+
const answer = await confirm({
|
|
343
|
+
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
344
|
+
initialValue: false
|
|
345
|
+
});
|
|
346
|
+
if (isCancel(answer)) {
|
|
347
|
+
cancel("Cancelled.");
|
|
348
|
+
return false;
|
|
349
|
+
}
|
|
350
|
+
return Boolean(answer);
|
|
351
|
+
};
|
|
352
|
+
for (const src of extractorFiles) {
|
|
353
|
+
if (!await exists(src)) {
|
|
354
|
+
throw new Error(`Registry file missing: ${src}`);
|
|
355
|
+
}
|
|
356
|
+
const rel = path2.relative(extractorRegistryAbs, src);
|
|
357
|
+
const dest = path2.join(destRootAbs, rel);
|
|
358
|
+
if (!await shouldWrite(src, dest))
|
|
359
|
+
continue;
|
|
360
|
+
const raw = await readText(src);
|
|
361
|
+
await writeText(dest, raw);
|
|
362
|
+
}
|
|
363
|
+
for (const src of sharedFiles) {
|
|
364
|
+
if (!await exists(src)) {
|
|
365
|
+
throw new Error(`Registry file missing: ${src}`);
|
|
366
|
+
}
|
|
367
|
+
const rel = path2.relative(sharedRegistryAbs, src);
|
|
368
|
+
const dest = path2.join(sharedDestRootAbs, rel);
|
|
369
|
+
if (!await shouldWrite(src, dest))
|
|
370
|
+
continue;
|
|
371
|
+
const raw = await readText(src);
|
|
372
|
+
await writeText(dest, raw);
|
|
373
|
+
}
|
|
374
|
+
}
|
|
265
375
|
|
|
266
376
|
// cli/lib/json.ts
|
|
267
377
|
import { readFile as readFile2, writeFile as writeFile2 } from "node:fs/promises";
|
|
@@ -347,6 +457,37 @@ function depsForConnector(connector) {
|
|
|
347
457
|
}
|
|
348
458
|
return { deps, devDeps };
|
|
349
459
|
}
|
|
460
|
+
function depsForExtractor(extractor) {
|
|
461
|
+
const deps = {};
|
|
462
|
+
const devDeps = {};
|
|
463
|
+
if (extractor === "pdf-llm") {
|
|
464
|
+
deps["ai"] = "^5.0.113";
|
|
465
|
+
}
|
|
466
|
+
if (extractor === "pdf-text-layer") {
|
|
467
|
+
deps["pdfjs-dist"] = "^5.4.149";
|
|
468
|
+
}
|
|
469
|
+
if (extractor === "pdf-ocr") {}
|
|
470
|
+
if (extractor === "image-ocr" || extractor === "image-caption-llm") {
|
|
471
|
+
deps["ai"] = "^5.0.113";
|
|
472
|
+
}
|
|
473
|
+
if (extractor === "audio-transcribe" || extractor === "video-transcribe") {
|
|
474
|
+
deps["ai"] = "^5.0.113";
|
|
475
|
+
}
|
|
476
|
+
if (extractor === "video-frames") {
|
|
477
|
+
deps["ai"] = "^5.0.113";
|
|
478
|
+
}
|
|
479
|
+
if (extractor === "file-text") {}
|
|
480
|
+
if (extractor === "file-docx") {
|
|
481
|
+
deps["mammoth"] = "^1.10.0";
|
|
482
|
+
}
|
|
483
|
+
if (extractor === "file-pptx") {
|
|
484
|
+
deps["jszip"] = "^3.10.1";
|
|
485
|
+
}
|
|
486
|
+
if (extractor === "file-xlsx") {
|
|
487
|
+
deps["xlsx"] = "^0.18.5";
|
|
488
|
+
}
|
|
489
|
+
return { deps, devDeps };
|
|
490
|
+
}
|
|
350
491
|
function installCmd(pm) {
|
|
351
492
|
if (pm === "bun")
|
|
352
493
|
return "bun install";
|
|
@@ -460,9 +601,93 @@ var parseInitArgs = (args) => {
|
|
|
460
601
|
}
|
|
461
602
|
continue;
|
|
462
603
|
}
|
|
604
|
+
if (a === "--rich-media") {
|
|
605
|
+
out.richMedia = true;
|
|
606
|
+
continue;
|
|
607
|
+
}
|
|
608
|
+
if (a === "--no-rich-media") {
|
|
609
|
+
out.richMedia = false;
|
|
610
|
+
continue;
|
|
611
|
+
}
|
|
612
|
+
if (a === "--extractors") {
|
|
613
|
+
const v = args[i + 1];
|
|
614
|
+
if (v) {
|
|
615
|
+
out.extractors = v.split(",").map((s) => s.trim()).filter(Boolean);
|
|
616
|
+
i++;
|
|
617
|
+
}
|
|
618
|
+
continue;
|
|
619
|
+
}
|
|
463
620
|
}
|
|
464
621
|
return out;
|
|
465
622
|
};
|
|
623
|
+
var DEFAULT_RICH_MEDIA_EXTRACTORS = ["pdf-text-layer", "file-text"];
|
|
624
|
+
var EXTRACTOR_OPTIONS = [
|
|
625
|
+
{
|
|
626
|
+
group: "PDF",
|
|
627
|
+
value: "pdf-text-layer",
|
|
628
|
+
label: `pdf-text-layer (Fast/cheap extraction via PDF text layer)`,
|
|
629
|
+
hint: "recommended"
|
|
630
|
+
},
|
|
631
|
+
{
|
|
632
|
+
group: "PDF",
|
|
633
|
+
value: "pdf-llm",
|
|
634
|
+
label: `pdf-llm (LLM-based PDF extraction; higher cost)`
|
|
635
|
+
},
|
|
636
|
+
{
|
|
637
|
+
group: "PDF",
|
|
638
|
+
value: "pdf-ocr",
|
|
639
|
+
label: `pdf-ocr (OCR scanned PDFs; requires native binaries)`,
|
|
640
|
+
hint: "worker-only"
|
|
641
|
+
},
|
|
642
|
+
{
|
|
643
|
+
group: "Image",
|
|
644
|
+
value: "image-ocr",
|
|
645
|
+
label: `image-ocr (Extract text from images via vision LLM)`
|
|
646
|
+
},
|
|
647
|
+
{
|
|
648
|
+
group: "Image",
|
|
649
|
+
value: "image-caption-llm",
|
|
650
|
+
label: `image-caption-llm (Generate captions for images via vision LLM)`
|
|
651
|
+
},
|
|
652
|
+
{
|
|
653
|
+
group: "Audio",
|
|
654
|
+
value: "audio-transcribe",
|
|
655
|
+
label: `audio-transcribe (Speech-to-text transcription)`
|
|
656
|
+
},
|
|
657
|
+
{
|
|
658
|
+
group: "Video",
|
|
659
|
+
value: "video-transcribe",
|
|
660
|
+
label: `video-transcribe (Transcribe video audio track)`
|
|
661
|
+
},
|
|
662
|
+
{
|
|
663
|
+
group: "Video",
|
|
664
|
+
value: "video-frames",
|
|
665
|
+
label: `video-frames (Sample frames + analyze via vision LLM; requires ffmpeg)`,
|
|
666
|
+
hint: "worker-only"
|
|
667
|
+
},
|
|
668
|
+
{
|
|
669
|
+
group: "Files",
|
|
670
|
+
value: "file-text",
|
|
671
|
+
label: `file-text (Extract text/markdown/json/html from common text files)`,
|
|
672
|
+
hint: "recommended"
|
|
673
|
+
},
|
|
674
|
+
{
|
|
675
|
+
group: "Files",
|
|
676
|
+
value: "file-docx",
|
|
677
|
+
label: `file-docx (Extract text from .docx files)`
|
|
678
|
+
},
|
|
679
|
+
{
|
|
680
|
+
group: "Files",
|
|
681
|
+
value: "file-pptx",
|
|
682
|
+
label: `file-pptx (Extract text from .pptx slides)`
|
|
683
|
+
},
|
|
684
|
+
{
|
|
685
|
+
group: "Files",
|
|
686
|
+
value: "file-xlsx",
|
|
687
|
+
label: `file-xlsx (Extract tables from .xlsx spreadsheets)`
|
|
688
|
+
}
|
|
689
|
+
];
|
|
690
|
+
var AVAILABLE_EXTRACTORS = new Set(EXTRACTOR_OPTIONS.map((o) => o.value));
|
|
466
691
|
async function initCommand(args) {
|
|
467
692
|
const root = await tryFindProjectRoot(process.cwd());
|
|
468
693
|
if (!root) {
|
|
@@ -531,17 +756,71 @@ async function initCommand(args) {
|
|
|
531
756
|
return;
|
|
532
757
|
}
|
|
533
758
|
const aliasBase = String(aliasAnswer).trim();
|
|
759
|
+
if (parsed.richMedia === false && (parsed.extractors ?? []).length > 0) {
|
|
760
|
+
throw new Error('Cannot use "--no-rich-media" together with "--extractors".');
|
|
761
|
+
}
|
|
762
|
+
const extractorsFromArgs = (parsed.extractors ?? []).filter((x) => AVAILABLE_EXTRACTORS.has(x)).sort();
|
|
763
|
+
const richMediaAnswer = extractorsFromArgs.length > 0 ? true : typeof parsed.richMedia === "boolean" ? parsed.richMedia : nonInteractive ? false : await confirm2({
|
|
764
|
+
message: "Enable rich media ingestion (PDF/images/audio/video/files)? This also enables multimodal image embeddings (you can change this later).",
|
|
765
|
+
initialValue: false
|
|
766
|
+
});
|
|
767
|
+
if (isCancel2(richMediaAnswer)) {
|
|
768
|
+
cancel2("Cancelled.");
|
|
769
|
+
return;
|
|
770
|
+
}
|
|
771
|
+
const richMediaEnabled = Boolean(richMediaAnswer);
|
|
772
|
+
const selectedExtractorsAnswer = richMediaEnabled || extractorsFromArgs.length > 0 ? nonInteractive ? extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS : await groupMultiselect({
|
|
773
|
+
message: "Select extractors to enable (space to toggle, enter to confirm)",
|
|
774
|
+
options: EXTRACTOR_OPTIONS.reduce((acc, opt) => {
|
|
775
|
+
acc[opt.group] ??= [];
|
|
776
|
+
acc[opt.group].push({
|
|
777
|
+
value: opt.value,
|
|
778
|
+
label: opt.label,
|
|
779
|
+
...opt.hint ? { hint: opt.hint } : {}
|
|
780
|
+
});
|
|
781
|
+
return acc;
|
|
782
|
+
}, {}),
|
|
783
|
+
initialValues: extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS,
|
|
784
|
+
required: false
|
|
785
|
+
}) : [];
|
|
786
|
+
if (isCancel2(selectedExtractorsAnswer)) {
|
|
787
|
+
cancel2("Cancelled.");
|
|
788
|
+
return;
|
|
789
|
+
}
|
|
790
|
+
const selectedExtractors = Array.from(new Set(Array.isArray(selectedExtractorsAnswer) ? selectedExtractorsAnswer : [])).sort();
|
|
534
791
|
const selection = {
|
|
535
792
|
installDir,
|
|
536
793
|
storeAdapter: storeAdapterAnswer,
|
|
537
794
|
projectRoot: root,
|
|
538
795
|
registryRoot,
|
|
539
|
-
aliasBase
|
|
796
|
+
aliasBase,
|
|
797
|
+
richMedia: richMediaEnabled ? {
|
|
798
|
+
enabled: true,
|
|
799
|
+
extractors: selectedExtractors
|
|
800
|
+
} : { enabled: false, extractors: [] }
|
|
540
801
|
};
|
|
541
802
|
await copyRegistryFiles(selection);
|
|
803
|
+
if (richMediaEnabled && selectedExtractors.length > 0) {
|
|
804
|
+
for (const extractor of selectedExtractors) {
|
|
805
|
+
await copyExtractorFiles({
|
|
806
|
+
projectRoot: root,
|
|
807
|
+
registryRoot,
|
|
808
|
+
installDir,
|
|
809
|
+
extractor,
|
|
810
|
+
yes: nonInteractive
|
|
811
|
+
});
|
|
812
|
+
}
|
|
813
|
+
}
|
|
542
814
|
const pkg = await readPackageJson(root);
|
|
543
815
|
const { deps, devDeps } = depsForAdapter(storeAdapterAnswer);
|
|
544
|
-
const
|
|
816
|
+
const extractorDeps = {};
|
|
817
|
+
const extractorDevDeps = {};
|
|
818
|
+
for (const ex of selectedExtractors) {
|
|
819
|
+
const r = depsForExtractor(ex);
|
|
820
|
+
Object.assign(extractorDeps, r.deps);
|
|
821
|
+
Object.assign(extractorDevDeps, r.devDeps);
|
|
822
|
+
}
|
|
823
|
+
const merged = mergeDeps(pkg, { ...deps, ...extractorDeps }, { ...devDeps, ...extractorDevDeps });
|
|
545
824
|
if (merged.changes.length > 0) {
|
|
546
825
|
await writePackageJson(root, merged.pkg);
|
|
547
826
|
}
|
|
@@ -550,7 +829,11 @@ async function initCommand(args) {
|
|
|
550
829
|
storeAdapter: storeAdapterAnswer,
|
|
551
830
|
aliasBase,
|
|
552
831
|
version: CONFIG_VERSION,
|
|
553
|
-
connectors: existing?.connectors ?? []
|
|
832
|
+
connectors: existing?.connectors ?? [],
|
|
833
|
+
extractors: Array.from(new Set([
|
|
834
|
+
...existing?.extractors ?? [],
|
|
835
|
+
...richMediaEnabled ? selectedExtractors : []
|
|
836
|
+
])).sort()
|
|
554
837
|
};
|
|
555
838
|
await writeJsonFile(path5.join(root, CONFIG_FILE), config);
|
|
556
839
|
const pm = await detectPackageManager(root);
|
|
@@ -564,6 +847,11 @@ async function initCommand(args) {
|
|
|
564
847
|
`- Docs: ${path5.join(installDir, "unrag.md")}`,
|
|
565
848
|
`- Config: unrag.config.ts`,
|
|
566
849
|
`- Imports: ${aliasBase}/* and ${aliasBase}/config`,
|
|
850
|
+
"",
|
|
851
|
+
`- Rich media: ${richMediaEnabled ? "enabled" : "disabled"}`,
|
|
852
|
+
richMediaEnabled ? `- Embeddings: multimodal enabled (images can be embedded directly)` : `- Embeddings: text-only (no direct image embedding)`,
|
|
853
|
+
richMediaEnabled ? `- Extractors: ${selectedExtractors.length > 0 ? selectedExtractors.join(", ") : "none"}` : "",
|
|
854
|
+
richMediaEnabled ? ` Tip: you can tweak extractors + assetProcessing flags in unrag.config.ts later.` : ` Tip: re-run \`unrag init --rich-media\` (or edit unrag.config.ts) to enable rich media later.`,
|
|
567
855
|
isNext ? tsconfigResult.changed ? `- Next.js: updated ${tsconfigResult.file} (added aliases)` : `- Next.js: no tsconfig changes needed` : `- Next.js: not detected`,
|
|
568
856
|
"",
|
|
569
857
|
merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
@@ -592,6 +880,20 @@ function docsUrl(siteRelativePath) {
|
|
|
592
880
|
var CONFIG_FILE2 = "unrag.json";
|
|
593
881
|
var __filename3 = fileURLToPath2(import.meta.url);
|
|
594
882
|
var __dirname3 = path6.dirname(__filename3);
|
|
883
|
+
var AVAILABLE_EXTRACTORS2 = [
|
|
884
|
+
"pdf-llm",
|
|
885
|
+
"pdf-text-layer",
|
|
886
|
+
"pdf-ocr",
|
|
887
|
+
"image-ocr",
|
|
888
|
+
"image-caption-llm",
|
|
889
|
+
"audio-transcribe",
|
|
890
|
+
"video-transcribe",
|
|
891
|
+
"video-frames",
|
|
892
|
+
"file-text",
|
|
893
|
+
"file-docx",
|
|
894
|
+
"file-pptx",
|
|
895
|
+
"file-xlsx"
|
|
896
|
+
];
|
|
595
897
|
var parseAddArgs = (args) => {
|
|
596
898
|
const out = {};
|
|
597
899
|
for (let i = 0;i < args.length; i++) {
|
|
@@ -600,8 +902,17 @@ var parseAddArgs = (args) => {
|
|
|
600
902
|
out.yes = true;
|
|
601
903
|
continue;
|
|
602
904
|
}
|
|
603
|
-
if (!out.
|
|
604
|
-
|
|
905
|
+
if (!out.kind && a && !a.startsWith("-")) {
|
|
906
|
+
if (a === "extractor") {
|
|
907
|
+
out.kind = "extractor";
|
|
908
|
+
continue;
|
|
909
|
+
}
|
|
910
|
+
out.kind = "connector";
|
|
911
|
+
out.name = a;
|
|
912
|
+
continue;
|
|
913
|
+
}
|
|
914
|
+
if (out.kind === "extractor" && !out.name && a && !a.startsWith("-")) {
|
|
915
|
+
out.name = a;
|
|
605
916
|
continue;
|
|
606
917
|
}
|
|
607
918
|
}
|
|
@@ -613,23 +924,24 @@ async function addCommand(args) {
|
|
|
613
924
|
throw new Error("Could not find a project root (no package.json found).");
|
|
614
925
|
}
|
|
615
926
|
const parsed = parseAddArgs(args);
|
|
616
|
-
const
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
927
|
+
const kind = parsed.kind ?? "connector";
|
|
928
|
+
const name = parsed.name;
|
|
929
|
+
if (!name) {
|
|
930
|
+
outro2([
|
|
931
|
+
"Usage:",
|
|
932
|
+
" unrag add <connector>",
|
|
933
|
+
" unrag add extractor <name>",
|
|
934
|
+
"",
|
|
935
|
+
"Available connectors: notion",
|
|
936
|
+
`Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`
|
|
937
|
+
].join(`
|
|
938
|
+
`));
|
|
627
939
|
return;
|
|
628
940
|
}
|
|
629
941
|
const configPath = path6.join(root, CONFIG_FILE2);
|
|
630
942
|
const config = await readJsonFile(configPath);
|
|
631
943
|
if (!config?.installDir) {
|
|
632
|
-
throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag init\` first.`);
|
|
944
|
+
throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag@latest init\` first.`);
|
|
633
945
|
}
|
|
634
946
|
const cliPackageRoot = await findUp(__dirname3, "package.json");
|
|
635
947
|
if (!cliPackageRoot) {
|
|
@@ -637,29 +949,70 @@ Available connectors: notion`);
|
|
|
637
949
|
}
|
|
638
950
|
const registryRoot = path6.join(cliPackageRoot, "registry");
|
|
639
951
|
const nonInteractive = parsed.yes || !process.stdin.isTTY;
|
|
640
|
-
await
|
|
952
|
+
const pkg = await readPackageJson(root);
|
|
953
|
+
if (kind === "connector") {
|
|
954
|
+
const connector = name;
|
|
955
|
+
if (connector !== "notion") {
|
|
956
|
+
outro2(`Unknown connector: ${name}
|
|
957
|
+
|
|
958
|
+
Available connectors: notion`);
|
|
959
|
+
return;
|
|
960
|
+
}
|
|
961
|
+
await copyConnectorFiles({
|
|
962
|
+
projectRoot: root,
|
|
963
|
+
registryRoot,
|
|
964
|
+
installDir: config.installDir,
|
|
965
|
+
connector,
|
|
966
|
+
yes: nonInteractive
|
|
967
|
+
});
|
|
968
|
+
const { deps: deps2, devDeps: devDeps2 } = depsForConnector(connector);
|
|
969
|
+
const merged2 = mergeDeps(pkg, deps2, devDeps2);
|
|
970
|
+
if (merged2.changes.length > 0) {
|
|
971
|
+
await writePackageJson(root, merged2.pkg);
|
|
972
|
+
}
|
|
973
|
+
const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
|
|
974
|
+
await writeJsonFile(configPath, { ...config, connectors });
|
|
975
|
+
outro2([
|
|
976
|
+
`Installed connector: ${connector}.`,
|
|
977
|
+
"",
|
|
978
|
+
`- Code: ${path6.join(config.installDir, "connectors", connector)}`,
|
|
979
|
+
`- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
|
|
980
|
+
"",
|
|
981
|
+
merged2.changes.length > 0 ? `Added deps: ${merged2.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
982
|
+
nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
|
|
983
|
+
].filter(Boolean).join(`
|
|
984
|
+
`));
|
|
985
|
+
return;
|
|
986
|
+
}
|
|
987
|
+
const extractor = name;
|
|
988
|
+
if (!extractor || !AVAILABLE_EXTRACTORS2.includes(extractor)) {
|
|
989
|
+
outro2(`Unknown extractor: ${name}
|
|
990
|
+
|
|
991
|
+
Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`);
|
|
992
|
+
return;
|
|
993
|
+
}
|
|
994
|
+
await copyExtractorFiles({
|
|
641
995
|
projectRoot: root,
|
|
642
996
|
registryRoot,
|
|
643
997
|
installDir: config.installDir,
|
|
644
|
-
|
|
998
|
+
extractor,
|
|
645
999
|
yes: nonInteractive
|
|
646
1000
|
});
|
|
647
|
-
const
|
|
648
|
-
const { deps, devDeps } = depsForConnector(connector);
|
|
1001
|
+
const { deps, devDeps } = depsForExtractor(extractor);
|
|
649
1002
|
const merged = mergeDeps(pkg, deps, devDeps);
|
|
650
1003
|
if (merged.changes.length > 0) {
|
|
651
1004
|
await writePackageJson(root, merged.pkg);
|
|
652
1005
|
}
|
|
653
|
-
const
|
|
654
|
-
await writeJsonFile(configPath, { ...config,
|
|
1006
|
+
const extractors = Array.from(new Set([...config.extractors ?? [], extractor])).sort();
|
|
1007
|
+
await writeJsonFile(configPath, { ...config, extractors });
|
|
655
1008
|
outro2([
|
|
656
|
-
`Installed
|
|
1009
|
+
`Installed extractor: ${extractor}.`,
|
|
657
1010
|
"",
|
|
658
|
-
`- Code: ${path6.join(config.installDir, "
|
|
659
|
-
`- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
|
|
1011
|
+
`- Code: ${path6.join(config.installDir, "extractors", extractor)}`,
|
|
660
1012
|
"",
|
|
661
1013
|
merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
662
|
-
|
|
1014
|
+
"",
|
|
1015
|
+
`Next: import the extractor and pass it to createContextEngine({ extractors: [...] }).`
|
|
663
1016
|
].filter(Boolean).join(`
|
|
664
1017
|
`));
|
|
665
1018
|
}
|
|
@@ -686,10 +1039,15 @@ function renderHelp() {
|
|
|
686
1039
|
" --store <adapter> drizzle | prisma | raw-sql",
|
|
687
1040
|
" --dir <path> Install directory (alias: --install-dir)",
|
|
688
1041
|
" --alias <@name> Import alias base (e.g. @unrag)",
|
|
1042
|
+
" --rich-media Enable rich media setup (also enables multimodal embeddings)",
|
|
1043
|
+
" --no-rich-media Disable rich media setup",
|
|
1044
|
+
" --extractors <list> Comma-separated extractors (implies --rich-media)",
|
|
689
1045
|
"",
|
|
690
1046
|
"Examples:",
|
|
691
|
-
" bunx unrag init",
|
|
692
|
-
" bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag",
|
|
1047
|
+
" bunx unrag@latest init",
|
|
1048
|
+
" bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
|
|
1049
|
+
" bunx unrag@latest init --yes --rich-media",
|
|
1050
|
+
" bunx unrag@latest init --yes --extractors pdf-text-layer,file-text",
|
|
693
1051
|
" bunx unrag add notion --yes",
|
|
694
1052
|
"",
|
|
695
1053
|
"Docs:",
|
package/package.json
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "unrag",
|
|
3
3
|
"type": "module",
|
|
4
|
+
"repository": "https://github.com/BetterStacks/unrag",
|
|
5
|
+
"homepage": "https://unrag.dev",
|
|
4
6
|
"bin": {
|
|
5
7
|
"unrag": "./dist/cli/index.js"
|
|
6
8
|
},
|
|
7
|
-
"version": "0.2.
|
|
9
|
+
"version": "0.2.4",
|
|
8
10
|
"private": false,
|
|
9
11
|
"license": "Apache-2.0",
|
|
10
12
|
"devDependencies": {
|