unrag 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +247 -47
- package/package.json +2 -1
- package/registry/config/unrag.config.ts +41 -17
package/dist/cli/index.js
CHANGED
|
@@ -4,7 +4,15 @@
|
|
|
4
4
|
import { intro, outro as outro3 } from "@clack/prompts";
|
|
5
5
|
|
|
6
6
|
// cli/commands/init.ts
|
|
7
|
-
import {
|
|
7
|
+
import {
|
|
8
|
+
cancel as cancel2,
|
|
9
|
+
confirm as confirm2,
|
|
10
|
+
groupMultiselect,
|
|
11
|
+
isCancel as isCancel2,
|
|
12
|
+
outro,
|
|
13
|
+
select,
|
|
14
|
+
text
|
|
15
|
+
} from "@clack/prompts";
|
|
8
16
|
import path5 from "node:path";
|
|
9
17
|
import { fileURLToPath } from "node:url";
|
|
10
18
|
|
|
@@ -71,8 +79,39 @@ var writeText = async (filePath, content) => {
|
|
|
71
79
|
await ensureDir(path2.dirname(filePath));
|
|
72
80
|
await writeFile(filePath, content, "utf8");
|
|
73
81
|
};
|
|
82
|
+
var EXTRACTOR_FACTORY = {
|
|
83
|
+
"pdf-llm": "createPdfLlmExtractor",
|
|
84
|
+
"pdf-text-layer": "createPdfTextLayerExtractor",
|
|
85
|
+
"pdf-ocr": "createPdfOcrExtractor",
|
|
86
|
+
"image-ocr": "createImageOcrExtractor",
|
|
87
|
+
"image-caption-llm": "createImageCaptionLlmExtractor",
|
|
88
|
+
"audio-transcribe": "createAudioTranscribeExtractor",
|
|
89
|
+
"video-transcribe": "createVideoTranscribeExtractor",
|
|
90
|
+
"video-frames": "createVideoFramesExtractor",
|
|
91
|
+
"file-text": "createFileTextExtractor",
|
|
92
|
+
"file-docx": "createFileDocxExtractor",
|
|
93
|
+
"file-pptx": "createFilePptxExtractor",
|
|
94
|
+
"file-xlsx": "createFileXlsxExtractor"
|
|
95
|
+
};
|
|
96
|
+
var EXTRACTOR_FLAG_KEYS = {
|
|
97
|
+
"pdf-text-layer": ["pdf_textLayer"],
|
|
98
|
+
"pdf-llm": ["pdf_llmExtraction"],
|
|
99
|
+
"pdf-ocr": ["pdf_ocr"],
|
|
100
|
+
"image-ocr": ["image_ocr"],
|
|
101
|
+
"image-caption-llm": ["image_captionLlm"],
|
|
102
|
+
"audio-transcribe": ["audio_transcription"],
|
|
103
|
+
"video-transcribe": ["video_transcription"],
|
|
104
|
+
"video-frames": ["video_frames"],
|
|
105
|
+
"file-text": ["file_text"],
|
|
106
|
+
"file-docx": ["file_docx"],
|
|
107
|
+
"file-pptx": ["file_pptx"],
|
|
108
|
+
"file-xlsx": ["file_xlsx"]
|
|
109
|
+
};
|
|
110
|
+
var ALL_FLAG_KEYS = Array.from(new Set(Object.values(EXTRACTOR_FLAG_KEYS).flat())).sort();
|
|
74
111
|
var renderUnragConfig = (content, selection) => {
|
|
75
112
|
const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
|
|
113
|
+
const richMedia = selection.richMedia ?? { enabled: false, extractors: [] };
|
|
114
|
+
const selectedExtractors = Array.from(new Set(richMedia.extractors ?? [])).sort();
|
|
76
115
|
const baseImports = [
|
|
77
116
|
`import { defineUnragConfig } from "${installImportBase}/core";`
|
|
78
117
|
];
|
|
@@ -88,22 +127,40 @@ var renderUnragConfig = (content, selection) => {
|
|
|
88
127
|
storeImports.push(`import { createPrismaVectorStore } from "${installImportBase}/store/prisma";`, `import { PrismaClient } from "@prisma/client";`);
|
|
89
128
|
storeCreateLines.push(` const prisma = (globalThis as any).__unragPrisma ?? new PrismaClient();`, ` (globalThis as any).__unragPrisma = prisma;`, ` const store = createPrismaVectorStore(prisma);`);
|
|
90
129
|
}
|
|
91
|
-
const
|
|
130
|
+
const extractorImports = [];
|
|
131
|
+
if (richMedia.enabled && selectedExtractors.length > 0) {
|
|
132
|
+
for (const ex of selectedExtractors) {
|
|
133
|
+
const factory = EXTRACTOR_FACTORY[ex];
|
|
134
|
+
extractorImports.push(`import { ${factory} } from "${installImportBase}/extractors/${ex}";`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
const importsBlock = [...baseImports, ...storeImports, ...extractorImports].join(`
|
|
92
138
|
`);
|
|
93
139
|
const createEngineBlock = [
|
|
94
140
|
`export function createUnragEngine() {`,
|
|
95
141
|
...storeCreateLines,
|
|
96
142
|
``,
|
|
97
143
|
` return unrag.createEngine({ store });`,
|
|
98
|
-
`}`,
|
|
99
|
-
``,
|
|
100
|
-
`export async function retrieve(query: string) {`,
|
|
101
|
-
` const engine = createUnragEngine();`,
|
|
102
|
-
` return engine.retrieve({ query, topK: unrag.defaults.retrieval.topK });`,
|
|
103
144
|
`}`
|
|
104
145
|
].join(`
|
|
105
146
|
`);
|
|
106
|
-
|
|
147
|
+
let out = content.replace("// __UNRAG_IMPORTS__", importsBlock).replace("// __UNRAG_CREATE_ENGINE__", createEngineBlock);
|
|
148
|
+
out = out.replace('type: "text", // __UNRAG_EMBEDDING_TYPE__', richMedia.enabled ? 'type: "multimodal",' : 'type: "text",').replace('model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__', richMedia.enabled ? 'model: "cohere/embed-v4.0",' : 'model: "openai/text-embedding-3-small",');
|
|
149
|
+
const enabledFlagKeys = new Set;
|
|
150
|
+
if (richMedia.enabled) {
|
|
151
|
+
for (const ex of selectedExtractors) {
|
|
152
|
+
for (const k of EXTRACTOR_FLAG_KEYS[ex] ?? []) {
|
|
153
|
+
enabledFlagKeys.add(k);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
for (const k of ALL_FLAG_KEYS) {
|
|
158
|
+
out = out.replace(`enabled: false, // __UNRAG_FLAG_${k}__`, `enabled: ${enabledFlagKeys.has(k) ? "true" : "false"},`);
|
|
159
|
+
}
|
|
160
|
+
const extractorLines = richMedia.enabled && selectedExtractors.length > 0 ? selectedExtractors.map((ex) => ` ${EXTRACTOR_FACTORY[ex]}(),`).join(`
|
|
161
|
+
`) : "";
|
|
162
|
+
out = out.replace(" // __UNRAG_EXTRACTORS__", extractorLines);
|
|
163
|
+
return out;
|
|
107
164
|
};
|
|
108
165
|
var renderDocs = (content, selection) => {
|
|
109
166
|
const notes = [];
|
|
@@ -272,28 +329,34 @@ async function copyExtractorFiles(selection) {
|
|
|
272
329
|
const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
|
|
273
330
|
const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
|
|
274
331
|
const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
|
|
332
|
+
const shouldWrite = async (src, dest) => {
|
|
333
|
+
if (!await exists(dest))
|
|
334
|
+
return true;
|
|
335
|
+
if (nonInteractive)
|
|
336
|
+
return false;
|
|
337
|
+
try {
|
|
338
|
+
const [srcRaw, destRaw] = await Promise.all([readText(src), readText(dest)]);
|
|
339
|
+
if (srcRaw === destRaw)
|
|
340
|
+
return false;
|
|
341
|
+
} catch {}
|
|
342
|
+
const answer = await confirm({
|
|
343
|
+
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
344
|
+
initialValue: false
|
|
345
|
+
});
|
|
346
|
+
if (isCancel(answer)) {
|
|
347
|
+
cancel("Cancelled.");
|
|
348
|
+
return false;
|
|
349
|
+
}
|
|
350
|
+
return Boolean(answer);
|
|
351
|
+
};
|
|
275
352
|
for (const src of extractorFiles) {
|
|
276
353
|
if (!await exists(src)) {
|
|
277
354
|
throw new Error(`Registry file missing: ${src}`);
|
|
278
355
|
}
|
|
279
356
|
const rel = path2.relative(extractorRegistryAbs, src);
|
|
280
357
|
const dest = path2.join(destRootAbs, rel);
|
|
281
|
-
if (await
|
|
282
|
-
|
|
283
|
-
continue;
|
|
284
|
-
}
|
|
285
|
-
const answer = await confirm({
|
|
286
|
-
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
287
|
-
initialValue: false
|
|
288
|
-
});
|
|
289
|
-
if (isCancel(answer)) {
|
|
290
|
-
cancel("Cancelled.");
|
|
291
|
-
return;
|
|
292
|
-
}
|
|
293
|
-
if (!answer) {
|
|
294
|
-
continue;
|
|
295
|
-
}
|
|
296
|
-
}
|
|
358
|
+
if (!await shouldWrite(src, dest))
|
|
359
|
+
continue;
|
|
297
360
|
const raw = await readText(src);
|
|
298
361
|
await writeText(dest, raw);
|
|
299
362
|
}
|
|
@@ -303,22 +366,8 @@ async function copyExtractorFiles(selection) {
|
|
|
303
366
|
}
|
|
304
367
|
const rel = path2.relative(sharedRegistryAbs, src);
|
|
305
368
|
const dest = path2.join(sharedDestRootAbs, rel);
|
|
306
|
-
if (await
|
|
307
|
-
|
|
308
|
-
continue;
|
|
309
|
-
}
|
|
310
|
-
const answer = await confirm({
|
|
311
|
-
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
312
|
-
initialValue: false
|
|
313
|
-
});
|
|
314
|
-
if (isCancel(answer)) {
|
|
315
|
-
cancel("Cancelled.");
|
|
316
|
-
return;
|
|
317
|
-
}
|
|
318
|
-
if (!answer) {
|
|
319
|
-
continue;
|
|
320
|
-
}
|
|
321
|
-
}
|
|
369
|
+
if (!await shouldWrite(src, dest))
|
|
370
|
+
continue;
|
|
322
371
|
const raw = await readText(src);
|
|
323
372
|
await writeText(dest, raw);
|
|
324
373
|
}
|
|
@@ -552,9 +601,93 @@ var parseInitArgs = (args) => {
|
|
|
552
601
|
}
|
|
553
602
|
continue;
|
|
554
603
|
}
|
|
604
|
+
if (a === "--rich-media") {
|
|
605
|
+
out.richMedia = true;
|
|
606
|
+
continue;
|
|
607
|
+
}
|
|
608
|
+
if (a === "--no-rich-media") {
|
|
609
|
+
out.richMedia = false;
|
|
610
|
+
continue;
|
|
611
|
+
}
|
|
612
|
+
if (a === "--extractors") {
|
|
613
|
+
const v = args[i + 1];
|
|
614
|
+
if (v) {
|
|
615
|
+
out.extractors = v.split(",").map((s) => s.trim()).filter(Boolean);
|
|
616
|
+
i++;
|
|
617
|
+
}
|
|
618
|
+
continue;
|
|
619
|
+
}
|
|
555
620
|
}
|
|
556
621
|
return out;
|
|
557
622
|
};
|
|
623
|
+
var DEFAULT_RICH_MEDIA_EXTRACTORS = ["pdf-text-layer", "file-text"];
|
|
624
|
+
var EXTRACTOR_OPTIONS = [
|
|
625
|
+
{
|
|
626
|
+
group: "PDF",
|
|
627
|
+
value: "pdf-text-layer",
|
|
628
|
+
label: `pdf-text-layer (Fast/cheap extraction via PDF text layer)`,
|
|
629
|
+
hint: "recommended"
|
|
630
|
+
},
|
|
631
|
+
{
|
|
632
|
+
group: "PDF",
|
|
633
|
+
value: "pdf-llm",
|
|
634
|
+
label: `pdf-llm (LLM-based PDF extraction; higher cost)`
|
|
635
|
+
},
|
|
636
|
+
{
|
|
637
|
+
group: "PDF",
|
|
638
|
+
value: "pdf-ocr",
|
|
639
|
+
label: `pdf-ocr (OCR scanned PDFs; requires native binaries)`,
|
|
640
|
+
hint: "worker-only"
|
|
641
|
+
},
|
|
642
|
+
{
|
|
643
|
+
group: "Image",
|
|
644
|
+
value: "image-ocr",
|
|
645
|
+
label: `image-ocr (Extract text from images via vision LLM)`
|
|
646
|
+
},
|
|
647
|
+
{
|
|
648
|
+
group: "Image",
|
|
649
|
+
value: "image-caption-llm",
|
|
650
|
+
label: `image-caption-llm (Generate captions for images via vision LLM)`
|
|
651
|
+
},
|
|
652
|
+
{
|
|
653
|
+
group: "Audio",
|
|
654
|
+
value: "audio-transcribe",
|
|
655
|
+
label: `audio-transcribe (Speech-to-text transcription)`
|
|
656
|
+
},
|
|
657
|
+
{
|
|
658
|
+
group: "Video",
|
|
659
|
+
value: "video-transcribe",
|
|
660
|
+
label: `video-transcribe (Transcribe video audio track)`
|
|
661
|
+
},
|
|
662
|
+
{
|
|
663
|
+
group: "Video",
|
|
664
|
+
value: "video-frames",
|
|
665
|
+
label: `video-frames (Sample frames + analyze via vision LLM; requires ffmpeg)`,
|
|
666
|
+
hint: "worker-only"
|
|
667
|
+
},
|
|
668
|
+
{
|
|
669
|
+
group: "Files",
|
|
670
|
+
value: "file-text",
|
|
671
|
+
label: `file-text (Extract text/markdown/json/html from common text files)`,
|
|
672
|
+
hint: "recommended"
|
|
673
|
+
},
|
|
674
|
+
{
|
|
675
|
+
group: "Files",
|
|
676
|
+
value: "file-docx",
|
|
677
|
+
label: `file-docx (Extract text from .docx files)`
|
|
678
|
+
},
|
|
679
|
+
{
|
|
680
|
+
group: "Files",
|
|
681
|
+
value: "file-pptx",
|
|
682
|
+
label: `file-pptx (Extract text from .pptx slides)`
|
|
683
|
+
},
|
|
684
|
+
{
|
|
685
|
+
group: "Files",
|
|
686
|
+
value: "file-xlsx",
|
|
687
|
+
label: `file-xlsx (Extract tables from .xlsx spreadsheets)`
|
|
688
|
+
}
|
|
689
|
+
];
|
|
690
|
+
var AVAILABLE_EXTRACTORS = new Set(EXTRACTOR_OPTIONS.map((o) => o.value));
|
|
558
691
|
async function initCommand(args) {
|
|
559
692
|
const root = await tryFindProjectRoot(process.cwd());
|
|
560
693
|
if (!root) {
|
|
@@ -623,17 +756,71 @@ async function initCommand(args) {
|
|
|
623
756
|
return;
|
|
624
757
|
}
|
|
625
758
|
const aliasBase = String(aliasAnswer).trim();
|
|
759
|
+
if (parsed.richMedia === false && (parsed.extractors ?? []).length > 0) {
|
|
760
|
+
throw new Error('Cannot use "--no-rich-media" together with "--extractors".');
|
|
761
|
+
}
|
|
762
|
+
const extractorsFromArgs = (parsed.extractors ?? []).filter((x) => AVAILABLE_EXTRACTORS.has(x)).sort();
|
|
763
|
+
const richMediaAnswer = extractorsFromArgs.length > 0 ? true : typeof parsed.richMedia === "boolean" ? parsed.richMedia : nonInteractive ? false : await confirm2({
|
|
764
|
+
message: "Enable rich media ingestion (PDF/images/audio/video/files)? This also enables multimodal image embeddings (you can change this later).",
|
|
765
|
+
initialValue: false
|
|
766
|
+
});
|
|
767
|
+
if (isCancel2(richMediaAnswer)) {
|
|
768
|
+
cancel2("Cancelled.");
|
|
769
|
+
return;
|
|
770
|
+
}
|
|
771
|
+
const richMediaEnabled = Boolean(richMediaAnswer);
|
|
772
|
+
const selectedExtractorsAnswer = richMediaEnabled || extractorsFromArgs.length > 0 ? nonInteractive ? extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS : await groupMultiselect({
|
|
773
|
+
message: "Select extractors to enable (space to toggle, enter to confirm)",
|
|
774
|
+
options: EXTRACTOR_OPTIONS.reduce((acc, opt) => {
|
|
775
|
+
acc[opt.group] ??= [];
|
|
776
|
+
acc[opt.group].push({
|
|
777
|
+
value: opt.value,
|
|
778
|
+
label: opt.label,
|
|
779
|
+
...opt.hint ? { hint: opt.hint } : {}
|
|
780
|
+
});
|
|
781
|
+
return acc;
|
|
782
|
+
}, {}),
|
|
783
|
+
initialValues: extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS,
|
|
784
|
+
required: false
|
|
785
|
+
}) : [];
|
|
786
|
+
if (isCancel2(selectedExtractorsAnswer)) {
|
|
787
|
+
cancel2("Cancelled.");
|
|
788
|
+
return;
|
|
789
|
+
}
|
|
790
|
+
const selectedExtractors = Array.from(new Set(Array.isArray(selectedExtractorsAnswer) ? selectedExtractorsAnswer : [])).sort();
|
|
626
791
|
const selection = {
|
|
627
792
|
installDir,
|
|
628
793
|
storeAdapter: storeAdapterAnswer,
|
|
629
794
|
projectRoot: root,
|
|
630
795
|
registryRoot,
|
|
631
|
-
aliasBase
|
|
796
|
+
aliasBase,
|
|
797
|
+
richMedia: richMediaEnabled ? {
|
|
798
|
+
enabled: true,
|
|
799
|
+
extractors: selectedExtractors
|
|
800
|
+
} : { enabled: false, extractors: [] }
|
|
632
801
|
};
|
|
633
802
|
await copyRegistryFiles(selection);
|
|
803
|
+
if (richMediaEnabled && selectedExtractors.length > 0) {
|
|
804
|
+
for (const extractor of selectedExtractors) {
|
|
805
|
+
await copyExtractorFiles({
|
|
806
|
+
projectRoot: root,
|
|
807
|
+
registryRoot,
|
|
808
|
+
installDir,
|
|
809
|
+
extractor,
|
|
810
|
+
yes: nonInteractive
|
|
811
|
+
});
|
|
812
|
+
}
|
|
813
|
+
}
|
|
634
814
|
const pkg = await readPackageJson(root);
|
|
635
815
|
const { deps, devDeps } = depsForAdapter(storeAdapterAnswer);
|
|
636
|
-
const
|
|
816
|
+
const extractorDeps = {};
|
|
817
|
+
const extractorDevDeps = {};
|
|
818
|
+
for (const ex of selectedExtractors) {
|
|
819
|
+
const r = depsForExtractor(ex);
|
|
820
|
+
Object.assign(extractorDeps, r.deps);
|
|
821
|
+
Object.assign(extractorDevDeps, r.devDeps);
|
|
822
|
+
}
|
|
823
|
+
const merged = mergeDeps(pkg, { ...deps, ...extractorDeps }, { ...devDeps, ...extractorDevDeps });
|
|
637
824
|
if (merged.changes.length > 0) {
|
|
638
825
|
await writePackageJson(root, merged.pkg);
|
|
639
826
|
}
|
|
@@ -643,7 +830,10 @@ async function initCommand(args) {
|
|
|
643
830
|
aliasBase,
|
|
644
831
|
version: CONFIG_VERSION,
|
|
645
832
|
connectors: existing?.connectors ?? [],
|
|
646
|
-
extractors:
|
|
833
|
+
extractors: Array.from(new Set([
|
|
834
|
+
...existing?.extractors ?? [],
|
|
835
|
+
...richMediaEnabled ? selectedExtractors : []
|
|
836
|
+
])).sort()
|
|
647
837
|
};
|
|
648
838
|
await writeJsonFile(path5.join(root, CONFIG_FILE), config);
|
|
649
839
|
const pm = await detectPackageManager(root);
|
|
@@ -657,6 +847,11 @@ async function initCommand(args) {
|
|
|
657
847
|
`- Docs: ${path5.join(installDir, "unrag.md")}`,
|
|
658
848
|
`- Config: unrag.config.ts`,
|
|
659
849
|
`- Imports: ${aliasBase}/* and ${aliasBase}/config`,
|
|
850
|
+
"",
|
|
851
|
+
`- Rich media: ${richMediaEnabled ? "enabled" : "disabled"}`,
|
|
852
|
+
richMediaEnabled ? `- Embeddings: multimodal enabled (images can be embedded directly)` : `- Embeddings: text-only (no direct image embedding)`,
|
|
853
|
+
richMediaEnabled ? `- Extractors: ${selectedExtractors.length > 0 ? selectedExtractors.join(", ") : "none"}` : "",
|
|
854
|
+
richMediaEnabled ? ` Tip: you can tweak extractors + assetProcessing flags in unrag.config.ts later.` : ` Tip: re-run \`unrag init --rich-media\` (or edit unrag.config.ts) to enable rich media later.`,
|
|
660
855
|
isNext ? tsconfigResult.changed ? `- Next.js: updated ${tsconfigResult.file} (added aliases)` : `- Next.js: no tsconfig changes needed` : `- Next.js: not detected`,
|
|
661
856
|
"",
|
|
662
857
|
merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
@@ -685,7 +880,7 @@ function docsUrl(siteRelativePath) {
|
|
|
685
880
|
var CONFIG_FILE2 = "unrag.json";
|
|
686
881
|
var __filename3 = fileURLToPath2(import.meta.url);
|
|
687
882
|
var __dirname3 = path6.dirname(__filename3);
|
|
688
|
-
var
|
|
883
|
+
var AVAILABLE_EXTRACTORS2 = [
|
|
689
884
|
"pdf-llm",
|
|
690
885
|
"pdf-text-layer",
|
|
691
886
|
"pdf-ocr",
|
|
@@ -738,7 +933,7 @@ async function addCommand(args) {
|
|
|
738
933
|
" unrag add extractor <name>",
|
|
739
934
|
"",
|
|
740
935
|
"Available connectors: notion",
|
|
741
|
-
`Available extractors: ${
|
|
936
|
+
`Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`
|
|
742
937
|
].join(`
|
|
743
938
|
`));
|
|
744
939
|
return;
|
|
@@ -790,10 +985,10 @@ Available connectors: notion`);
|
|
|
790
985
|
return;
|
|
791
986
|
}
|
|
792
987
|
const extractor = name;
|
|
793
|
-
if (!extractor || !
|
|
988
|
+
if (!extractor || !AVAILABLE_EXTRACTORS2.includes(extractor)) {
|
|
794
989
|
outro2(`Unknown extractor: ${name}
|
|
795
990
|
|
|
796
|
-
Available extractors: ${
|
|
991
|
+
Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`);
|
|
797
992
|
return;
|
|
798
993
|
}
|
|
799
994
|
await copyExtractorFiles({
|
|
@@ -844,10 +1039,15 @@ function renderHelp() {
|
|
|
844
1039
|
" --store <adapter> drizzle | prisma | raw-sql",
|
|
845
1040
|
" --dir <path> Install directory (alias: --install-dir)",
|
|
846
1041
|
" --alias <@name> Import alias base (e.g. @unrag)",
|
|
1042
|
+
" --rich-media Enable rich media setup (also enables multimodal embeddings)",
|
|
1043
|
+
" --no-rich-media Disable rich media setup",
|
|
1044
|
+
" --extractors <list> Comma-separated extractors (implies --rich-media)",
|
|
847
1045
|
"",
|
|
848
1046
|
"Examples:",
|
|
849
1047
|
" bunx unrag@latest init",
|
|
850
1048
|
" bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
|
|
1049
|
+
" bunx unrag@latest init --yes --rich-media",
|
|
1050
|
+
" bunx unrag@latest init --yes --extractors pdf-text-layer,file-text",
|
|
851
1051
|
" bunx unrag add notion --yes",
|
|
852
1052
|
"",
|
|
853
1053
|
"Docs:",
|
package/package.json
CHANGED
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
"name": "unrag",
|
|
3
3
|
"type": "module",
|
|
4
4
|
"repository": "https://github.com/BetterStacks/unrag",
|
|
5
|
+
"homepage": "https://unrag.dev",
|
|
5
6
|
"bin": {
|
|
6
7
|
"unrag": "./dist/cli/index.js"
|
|
7
8
|
},
|
|
8
|
-
"version": "0.2.
|
|
9
|
+
"version": "0.2.4",
|
|
9
10
|
"private": false,
|
|
10
11
|
"license": "Apache-2.0",
|
|
11
12
|
"devDependencies": {
|
|
@@ -11,6 +11,8 @@
|
|
|
11
11
|
* treated like vendored source code.
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
+
// @ts-nocheck
|
|
15
|
+
|
|
14
16
|
// __UNRAG_IMPORTS__
|
|
15
17
|
|
|
16
18
|
export const unrag = defineUnragConfig({
|
|
@@ -26,8 +28,8 @@ export const unrag = defineUnragConfig({
|
|
|
26
28
|
embedding: {
|
|
27
29
|
provider: "ai",
|
|
28
30
|
config: {
|
|
29
|
-
type: "text",
|
|
30
|
-
model: "openai/text-embedding-3-small",
|
|
31
|
+
type: "text", // __UNRAG_EMBEDDING_TYPE__
|
|
32
|
+
model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__
|
|
31
33
|
timeoutMs: 15_000,
|
|
32
34
|
},
|
|
33
35
|
},
|
|
@@ -52,13 +54,15 @@ export const unrag = defineUnragConfig({
|
|
|
52
54
|
* - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
|
|
53
55
|
* - `extractors: [createPdfLlmExtractor()]`
|
|
54
56
|
*/
|
|
55
|
-
extractors: [
|
|
57
|
+
extractors: [
|
|
58
|
+
// __UNRAG_EXTRACTORS__
|
|
59
|
+
],
|
|
56
60
|
/**
|
|
57
61
|
* Rich media processing controls.
|
|
58
62
|
*
|
|
59
63
|
* Notes:
|
|
60
|
-
* -
|
|
61
|
-
* -
|
|
64
|
+
* - This generated config is cost-safe by default (all extraction is off).
|
|
65
|
+
* - `unrag init` can enable rich media + multimodal embeddings for you.
|
|
62
66
|
* - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
|
|
63
67
|
*/
|
|
64
68
|
assetProcessing: {
|
|
@@ -74,14 +78,14 @@ export const unrag = defineUnragConfig({
|
|
|
74
78
|
pdf: {
|
|
75
79
|
// Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
|
|
76
80
|
textLayer: {
|
|
77
|
-
enabled: false,
|
|
81
|
+
enabled: false, // __UNRAG_FLAG_pdf_textLayer__
|
|
78
82
|
maxBytes: 15 * 1024 * 1024,
|
|
79
83
|
maxOutputChars: 200_000,
|
|
80
84
|
minChars: 200,
|
|
81
85
|
// maxPages: 200,
|
|
82
86
|
},
|
|
83
87
|
llmExtraction: {
|
|
84
|
-
enabled:
|
|
88
|
+
enabled: false, // __UNRAG_FLAG_pdf_llmExtraction__
|
|
85
89
|
model: "google/gemini-2.0-flash",
|
|
86
90
|
prompt:
|
|
87
91
|
"Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
|
|
@@ -91,7 +95,7 @@ export const unrag = defineUnragConfig({
|
|
|
91
95
|
},
|
|
92
96
|
// Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
|
|
93
97
|
ocr: {
|
|
94
|
-
enabled: false,
|
|
98
|
+
enabled: false, // __UNRAG_FLAG_pdf_ocr__
|
|
95
99
|
maxBytes: 15 * 1024 * 1024,
|
|
96
100
|
maxOutputChars: 200_000,
|
|
97
101
|
minChars: 200,
|
|
@@ -104,7 +108,7 @@ export const unrag = defineUnragConfig({
|
|
|
104
108
|
},
|
|
105
109
|
image: {
|
|
106
110
|
ocr: {
|
|
107
|
-
enabled: false,
|
|
111
|
+
enabled: false, // __UNRAG_FLAG_image_ocr__
|
|
108
112
|
model: "google/gemini-2.0-flash",
|
|
109
113
|
prompt:
|
|
110
114
|
"Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
@@ -113,7 +117,7 @@ export const unrag = defineUnragConfig({
|
|
|
113
117
|
maxOutputChars: 50_000,
|
|
114
118
|
},
|
|
115
119
|
captionLlm: {
|
|
116
|
-
enabled: false,
|
|
120
|
+
enabled: false, // __UNRAG_FLAG_image_captionLlm__
|
|
117
121
|
model: "google/gemini-2.0-flash",
|
|
118
122
|
prompt:
|
|
119
123
|
"Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
|
|
@@ -124,7 +128,7 @@ export const unrag = defineUnragConfig({
|
|
|
124
128
|
},
|
|
125
129
|
audio: {
|
|
126
130
|
transcription: {
|
|
127
|
-
enabled: false,
|
|
131
|
+
enabled: false, // __UNRAG_FLAG_audio_transcription__
|
|
128
132
|
model: "openai/whisper-1",
|
|
129
133
|
timeoutMs: 120_000,
|
|
130
134
|
maxBytes: 25 * 1024 * 1024,
|
|
@@ -132,13 +136,13 @@ export const unrag = defineUnragConfig({
|
|
|
132
136
|
},
|
|
133
137
|
video: {
|
|
134
138
|
transcription: {
|
|
135
|
-
enabled: false,
|
|
139
|
+
enabled: false, // __UNRAG_FLAG_video_transcription__
|
|
136
140
|
model: "openai/whisper-1",
|
|
137
141
|
timeoutMs: 120_000,
|
|
138
142
|
maxBytes: 50 * 1024 * 1024,
|
|
139
143
|
},
|
|
140
144
|
frames: {
|
|
141
|
-
enabled: false,
|
|
145
|
+
enabled: false, // __UNRAG_FLAG_video_frames__
|
|
142
146
|
sampleFps: 0.2,
|
|
143
147
|
maxFrames: 50,
|
|
144
148
|
// ffmpegPath: "/usr/bin/ffmpeg",
|
|
@@ -151,10 +155,30 @@ export const unrag = defineUnragConfig({
|
|
|
151
155
|
},
|
|
152
156
|
},
|
|
153
157
|
file: {
|
|
154
|
-
text: {
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
+
text: {
|
|
159
|
+
enabled: false, // __UNRAG_FLAG_file_text__
|
|
160
|
+
maxBytes: 5 * 1024 * 1024,
|
|
161
|
+
maxOutputChars: 200_000,
|
|
162
|
+
minChars: 50,
|
|
163
|
+
},
|
|
164
|
+
docx: {
|
|
165
|
+
enabled: false, // __UNRAG_FLAG_file_docx__
|
|
166
|
+
maxBytes: 15 * 1024 * 1024,
|
|
167
|
+
maxOutputChars: 200_000,
|
|
168
|
+
minChars: 50,
|
|
169
|
+
},
|
|
170
|
+
pptx: {
|
|
171
|
+
enabled: false, // __UNRAG_FLAG_file_pptx__
|
|
172
|
+
maxBytes: 30 * 1024 * 1024,
|
|
173
|
+
maxOutputChars: 200_000,
|
|
174
|
+
minChars: 50,
|
|
175
|
+
},
|
|
176
|
+
xlsx: {
|
|
177
|
+
enabled: false, // __UNRAG_FLAG_file_xlsx__
|
|
178
|
+
maxBytes: 30 * 1024 * 1024,
|
|
179
|
+
maxOutputChars: 200_000,
|
|
180
|
+
minChars: 50,
|
|
181
|
+
},
|
|
158
182
|
},
|
|
159
183
|
},
|
|
160
184
|
},
|