unrag 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +256 -51
- package/package.json +2 -1
- package/registry/config/unrag.config.ts +41 -17
- package/registry/connectors/google-drive/client.ts +171 -0
- package/registry/connectors/google-drive/index.ts +10 -0
- package/registry/connectors/google-drive/mime.ts +76 -0
- package/registry/connectors/google-drive/sync.ts +528 -0
- package/registry/connectors/google-drive/types.ts +127 -0
package/dist/cli/index.js
CHANGED
|
@@ -4,7 +4,15 @@
|
|
|
4
4
|
import { intro, outro as outro3 } from "@clack/prompts";
|
|
5
5
|
|
|
6
6
|
// cli/commands/init.ts
|
|
7
|
-
import {
|
|
7
|
+
import {
|
|
8
|
+
cancel as cancel2,
|
|
9
|
+
confirm as confirm2,
|
|
10
|
+
groupMultiselect,
|
|
11
|
+
isCancel as isCancel2,
|
|
12
|
+
outro,
|
|
13
|
+
select,
|
|
14
|
+
text
|
|
15
|
+
} from "@clack/prompts";
|
|
8
16
|
import path5 from "node:path";
|
|
9
17
|
import { fileURLToPath } from "node:url";
|
|
10
18
|
|
|
@@ -71,8 +79,39 @@ var writeText = async (filePath, content) => {
|
|
|
71
79
|
await ensureDir(path2.dirname(filePath));
|
|
72
80
|
await writeFile(filePath, content, "utf8");
|
|
73
81
|
};
|
|
82
|
+
var EXTRACTOR_FACTORY = {
|
|
83
|
+
"pdf-llm": "createPdfLlmExtractor",
|
|
84
|
+
"pdf-text-layer": "createPdfTextLayerExtractor",
|
|
85
|
+
"pdf-ocr": "createPdfOcrExtractor",
|
|
86
|
+
"image-ocr": "createImageOcrExtractor",
|
|
87
|
+
"image-caption-llm": "createImageCaptionLlmExtractor",
|
|
88
|
+
"audio-transcribe": "createAudioTranscribeExtractor",
|
|
89
|
+
"video-transcribe": "createVideoTranscribeExtractor",
|
|
90
|
+
"video-frames": "createVideoFramesExtractor",
|
|
91
|
+
"file-text": "createFileTextExtractor",
|
|
92
|
+
"file-docx": "createFileDocxExtractor",
|
|
93
|
+
"file-pptx": "createFilePptxExtractor",
|
|
94
|
+
"file-xlsx": "createFileXlsxExtractor"
|
|
95
|
+
};
|
|
96
|
+
var EXTRACTOR_FLAG_KEYS = {
|
|
97
|
+
"pdf-text-layer": ["pdf_textLayer"],
|
|
98
|
+
"pdf-llm": ["pdf_llmExtraction"],
|
|
99
|
+
"pdf-ocr": ["pdf_ocr"],
|
|
100
|
+
"image-ocr": ["image_ocr"],
|
|
101
|
+
"image-caption-llm": ["image_captionLlm"],
|
|
102
|
+
"audio-transcribe": ["audio_transcription"],
|
|
103
|
+
"video-transcribe": ["video_transcription"],
|
|
104
|
+
"video-frames": ["video_frames"],
|
|
105
|
+
"file-text": ["file_text"],
|
|
106
|
+
"file-docx": ["file_docx"],
|
|
107
|
+
"file-pptx": ["file_pptx"],
|
|
108
|
+
"file-xlsx": ["file_xlsx"]
|
|
109
|
+
};
|
|
110
|
+
var ALL_FLAG_KEYS = Array.from(new Set(Object.values(EXTRACTOR_FLAG_KEYS).flat())).sort();
|
|
74
111
|
var renderUnragConfig = (content, selection) => {
|
|
75
112
|
const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
|
|
113
|
+
const richMedia = selection.richMedia ?? { enabled: false, extractors: [] };
|
|
114
|
+
const selectedExtractors = Array.from(new Set(richMedia.extractors ?? [])).sort();
|
|
76
115
|
const baseImports = [
|
|
77
116
|
`import { defineUnragConfig } from "${installImportBase}/core";`
|
|
78
117
|
];
|
|
@@ -88,22 +127,40 @@ var renderUnragConfig = (content, selection) => {
|
|
|
88
127
|
storeImports.push(`import { createPrismaVectorStore } from "${installImportBase}/store/prisma";`, `import { PrismaClient } from "@prisma/client";`);
|
|
89
128
|
storeCreateLines.push(` const prisma = (globalThis as any).__unragPrisma ?? new PrismaClient();`, ` (globalThis as any).__unragPrisma = prisma;`, ` const store = createPrismaVectorStore(prisma);`);
|
|
90
129
|
}
|
|
91
|
-
const
|
|
130
|
+
const extractorImports = [];
|
|
131
|
+
if (richMedia.enabled && selectedExtractors.length > 0) {
|
|
132
|
+
for (const ex of selectedExtractors) {
|
|
133
|
+
const factory = EXTRACTOR_FACTORY[ex];
|
|
134
|
+
extractorImports.push(`import { ${factory} } from "${installImportBase}/extractors/${ex}";`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
const importsBlock = [...baseImports, ...storeImports, ...extractorImports].join(`
|
|
92
138
|
`);
|
|
93
139
|
const createEngineBlock = [
|
|
94
140
|
`export function createUnragEngine() {`,
|
|
95
141
|
...storeCreateLines,
|
|
96
142
|
``,
|
|
97
143
|
` return unrag.createEngine({ store });`,
|
|
98
|
-
`}`,
|
|
99
|
-
``,
|
|
100
|
-
`export async function retrieve(query: string) {`,
|
|
101
|
-
` const engine = createUnragEngine();`,
|
|
102
|
-
` return engine.retrieve({ query, topK: unrag.defaults.retrieval.topK });`,
|
|
103
144
|
`}`
|
|
104
145
|
].join(`
|
|
105
146
|
`);
|
|
106
|
-
|
|
147
|
+
let out = content.replace("// __UNRAG_IMPORTS__", importsBlock).replace("// __UNRAG_CREATE_ENGINE__", createEngineBlock);
|
|
148
|
+
out = out.replace('type: "text", // __UNRAG_EMBEDDING_TYPE__', richMedia.enabled ? 'type: "multimodal",' : 'type: "text",').replace('model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__', richMedia.enabled ? 'model: "cohere/embed-v4.0",' : 'model: "openai/text-embedding-3-small",');
|
|
149
|
+
const enabledFlagKeys = new Set;
|
|
150
|
+
if (richMedia.enabled) {
|
|
151
|
+
for (const ex of selectedExtractors) {
|
|
152
|
+
for (const k of EXTRACTOR_FLAG_KEYS[ex] ?? []) {
|
|
153
|
+
enabledFlagKeys.add(k);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
for (const k of ALL_FLAG_KEYS) {
|
|
158
|
+
out = out.replace(`enabled: false, // __UNRAG_FLAG_${k}__`, `enabled: ${enabledFlagKeys.has(k) ? "true" : "false"},`);
|
|
159
|
+
}
|
|
160
|
+
const extractorLines = richMedia.enabled && selectedExtractors.length > 0 ? selectedExtractors.map((ex) => ` ${EXTRACTOR_FACTORY[ex]}(),`).join(`
|
|
161
|
+
`) : "";
|
|
162
|
+
out = out.replace(" // __UNRAG_EXTRACTORS__", extractorLines);
|
|
163
|
+
return out;
|
|
107
164
|
};
|
|
108
165
|
var renderDocs = (content, selection) => {
|
|
109
166
|
const notes = [];
|
|
@@ -272,28 +329,34 @@ async function copyExtractorFiles(selection) {
|
|
|
272
329
|
const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
|
|
273
330
|
const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
|
|
274
331
|
const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
|
|
332
|
+
const shouldWrite = async (src, dest) => {
|
|
333
|
+
if (!await exists(dest))
|
|
334
|
+
return true;
|
|
335
|
+
if (nonInteractive)
|
|
336
|
+
return false;
|
|
337
|
+
try {
|
|
338
|
+
const [srcRaw, destRaw] = await Promise.all([readText(src), readText(dest)]);
|
|
339
|
+
if (srcRaw === destRaw)
|
|
340
|
+
return false;
|
|
341
|
+
} catch {}
|
|
342
|
+
const answer = await confirm({
|
|
343
|
+
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
344
|
+
initialValue: false
|
|
345
|
+
});
|
|
346
|
+
if (isCancel(answer)) {
|
|
347
|
+
cancel("Cancelled.");
|
|
348
|
+
return false;
|
|
349
|
+
}
|
|
350
|
+
return Boolean(answer);
|
|
351
|
+
};
|
|
275
352
|
for (const src of extractorFiles) {
|
|
276
353
|
if (!await exists(src)) {
|
|
277
354
|
throw new Error(`Registry file missing: ${src}`);
|
|
278
355
|
}
|
|
279
356
|
const rel = path2.relative(extractorRegistryAbs, src);
|
|
280
357
|
const dest = path2.join(destRootAbs, rel);
|
|
281
|
-
if (await
|
|
282
|
-
|
|
283
|
-
continue;
|
|
284
|
-
}
|
|
285
|
-
const answer = await confirm({
|
|
286
|
-
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
287
|
-
initialValue: false
|
|
288
|
-
});
|
|
289
|
-
if (isCancel(answer)) {
|
|
290
|
-
cancel("Cancelled.");
|
|
291
|
-
return;
|
|
292
|
-
}
|
|
293
|
-
if (!answer) {
|
|
294
|
-
continue;
|
|
295
|
-
}
|
|
296
|
-
}
|
|
358
|
+
if (!await shouldWrite(src, dest))
|
|
359
|
+
continue;
|
|
297
360
|
const raw = await readText(src);
|
|
298
361
|
await writeText(dest, raw);
|
|
299
362
|
}
|
|
@@ -303,22 +366,8 @@ async function copyExtractorFiles(selection) {
|
|
|
303
366
|
}
|
|
304
367
|
const rel = path2.relative(sharedRegistryAbs, src);
|
|
305
368
|
const dest = path2.join(sharedDestRootAbs, rel);
|
|
306
|
-
if (await
|
|
307
|
-
|
|
308
|
-
continue;
|
|
309
|
-
}
|
|
310
|
-
const answer = await confirm({
|
|
311
|
-
message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
|
|
312
|
-
initialValue: false
|
|
313
|
-
});
|
|
314
|
-
if (isCancel(answer)) {
|
|
315
|
-
cancel("Cancelled.");
|
|
316
|
-
return;
|
|
317
|
-
}
|
|
318
|
-
if (!answer) {
|
|
319
|
-
continue;
|
|
320
|
-
}
|
|
321
|
-
}
|
|
369
|
+
if (!await shouldWrite(src, dest))
|
|
370
|
+
continue;
|
|
322
371
|
const raw = await readText(src);
|
|
323
372
|
await writeText(dest, raw);
|
|
324
373
|
}
|
|
@@ -406,6 +455,10 @@ function depsForConnector(connector) {
|
|
|
406
455
|
if (connector === "notion") {
|
|
407
456
|
deps["@notionhq/client"] = "^2.2.16";
|
|
408
457
|
}
|
|
458
|
+
if (connector === "google-drive") {
|
|
459
|
+
deps["googleapis"] = "^148.0.0";
|
|
460
|
+
deps["google-auth-library"] = "^10.0.0";
|
|
461
|
+
}
|
|
409
462
|
return { deps, devDeps };
|
|
410
463
|
}
|
|
411
464
|
function depsForExtractor(extractor) {
|
|
@@ -552,9 +605,93 @@ var parseInitArgs = (args) => {
|
|
|
552
605
|
}
|
|
553
606
|
continue;
|
|
554
607
|
}
|
|
608
|
+
if (a === "--rich-media") {
|
|
609
|
+
out.richMedia = true;
|
|
610
|
+
continue;
|
|
611
|
+
}
|
|
612
|
+
if (a === "--no-rich-media") {
|
|
613
|
+
out.richMedia = false;
|
|
614
|
+
continue;
|
|
615
|
+
}
|
|
616
|
+
if (a === "--extractors") {
|
|
617
|
+
const v = args[i + 1];
|
|
618
|
+
if (v) {
|
|
619
|
+
out.extractors = v.split(",").map((s) => s.trim()).filter(Boolean);
|
|
620
|
+
i++;
|
|
621
|
+
}
|
|
622
|
+
continue;
|
|
623
|
+
}
|
|
555
624
|
}
|
|
556
625
|
return out;
|
|
557
626
|
};
|
|
627
|
+
var DEFAULT_RICH_MEDIA_EXTRACTORS = ["pdf-text-layer", "file-text"];
|
|
628
|
+
var EXTRACTOR_OPTIONS = [
|
|
629
|
+
{
|
|
630
|
+
group: "PDF",
|
|
631
|
+
value: "pdf-text-layer",
|
|
632
|
+
label: `pdf-text-layer (Fast/cheap extraction via PDF text layer)`,
|
|
633
|
+
hint: "recommended"
|
|
634
|
+
},
|
|
635
|
+
{
|
|
636
|
+
group: "PDF",
|
|
637
|
+
value: "pdf-llm",
|
|
638
|
+
label: `pdf-llm (LLM-based PDF extraction; higher cost)`
|
|
639
|
+
},
|
|
640
|
+
{
|
|
641
|
+
group: "PDF",
|
|
642
|
+
value: "pdf-ocr",
|
|
643
|
+
label: `pdf-ocr (OCR scanned PDFs; requires native binaries)`,
|
|
644
|
+
hint: "worker-only"
|
|
645
|
+
},
|
|
646
|
+
{
|
|
647
|
+
group: "Image",
|
|
648
|
+
value: "image-ocr",
|
|
649
|
+
label: `image-ocr (Extract text from images via vision LLM)`
|
|
650
|
+
},
|
|
651
|
+
{
|
|
652
|
+
group: "Image",
|
|
653
|
+
value: "image-caption-llm",
|
|
654
|
+
label: `image-caption-llm (Generate captions for images via vision LLM)`
|
|
655
|
+
},
|
|
656
|
+
{
|
|
657
|
+
group: "Audio",
|
|
658
|
+
value: "audio-transcribe",
|
|
659
|
+
label: `audio-transcribe (Speech-to-text transcription)`
|
|
660
|
+
},
|
|
661
|
+
{
|
|
662
|
+
group: "Video",
|
|
663
|
+
value: "video-transcribe",
|
|
664
|
+
label: `video-transcribe (Transcribe video audio track)`
|
|
665
|
+
},
|
|
666
|
+
{
|
|
667
|
+
group: "Video",
|
|
668
|
+
value: "video-frames",
|
|
669
|
+
label: `video-frames (Sample frames + analyze via vision LLM; requires ffmpeg)`,
|
|
670
|
+
hint: "worker-only"
|
|
671
|
+
},
|
|
672
|
+
{
|
|
673
|
+
group: "Files",
|
|
674
|
+
value: "file-text",
|
|
675
|
+
label: `file-text (Extract text/markdown/json/html from common text files)`,
|
|
676
|
+
hint: "recommended"
|
|
677
|
+
},
|
|
678
|
+
{
|
|
679
|
+
group: "Files",
|
|
680
|
+
value: "file-docx",
|
|
681
|
+
label: `file-docx (Extract text from .docx files)`
|
|
682
|
+
},
|
|
683
|
+
{
|
|
684
|
+
group: "Files",
|
|
685
|
+
value: "file-pptx",
|
|
686
|
+
label: `file-pptx (Extract text from .pptx slides)`
|
|
687
|
+
},
|
|
688
|
+
{
|
|
689
|
+
group: "Files",
|
|
690
|
+
value: "file-xlsx",
|
|
691
|
+
label: `file-xlsx (Extract tables from .xlsx spreadsheets)`
|
|
692
|
+
}
|
|
693
|
+
];
|
|
694
|
+
var AVAILABLE_EXTRACTORS = new Set(EXTRACTOR_OPTIONS.map((o) => o.value));
|
|
558
695
|
async function initCommand(args) {
|
|
559
696
|
const root = await tryFindProjectRoot(process.cwd());
|
|
560
697
|
if (!root) {
|
|
@@ -623,17 +760,71 @@ async function initCommand(args) {
|
|
|
623
760
|
return;
|
|
624
761
|
}
|
|
625
762
|
const aliasBase = String(aliasAnswer).trim();
|
|
763
|
+
if (parsed.richMedia === false && (parsed.extractors ?? []).length > 0) {
|
|
764
|
+
throw new Error('Cannot use "--no-rich-media" together with "--extractors".');
|
|
765
|
+
}
|
|
766
|
+
const extractorsFromArgs = (parsed.extractors ?? []).filter((x) => AVAILABLE_EXTRACTORS.has(x)).sort();
|
|
767
|
+
const richMediaAnswer = extractorsFromArgs.length > 0 ? true : typeof parsed.richMedia === "boolean" ? parsed.richMedia : nonInteractive ? false : await confirm2({
|
|
768
|
+
message: "Enable rich media ingestion (PDF/images/audio/video/files)? This also enables multimodal image embeddings (you can change this later).",
|
|
769
|
+
initialValue: false
|
|
770
|
+
});
|
|
771
|
+
if (isCancel2(richMediaAnswer)) {
|
|
772
|
+
cancel2("Cancelled.");
|
|
773
|
+
return;
|
|
774
|
+
}
|
|
775
|
+
const richMediaEnabled = Boolean(richMediaAnswer);
|
|
776
|
+
const selectedExtractorsAnswer = richMediaEnabled || extractorsFromArgs.length > 0 ? nonInteractive ? extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS : await groupMultiselect({
|
|
777
|
+
message: "Select extractors to enable (space to toggle, enter to confirm)",
|
|
778
|
+
options: EXTRACTOR_OPTIONS.reduce((acc, opt) => {
|
|
779
|
+
acc[opt.group] ??= [];
|
|
780
|
+
acc[opt.group].push({
|
|
781
|
+
value: opt.value,
|
|
782
|
+
label: opt.label,
|
|
783
|
+
...opt.hint ? { hint: opt.hint } : {}
|
|
784
|
+
});
|
|
785
|
+
return acc;
|
|
786
|
+
}, {}),
|
|
787
|
+
initialValues: extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS,
|
|
788
|
+
required: false
|
|
789
|
+
}) : [];
|
|
790
|
+
if (isCancel2(selectedExtractorsAnswer)) {
|
|
791
|
+
cancel2("Cancelled.");
|
|
792
|
+
return;
|
|
793
|
+
}
|
|
794
|
+
const selectedExtractors = Array.from(new Set(Array.isArray(selectedExtractorsAnswer) ? selectedExtractorsAnswer : [])).sort();
|
|
626
795
|
const selection = {
|
|
627
796
|
installDir,
|
|
628
797
|
storeAdapter: storeAdapterAnswer,
|
|
629
798
|
projectRoot: root,
|
|
630
799
|
registryRoot,
|
|
631
|
-
aliasBase
|
|
800
|
+
aliasBase,
|
|
801
|
+
richMedia: richMediaEnabled ? {
|
|
802
|
+
enabled: true,
|
|
803
|
+
extractors: selectedExtractors
|
|
804
|
+
} : { enabled: false, extractors: [] }
|
|
632
805
|
};
|
|
633
806
|
await copyRegistryFiles(selection);
|
|
807
|
+
if (richMediaEnabled && selectedExtractors.length > 0) {
|
|
808
|
+
for (const extractor of selectedExtractors) {
|
|
809
|
+
await copyExtractorFiles({
|
|
810
|
+
projectRoot: root,
|
|
811
|
+
registryRoot,
|
|
812
|
+
installDir,
|
|
813
|
+
extractor,
|
|
814
|
+
yes: nonInteractive
|
|
815
|
+
});
|
|
816
|
+
}
|
|
817
|
+
}
|
|
634
818
|
const pkg = await readPackageJson(root);
|
|
635
819
|
const { deps, devDeps } = depsForAdapter(storeAdapterAnswer);
|
|
636
|
-
const
|
|
820
|
+
const extractorDeps = {};
|
|
821
|
+
const extractorDevDeps = {};
|
|
822
|
+
for (const ex of selectedExtractors) {
|
|
823
|
+
const r = depsForExtractor(ex);
|
|
824
|
+
Object.assign(extractorDeps, r.deps);
|
|
825
|
+
Object.assign(extractorDevDeps, r.devDeps);
|
|
826
|
+
}
|
|
827
|
+
const merged = mergeDeps(pkg, { ...deps, ...extractorDeps }, { ...devDeps, ...extractorDevDeps });
|
|
637
828
|
if (merged.changes.length > 0) {
|
|
638
829
|
await writePackageJson(root, merged.pkg);
|
|
639
830
|
}
|
|
@@ -643,7 +834,10 @@ async function initCommand(args) {
|
|
|
643
834
|
aliasBase,
|
|
644
835
|
version: CONFIG_VERSION,
|
|
645
836
|
connectors: existing?.connectors ?? [],
|
|
646
|
-
extractors:
|
|
837
|
+
extractors: Array.from(new Set([
|
|
838
|
+
...existing?.extractors ?? [],
|
|
839
|
+
...richMediaEnabled ? selectedExtractors : []
|
|
840
|
+
])).sort()
|
|
647
841
|
};
|
|
648
842
|
await writeJsonFile(path5.join(root, CONFIG_FILE), config);
|
|
649
843
|
const pm = await detectPackageManager(root);
|
|
@@ -657,6 +851,11 @@ async function initCommand(args) {
|
|
|
657
851
|
`- Docs: ${path5.join(installDir, "unrag.md")}`,
|
|
658
852
|
`- Config: unrag.config.ts`,
|
|
659
853
|
`- Imports: ${aliasBase}/* and ${aliasBase}/config`,
|
|
854
|
+
"",
|
|
855
|
+
`- Rich media: ${richMediaEnabled ? "enabled" : "disabled"}`,
|
|
856
|
+
richMediaEnabled ? `- Embeddings: multimodal enabled (images can be embedded directly)` : `- Embeddings: text-only (no direct image embedding)`,
|
|
857
|
+
richMediaEnabled ? `- Extractors: ${selectedExtractors.length > 0 ? selectedExtractors.join(", ") : "none"}` : "",
|
|
858
|
+
richMediaEnabled ? ` Tip: you can tweak extractors + assetProcessing flags in unrag.config.ts later.` : ` Tip: re-run \`unrag init --rich-media\` (or edit unrag.config.ts) to enable rich media later.`,
|
|
660
859
|
isNext ? tsconfigResult.changed ? `- Next.js: updated ${tsconfigResult.file} (added aliases)` : `- Next.js: no tsconfig changes needed` : `- Next.js: not detected`,
|
|
661
860
|
"",
|
|
662
861
|
merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
@@ -685,7 +884,7 @@ function docsUrl(siteRelativePath) {
|
|
|
685
884
|
var CONFIG_FILE2 = "unrag.json";
|
|
686
885
|
var __filename3 = fileURLToPath2(import.meta.url);
|
|
687
886
|
var __dirname3 = path6.dirname(__filename3);
|
|
688
|
-
var
|
|
887
|
+
var AVAILABLE_EXTRACTORS2 = [
|
|
689
888
|
"pdf-llm",
|
|
690
889
|
"pdf-text-layer",
|
|
691
890
|
"pdf-ocr",
|
|
@@ -699,6 +898,7 @@ var AVAILABLE_EXTRACTORS = [
|
|
|
699
898
|
"file-pptx",
|
|
700
899
|
"file-xlsx"
|
|
701
900
|
];
|
|
901
|
+
var AVAILABLE_CONNECTORS = ["notion", "google-drive"];
|
|
702
902
|
var parseAddArgs = (args) => {
|
|
703
903
|
const out = {};
|
|
704
904
|
for (let i = 0;i < args.length; i++) {
|
|
@@ -737,8 +937,8 @@ async function addCommand(args) {
|
|
|
737
937
|
" unrag add <connector>",
|
|
738
938
|
" unrag add extractor <name>",
|
|
739
939
|
"",
|
|
740
|
-
|
|
741
|
-
`Available extractors: ${
|
|
940
|
+
`Available connectors: ${AVAILABLE_CONNECTORS.join(", ")}`,
|
|
941
|
+
`Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`
|
|
742
942
|
].join(`
|
|
743
943
|
`));
|
|
744
944
|
return;
|
|
@@ -757,10 +957,10 @@ async function addCommand(args) {
|
|
|
757
957
|
const pkg = await readPackageJson(root);
|
|
758
958
|
if (kind === "connector") {
|
|
759
959
|
const connector = name;
|
|
760
|
-
if (connector
|
|
960
|
+
if (!connector || !AVAILABLE_CONNECTORS.includes(connector)) {
|
|
761
961
|
outro2(`Unknown connector: ${name}
|
|
762
962
|
|
|
763
|
-
Available connectors:
|
|
963
|
+
Available connectors: ${AVAILABLE_CONNECTORS.join(", ")}`);
|
|
764
964
|
return;
|
|
765
965
|
}
|
|
766
966
|
await copyConnectorFiles({
|
|
@@ -784,16 +984,16 @@ Available connectors: notion`);
|
|
|
784
984
|
`- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
|
|
785
985
|
"",
|
|
786
986
|
merged2.changes.length > 0 ? `Added deps: ${merged2.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
|
|
787
|
-
nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
|
|
987
|
+
nonInteractive ? "" : connector === "notion" ? "Tip: keep NOTION_TOKEN server-side only (env var)." : connector === "google-drive" ? "Tip: keep Google OAuth refresh tokens and service account keys server-side only." : ""
|
|
788
988
|
].filter(Boolean).join(`
|
|
789
989
|
`));
|
|
790
990
|
return;
|
|
791
991
|
}
|
|
792
992
|
const extractor = name;
|
|
793
|
-
if (!extractor || !
|
|
993
|
+
if (!extractor || !AVAILABLE_EXTRACTORS2.includes(extractor)) {
|
|
794
994
|
outro2(`Unknown extractor: ${name}
|
|
795
995
|
|
|
796
|
-
Available extractors: ${
|
|
996
|
+
Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`);
|
|
797
997
|
return;
|
|
798
998
|
}
|
|
799
999
|
await copyExtractorFiles({
|
|
@@ -844,10 +1044,15 @@ function renderHelp() {
|
|
|
844
1044
|
" --store <adapter> drizzle | prisma | raw-sql",
|
|
845
1045
|
" --dir <path> Install directory (alias: --install-dir)",
|
|
846
1046
|
" --alias <@name> Import alias base (e.g. @unrag)",
|
|
1047
|
+
" --rich-media Enable rich media setup (also enables multimodal embeddings)",
|
|
1048
|
+
" --no-rich-media Disable rich media setup",
|
|
1049
|
+
" --extractors <list> Comma-separated extractors (implies --rich-media)",
|
|
847
1050
|
"",
|
|
848
1051
|
"Examples:",
|
|
849
1052
|
" bunx unrag@latest init",
|
|
850
1053
|
" bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
|
|
1054
|
+
" bunx unrag@latest init --yes --rich-media",
|
|
1055
|
+
" bunx unrag@latest init --yes --extractors pdf-text-layer,file-text",
|
|
851
1056
|
" bunx unrag add notion --yes",
|
|
852
1057
|
"",
|
|
853
1058
|
"Docs:",
|
package/package.json
CHANGED
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
"name": "unrag",
|
|
3
3
|
"type": "module",
|
|
4
4
|
"repository": "https://github.com/BetterStacks/unrag",
|
|
5
|
+
"homepage": "https://unrag.dev",
|
|
5
6
|
"bin": {
|
|
6
7
|
"unrag": "./dist/cli/index.js"
|
|
7
8
|
},
|
|
8
|
-
"version": "0.2.
|
|
9
|
+
"version": "0.2.5",
|
|
9
10
|
"private": false,
|
|
10
11
|
"license": "Apache-2.0",
|
|
11
12
|
"devDependencies": {
|
|
@@ -11,6 +11,8 @@
|
|
|
11
11
|
* treated like vendored source code.
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
+
// @ts-nocheck
|
|
15
|
+
|
|
14
16
|
// __UNRAG_IMPORTS__
|
|
15
17
|
|
|
16
18
|
export const unrag = defineUnragConfig({
|
|
@@ -26,8 +28,8 @@ export const unrag = defineUnragConfig({
|
|
|
26
28
|
embedding: {
|
|
27
29
|
provider: "ai",
|
|
28
30
|
config: {
|
|
29
|
-
type: "text",
|
|
30
|
-
model: "openai/text-embedding-3-small",
|
|
31
|
+
type: "text", // __UNRAG_EMBEDDING_TYPE__
|
|
32
|
+
model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__
|
|
31
33
|
timeoutMs: 15_000,
|
|
32
34
|
},
|
|
33
35
|
},
|
|
@@ -52,13 +54,15 @@ export const unrag = defineUnragConfig({
|
|
|
52
54
|
* - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
|
|
53
55
|
* - `extractors: [createPdfLlmExtractor()]`
|
|
54
56
|
*/
|
|
55
|
-
extractors: [
|
|
57
|
+
extractors: [
|
|
58
|
+
// __UNRAG_EXTRACTORS__
|
|
59
|
+
],
|
|
56
60
|
/**
|
|
57
61
|
* Rich media processing controls.
|
|
58
62
|
*
|
|
59
63
|
* Notes:
|
|
60
|
-
* -
|
|
61
|
-
* -
|
|
64
|
+
* - This generated config is cost-safe by default (all extraction is off).
|
|
65
|
+
* - `unrag init` can enable rich media + multimodal embeddings for you.
|
|
62
66
|
* - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
|
|
63
67
|
*/
|
|
64
68
|
assetProcessing: {
|
|
@@ -74,14 +78,14 @@ export const unrag = defineUnragConfig({
|
|
|
74
78
|
pdf: {
|
|
75
79
|
// Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
|
|
76
80
|
textLayer: {
|
|
77
|
-
enabled: false,
|
|
81
|
+
enabled: false, // __UNRAG_FLAG_pdf_textLayer__
|
|
78
82
|
maxBytes: 15 * 1024 * 1024,
|
|
79
83
|
maxOutputChars: 200_000,
|
|
80
84
|
minChars: 200,
|
|
81
85
|
// maxPages: 200,
|
|
82
86
|
},
|
|
83
87
|
llmExtraction: {
|
|
84
|
-
enabled:
|
|
88
|
+
enabled: false, // __UNRAG_FLAG_pdf_llmExtraction__
|
|
85
89
|
model: "google/gemini-2.0-flash",
|
|
86
90
|
prompt:
|
|
87
91
|
"Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
|
|
@@ -91,7 +95,7 @@ export const unrag = defineUnragConfig({
|
|
|
91
95
|
},
|
|
92
96
|
// Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
|
|
93
97
|
ocr: {
|
|
94
|
-
enabled: false,
|
|
98
|
+
enabled: false, // __UNRAG_FLAG_pdf_ocr__
|
|
95
99
|
maxBytes: 15 * 1024 * 1024,
|
|
96
100
|
maxOutputChars: 200_000,
|
|
97
101
|
minChars: 200,
|
|
@@ -104,7 +108,7 @@ export const unrag = defineUnragConfig({
|
|
|
104
108
|
},
|
|
105
109
|
image: {
|
|
106
110
|
ocr: {
|
|
107
|
-
enabled: false,
|
|
111
|
+
enabled: false, // __UNRAG_FLAG_image_ocr__
|
|
108
112
|
model: "google/gemini-2.0-flash",
|
|
109
113
|
prompt:
|
|
110
114
|
"Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
@@ -113,7 +117,7 @@ export const unrag = defineUnragConfig({
|
|
|
113
117
|
maxOutputChars: 50_000,
|
|
114
118
|
},
|
|
115
119
|
captionLlm: {
|
|
116
|
-
enabled: false,
|
|
120
|
+
enabled: false, // __UNRAG_FLAG_image_captionLlm__
|
|
117
121
|
model: "google/gemini-2.0-flash",
|
|
118
122
|
prompt:
|
|
119
123
|
"Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
|
|
@@ -124,7 +128,7 @@ export const unrag = defineUnragConfig({
|
|
|
124
128
|
},
|
|
125
129
|
audio: {
|
|
126
130
|
transcription: {
|
|
127
|
-
enabled: false,
|
|
131
|
+
enabled: false, // __UNRAG_FLAG_audio_transcription__
|
|
128
132
|
model: "openai/whisper-1",
|
|
129
133
|
timeoutMs: 120_000,
|
|
130
134
|
maxBytes: 25 * 1024 * 1024,
|
|
@@ -132,13 +136,13 @@ export const unrag = defineUnragConfig({
|
|
|
132
136
|
},
|
|
133
137
|
video: {
|
|
134
138
|
transcription: {
|
|
135
|
-
enabled: false,
|
|
139
|
+
enabled: false, // __UNRAG_FLAG_video_transcription__
|
|
136
140
|
model: "openai/whisper-1",
|
|
137
141
|
timeoutMs: 120_000,
|
|
138
142
|
maxBytes: 50 * 1024 * 1024,
|
|
139
143
|
},
|
|
140
144
|
frames: {
|
|
141
|
-
enabled: false,
|
|
145
|
+
enabled: false, // __UNRAG_FLAG_video_frames__
|
|
142
146
|
sampleFps: 0.2,
|
|
143
147
|
maxFrames: 50,
|
|
144
148
|
// ffmpegPath: "/usr/bin/ffmpeg",
|
|
@@ -151,10 +155,30 @@ export const unrag = defineUnragConfig({
|
|
|
151
155
|
},
|
|
152
156
|
},
|
|
153
157
|
file: {
|
|
154
|
-
text: {
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
+
text: {
|
|
159
|
+
enabled: false, // __UNRAG_FLAG_file_text__
|
|
160
|
+
maxBytes: 5 * 1024 * 1024,
|
|
161
|
+
maxOutputChars: 200_000,
|
|
162
|
+
minChars: 50,
|
|
163
|
+
},
|
|
164
|
+
docx: {
|
|
165
|
+
enabled: false, // __UNRAG_FLAG_file_docx__
|
|
166
|
+
maxBytes: 15 * 1024 * 1024,
|
|
167
|
+
maxOutputChars: 200_000,
|
|
168
|
+
minChars: 50,
|
|
169
|
+
},
|
|
170
|
+
pptx: {
|
|
171
|
+
enabled: false, // __UNRAG_FLAG_file_pptx__
|
|
172
|
+
maxBytes: 30 * 1024 * 1024,
|
|
173
|
+
maxOutputChars: 200_000,
|
|
174
|
+
minChars: 50,
|
|
175
|
+
},
|
|
176
|
+
xlsx: {
|
|
177
|
+
enabled: false, // __UNRAG_FLAG_file_xlsx__
|
|
178
|
+
maxBytes: 30 * 1024 * 1024,
|
|
179
|
+
maxOutputChars: 200_000,
|
|
180
|
+
minChars: 50,
|
|
181
|
+
},
|
|
158
182
|
},
|
|
159
183
|
},
|
|
160
184
|
},
|