unrag 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/index.js CHANGED
@@ -4,7 +4,15 @@
4
4
  import { intro, outro as outro3 } from "@clack/prompts";
5
5
 
6
6
  // cli/commands/init.ts
7
- import { cancel as cancel2, isCancel as isCancel2, outro, select, text } from "@clack/prompts";
7
+ import {
8
+ cancel as cancel2,
9
+ confirm as confirm2,
10
+ groupMultiselect,
11
+ isCancel as isCancel2,
12
+ outro,
13
+ select,
14
+ text
15
+ } from "@clack/prompts";
8
16
  import path5 from "node:path";
9
17
  import { fileURLToPath } from "node:url";
10
18
 
@@ -71,8 +79,39 @@ var writeText = async (filePath, content) => {
71
79
  await ensureDir(path2.dirname(filePath));
72
80
  await writeFile(filePath, content, "utf8");
73
81
  };
82
+ var EXTRACTOR_FACTORY = {
83
+ "pdf-llm": "createPdfLlmExtractor",
84
+ "pdf-text-layer": "createPdfTextLayerExtractor",
85
+ "pdf-ocr": "createPdfOcrExtractor",
86
+ "image-ocr": "createImageOcrExtractor",
87
+ "image-caption-llm": "createImageCaptionLlmExtractor",
88
+ "audio-transcribe": "createAudioTranscribeExtractor",
89
+ "video-transcribe": "createVideoTranscribeExtractor",
90
+ "video-frames": "createVideoFramesExtractor",
91
+ "file-text": "createFileTextExtractor",
92
+ "file-docx": "createFileDocxExtractor",
93
+ "file-pptx": "createFilePptxExtractor",
94
+ "file-xlsx": "createFileXlsxExtractor"
95
+ };
96
+ var EXTRACTOR_FLAG_KEYS = {
97
+ "pdf-text-layer": ["pdf_textLayer"],
98
+ "pdf-llm": ["pdf_llmExtraction"],
99
+ "pdf-ocr": ["pdf_ocr"],
100
+ "image-ocr": ["image_ocr"],
101
+ "image-caption-llm": ["image_captionLlm"],
102
+ "audio-transcribe": ["audio_transcription"],
103
+ "video-transcribe": ["video_transcription"],
104
+ "video-frames": ["video_frames"],
105
+ "file-text": ["file_text"],
106
+ "file-docx": ["file_docx"],
107
+ "file-pptx": ["file_pptx"],
108
+ "file-xlsx": ["file_xlsx"]
109
+ };
110
+ var ALL_FLAG_KEYS = Array.from(new Set(Object.values(EXTRACTOR_FLAG_KEYS).flat())).sort();
74
111
  var renderUnragConfig = (content, selection) => {
75
112
  const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
113
+ const richMedia = selection.richMedia ?? { enabled: false, extractors: [] };
114
+ const selectedExtractors = Array.from(new Set(richMedia.extractors ?? [])).sort();
76
115
  const baseImports = [
77
116
  `import { defineUnragConfig } from "${installImportBase}/core";`
78
117
  ];
@@ -88,22 +127,40 @@ var renderUnragConfig = (content, selection) => {
88
127
  storeImports.push(`import { createPrismaVectorStore } from "${installImportBase}/store/prisma";`, `import { PrismaClient } from "@prisma/client";`);
89
128
  storeCreateLines.push(` const prisma = (globalThis as any).__unragPrisma ?? new PrismaClient();`, ` (globalThis as any).__unragPrisma = prisma;`, ` const store = createPrismaVectorStore(prisma);`);
90
129
  }
91
- const importsBlock = [...baseImports, ...storeImports].join(`
130
+ const extractorImports = [];
131
+ if (richMedia.enabled && selectedExtractors.length > 0) {
132
+ for (const ex of selectedExtractors) {
133
+ const factory = EXTRACTOR_FACTORY[ex];
134
+ extractorImports.push(`import { ${factory} } from "${installImportBase}/extractors/${ex}";`);
135
+ }
136
+ }
137
+ const importsBlock = [...baseImports, ...storeImports, ...extractorImports].join(`
92
138
  `);
93
139
  const createEngineBlock = [
94
140
  `export function createUnragEngine() {`,
95
141
  ...storeCreateLines,
96
142
  ``,
97
143
  ` return unrag.createEngine({ store });`,
98
- `}`,
99
- ``,
100
- `export async function retrieve(query: string) {`,
101
- ` const engine = createUnragEngine();`,
102
- ` return engine.retrieve({ query, topK: unrag.defaults.retrieval.topK });`,
103
144
  `}`
104
145
  ].join(`
105
146
  `);
106
- return content.replace("// __UNRAG_IMPORTS__", importsBlock).replace("// __UNRAG_CREATE_ENGINE__", createEngineBlock);
147
+ let out = content.replace("// __UNRAG_IMPORTS__", importsBlock).replace("// __UNRAG_CREATE_ENGINE__", createEngineBlock);
148
+ out = out.replace('type: "text", // __UNRAG_EMBEDDING_TYPE__', richMedia.enabled ? 'type: "multimodal",' : 'type: "text",').replace('model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__', richMedia.enabled ? 'model: "cohere/embed-v4.0",' : 'model: "openai/text-embedding-3-small",');
149
+ const enabledFlagKeys = new Set;
150
+ if (richMedia.enabled) {
151
+ for (const ex of selectedExtractors) {
152
+ for (const k of EXTRACTOR_FLAG_KEYS[ex] ?? []) {
153
+ enabledFlagKeys.add(k);
154
+ }
155
+ }
156
+ }
157
+ for (const k of ALL_FLAG_KEYS) {
158
+ out = out.replace(`enabled: false, // __UNRAG_FLAG_${k}__`, `enabled: ${enabledFlagKeys.has(k) ? "true" : "false"},`);
159
+ }
160
+ const extractorLines = richMedia.enabled && selectedExtractors.length > 0 ? selectedExtractors.map((ex) => ` ${EXTRACTOR_FACTORY[ex]}(),`).join(`
161
+ `) : "";
162
+ out = out.replace(" // __UNRAG_EXTRACTORS__", extractorLines);
163
+ return out;
107
164
  };
108
165
  var renderDocs = (content, selection) => {
109
166
  const notes = [];
@@ -272,28 +329,34 @@ async function copyExtractorFiles(selection) {
272
329
  const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
273
330
  const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
274
331
  const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
332
+ const shouldWrite = async (src, dest) => {
333
+ if (!await exists(dest))
334
+ return true;
335
+ if (nonInteractive)
336
+ return false;
337
+ try {
338
+ const [srcRaw, destRaw] = await Promise.all([readText(src), readText(dest)]);
339
+ if (srcRaw === destRaw)
340
+ return false;
341
+ } catch {}
342
+ const answer = await confirm({
343
+ message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
344
+ initialValue: false
345
+ });
346
+ if (isCancel(answer)) {
347
+ cancel("Cancelled.");
348
+ return false;
349
+ }
350
+ return Boolean(answer);
351
+ };
275
352
  for (const src of extractorFiles) {
276
353
  if (!await exists(src)) {
277
354
  throw new Error(`Registry file missing: ${src}`);
278
355
  }
279
356
  const rel = path2.relative(extractorRegistryAbs, src);
280
357
  const dest = path2.join(destRootAbs, rel);
281
- if (await exists(dest)) {
282
- if (nonInteractive) {
283
- continue;
284
- }
285
- const answer = await confirm({
286
- message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
287
- initialValue: false
288
- });
289
- if (isCancel(answer)) {
290
- cancel("Cancelled.");
291
- return;
292
- }
293
- if (!answer) {
294
- continue;
295
- }
296
- }
358
+ if (!await shouldWrite(src, dest))
359
+ continue;
297
360
  const raw = await readText(src);
298
361
  await writeText(dest, raw);
299
362
  }
@@ -303,22 +366,8 @@ async function copyExtractorFiles(selection) {
303
366
  }
304
367
  const rel = path2.relative(sharedRegistryAbs, src);
305
368
  const dest = path2.join(sharedDestRootAbs, rel);
306
- if (await exists(dest)) {
307
- if (nonInteractive) {
308
- continue;
309
- }
310
- const answer = await confirm({
311
- message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
312
- initialValue: false
313
- });
314
- if (isCancel(answer)) {
315
- cancel("Cancelled.");
316
- return;
317
- }
318
- if (!answer) {
319
- continue;
320
- }
321
- }
369
+ if (!await shouldWrite(src, dest))
370
+ continue;
322
371
  const raw = await readText(src);
323
372
  await writeText(dest, raw);
324
373
  }
@@ -406,6 +455,10 @@ function depsForConnector(connector) {
406
455
  if (connector === "notion") {
407
456
  deps["@notionhq/client"] = "^2.2.16";
408
457
  }
458
+ if (connector === "google-drive") {
459
+ deps["googleapis"] = "^148.0.0";
460
+ deps["google-auth-library"] = "^10.0.0";
461
+ }
409
462
  return { deps, devDeps };
410
463
  }
411
464
  function depsForExtractor(extractor) {
@@ -552,9 +605,93 @@ var parseInitArgs = (args) => {
552
605
  }
553
606
  continue;
554
607
  }
608
+ if (a === "--rich-media") {
609
+ out.richMedia = true;
610
+ continue;
611
+ }
612
+ if (a === "--no-rich-media") {
613
+ out.richMedia = false;
614
+ continue;
615
+ }
616
+ if (a === "--extractors") {
617
+ const v = args[i + 1];
618
+ if (v) {
619
+ out.extractors = v.split(",").map((s) => s.trim()).filter(Boolean);
620
+ i++;
621
+ }
622
+ continue;
623
+ }
555
624
  }
556
625
  return out;
557
626
  };
627
+ var DEFAULT_RICH_MEDIA_EXTRACTORS = ["pdf-text-layer", "file-text"];
628
+ var EXTRACTOR_OPTIONS = [
629
+ {
630
+ group: "PDF",
631
+ value: "pdf-text-layer",
632
+ label: `pdf-text-layer (Fast/cheap extraction via PDF text layer)`,
633
+ hint: "recommended"
634
+ },
635
+ {
636
+ group: "PDF",
637
+ value: "pdf-llm",
638
+ label: `pdf-llm (LLM-based PDF extraction; higher cost)`
639
+ },
640
+ {
641
+ group: "PDF",
642
+ value: "pdf-ocr",
643
+ label: `pdf-ocr (OCR scanned PDFs; requires native binaries)`,
644
+ hint: "worker-only"
645
+ },
646
+ {
647
+ group: "Image",
648
+ value: "image-ocr",
649
+ label: `image-ocr (Extract text from images via vision LLM)`
650
+ },
651
+ {
652
+ group: "Image",
653
+ value: "image-caption-llm",
654
+ label: `image-caption-llm (Generate captions for images via vision LLM)`
655
+ },
656
+ {
657
+ group: "Audio",
658
+ value: "audio-transcribe",
659
+ label: `audio-transcribe (Speech-to-text transcription)`
660
+ },
661
+ {
662
+ group: "Video",
663
+ value: "video-transcribe",
664
+ label: `video-transcribe (Transcribe video audio track)`
665
+ },
666
+ {
667
+ group: "Video",
668
+ value: "video-frames",
669
+ label: `video-frames (Sample frames + analyze via vision LLM; requires ffmpeg)`,
670
+ hint: "worker-only"
671
+ },
672
+ {
673
+ group: "Files",
674
+ value: "file-text",
675
+ label: `file-text (Extract text/markdown/json/html from common text files)`,
676
+ hint: "recommended"
677
+ },
678
+ {
679
+ group: "Files",
680
+ value: "file-docx",
681
+ label: `file-docx (Extract text from .docx files)`
682
+ },
683
+ {
684
+ group: "Files",
685
+ value: "file-pptx",
686
+ label: `file-pptx (Extract text from .pptx slides)`
687
+ },
688
+ {
689
+ group: "Files",
690
+ value: "file-xlsx",
691
+ label: `file-xlsx (Extract tables from .xlsx spreadsheets)`
692
+ }
693
+ ];
694
+ var AVAILABLE_EXTRACTORS = new Set(EXTRACTOR_OPTIONS.map((o) => o.value));
558
695
  async function initCommand(args) {
559
696
  const root = await tryFindProjectRoot(process.cwd());
560
697
  if (!root) {
@@ -623,17 +760,71 @@ async function initCommand(args) {
623
760
  return;
624
761
  }
625
762
  const aliasBase = String(aliasAnswer).trim();
763
+ if (parsed.richMedia === false && (parsed.extractors ?? []).length > 0) {
764
+ throw new Error('Cannot use "--no-rich-media" together with "--extractors".');
765
+ }
766
+ const extractorsFromArgs = (parsed.extractors ?? []).filter((x) => AVAILABLE_EXTRACTORS.has(x)).sort();
767
+ const richMediaAnswer = extractorsFromArgs.length > 0 ? true : typeof parsed.richMedia === "boolean" ? parsed.richMedia : nonInteractive ? false : await confirm2({
768
+ message: "Enable rich media ingestion (PDF/images/audio/video/files)? This also enables multimodal image embeddings (you can change this later).",
769
+ initialValue: false
770
+ });
771
+ if (isCancel2(richMediaAnswer)) {
772
+ cancel2("Cancelled.");
773
+ return;
774
+ }
775
+ const richMediaEnabled = Boolean(richMediaAnswer);
776
+ const selectedExtractorsAnswer = richMediaEnabled || extractorsFromArgs.length > 0 ? nonInteractive ? extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS : await groupMultiselect({
777
+ message: "Select extractors to enable (space to toggle, enter to confirm)",
778
+ options: EXTRACTOR_OPTIONS.reduce((acc, opt) => {
779
+ acc[opt.group] ??= [];
780
+ acc[opt.group].push({
781
+ value: opt.value,
782
+ label: opt.label,
783
+ ...opt.hint ? { hint: opt.hint } : {}
784
+ });
785
+ return acc;
786
+ }, {}),
787
+ initialValues: extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS,
788
+ required: false
789
+ }) : [];
790
+ if (isCancel2(selectedExtractorsAnswer)) {
791
+ cancel2("Cancelled.");
792
+ return;
793
+ }
794
+ const selectedExtractors = Array.from(new Set(Array.isArray(selectedExtractorsAnswer) ? selectedExtractorsAnswer : [])).sort();
626
795
  const selection = {
627
796
  installDir,
628
797
  storeAdapter: storeAdapterAnswer,
629
798
  projectRoot: root,
630
799
  registryRoot,
631
- aliasBase
800
+ aliasBase,
801
+ richMedia: richMediaEnabled ? {
802
+ enabled: true,
803
+ extractors: selectedExtractors
804
+ } : { enabled: false, extractors: [] }
632
805
  };
633
806
  await copyRegistryFiles(selection);
807
+ if (richMediaEnabled && selectedExtractors.length > 0) {
808
+ for (const extractor of selectedExtractors) {
809
+ await copyExtractorFiles({
810
+ projectRoot: root,
811
+ registryRoot,
812
+ installDir,
813
+ extractor,
814
+ yes: nonInteractive
815
+ });
816
+ }
817
+ }
634
818
  const pkg = await readPackageJson(root);
635
819
  const { deps, devDeps } = depsForAdapter(storeAdapterAnswer);
636
- const merged = mergeDeps(pkg, deps, devDeps);
820
+ const extractorDeps = {};
821
+ const extractorDevDeps = {};
822
+ for (const ex of selectedExtractors) {
823
+ const r = depsForExtractor(ex);
824
+ Object.assign(extractorDeps, r.deps);
825
+ Object.assign(extractorDevDeps, r.devDeps);
826
+ }
827
+ const merged = mergeDeps(pkg, { ...deps, ...extractorDeps }, { ...devDeps, ...extractorDevDeps });
637
828
  if (merged.changes.length > 0) {
638
829
  await writePackageJson(root, merged.pkg);
639
830
  }
@@ -643,7 +834,10 @@ async function initCommand(args) {
643
834
  aliasBase,
644
835
  version: CONFIG_VERSION,
645
836
  connectors: existing?.connectors ?? [],
646
- extractors: existing?.extractors ?? []
837
+ extractors: Array.from(new Set([
838
+ ...existing?.extractors ?? [],
839
+ ...richMediaEnabled ? selectedExtractors : []
840
+ ])).sort()
647
841
  };
648
842
  await writeJsonFile(path5.join(root, CONFIG_FILE), config);
649
843
  const pm = await detectPackageManager(root);
@@ -657,6 +851,11 @@ async function initCommand(args) {
657
851
  `- Docs: ${path5.join(installDir, "unrag.md")}`,
658
852
  `- Config: unrag.config.ts`,
659
853
  `- Imports: ${aliasBase}/* and ${aliasBase}/config`,
854
+ "",
855
+ `- Rich media: ${richMediaEnabled ? "enabled" : "disabled"}`,
856
+ richMediaEnabled ? `- Embeddings: multimodal enabled (images can be embedded directly)` : `- Embeddings: text-only (no direct image embedding)`,
857
+ richMediaEnabled ? `- Extractors: ${selectedExtractors.length > 0 ? selectedExtractors.join(", ") : "none"}` : "",
858
+ richMediaEnabled ? ` Tip: you can tweak extractors + assetProcessing flags in unrag.config.ts later.` : ` Tip: re-run \`unrag init --rich-media\` (or edit unrag.config.ts) to enable rich media later.`,
660
859
  isNext ? tsconfigResult.changed ? `- Next.js: updated ${tsconfigResult.file} (added aliases)` : `- Next.js: no tsconfig changes needed` : `- Next.js: not detected`,
661
860
  "",
662
861
  merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
@@ -685,7 +884,7 @@ function docsUrl(siteRelativePath) {
685
884
  var CONFIG_FILE2 = "unrag.json";
686
885
  var __filename3 = fileURLToPath2(import.meta.url);
687
886
  var __dirname3 = path6.dirname(__filename3);
688
- var AVAILABLE_EXTRACTORS = [
887
+ var AVAILABLE_EXTRACTORS2 = [
689
888
  "pdf-llm",
690
889
  "pdf-text-layer",
691
890
  "pdf-ocr",
@@ -699,6 +898,7 @@ var AVAILABLE_EXTRACTORS = [
699
898
  "file-pptx",
700
899
  "file-xlsx"
701
900
  ];
901
+ var AVAILABLE_CONNECTORS = ["notion", "google-drive"];
702
902
  var parseAddArgs = (args) => {
703
903
  const out = {};
704
904
  for (let i = 0;i < args.length; i++) {
@@ -737,8 +937,8 @@ async function addCommand(args) {
737
937
  " unrag add <connector>",
738
938
  " unrag add extractor <name>",
739
939
  "",
740
- "Available connectors: notion",
741
- `Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`
940
+ `Available connectors: ${AVAILABLE_CONNECTORS.join(", ")}`,
941
+ `Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`
742
942
  ].join(`
743
943
  `));
744
944
  return;
@@ -757,10 +957,10 @@ async function addCommand(args) {
757
957
  const pkg = await readPackageJson(root);
758
958
  if (kind === "connector") {
759
959
  const connector = name;
760
- if (connector !== "notion") {
960
+ if (!connector || !AVAILABLE_CONNECTORS.includes(connector)) {
761
961
  outro2(`Unknown connector: ${name}
762
962
 
763
- Available connectors: notion`);
963
+ Available connectors: ${AVAILABLE_CONNECTORS.join(", ")}`);
764
964
  return;
765
965
  }
766
966
  await copyConnectorFiles({
@@ -784,16 +984,16 @@ Available connectors: notion`);
784
984
  `- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
785
985
  "",
786
986
  merged2.changes.length > 0 ? `Added deps: ${merged2.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
787
- nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
987
+ nonInteractive ? "" : connector === "notion" ? "Tip: keep NOTION_TOKEN server-side only (env var)." : connector === "google-drive" ? "Tip: keep Google OAuth refresh tokens and service account keys server-side only." : ""
788
988
  ].filter(Boolean).join(`
789
989
  `));
790
990
  return;
791
991
  }
792
992
  const extractor = name;
793
- if (!extractor || !AVAILABLE_EXTRACTORS.includes(extractor)) {
993
+ if (!extractor || !AVAILABLE_EXTRACTORS2.includes(extractor)) {
794
994
  outro2(`Unknown extractor: ${name}
795
995
 
796
- Available extractors: ${AVAILABLE_EXTRACTORS.join(", ")}`);
996
+ Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`);
797
997
  return;
798
998
  }
799
999
  await copyExtractorFiles({
@@ -844,10 +1044,15 @@ function renderHelp() {
844
1044
  " --store <adapter> drizzle | prisma | raw-sql",
845
1045
  " --dir <path> Install directory (alias: --install-dir)",
846
1046
  " --alias <@name> Import alias base (e.g. @unrag)",
1047
+ " --rich-media Enable rich media setup (also enables multimodal embeddings)",
1048
+ " --no-rich-media Disable rich media setup",
1049
+ " --extractors <list> Comma-separated extractors (implies --rich-media)",
847
1050
  "",
848
1051
  "Examples:",
849
1052
  " bunx unrag@latest init",
850
1053
  " bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
1054
+ " bunx unrag@latest init --yes --rich-media",
1055
+ " bunx unrag@latest init --yes --extractors pdf-text-layer,file-text",
851
1056
  " bunx unrag add notion --yes",
852
1057
  "",
853
1058
  "Docs:",
package/package.json CHANGED
@@ -2,10 +2,11 @@
2
2
  "name": "unrag",
3
3
  "type": "module",
4
4
  "repository": "https://github.com/BetterStacks/unrag",
5
+ "homepage": "https://unrag.dev",
5
6
  "bin": {
6
7
  "unrag": "./dist/cli/index.js"
7
8
  },
8
- "version": "0.2.3",
9
+ "version": "0.2.5",
9
10
  "private": false,
10
11
  "license": "Apache-2.0",
11
12
  "devDependencies": {
@@ -11,6 +11,8 @@
11
11
  * treated like vendored source code.
12
12
  */
13
13
 
14
+ // @ts-nocheck
15
+
14
16
  // __UNRAG_IMPORTS__
15
17
 
16
18
  export const unrag = defineUnragConfig({
@@ -26,8 +28,8 @@ export const unrag = defineUnragConfig({
26
28
  embedding: {
27
29
  provider: "ai",
28
30
  config: {
29
- type: "text",
30
- model: "openai/text-embedding-3-small",
31
+ type: "text", // __UNRAG_EMBEDDING_TYPE__
32
+ model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__
31
33
  timeoutMs: 15_000,
32
34
  },
33
35
  },
@@ -52,13 +54,15 @@ export const unrag = defineUnragConfig({
52
54
  * - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
53
55
  * - `extractors: [createPdfLlmExtractor()]`
54
56
  */
55
- extractors: [],
57
+ extractors: [
58
+ // __UNRAG_EXTRACTORS__
59
+ ],
56
60
  /**
57
61
  * Rich media processing controls.
58
62
  *
59
63
  * Notes:
60
- * - The library defaults are cost-safe (PDF LLM extraction is off).
61
- * - This generated config opts you into PDF extraction for convenience.
64
+ * - This generated config is cost-safe by default (all extraction is off).
65
+ * - `unrag init` can enable rich media + multimodal embeddings for you.
62
66
  * - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
63
67
  */
64
68
  assetProcessing: {
@@ -74,14 +78,14 @@ export const unrag = defineUnragConfig({
74
78
  pdf: {
75
79
  // Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
76
80
  textLayer: {
77
- enabled: false,
81
+ enabled: false, // __UNRAG_FLAG_pdf_textLayer__
78
82
  maxBytes: 15 * 1024 * 1024,
79
83
  maxOutputChars: 200_000,
80
84
  minChars: 200,
81
85
  // maxPages: 200,
82
86
  },
83
87
  llmExtraction: {
84
- enabled: true,
88
+ enabled: false, // __UNRAG_FLAG_pdf_llmExtraction__
85
89
  model: "google/gemini-2.0-flash",
86
90
  prompt:
87
91
  "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
@@ -91,7 +95,7 @@ export const unrag = defineUnragConfig({
91
95
  },
92
96
  // Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
93
97
  ocr: {
94
- enabled: false,
98
+ enabled: false, // __UNRAG_FLAG_pdf_ocr__
95
99
  maxBytes: 15 * 1024 * 1024,
96
100
  maxOutputChars: 200_000,
97
101
  minChars: 200,
@@ -104,7 +108,7 @@ export const unrag = defineUnragConfig({
104
108
  },
105
109
  image: {
106
110
  ocr: {
107
- enabled: false,
111
+ enabled: false, // __UNRAG_FLAG_image_ocr__
108
112
  model: "google/gemini-2.0-flash",
109
113
  prompt:
110
114
  "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
@@ -113,7 +117,7 @@ export const unrag = defineUnragConfig({
113
117
  maxOutputChars: 50_000,
114
118
  },
115
119
  captionLlm: {
116
- enabled: false,
120
+ enabled: false, // __UNRAG_FLAG_image_captionLlm__
117
121
  model: "google/gemini-2.0-flash",
118
122
  prompt:
119
123
  "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
@@ -124,7 +128,7 @@ export const unrag = defineUnragConfig({
124
128
  },
125
129
  audio: {
126
130
  transcription: {
127
- enabled: false,
131
+ enabled: false, // __UNRAG_FLAG_audio_transcription__
128
132
  model: "openai/whisper-1",
129
133
  timeoutMs: 120_000,
130
134
  maxBytes: 25 * 1024 * 1024,
@@ -132,13 +136,13 @@ export const unrag = defineUnragConfig({
132
136
  },
133
137
  video: {
134
138
  transcription: {
135
- enabled: false,
139
+ enabled: false, // __UNRAG_FLAG_video_transcription__
136
140
  model: "openai/whisper-1",
137
141
  timeoutMs: 120_000,
138
142
  maxBytes: 50 * 1024 * 1024,
139
143
  },
140
144
  frames: {
141
- enabled: false,
145
+ enabled: false, // __UNRAG_FLAG_video_frames__
142
146
  sampleFps: 0.2,
143
147
  maxFrames: 50,
144
148
  // ffmpegPath: "/usr/bin/ffmpeg",
@@ -151,10 +155,30 @@ export const unrag = defineUnragConfig({
151
155
  },
152
156
  },
153
157
  file: {
154
- text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
155
- docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
156
- pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
157
- xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
158
+ text: {
159
+ enabled: false, // __UNRAG_FLAG_file_text__
160
+ maxBytes: 5 * 1024 * 1024,
161
+ maxOutputChars: 200_000,
162
+ minChars: 50,
163
+ },
164
+ docx: {
165
+ enabled: false, // __UNRAG_FLAG_file_docx__
166
+ maxBytes: 15 * 1024 * 1024,
167
+ maxOutputChars: 200_000,
168
+ minChars: 50,
169
+ },
170
+ pptx: {
171
+ enabled: false, // __UNRAG_FLAG_file_pptx__
172
+ maxBytes: 30 * 1024 * 1024,
173
+ maxOutputChars: 200_000,
174
+ minChars: 50,
175
+ },
176
+ xlsx: {
177
+ enabled: false, // __UNRAG_FLAG_file_xlsx__
178
+ maxBytes: 30 * 1024 * 1024,
179
+ maxOutputChars: 200_000,
180
+ minChars: 50,
181
+ },
158
182
  },
159
183
  },
160
184
  },