unrag 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +2 -2
  2. package/dist/cli/index.js +408 -50
  3. package/package.json +3 -1
  4. package/registry/config/unrag.config.ts +164 -7
  5. package/registry/connectors/notion/render.ts +78 -0
  6. package/registry/connectors/notion/sync.ts +12 -3
  7. package/registry/connectors/notion/types.ts +3 -1
  8. package/registry/core/assets.ts +54 -0
  9. package/registry/core/config.ts +150 -0
  10. package/registry/core/context-engine.ts +69 -1
  11. package/registry/core/index.ts +15 -2
  12. package/registry/core/ingest.ts +743 -17
  13. package/registry/core/types.ts +606 -0
  14. package/registry/docs/unrag.md +6 -0
  15. package/registry/embedding/ai.ts +89 -8
  16. package/registry/extractors/_shared/fetch.ts +113 -0
  17. package/registry/extractors/_shared/media.ts +14 -0
  18. package/registry/extractors/_shared/text.ts +11 -0
  19. package/registry/extractors/audio-transcribe/index.ts +75 -0
  20. package/registry/extractors/file-docx/index.ts +53 -0
  21. package/registry/extractors/file-pptx/index.ts +92 -0
  22. package/registry/extractors/file-text/index.ts +85 -0
  23. package/registry/extractors/file-xlsx/index.ts +58 -0
  24. package/registry/extractors/image-caption-llm/index.ts +60 -0
  25. package/registry/extractors/image-ocr/index.ts +60 -0
  26. package/registry/extractors/pdf-llm/index.ts +84 -0
  27. package/registry/extractors/pdf-ocr/index.ts +125 -0
  28. package/registry/extractors/pdf-text-layer/index.ts +76 -0
  29. package/registry/extractors/video-frames/index.ts +126 -0
  30. package/registry/extractors/video-transcribe/index.ts +78 -0
  31. package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
package/README.md CHANGED
@@ -10,13 +10,13 @@ It installs small, auditable source files into your repo:
10
10
  ## Usage
11
11
 
12
12
  ```bash
13
- bunx unrag init
13
+ bunx unrag@latest init
14
14
  ```
15
15
 
16
16
  ### Common flags
17
17
 
18
18
  ```bash
19
- bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag
19
+ bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag
20
20
  ```
21
21
 
22
22
  - `--store`: `drizzle` | `prisma` | `raw-sql`
package/dist/cli/index.js CHANGED
@@ -4,7 +4,15 @@
4
4
  import { intro, outro as outro3 } from "@clack/prompts";
5
5
 
6
6
  // cli/commands/init.ts
7
- import { cancel as cancel2, isCancel as isCancel2, outro, select, text } from "@clack/prompts";
7
+ import {
8
+ cancel as cancel2,
9
+ confirm as confirm2,
10
+ groupMultiselect,
11
+ isCancel as isCancel2,
12
+ outro,
13
+ select,
14
+ text
15
+ } from "@clack/prompts";
8
16
  import path5 from "node:path";
9
17
  import { fileURLToPath } from "node:url";
10
18
 
@@ -71,11 +79,41 @@ var writeText = async (filePath, content) => {
71
79
  await ensureDir(path2.dirname(filePath));
72
80
  await writeFile(filePath, content, "utf8");
73
81
  };
82
+ var EXTRACTOR_FACTORY = {
83
+ "pdf-llm": "createPdfLlmExtractor",
84
+ "pdf-text-layer": "createPdfTextLayerExtractor",
85
+ "pdf-ocr": "createPdfOcrExtractor",
86
+ "image-ocr": "createImageOcrExtractor",
87
+ "image-caption-llm": "createImageCaptionLlmExtractor",
88
+ "audio-transcribe": "createAudioTranscribeExtractor",
89
+ "video-transcribe": "createVideoTranscribeExtractor",
90
+ "video-frames": "createVideoFramesExtractor",
91
+ "file-text": "createFileTextExtractor",
92
+ "file-docx": "createFileDocxExtractor",
93
+ "file-pptx": "createFilePptxExtractor",
94
+ "file-xlsx": "createFileXlsxExtractor"
95
+ };
96
+ var EXTRACTOR_FLAG_KEYS = {
97
+ "pdf-text-layer": ["pdf_textLayer"],
98
+ "pdf-llm": ["pdf_llmExtraction"],
99
+ "pdf-ocr": ["pdf_ocr"],
100
+ "image-ocr": ["image_ocr"],
101
+ "image-caption-llm": ["image_captionLlm"],
102
+ "audio-transcribe": ["audio_transcription"],
103
+ "video-transcribe": ["video_transcription"],
104
+ "video-frames": ["video_frames"],
105
+ "file-text": ["file_text"],
106
+ "file-docx": ["file_docx"],
107
+ "file-pptx": ["file_pptx"],
108
+ "file-xlsx": ["file_xlsx"]
109
+ };
110
+ var ALL_FLAG_KEYS = Array.from(new Set(Object.values(EXTRACTOR_FLAG_KEYS).flat())).sort();
74
111
  var renderUnragConfig = (content, selection) => {
75
112
  const installImportBase = `./${selection.installDir.replace(/\\/g, "/")}`;
113
+ const richMedia = selection.richMedia ?? { enabled: false, extractors: [] };
114
+ const selectedExtractors = Array.from(new Set(richMedia.extractors ?? [])).sort();
76
115
  const baseImports = [
77
- `import { createContextEngine, defineConfig } from "${installImportBase}/core";`,
78
- `import { createAiEmbeddingProvider } from "${installImportBase}/embedding/ai";`
116
+ `import { defineUnragConfig } from "${installImportBase}/core";`
79
117
  ];
80
118
  const storeImports = [];
81
119
  const storeCreateLines = [];
@@ -89,32 +127,40 @@ var renderUnragConfig = (content, selection) => {
89
127
  storeImports.push(`import { createPrismaVectorStore } from "${installImportBase}/store/prisma";`, `import { PrismaClient } from "@prisma/client";`);
90
128
  storeCreateLines.push(` const prisma = (globalThis as any).__unragPrisma ?? new PrismaClient();`, ` (globalThis as any).__unragPrisma = prisma;`, ` const store = createPrismaVectorStore(prisma);`);
91
129
  }
92
- const importsBlock = [...baseImports, ...storeImports].join(`
130
+ const extractorImports = [];
131
+ if (richMedia.enabled && selectedExtractors.length > 0) {
132
+ for (const ex of selectedExtractors) {
133
+ const factory = EXTRACTOR_FACTORY[ex];
134
+ extractorImports.push(`import { ${factory} } from "${installImportBase}/extractors/${ex}";`);
135
+ }
136
+ }
137
+ const importsBlock = [...baseImports, ...storeImports, ...extractorImports].join(`
93
138
  `);
94
139
  const createEngineBlock = [
95
140
  `export function createUnragEngine() {`,
96
- ` const embedding = createAiEmbeddingProvider({`,
97
- ` model: unragConfig.embedding.model,`,
98
- ` timeoutMs: unragConfig.embedding.timeoutMs,`,
99
- ` });`,
100
141
  ...storeCreateLines,
101
142
  ``,
102
- ` return createContextEngine(`,
103
- ` defineConfig({`,
104
- ` embedding,`,
105
- ` store,`,
106
- ` defaults: unragConfig.chunking,`,
107
- ` })`,
108
- ` );`,
109
- `}`,
110
- ``,
111
- `export async function retrieve(query: string) {`,
112
- ` const engine = createUnragEngine();`,
113
- ` return engine.retrieve({ query, topK: unragConfig.retrieval.topK });`,
143
+ ` return unrag.createEngine({ store });`,
114
144
  `}`
115
145
  ].join(`
116
146
  `);
117
- return content.replace("// __UNRAG_IMPORTS__", importsBlock).replace("// __UNRAG_CREATE_ENGINE__", createEngineBlock);
147
+ let out = content.replace("// __UNRAG_IMPORTS__", importsBlock).replace("// __UNRAG_CREATE_ENGINE__", createEngineBlock);
148
+ out = out.replace('type: "text", // __UNRAG_EMBEDDING_TYPE__', richMedia.enabled ? 'type: "multimodal",' : 'type: "text",').replace('model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__', richMedia.enabled ? 'model: "cohere/embed-v4.0",' : 'model: "openai/text-embedding-3-small",');
149
+ const enabledFlagKeys = new Set;
150
+ if (richMedia.enabled) {
151
+ for (const ex of selectedExtractors) {
152
+ for (const k of EXTRACTOR_FLAG_KEYS[ex] ?? []) {
153
+ enabledFlagKeys.add(k);
154
+ }
155
+ }
156
+ }
157
+ for (const k of ALL_FLAG_KEYS) {
158
+ out = out.replace(`enabled: false, // __UNRAG_FLAG_${k}__`, `enabled: ${enabledFlagKeys.has(k) ? "true" : "false"},`);
159
+ }
160
+ const extractorLines = richMedia.enabled && selectedExtractors.length > 0 ? selectedExtractors.map((ex) => ` ${EXTRACTOR_FACTORY[ex]}(),`).join(`
161
+ `) : "";
162
+ out = out.replace(" // __UNRAG_EXTRACTORS__", extractorLines);
163
+ return out;
118
164
  };
119
165
  var renderDocs = (content, selection) => {
120
166
  const notes = [];
@@ -147,6 +193,10 @@ async function copyRegistryFiles(selection) {
147
193
  src: path2.join(selection.registryRoot, "core/index.ts"),
148
194
  dest: path2.join(installBaseAbs, "core/index.ts")
149
195
  },
196
+ {
197
+ src: path2.join(selection.registryRoot, "core/assets.ts"),
198
+ dest: path2.join(installBaseAbs, "core/assets.ts")
199
+ },
150
200
  {
151
201
  src: path2.join(selection.registryRoot, "core/types.ts"),
152
202
  dest: path2.join(installBaseAbs, "core/types.ts")
@@ -163,6 +213,10 @@ async function copyRegistryFiles(selection) {
163
213
  src: path2.join(selection.registryRoot, "core/context-engine.ts"),
164
214
  dest: path2.join(installBaseAbs, "core/context-engine.ts")
165
215
  },
216
+ {
217
+ src: path2.join(selection.registryRoot, "core/delete.ts"),
218
+ dest: path2.join(installBaseAbs, "core/delete.ts")
219
+ },
166
220
  {
167
221
  src: path2.join(selection.registryRoot, "core/ingest.ts"),
168
222
  dest: path2.join(installBaseAbs, "core/ingest.ts")
@@ -262,6 +316,62 @@ async function copyConnectorFiles(selection) {
262
316
  await writeText(dest, raw);
263
317
  }
264
318
  }
319
+ async function copyExtractorFiles(selection) {
320
+ const toAbs = (projectRelative) => path2.join(selection.projectRoot, projectRelative);
321
+ const installBaseAbs = toAbs(selection.installDir);
322
+ const extractorRegistryAbs = path2.join(selection.registryRoot, "extractors", selection.extractor);
323
+ const sharedRegistryAbs = path2.join(selection.registryRoot, "extractors", "_shared");
324
+ if (!await exists(extractorRegistryAbs)) {
325
+ throw new Error(`Unknown extractor registry: ${path2.relative(selection.registryRoot, extractorRegistryAbs)}`);
326
+ }
327
+ const extractorFiles = await listFilesRecursive(extractorRegistryAbs);
328
+ const sharedFiles = await exists(sharedRegistryAbs) ? await listFilesRecursive(sharedRegistryAbs) : [];
329
+ const destRootAbs = path2.join(installBaseAbs, "extractors", selection.extractor);
330
+ const sharedDestRootAbs = path2.join(installBaseAbs, "extractors", "_shared");
331
+ const nonInteractive = Boolean(selection.yes) || !process.stdin.isTTY;
332
+ const shouldWrite = async (src, dest) => {
333
+ if (!await exists(dest))
334
+ return true;
335
+ if (nonInteractive)
336
+ return false;
337
+ try {
338
+ const [srcRaw, destRaw] = await Promise.all([readText(src), readText(dest)]);
339
+ if (srcRaw === destRaw)
340
+ return false;
341
+ } catch {}
342
+ const answer = await confirm({
343
+ message: `Overwrite ${path2.relative(selection.projectRoot, dest)}?`,
344
+ initialValue: false
345
+ });
346
+ if (isCancel(answer)) {
347
+ cancel("Cancelled.");
348
+ return false;
349
+ }
350
+ return Boolean(answer);
351
+ };
352
+ for (const src of extractorFiles) {
353
+ if (!await exists(src)) {
354
+ throw new Error(`Registry file missing: ${src}`);
355
+ }
356
+ const rel = path2.relative(extractorRegistryAbs, src);
357
+ const dest = path2.join(destRootAbs, rel);
358
+ if (!await shouldWrite(src, dest))
359
+ continue;
360
+ const raw = await readText(src);
361
+ await writeText(dest, raw);
362
+ }
363
+ for (const src of sharedFiles) {
364
+ if (!await exists(src)) {
365
+ throw new Error(`Registry file missing: ${src}`);
366
+ }
367
+ const rel = path2.relative(sharedRegistryAbs, src);
368
+ const dest = path2.join(sharedDestRootAbs, rel);
369
+ if (!await shouldWrite(src, dest))
370
+ continue;
371
+ const raw = await readText(src);
372
+ await writeText(dest, raw);
373
+ }
374
+ }
265
375
 
266
376
  // cli/lib/json.ts
267
377
  import { readFile as readFile2, writeFile as writeFile2 } from "node:fs/promises";
@@ -347,6 +457,37 @@ function depsForConnector(connector) {
347
457
  }
348
458
  return { deps, devDeps };
349
459
  }
460
+ function depsForExtractor(extractor) {
461
+ const deps = {};
462
+ const devDeps = {};
463
+ if (extractor === "pdf-llm") {
464
+ deps["ai"] = "^5.0.113";
465
+ }
466
+ if (extractor === "pdf-text-layer") {
467
+ deps["pdfjs-dist"] = "^5.4.149";
468
+ }
469
+ if (extractor === "pdf-ocr") {}
470
+ if (extractor === "image-ocr" || extractor === "image-caption-llm") {
471
+ deps["ai"] = "^5.0.113";
472
+ }
473
+ if (extractor === "audio-transcribe" || extractor === "video-transcribe") {
474
+ deps["ai"] = "^5.0.113";
475
+ }
476
+ if (extractor === "video-frames") {
477
+ deps["ai"] = "^5.0.113";
478
+ }
479
+ if (extractor === "file-text") {}
480
+ if (extractor === "file-docx") {
481
+ deps["mammoth"] = "^1.10.0";
482
+ }
483
+ if (extractor === "file-pptx") {
484
+ deps["jszip"] = "^3.10.1";
485
+ }
486
+ if (extractor === "file-xlsx") {
487
+ deps["xlsx"] = "^0.18.5";
488
+ }
489
+ return { deps, devDeps };
490
+ }
350
491
  function installCmd(pm) {
351
492
  if (pm === "bun")
352
493
  return "bun install";
@@ -460,9 +601,93 @@ var parseInitArgs = (args) => {
460
601
  }
461
602
  continue;
462
603
  }
604
+ if (a === "--rich-media") {
605
+ out.richMedia = true;
606
+ continue;
607
+ }
608
+ if (a === "--no-rich-media") {
609
+ out.richMedia = false;
610
+ continue;
611
+ }
612
+ if (a === "--extractors") {
613
+ const v = args[i + 1];
614
+ if (v) {
615
+ out.extractors = v.split(",").map((s) => s.trim()).filter(Boolean);
616
+ i++;
617
+ }
618
+ continue;
619
+ }
463
620
  }
464
621
  return out;
465
622
  };
623
+ var DEFAULT_RICH_MEDIA_EXTRACTORS = ["pdf-text-layer", "file-text"];
624
+ var EXTRACTOR_OPTIONS = [
625
+ {
626
+ group: "PDF",
627
+ value: "pdf-text-layer",
628
+ label: `pdf-text-layer (Fast/cheap extraction via PDF text layer)`,
629
+ hint: "recommended"
630
+ },
631
+ {
632
+ group: "PDF",
633
+ value: "pdf-llm",
634
+ label: `pdf-llm (LLM-based PDF extraction; higher cost)`
635
+ },
636
+ {
637
+ group: "PDF",
638
+ value: "pdf-ocr",
639
+ label: `pdf-ocr (OCR scanned PDFs; requires native binaries)`,
640
+ hint: "worker-only"
641
+ },
642
+ {
643
+ group: "Image",
644
+ value: "image-ocr",
645
+ label: `image-ocr (Extract text from images via vision LLM)`
646
+ },
647
+ {
648
+ group: "Image",
649
+ value: "image-caption-llm",
650
+ label: `image-caption-llm (Generate captions for images via vision LLM)`
651
+ },
652
+ {
653
+ group: "Audio",
654
+ value: "audio-transcribe",
655
+ label: `audio-transcribe (Speech-to-text transcription)`
656
+ },
657
+ {
658
+ group: "Video",
659
+ value: "video-transcribe",
660
+ label: `video-transcribe (Transcribe video audio track)`
661
+ },
662
+ {
663
+ group: "Video",
664
+ value: "video-frames",
665
+ label: `video-frames (Sample frames + analyze via vision LLM; requires ffmpeg)`,
666
+ hint: "worker-only"
667
+ },
668
+ {
669
+ group: "Files",
670
+ value: "file-text",
671
+ label: `file-text (Extract text/markdown/json/html from common text files)`,
672
+ hint: "recommended"
673
+ },
674
+ {
675
+ group: "Files",
676
+ value: "file-docx",
677
+ label: `file-docx (Extract text from .docx files)`
678
+ },
679
+ {
680
+ group: "Files",
681
+ value: "file-pptx",
682
+ label: `file-pptx (Extract text from .pptx slides)`
683
+ },
684
+ {
685
+ group: "Files",
686
+ value: "file-xlsx",
687
+ label: `file-xlsx (Extract tables from .xlsx spreadsheets)`
688
+ }
689
+ ];
690
+ var AVAILABLE_EXTRACTORS = new Set(EXTRACTOR_OPTIONS.map((o) => o.value));
466
691
  async function initCommand(args) {
467
692
  const root = await tryFindProjectRoot(process.cwd());
468
693
  if (!root) {
@@ -531,17 +756,71 @@ async function initCommand(args) {
531
756
  return;
532
757
  }
533
758
  const aliasBase = String(aliasAnswer).trim();
759
+ if (parsed.richMedia === false && (parsed.extractors ?? []).length > 0) {
760
+ throw new Error('Cannot use "--no-rich-media" together with "--extractors".');
761
+ }
762
+ const extractorsFromArgs = (parsed.extractors ?? []).filter((x) => AVAILABLE_EXTRACTORS.has(x)).sort();
763
+ const richMediaAnswer = extractorsFromArgs.length > 0 ? true : typeof parsed.richMedia === "boolean" ? parsed.richMedia : nonInteractive ? false : await confirm2({
764
+ message: "Enable rich media ingestion (PDF/images/audio/video/files)? This also enables multimodal image embeddings (you can change this later).",
765
+ initialValue: false
766
+ });
767
+ if (isCancel2(richMediaAnswer)) {
768
+ cancel2("Cancelled.");
769
+ return;
770
+ }
771
+ const richMediaEnabled = Boolean(richMediaAnswer);
772
+ const selectedExtractorsAnswer = richMediaEnabled || extractorsFromArgs.length > 0 ? nonInteractive ? extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS : await groupMultiselect({
773
+ message: "Select extractors to enable (space to toggle, enter to confirm)",
774
+ options: EXTRACTOR_OPTIONS.reduce((acc, opt) => {
775
+ acc[opt.group] ??= [];
776
+ acc[opt.group].push({
777
+ value: opt.value,
778
+ label: opt.label,
779
+ ...opt.hint ? { hint: opt.hint } : {}
780
+ });
781
+ return acc;
782
+ }, {}),
783
+ initialValues: extractorsFromArgs.length > 0 ? extractorsFromArgs : DEFAULT_RICH_MEDIA_EXTRACTORS,
784
+ required: false
785
+ }) : [];
786
+ if (isCancel2(selectedExtractorsAnswer)) {
787
+ cancel2("Cancelled.");
788
+ return;
789
+ }
790
+ const selectedExtractors = Array.from(new Set(Array.isArray(selectedExtractorsAnswer) ? selectedExtractorsAnswer : [])).sort();
534
791
  const selection = {
535
792
  installDir,
536
793
  storeAdapter: storeAdapterAnswer,
537
794
  projectRoot: root,
538
795
  registryRoot,
539
- aliasBase
796
+ aliasBase,
797
+ richMedia: richMediaEnabled ? {
798
+ enabled: true,
799
+ extractors: selectedExtractors
800
+ } : { enabled: false, extractors: [] }
540
801
  };
541
802
  await copyRegistryFiles(selection);
803
+ if (richMediaEnabled && selectedExtractors.length > 0) {
804
+ for (const extractor of selectedExtractors) {
805
+ await copyExtractorFiles({
806
+ projectRoot: root,
807
+ registryRoot,
808
+ installDir,
809
+ extractor,
810
+ yes: nonInteractive
811
+ });
812
+ }
813
+ }
542
814
  const pkg = await readPackageJson(root);
543
815
  const { deps, devDeps } = depsForAdapter(storeAdapterAnswer);
544
- const merged = mergeDeps(pkg, deps, devDeps);
816
+ const extractorDeps = {};
817
+ const extractorDevDeps = {};
818
+ for (const ex of selectedExtractors) {
819
+ const r = depsForExtractor(ex);
820
+ Object.assign(extractorDeps, r.deps);
821
+ Object.assign(extractorDevDeps, r.devDeps);
822
+ }
823
+ const merged = mergeDeps(pkg, { ...deps, ...extractorDeps }, { ...devDeps, ...extractorDevDeps });
545
824
  if (merged.changes.length > 0) {
546
825
  await writePackageJson(root, merged.pkg);
547
826
  }
@@ -550,7 +829,11 @@ async function initCommand(args) {
550
829
  storeAdapter: storeAdapterAnswer,
551
830
  aliasBase,
552
831
  version: CONFIG_VERSION,
553
- connectors: existing?.connectors ?? []
832
+ connectors: existing?.connectors ?? [],
833
+ extractors: Array.from(new Set([
834
+ ...existing?.extractors ?? [],
835
+ ...richMediaEnabled ? selectedExtractors : []
836
+ ])).sort()
554
837
  };
555
838
  await writeJsonFile(path5.join(root, CONFIG_FILE), config);
556
839
  const pm = await detectPackageManager(root);
@@ -564,6 +847,11 @@ async function initCommand(args) {
564
847
  `- Docs: ${path5.join(installDir, "unrag.md")}`,
565
848
  `- Config: unrag.config.ts`,
566
849
  `- Imports: ${aliasBase}/* and ${aliasBase}/config`,
850
+ "",
851
+ `- Rich media: ${richMediaEnabled ? "enabled" : "disabled"}`,
852
+ richMediaEnabled ? `- Embeddings: multimodal enabled (images can be embedded directly)` : `- Embeddings: text-only (no direct image embedding)`,
853
+ richMediaEnabled ? `- Extractors: ${selectedExtractors.length > 0 ? selectedExtractors.join(", ") : "none"}` : "",
854
+ richMediaEnabled ? ` Tip: you can tweak extractors + assetProcessing flags in unrag.config.ts later.` : ` Tip: re-run \`unrag init --rich-media\` (or edit unrag.config.ts) to enable rich media later.`,
567
855
  isNext ? tsconfigResult.changed ? `- Next.js: updated ${tsconfigResult.file} (added aliases)` : `- Next.js: no tsconfig changes needed` : `- Next.js: not detected`,
568
856
  "",
569
857
  merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
@@ -592,6 +880,20 @@ function docsUrl(siteRelativePath) {
592
880
  var CONFIG_FILE2 = "unrag.json";
593
881
  var __filename3 = fileURLToPath2(import.meta.url);
594
882
  var __dirname3 = path6.dirname(__filename3);
883
+ var AVAILABLE_EXTRACTORS2 = [
884
+ "pdf-llm",
885
+ "pdf-text-layer",
886
+ "pdf-ocr",
887
+ "image-ocr",
888
+ "image-caption-llm",
889
+ "audio-transcribe",
890
+ "video-transcribe",
891
+ "video-frames",
892
+ "file-text",
893
+ "file-docx",
894
+ "file-pptx",
895
+ "file-xlsx"
896
+ ];
595
897
  var parseAddArgs = (args) => {
596
898
  const out = {};
597
899
  for (let i = 0;i < args.length; i++) {
@@ -600,8 +902,17 @@ var parseAddArgs = (args) => {
600
902
  out.yes = true;
601
903
  continue;
602
904
  }
603
- if (!out.connector && a && !a.startsWith("-")) {
604
- out.connector = a;
905
+ if (!out.kind && a && !a.startsWith("-")) {
906
+ if (a === "extractor") {
907
+ out.kind = "extractor";
908
+ continue;
909
+ }
910
+ out.kind = "connector";
911
+ out.name = a;
912
+ continue;
913
+ }
914
+ if (out.kind === "extractor" && !out.name && a && !a.startsWith("-")) {
915
+ out.name = a;
605
916
  continue;
606
917
  }
607
918
  }
@@ -613,23 +924,24 @@ async function addCommand(args) {
613
924
  throw new Error("Could not find a project root (no package.json found).");
614
925
  }
615
926
  const parsed = parseAddArgs(args);
616
- const connector = parsed.connector;
617
- if (!connector) {
618
- outro2(`Usage: unrag add <connector>
619
-
620
- Available connectors: notion`);
621
- return;
622
- }
623
- if (connector !== "notion") {
624
- outro2(`Unknown connector: ${connector}
625
-
626
- Available connectors: notion`);
927
+ const kind = parsed.kind ?? "connector";
928
+ const name = parsed.name;
929
+ if (!name) {
930
+ outro2([
931
+ "Usage:",
932
+ " unrag add <connector>",
933
+ " unrag add extractor <name>",
934
+ "",
935
+ "Available connectors: notion",
936
+ `Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`
937
+ ].join(`
938
+ `));
627
939
  return;
628
940
  }
629
941
  const configPath = path6.join(root, CONFIG_FILE2);
630
942
  const config = await readJsonFile(configPath);
631
943
  if (!config?.installDir) {
632
- throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag init\` first.`);
944
+ throw new Error(`Missing ${CONFIG_FILE2}. Run \`unrag@latest init\` first.`);
633
945
  }
634
946
  const cliPackageRoot = await findUp(__dirname3, "package.json");
635
947
  if (!cliPackageRoot) {
@@ -637,29 +949,70 @@ Available connectors: notion`);
637
949
  }
638
950
  const registryRoot = path6.join(cliPackageRoot, "registry");
639
951
  const nonInteractive = parsed.yes || !process.stdin.isTTY;
640
- await copyConnectorFiles({
952
+ const pkg = await readPackageJson(root);
953
+ if (kind === "connector") {
954
+ const connector = name;
955
+ if (connector !== "notion") {
956
+ outro2(`Unknown connector: ${name}
957
+
958
+ Available connectors: notion`);
959
+ return;
960
+ }
961
+ await copyConnectorFiles({
962
+ projectRoot: root,
963
+ registryRoot,
964
+ installDir: config.installDir,
965
+ connector,
966
+ yes: nonInteractive
967
+ });
968
+ const { deps: deps2, devDeps: devDeps2 } = depsForConnector(connector);
969
+ const merged2 = mergeDeps(pkg, deps2, devDeps2);
970
+ if (merged2.changes.length > 0) {
971
+ await writePackageJson(root, merged2.pkg);
972
+ }
973
+ const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
974
+ await writeJsonFile(configPath, { ...config, connectors });
975
+ outro2([
976
+ `Installed connector: ${connector}.`,
977
+ "",
978
+ `- Code: ${path6.join(config.installDir, "connectors", connector)}`,
979
+ `- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
980
+ "",
981
+ merged2.changes.length > 0 ? `Added deps: ${merged2.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
982
+ nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
983
+ ].filter(Boolean).join(`
984
+ `));
985
+ return;
986
+ }
987
+ const extractor = name;
988
+ if (!extractor || !AVAILABLE_EXTRACTORS2.includes(extractor)) {
989
+ outro2(`Unknown extractor: ${name}
990
+
991
+ Available extractors: ${AVAILABLE_EXTRACTORS2.join(", ")}`);
992
+ return;
993
+ }
994
+ await copyExtractorFiles({
641
995
  projectRoot: root,
642
996
  registryRoot,
643
997
  installDir: config.installDir,
644
- connector,
998
+ extractor,
645
999
  yes: nonInteractive
646
1000
  });
647
- const pkg = await readPackageJson(root);
648
- const { deps, devDeps } = depsForConnector(connector);
1001
+ const { deps, devDeps } = depsForExtractor(extractor);
649
1002
  const merged = mergeDeps(pkg, deps, devDeps);
650
1003
  if (merged.changes.length > 0) {
651
1004
  await writePackageJson(root, merged.pkg);
652
1005
  }
653
- const connectors = Array.from(new Set([...config.connectors ?? [], connector])).sort();
654
- await writeJsonFile(configPath, { ...config, connectors });
1006
+ const extractors = Array.from(new Set([...config.extractors ?? [], extractor])).sort();
1007
+ await writeJsonFile(configPath, { ...config, extractors });
655
1008
  outro2([
656
- `Installed connector: ${connector}.`,
1009
+ `Installed extractor: ${extractor}.`,
657
1010
  "",
658
- `- Code: ${path6.join(config.installDir, "connectors", connector)}`,
659
- `- Docs: ${docsUrl(`/docs/connectors/${connector}`)}`,
1011
+ `- Code: ${path6.join(config.installDir, "extractors", extractor)}`,
660
1012
  "",
661
1013
  merged.changes.length > 0 ? `Added deps: ${merged.changes.map((c) => c.name).join(", ")}` : "Added deps: none",
662
- nonInteractive ? "" : "Tip: keep NOTION_TOKEN server-side only (env var)."
1014
+ "",
1015
+ `Next: import the extractor and pass it to createContextEngine({ extractors: [...] }).`
663
1016
  ].filter(Boolean).join(`
664
1017
  `));
665
1018
  }
@@ -686,10 +1039,15 @@ function renderHelp() {
686
1039
  " --store <adapter> drizzle | prisma | raw-sql",
687
1040
  " --dir <path> Install directory (alias: --install-dir)",
688
1041
  " --alias <@name> Import alias base (e.g. @unrag)",
1042
+ " --rich-media Enable rich media setup (also enables multimodal embeddings)",
1043
+ " --no-rich-media Disable rich media setup",
1044
+ " --extractors <list> Comma-separated extractors (implies --rich-media)",
689
1045
  "",
690
1046
  "Examples:",
691
- " bunx unrag init",
692
- " bunx unrag init --yes --store drizzle --dir lib/unrag --alias @unrag",
1047
+ " bunx unrag@latest init",
1048
+ " bunx unrag@latest init --yes --store drizzle --dir lib/unrag --alias @unrag",
1049
+ " bunx unrag@latest init --yes --rich-media",
1050
+ " bunx unrag@latest init --yes --extractors pdf-text-layer,file-text",
693
1051
  " bunx unrag add notion --yes",
694
1052
  "",
695
1053
  "Docs:",
package/package.json CHANGED
@@ -1,10 +1,12 @@
1
1
  {
2
2
  "name": "unrag",
3
3
  "type": "module",
4
+ "repository": "https://github.com/BetterStacks/unrag",
5
+ "homepage": "https://unrag.dev",
4
6
  "bin": {
5
7
  "unrag": "./dist/cli/index.js"
6
8
  },
7
- "version": "0.2.2",
9
+ "version": "0.2.4",
8
10
  "private": false,
9
11
  "license": "Apache-2.0",
10
12
  "devDependencies": {