unrag 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/index.js CHANGED
@@ -396,6 +396,10 @@ async function copyRegistryFiles(selection) {
396
396
  src: path2.join(selection.registryRoot, "core/retrieve.ts"),
397
397
  dest: path2.join(installBaseAbs, "core/retrieve.ts")
398
398
  },
399
+ {
400
+ src: path2.join(selection.registryRoot, "core/rerank.ts"),
401
+ dest: path2.join(installBaseAbs, "core/rerank.ts")
402
+ },
399
403
  {
400
404
  src: path2.join(selection.registryRoot, "embedding/_shared.ts"),
401
405
  dest: path2.join(installBaseAbs, "embedding/_shared.ts")
@@ -707,6 +711,9 @@ function isPresetPayloadV1(x) {
707
711
  return false;
708
712
  if (!Array.isArray(o.modules.extractors) || !Array.isArray(o.modules.connectors))
709
713
  return false;
714
+ if ("batteries" in o.modules && o.modules.batteries != null && !Array.isArray(o.modules.batteries)) {
715
+ return false;
716
+ }
710
717
  return true;
711
718
  }
712
719
  function toPresetUrl(input) {
@@ -879,6 +886,7 @@ function depsForBattery(battery) {
879
886
  deps["ai"] = "^6.0.3";
880
887
  deps["@ai-sdk/cohere"] = "^3.0.1";
881
888
  }
889
+ if (battery === "eval") {}
882
890
  return { deps, devDeps };
883
891
  }
884
892
  function installCmd(pm) {
@@ -983,6 +991,314 @@ async function patchTsconfigPaths(params) {
983
991
  return { changed: true, file: configFile };
984
992
  }
985
993
 
994
+ // cli/commands/init.ts
995
+ import { writeFile as writeFile5 } from "node:fs/promises";
996
+
997
+ // cli/lib/evalBatteryScaffold.ts
998
+ var EVAL_SAMPLE_DATASET_V1 = {
999
+ version: "1",
1000
+ id: "sample",
1001
+ description: "Tiny dataset to validate retrieval changes.",
1002
+ defaults: {
1003
+ topK: 10,
1004
+ scopePrefix: "eval:sample:",
1005
+ mode: "retrieve",
1006
+ thresholds: { min: { recallAtK: 0.75 } }
1007
+ },
1008
+ documents: [
1009
+ {
1010
+ sourceId: "eval:sample:doc:refund-policy",
1011
+ content: "Refunds are available within 30 days of purchase, provided you have a receipt."
1012
+ },
1013
+ {
1014
+ sourceId: "eval:sample:doc:contact-support",
1015
+ content: "Contact support by emailing support@example.com. Response times are typically under 24 hours."
1016
+ }
1017
+ ],
1018
+ queries: [
1019
+ {
1020
+ id: "q_refund_window",
1021
+ query: "How long do I have to request a refund?",
1022
+ relevant: { sourceIds: ["eval:sample:doc:refund-policy"] }
1023
+ },
1024
+ {
1025
+ id: "q_contact_support",
1026
+ query: "How do I contact support?",
1027
+ relevant: { sourceIds: ["eval:sample:doc:contact-support"] }
1028
+ }
1029
+ ]
1030
+ };
1031
+ var EVAL_CONFIG_DEFAULT = {
1032
+ thresholds: { min: { recallAtK: 0.75 } },
1033
+ cleanup: "none",
1034
+ ingest: true
1035
+ };
1036
+ var EVAL_PACKAGE_JSON_SCRIPTS = {
1037
+ "unrag:eval": `bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json`,
1038
+ "unrag:eval:ci": `bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json --ci`
1039
+ };
1040
+ function renderEvalRunnerScript(opts) {
1041
+ const installImportBase = `../${opts.installDir.replace(/\\/g, "/")}`;
1042
+ return `/**
1043
+ * Unrag eval runner entrypoint (generated).
1044
+ *
1045
+ * You own this file — customize it freely.
1046
+ */
1047
+
1048
+ import path from "node:path";
1049
+ import { access, readFile } from "node:fs/promises";
1050
+
1051
+ import { createUnragEngine } from "../unrag.config";
1052
+ import {
1053
+ runEval,
1054
+ readEvalReportFromFile,
1055
+ writeEvalReport,
1056
+ writeEvalSummaryMd,
1057
+ diffEvalReports,
1058
+ writeEvalDiffJson,
1059
+ writeEvalDiffMd,
1060
+ type EvalMode,
1061
+ type EvalThresholds,
1062
+ type EvalCleanupPolicy,
1063
+ } from "${installImportBase}/eval";
1064
+
1065
+ type CliArgs = {
1066
+ dataset?: string;
1067
+ baseline?: string;
1068
+ outputDir?: string;
1069
+ mode?: EvalMode;
1070
+ topK?: number;
1071
+ rerankTopK?: number;
1072
+ scopePrefix?: string;
1073
+ ingest?: boolean;
1074
+ cleanup?: EvalCleanupPolicy;
1075
+ thresholds?: Partial<EvalThresholds>;
1076
+ ci?: boolean;
1077
+ allowAssets?: boolean;
1078
+ allowNonEvalPrefix?: boolean;
1079
+ yes?: boolean;
1080
+ includeNdcg?: boolean;
1081
+ };
1082
+
1083
+ async function fileExists(p: string): Promise<boolean> {
1084
+ try {
1085
+ await access(p);
1086
+ return true;
1087
+ } catch {
1088
+ return false;
1089
+ }
1090
+ }
1091
+
1092
+ async function loadEnvFilesBestEffort(projectRoot: string) {
1093
+ const nodeEnv = process.env.NODE_ENV ?? "development";
1094
+ const candidates = [
1095
+ ".env",
1096
+ ".env.local",
1097
+ \`.env.\${nodeEnv}\`,
1098
+ \`.env.\${nodeEnv}.local\`,
1099
+ ];
1100
+ for (const rel of candidates) {
1101
+ const abs = path.join(projectRoot, rel);
1102
+ if (!(await fileExists(abs))) continue;
1103
+ try {
1104
+ const raw = await readFile(abs, "utf8");
1105
+ for (const line of raw.split(/\\r?\\n/)) {
1106
+ const trimmed = line.trim();
1107
+ if (!trimmed || trimmed.startsWith("#")) continue;
1108
+ const eq = trimmed.indexOf("=");
1109
+ if (eq < 0) continue;
1110
+ const key = trimmed.slice(0, eq).trim();
1111
+ const value = trimmed.slice(eq + 1).trim().replace(/^"|"$/g, "");
1112
+ if (!key) continue;
1113
+ if (process.env[key] == null) process.env[key] = value;
1114
+ }
1115
+ } catch {
1116
+ // ignore
1117
+ }
1118
+ }
1119
+ }
1120
+
1121
+ function parseThresholdExpr(expr: string): Partial<EvalThresholds> {
1122
+ // Accept both:
1123
+ // - "min.recallAtK=0.75"
1124
+ // - "recallAtK=0.75" (shorthand for min)
1125
+ const [lhsRaw, rhsRaw] = String(expr ?? "").split("=");
1126
+ const lhs = (lhsRaw ?? "").trim();
1127
+ const rhs = Number(String(rhsRaw ?? "").trim());
1128
+ if (!lhs || Number.isNaN(rhs)) return {};
1129
+
1130
+ const parts = lhs.split(".").map((p) => p.trim()).filter(Boolean);
1131
+ const level = parts.length === 2 ? parts[0] : "min";
1132
+ const metric = parts.length === 2 ? parts[1] : parts[0];
1133
+ if (level !== "min") return {};
1134
+
1135
+ const allowed = new Set(["hitAtK", "precisionAtK", "recallAtK", "mrrAtK", "ndcgAtK"]);
1136
+ if (!allowed.has(metric)) return {};
1137
+ return { min: { [metric]: rhs } } as any;
1138
+ }
1139
+
1140
+ function mergeThresholds(
1141
+ a: Partial<EvalThresholds> | undefined,
1142
+ b: Partial<EvalThresholds> | undefined
1143
+ ): Partial<EvalThresholds> | undefined {
1144
+ if (!a && !b) return undefined;
1145
+ const out: any = { ...(a ?? {}) };
1146
+ if (b?.min) out.min = { ...(out.min ?? {}), ...(b.min as any) };
1147
+ return out;
1148
+ }
1149
+
1150
+ function parseArgs(argv: string[]): CliArgs {
1151
+ const out: CliArgs = {};
1152
+ const thresholds: Partial<EvalThresholds>[] = [];
1153
+
1154
+ for (let i = 0; i < argv.length; i++) {
1155
+ const a = argv[i];
1156
+ if (a === "--dataset") out.dataset = argv[++i];
1157
+ else if (a === "--baseline") out.baseline = argv[++i];
1158
+ else if (a === "--outputDir" || a === "--output-dir") out.outputDir = argv[++i];
1159
+ else if (a === "--mode") out.mode = argv[++i] as EvalMode;
1160
+ else if (a === "--topK" || a === "--top-k") out.topK = Number(argv[++i]);
1161
+ else if (a === "--rerankTopK" || a === "--rerank-top-k") out.rerankTopK = Number(argv[++i]);
1162
+ else if (a === "--scopePrefix" || a === "--scope-prefix") out.scopePrefix = argv[++i];
1163
+ else if (a === "--no-ingest") out.ingest = false;
1164
+ else if (a === "--cleanup") out.cleanup = argv[++i] as EvalCleanupPolicy;
1165
+ else if (a === "--threshold") thresholds.push(parseThresholdExpr(argv[++i] ?? ""));
1166
+ else if (a === "--ci") out.ci = true;
1167
+ else if (a === "--allow-assets") out.allowAssets = true;
1168
+ else if (a === "--allow-non-eval-prefix" || a === "--allow-custom-prefix") out.allowNonEvalPrefix = true;
1169
+ else if (a === "--yes" || a === "-y") out.yes = true;
1170
+ else if (a === "--include-ndcg") out.includeNdcg = true;
1171
+ else if (a === "--help" || a === "-h") {
1172
+ printHelp();
1173
+ process.exit(0);
1174
+ }
1175
+ }
1176
+
1177
+ for (const t of thresholds) out.thresholds = mergeThresholds(out.thresholds ?? {}, t);
1178
+ return out;
1179
+ }
1180
+
1181
+ function printHelp() {
1182
+ console.log(
1183
+ [
1184
+ "unrag-eval — retrieval eval harness",
1185
+ "",
1186
+ "Usage:",
1187
+ " bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json",
1188
+ "",
1189
+ "Options:",
1190
+ " --dataset <path> Dataset JSON path (required)",
1191
+ " --baseline <report.json> Baseline report for diffing",
1192
+ " --output-dir <dir> Output dir (default: .unrag/eval/runs/<ts>-<datasetId>)",
1193
+ " --mode retrieve|retrieve+rerank Override mode",
1194
+ " --top-k <n> Override topK",
1195
+ " --rerank-top-k <n> In rerank mode, retrieve N candidates before reranking (default: topK*3)",
1196
+ " --scope-prefix <prefix> Override scopePrefix",
1197
+ " --no-ingest Skip dataset document ingest",
1198
+ " --cleanup none|on-success|always Cleanup policy when ingesting",
1199
+ " --threshold <k=v> Repeatable thresholds (e.g. min.recallAtK=0.75)",
1200
+ " --ci CI mode (non-interactive)",
1201
+ " --yes, -y Allow dangerous operations when explicitly enabled",
1202
+ " --allow-assets Allow documents[].assets ingestion (advanced)",
1203
+ " --allow-custom-prefix Allow scopePrefix outside eval:* (dangerous)",
1204
+ " --include-ndcg Compute nDCG@k (optional)",
1205
+ ].join("\\n")
1206
+ );
1207
+ }
1208
+
1209
+ async function readConfigFile(projectRoot: string): Promise<any | null> {
1210
+ const abs = path.join(projectRoot, ".unrag/eval/config.json");
1211
+ if (!(await fileExists(abs))) return null;
1212
+ const raw = await readFile(abs, "utf8");
1213
+ try {
1214
+ return JSON.parse(raw);
1215
+ } catch (e) {
1216
+ const msg = e instanceof Error ? e.message : String(e);
1217
+ throw new Error(\`Failed to parse .unrag/eval/config.json: \${msg}\`);
1218
+ }
1219
+ }
1220
+
1221
+ function sanitizeMode(v: any): EvalMode | undefined {
1222
+ if (v === "retrieve" || v === "retrieve+rerank") return v;
1223
+ return undefined;
1224
+ }
1225
+
1226
+ function sanitizeCleanup(v: any): EvalCleanupPolicy | undefined {
1227
+ if (v === "none" || v === "on-success" || v === "always") return v;
1228
+ return undefined;
1229
+ }
1230
+
1231
+ async function main() {
1232
+ const projectRoot = path.join(process.cwd());
1233
+ await loadEnvFilesBestEffort(projectRoot);
1234
+
1235
+ const cli = parseArgs(process.argv.slice(2));
1236
+ const cfg = await readConfigFile(projectRoot);
1237
+
1238
+ const datasetPath = cli.dataset ?? cfg?.dataset ?? ".unrag/eval/datasets/sample.json";
1239
+ if (!datasetPath) throw new Error("--dataset is required");
1240
+
1241
+ const engine = createUnragEngine();
1242
+
1243
+ const mode = sanitizeMode(cli.mode ?? cfg?.mode) ?? undefined;
1244
+ const cleanup = sanitizeCleanup(cli.cleanup ?? cfg?.cleanup) ?? undefined;
1245
+
1246
+ const result = await runEval({
1247
+ engine,
1248
+ datasetPath,
1249
+ mode,
1250
+ topK: typeof cli.topK === "number" ? cli.topK : undefined,
1251
+ rerankTopK: typeof cli.rerankTopK === "number" ? cli.rerankTopK : undefined,
1252
+ scopePrefix: typeof cli.scopePrefix === "string" ? cli.scopePrefix : undefined,
1253
+ ingest: typeof cli.ingest === "boolean" ? cli.ingest : (typeof cfg?.ingest === "boolean" ? cfg.ingest : undefined),
1254
+ cleanup,
1255
+ thresholds: mergeThresholds(cfg?.thresholds, cli.thresholds),
1256
+ ci: Boolean(cli.ci),
1257
+ allowAssets: Boolean(cli.allowAssets),
1258
+ allowNonEvalPrefix: Boolean(cli.allowNonEvalPrefix),
1259
+ yes: Boolean(cli.yes),
1260
+ includeNdcg: Boolean(cli.includeNdcg),
1261
+ });
1262
+
1263
+ const outputDir = cli.outputDir ?? cfg?.outputDir ?? result.outputDir;
1264
+
1265
+ const reportPath = await writeEvalReport(outputDir, result.report);
1266
+ const summaryPath = await writeEvalSummaryMd(outputDir, result.report);
1267
+
1268
+ let diffPaths: { json: string; md: string } | null = null;
1269
+ const baselinePath = cli.baseline ?? cfg?.baseline;
1270
+ if (baselinePath) {
1271
+ const baseline = await readEvalReportFromFile(baselinePath);
1272
+ const diff = diffEvalReports({ baseline, candidate: result.report, baselinePath, candidatePath: reportPath });
1273
+ const diffJson = await writeEvalDiffJson(outputDir, diff);
1274
+ const diffMd = await writeEvalDiffMd(outputDir, diff);
1275
+ diffPaths = { json: diffJson, md: diffMd };
1276
+ }
1277
+
1278
+ console.log(
1279
+ [
1280
+ \`[unrag:eval] Wrote report: \${reportPath}\`,
1281
+ \`[unrag:eval] Wrote summary: \${summaryPath}\`,
1282
+ diffPaths ? \`[unrag:eval] Wrote diff: \${diffPaths.json} (+ \${diffPaths.md})\` : "",
1283
+ result.thresholdFailures.length > 0
1284
+ ? \`[unrag:eval] Threshold failures:\\n- \${result.thresholdFailures.join("\\n- ")}\`
1285
+ : \`[unrag:eval] Thresholds: pass\`,
1286
+ ]
1287
+ .filter(Boolean)
1288
+ .join("\\n")
1289
+ );
1290
+
1291
+ process.exitCode = result.exitCode;
1292
+ }
1293
+
1294
+ main().catch((err) => {
1295
+ const msg = err instanceof Error ? err.stack ?? err.message : String(err);
1296
+ console.error(\`[unrag:eval] Error: \${msg}\`);
1297
+ process.exitCode = 2;
1298
+ });
1299
+ `;
1300
+ }
1301
+
986
1302
  // cli/commands/init.ts
987
1303
  var CONFIG_FILE = "unrag.json";
988
1304
  var CONFIG_VERSION = 1;
@@ -1069,6 +1385,7 @@ var parseInitArgs = (args) => {
1069
1385
  };
1070
1386
  var toExtractors = (xs) => (Array.isArray(xs) ? xs : []).map((s) => String(s).trim()).filter(Boolean);
1071
1387
  var toConnectors = (xs) => (Array.isArray(xs) ? xs : []).map((s) => String(s).trim()).filter(Boolean);
1388
+ var toBatteries = (xs) => (Array.isArray(xs) ? xs : []).map((s) => String(s).trim()).filter(Boolean);
1072
1389
  async function initCommand(args) {
1073
1390
  const root = await tryFindProjectRoot(process.cwd());
1074
1391
  if (!root) {
@@ -1291,7 +1608,34 @@ async function initCommand(args) {
1291
1608
  Object.assign(connectorDeps, r.deps);
1292
1609
  Object.assign(connectorDevDeps, r.devDeps);
1293
1610
  }
1294
- const merged = mergeDeps(pkg, { ...deps, ...embeddingDeps.deps, ...extractorDeps, ...connectorDeps }, { ...devDeps, ...embeddingDeps.devDeps, ...extractorDevDeps, ...connectorDevDeps });
1611
+ const batteriesFromPreset = preset ? Array.from(new Set(toBatteries(preset.modules?.batteries))).sort() : [];
1612
+ const availableBatteryIds = new Set((manifest.batteries ?? []).filter((b) => b.status === "available").map((b) => String(b.id)));
1613
+ if (preset) {
1614
+ const unknown = batteriesFromPreset.filter((b) => !availableBatteryIds.has(b));
1615
+ if (unknown.length > 0) {
1616
+ throw new Error(`Preset contains unknown/unavailable batteries: ${unknown.join(", ")}`);
1617
+ }
1618
+ }
1619
+ if (batteriesFromPreset.length > 0) {
1620
+ for (const battery of batteriesFromPreset) {
1621
+ await copyBatteryFiles({
1622
+ projectRoot: root,
1623
+ registryRoot,
1624
+ installDir,
1625
+ battery,
1626
+ yes: nonInteractive,
1627
+ overwrite: overwritePolicy
1628
+ });
1629
+ }
1630
+ }
1631
+ const batteryDeps = {};
1632
+ const batteryDevDeps = {};
1633
+ for (const b of batteriesFromPreset) {
1634
+ const r = depsForBattery(b);
1635
+ Object.assign(batteryDeps, r.deps);
1636
+ Object.assign(batteryDevDeps, r.devDeps);
1637
+ }
1638
+ const merged = mergeDeps(pkg, { ...deps, ...embeddingDeps.deps, ...extractorDeps, ...connectorDeps, ...batteryDeps }, { ...devDeps, ...embeddingDeps.devDeps, ...extractorDevDeps, ...connectorDevDeps, ...batteryDevDeps });
1295
1639
  if (merged.changes.length > 0) {
1296
1640
  await writePackageJson(root, merged.pkg);
1297
1641
  if (!noInstall) {
@@ -1308,9 +1652,42 @@ async function initCommand(args) {
1308
1652
  extractors: Array.from(new Set([
1309
1653
  ...existing?.extractors ?? [],
1310
1654
  ...richMediaEnabled ? selectedExtractors : []
1311
- ])).sort()
1655
+ ])).sort(),
1656
+ batteries: Array.from(new Set([...existing?.batteries ?? [], ...batteriesFromPreset])).sort()
1312
1657
  };
1313
1658
  await writeJsonFile(path6.join(root, CONFIG_FILE), config);
1659
+ const writeTextFile = async (absPath, content) => {
1660
+ await ensureDir(path6.dirname(absPath));
1661
+ await writeFile5(absPath, content, "utf8");
1662
+ };
1663
+ const writeIfMissing = async (absPath, content) => {
1664
+ if (await exists(absPath))
1665
+ return false;
1666
+ await writeTextFile(absPath, content);
1667
+ return true;
1668
+ };
1669
+ if (batteriesFromPreset.includes("eval")) {
1670
+ const datasetAbs = path6.join(root, ".unrag/eval/datasets/sample.json");
1671
+ const evalConfigAbs = path6.join(root, ".unrag/eval/config.json");
1672
+ const scriptAbs = path6.join(root, "scripts/unrag-eval.ts");
1673
+ await writeIfMissing(datasetAbs, JSON.stringify(EVAL_SAMPLE_DATASET_V1, null, 2) + `
1674
+ `);
1675
+ await writeIfMissing(evalConfigAbs, JSON.stringify(EVAL_CONFIG_DEFAULT, null, 2) + `
1676
+ `);
1677
+ await writeIfMissing(scriptAbs, renderEvalRunnerScript({ installDir }));
1678
+ const pkg2 = await readPackageJson(root);
1679
+ const existingScripts = pkg2.scripts ?? {};
1680
+ const toAdd = {};
1681
+ for (const [name, cmd] of Object.entries(EVAL_PACKAGE_JSON_SCRIPTS)) {
1682
+ if (!(name in existingScripts)) {
1683
+ toAdd[name] = cmd;
1684
+ }
1685
+ }
1686
+ if (Object.keys(toAdd).length > 0) {
1687
+ pkg2.scripts = { ...existingScripts, ...toAdd };
1688
+ await writePackageJson(root, pkg2);
1689
+ }
1690
+ }
1314
1691
  const pm = await detectPackageManager(root);
1315
1692
  const installLine = merged.changes.length === 0 ? "Dependencies already satisfied." : noInstall ? `Next: run \`${installCmd(pm)}\`` : "Dependencies installed.";
1316
1693
  const isNext = Boolean((merged.pkg.dependencies ?? {})["next"]) || Boolean((merged.pkg.devDependencies ?? {})["next"]);
@@ -1437,12 +1814,92 @@ async function initCommand(args) {
1437
1814
  }
1438
1815
 
1439
1816
  // cli/commands/add.ts
1440
- import { outro as outro2 } from "@clack/prompts";
1817
+ import { cancel as cancel3, confirm as confirm3, isCancel as isCancel3, outro as outro2, select as select2, text as text2 } from "@clack/prompts";
1818
+ import { writeFile as writeFile6 } from "node:fs/promises";
1441
1819
  import path7 from "node:path";
1442
1820
  import { fileURLToPath as fileURLToPath2 } from "node:url";
1443
1821
  var CONFIG_FILE2 = "unrag.json";
1444
1822
  var __filename3 = fileURLToPath2(import.meta.url);
1445
1823
  var __dirname3 = path7.dirname(__filename3);
1824
+ var writeTextFile = async (absPath, content) => {
1825
+ await ensureDir(path7.dirname(absPath));
1826
+ await writeFile6(absPath, content, "utf8");
1827
+ };
1828
+ var shouldWriteFile = async (absPath, projectRoot, nonInteractive) => {
1829
+ if (!await exists(absPath))
1830
+ return true;
1831
+ if (nonInteractive)
1832
+ return false;
1833
+ const answer = await confirm3({
1834
+ message: `Overwrite ${path7.relative(projectRoot, absPath)}?`,
1835
+ initialValue: false
1836
+ });
1837
+ if (isCancel3(answer)) {
1838
+ cancel3("Cancelled.");
1839
+ return false;
1840
+ }
1841
+ return Boolean(answer);
1842
+ };
1843
+ var addPackageJsonScripts = async (args) => {
1844
+ const existing = args.pkg.scripts ?? {};
1845
+ const desired = args.scripts;
1846
+ const conflicting = Object.keys(desired).filter((k) => (k in existing));
1847
+ const toAdd = { ...desired };
1848
+ if (conflicting.length > 0 && args.nonInteractive) {
1849
+ for (const k of conflicting)
1850
+ delete toAdd[k];
1851
+ }
1852
+ if (conflicting.length > 0 && !args.nonInteractive) {
1853
+ for (const scriptName of conflicting) {
1854
+ const action = await select2({
1855
+ message: `Script "${scriptName}" already exists. What would you like to do?`,
1856
+ options: [
1857
+ { value: "keep", label: "Keep existing", hint: existing[scriptName] },
1858
+ { value: "overwrite", label: "Overwrite", hint: desired[scriptName] },
1859
+ { value: "rename", label: "Add with different name", hint: `${scriptName}:new` }
1860
+ ],
1861
+ initialValue: "keep"
1862
+ });
1863
+ if (isCancel3(action)) {
1864
+ cancel3("Cancelled.");
1865
+ return { added: [], kept: Object.keys(desired) };
1866
+ }
1867
+ if (action === "keep") {
1868
+ delete toAdd[scriptName];
1869
+ continue;
1870
+ }
1871
+ if (action === "rename") {
1872
+ const newName = await text2({
1873
+ message: `New script name for ${scriptName}`,
1874
+ initialValue: `${scriptName}:new`,
1875
+ validate: (v) => {
1876
+ const s = String(v).trim();
1877
+ if (!s)
1878
+ return "Script name is required";
1879
+ if (s in existing || s in toAdd)
1880
+ return "Script name already exists";
1881
+ return;
1882
+ }
1883
+ });
1884
+ if (isCancel3(newName)) {
1885
+ cancel3("Cancelled.");
1886
+ return { added: [], kept: Object.keys(desired) };
1887
+ }
1888
+ const nextName = String(newName).trim();
1889
+ const value = toAdd[scriptName];
1890
+ delete toAdd[scriptName];
1891
+ toAdd[nextName] = value;
1892
+ }
1893
+ }
1894
+ }
1895
+ const added = Object.keys(toAdd);
1896
+ if (added.length > 0) {
1897
+ args.pkg.scripts = { ...existing, ...toAdd };
1898
+ await writePackageJson(args.projectRoot, args.pkg);
1899
+ }
1900
+ const kept = conflicting.filter((k) => !(k in toAdd));
1901
+ return { added, kept };
1902
+ };
1446
1903
  var parseAddArgs = (args) => {
1447
1904
  const out = {};
1448
1905
  for (let i = 0;i < args.length; i++) {
@@ -1539,6 +1996,339 @@ Available batteries: ${Array.from(availableBatteries).join(", ")}`);
1539
1996
  }
1540
1997
  const batteries = Array.from(new Set([...config.batteries ?? [], battery])).sort();
1541
1998
  await writeJsonFile(configPath, { ...config, batteries });
1999
+ if (battery === "eval") {
2000
+ const datasetAbs = path7.join(root, ".unrag/eval/datasets/sample.json");
2001
+ const configAbs = path7.join(root, ".unrag/eval/config.json");
2002
+ const scriptAbs = path7.join(root, "scripts/unrag-eval.ts");
2003
+ const sampleDataset = {
2004
+ version: "1",
2005
+ id: "sample",
2006
+ description: "Tiny dataset to validate retrieval changes.",
2007
+ defaults: {
2008
+ topK: 10,
2009
+ scopePrefix: "eval:sample:",
2010
+ mode: "retrieve",
2011
+ thresholds: { min: { recallAtK: 0.75 } }
2012
+ },
2013
+ documents: [
2014
+ {
2015
+ sourceId: "eval:sample:doc:refund-policy",
2016
+ content: "Refunds are available within 30 days of purchase, provided you have a receipt."
2017
+ },
2018
+ {
2019
+ sourceId: "eval:sample:doc:contact-support",
2020
+ content: "Contact support by emailing support@example.com. Response times are typically under 24 hours."
2021
+ }
2022
+ ],
2023
+ queries: [
2024
+ {
2025
+ id: "q_refund_window",
2026
+ query: "How long do I have to request a refund?",
2027
+ relevant: { sourceIds: ["eval:sample:doc:refund-policy"] }
2028
+ },
2029
+ {
2030
+ id: "q_contact_support",
2031
+ query: "How do I contact support?",
2032
+ relevant: { sourceIds: ["eval:sample:doc:contact-support"] }
2033
+ }
2034
+ ]
2035
+ };
2036
+ const evalConfig = {
2037
+ thresholds: { min: { recallAtK: 0.75 } },
2038
+ cleanup: "none",
2039
+ ingest: true
2040
+ };
2041
+ const installImportBase = `../${config.installDir.replace(/\\/g, "/")}`;
2042
+ const script = `/**
2043
+ * Unrag eval runner entrypoint (generated).
2044
+ *
2045
+ * You own this file — customize it freely.
2046
+ */
2047
+
2048
+ import path from "node:path";
2049
+ import { access, readFile } from "node:fs/promises";
2050
+
2051
+ import { createUnragEngine } from "../unrag.config";
2052
+ import {
2053
+ runEval,
2054
+ readEvalReportFromFile,
2055
+ writeEvalReport,
2056
+ writeEvalSummaryMd,
2057
+ diffEvalReports,
2058
+ writeEvalDiffJson,
2059
+ writeEvalDiffMd,
2060
+ type EvalMode,
2061
+ type EvalThresholds,
2062
+ type EvalCleanupPolicy,
2063
+ } from "${installImportBase}/eval";
2064
+
2065
+ type CliArgs = {
2066
+ dataset?: string;
2067
+ baseline?: string;
2068
+ outputDir?: string;
2069
+ mode?: EvalMode;
2070
+ topK?: number;
2071
+ rerankTopK?: number;
2072
+ scopePrefix?: string;
2073
+ ingest?: boolean;
2074
+ cleanup?: EvalCleanupPolicy;
2075
+ thresholds?: Partial<EvalThresholds>;
2076
+ ci?: boolean;
2077
+ allowAssets?: boolean;
2078
+ allowNonEvalPrefix?: boolean;
2079
+ yes?: boolean;
2080
+ includeNdcg?: boolean;
2081
+ };
2082
+
2083
+ async function fileExists(p: string): Promise<boolean> {
2084
+ try {
2085
+ await access(p);
2086
+ return true;
2087
+ } catch {
2088
+ return false;
2089
+ }
2090
+ }
2091
+
2092
+ async function loadEnvFilesBestEffort(projectRoot: string) {
2093
+ const nodeEnv = process.env.NODE_ENV ?? "development";
2094
+ const candidates = [
2095
+ ".env",
2096
+ ".env.local",
2097
+ \`.env.\${nodeEnv}\`,
2098
+ \`.env.\${nodeEnv}.local\`,
2099
+ ];
2100
+ for (const rel of candidates) {
2101
+ const abs = path.join(projectRoot, rel);
2102
+ if (!(await fileExists(abs))) continue;
2103
+ const raw = await readFile(abs, "utf8").catch(() => "");
2104
+ for (const line of raw.split(/\\r?\\n/)) {
2105
+ const s = line.trim();
2106
+ if (!s || s.startsWith("#")) continue;
2107
+ const eq = s.indexOf("=");
2108
+ if (eq < 0) continue;
2109
+ const key = s.slice(0, eq).trim();
2110
+ const value = s.slice(eq + 1).trim().replace(/^"|"$/g, "");
2111
+ if (!key) continue;
2112
+ if (process.env[key] === undefined) process.env[key] = value;
2113
+ }
2114
+ }
2115
+ }
2116
+
2117
+ function parseThresholdExpr(expr: string): Partial<EvalThresholds> {
2118
+ const s = String(expr ?? "").trim();
2119
+ const eq = s.indexOf("=");
2120
+ if (eq < 0) throw new Error(\`Invalid --threshold: "\${s}" (expected key=value)\`);
2121
+ const key = s.slice(0, eq).trim();
2122
+ const value = Number(s.slice(eq + 1).trim());
2123
+ if (!Number.isFinite(value)) throw new Error(\`Invalid --threshold value: "\${s}"\`);
2124
+
2125
+ const out: Partial<EvalThresholds> = {};
2126
+ if (key === "min.hitAtK") out.min = { hitAtK: value };
2127
+ else if (key === "min.recallAtK") out.min = { recallAtK: value };
2128
+ else if (key === "min.mrrAtK") out.min = { mrrAtK: value };
2129
+ else if (key === "max.p95TotalMs") out.max = { p95TotalMs: value };
2130
+ else throw new Error(\`Unknown threshold key: "\${key}"\`);
2131
+ return out;
2132
+ }
2133
+
2134
+ function mergeThresholds(a: Partial<EvalThresholds>, b: Partial<EvalThresholds>): Partial<EvalThresholds> {
2135
+ return {
2136
+ min: { ...(a.min ?? {}), ...(b.min ?? {}) },
2137
+ max: { ...(a.max ?? {}), ...(b.max ?? {}) },
2138
+ };
2139
+ }
2140
+
2141
+ function parseArgs(argv: string[]): CliArgs {
2142
+ const out: CliArgs = {};
2143
+ const thresholds: Partial<EvalThresholds>[] = [];
2144
+
2145
+ for (let i = 0; i < argv.length; i++) {
2146
+ const a = argv[i];
2147
+ if (a === "--dataset") out.dataset = argv[++i];
2148
+ else if (a === "--baseline") out.baseline = argv[++i];
2149
+ else if (a === "--outputDir" || a === "--output-dir") out.outputDir = argv[++i];
2150
+ else if (a === "--mode") out.mode = argv[++i] as EvalMode;
2151
+ else if (a === "--topK" || a === "--top-k") out.topK = Number(argv[++i]);
2152
+ else if (a === "--rerankTopK" || a === "--rerank-top-k") out.rerankTopK = Number(argv[++i]);
2153
+ else if (a === "--scopePrefix" || a === "--scope-prefix") out.scopePrefix = argv[++i];
2154
+ else if (a === "--no-ingest") out.ingest = false;
2155
+ else if (a === "--cleanup") out.cleanup = argv[++i] as EvalCleanupPolicy;
2156
+ else if (a === "--threshold") thresholds.push(parseThresholdExpr(argv[++i] ?? ""));
2157
+ else if (a === "--ci") out.ci = true;
2158
+ else if (a === "--allow-assets") out.allowAssets = true;
2159
+ else if (a === "--allow-non-eval-prefix" || a === "--allow-custom-prefix") out.allowNonEvalPrefix = true;
2160
+ else if (a === "--yes" || a === "-y") out.yes = true;
2161
+ else if (a === "--include-ndcg") out.includeNdcg = true;
2162
+ else if (a === "--help" || a === "-h") {
2163
+ printHelp();
2164
+ process.exit(0);
2165
+ }
2166
+ }
2167
+
2168
+ for (const t of thresholds) out.thresholds = mergeThresholds(out.thresholds ?? {}, t);
2169
+ return out;
2170
+ }
2171
+
2172
+ function printHelp() {
2173
+ console.log(
2174
+ [
2175
+ "unrag-eval — retrieval eval harness",
2176
+ "",
2177
+ "Usage:",
2178
+ " bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json",
2179
+ "",
2180
+ "Options:",
2181
+ " --dataset <path> Dataset JSON path (required)",
2182
+ " --baseline <report.json> Baseline report for diffing",
2183
+ " --output-dir <dir> Output dir (default: .unrag/eval/runs/<ts>-<datasetId>)",
2184
+ " --mode retrieve|retrieve+rerank Override mode",
2185
+ " --top-k <n> Override topK",
2186
+ " --rerank-top-k <n> In rerank mode, retrieve N candidates before reranking (default: topK*3)",
2187
+ " --scope-prefix <prefix> Override scopePrefix",
2188
+ " --no-ingest Skip dataset document ingest",
2189
+ " --cleanup none|on-success|always Cleanup policy when ingesting",
2190
+ " --threshold <k=v> Repeatable thresholds (e.g. min.recallAtK=0.75)",
2191
+ " --ci CI mode (non-interactive)",
2192
+ " --yes, -y Allow dangerous operations when explicitly enabled",
2193
+ " --allow-assets Allow documents[].assets ingestion (advanced)",
2194
+ " --allow-custom-prefix Allow scopePrefix outside eval:* (dangerous)",
2195
+ " --include-ndcg Compute nDCG@k (optional)",
2196
+ ].join("\\n")
2197
+ );
2198
+ }
2199
+
2200
+ async function readConfigFile(projectRoot: string): Promise<any | null> {
2201
+ const abs = path.join(projectRoot, ".unrag/eval/config.json");
2202
+ if (!(await fileExists(abs))) return null;
2203
+ const raw = await readFile(abs, "utf8");
2204
+ try {
2205
+ return JSON.parse(raw);
2206
+ } catch (e) {
2207
+ const msg = e instanceof Error ? e.message : String(e);
2208
+ throw new Error(\`Failed to parse .unrag/eval/config.json: \${msg}\`);
2209
+ }
2210
+ }
2211
+
2212
+ function sanitizeMode(v: any): EvalMode | undefined {
2213
+ if (v === "retrieve" || v === "retrieve+rerank") return v;
2214
+ return undefined;
2215
+ }
2216
+
2217
+ function sanitizeCleanup(v: any): EvalCleanupPolicy | undefined {
2218
+ if (v === "none" || v === "on-success" || v === "always") return v;
2219
+ return undefined;
2220
+ }
2221
+
2222
+ async function main() {
2223
+ const projectRoot = path.join(process.cwd());
2224
+ await loadEnvFilesBestEffort(projectRoot);
2225
+
2226
+ const cli = parseArgs(process.argv.slice(2));
2227
+ const cfg = await readConfigFile(projectRoot);
2228
+
2229
+ const datasetPath = cli.dataset ?? cfg?.dataset ?? ".unrag/eval/datasets/sample.json";
2230
+ if (!datasetPath) throw new Error("--dataset is required");
2231
+
2232
+ const engine = createUnragEngine();
2233
+
2234
+ const thresholds: Partial<EvalThresholds> = mergeThresholds(cfg?.thresholds ?? {}, cli.thresholds ?? {});
2235
+
2236
+ const result = await runEval({
2237
+ engine,
2238
+ datasetPath,
2239
+ mode: cli.mode ?? sanitizeMode(cfg?.mode),
2240
+ topK: cli.topK ?? (typeof cfg?.topK === "number" ? cfg.topK : undefined),
2241
+ rerankTopK: cli.rerankTopK ?? (typeof cfg?.rerankTopK === "number" ? cfg.rerankTopK : undefined),
2242
+ scopePrefix: cli.scopePrefix ?? (typeof cfg?.scopePrefix === "string" ? cfg.scopePrefix : undefined),
2243
+ ingest: cli.ingest ?? (typeof cfg?.ingest === "boolean" ? cfg.ingest : undefined),
2244
+ cleanup: cli.cleanup ?? sanitizeCleanup(cfg?.cleanup) ?? "none",
2245
+ includeNdcg: cli.includeNdcg ?? Boolean(cfg?.includeNdcg),
2246
+ allowAssets: cli.allowAssets ?? Boolean(cfg?.allowAssets),
2247
+ allowNonEvalPrefix: cli.allowNonEvalPrefix ?? Boolean(cfg?.allowNonEvalPrefix),
2248
+ confirmedDangerousDelete: Boolean(cli.yes),
2249
+ thresholds,
2250
+ });
2251
+
2252
+ const ts = new Date().toISOString().replace(/[:.]/g, "-");
2253
+ const outputDir =
2254
+ cli.outputDir ??
2255
+ cfg?.outputDir ??
2256
+ path.join(".unrag/eval/runs", \`\${ts}-\${result.report.dataset.id}\`);
2257
+
2258
+ const reportPath = await writeEvalReport(outputDir, result.report);
2259
+ const summaryPath = await writeEvalSummaryMd(outputDir, result.report);
2260
+
2261
+ let diffPaths: { json: string; md: string } | null = null;
2262
+ const baselinePath = cli.baseline ?? cfg?.baseline;
2263
+ if (baselinePath) {
2264
+ const baseline = await readEvalReportFromFile(baselinePath);
2265
+ const diff = diffEvalReports({ baseline, candidate: result.report, baselinePath, candidatePath: reportPath });
2266
+ const diffJson = await writeEvalDiffJson(outputDir, diff);
2267
+ const diffMd = await writeEvalDiffMd(outputDir, diff);
2268
+ diffPaths = { json: diffJson, md: diffMd };
2269
+ }
2270
+
2271
+ console.log(
2272
+ [
2273
+ \`[unrag:eval] Wrote report: \${reportPath}\`,
2274
+ \`[unrag:eval] Wrote summary: \${summaryPath}\`,
2275
+ diffPaths ? \`[unrag:eval] Wrote diff: \${diffPaths.json} (+ \${diffPaths.md})\` : "",
2276
+ result.thresholdFailures.length > 0
2277
+ ? \`[unrag:eval] Threshold failures:\\n- \${result.thresholdFailures.join("\\n- ")}\`
2278
+ : \`[unrag:eval] Thresholds: pass\`,
2279
+ ]
2280
+ .filter(Boolean)
2281
+ .join("\\n")
2282
+ );
2283
+
2284
+ process.exitCode = result.exitCode;
2285
+ }
2286
+
2287
+ main().catch((err) => {
2288
+ const msg = err instanceof Error ? err.stack ?? err.message : String(err);
2289
+ console.error(\`[unrag:eval] Error: \${msg}\`);
2290
+ process.exitCode = 2;
2291
+ });
2292
+ `;
2293
+ if (await shouldWriteFile(datasetAbs, root, nonInteractive)) {
2294
+ await writeTextFile(datasetAbs, JSON.stringify(sampleDataset, null, 2) + `
2295
+ `);
2296
+ }
2297
+ if (await shouldWriteFile(configAbs, root, nonInteractive)) {
2298
+ await writeTextFile(configAbs, JSON.stringify(evalConfig, null, 2) + `
2299
+ `);
2300
+ }
2301
+ if (await shouldWriteFile(scriptAbs, root, nonInteractive)) {
2302
+ await writeTextFile(scriptAbs, script);
2303
+ }
2304
+ const scriptsToAdd = {
2305
+ "unrag:eval": `bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json`,
2306
+ "unrag:eval:ci": `bun run scripts/unrag-eval.ts -- --dataset .unrag/eval/datasets/sample.json --ci`
2307
+ };
2308
+ const scriptsResult = await addPackageJsonScripts({
2309
+ projectRoot: root,
2310
+ pkg,
2311
+ scripts: scriptsToAdd,
2312
+ nonInteractive
2313
+ });
2314
+ outro2([
2315
+ `Installed battery: ${battery}.`,
2316
+ "",
2317
+ `- Code: ${path7.join(config.installDir, "eval")}`,
2318
+ "",
2319
+ `- Dataset: ${path7.relative(root, datasetAbs)}`,
2320
+ `- Script: ${path7.relative(root, scriptAbs)}`,
2321
+ "",
2322
+ scriptsResult.added.length > 0 ? `Added scripts: ${scriptsResult.added.join(", ")}` : "Added scripts: none",
2323
+ scriptsResult.kept.length > 0 ? `Kept existing scripts: ${scriptsResult.kept.join(", ")}` : "",
2324
+ "",
2325
+ "Next:",
2326
+ " bun run unrag:eval",
2327
+ " bun run unrag:eval:ci"
2328
+ ].filter(Boolean).join(`
2329
+ `));
2330
+ return;
2331
+ }
1542
2332
  const wiringSnippet = battery === "reranker" ? [
1543
2333
  "",
1544
2334
  "Next steps:",
@@ -2776,14 +3566,15 @@ async function runDbChecks(state, options) {
2776
3566
  summary: `Using ${dbUrlResult.source}`,
2777
3567
  details: [redactConnectionString(dbUrlResult.url)]
2778
3568
  });
2779
- let client = null;
3569
+ let end;
2780
3570
  try {
2781
3571
  const pg = await import("pg");
2782
3572
  const Pool = pg.default?.Pool ?? pg.Pool;
2783
3573
  const pool = new Pool({ connectionString: dbUrlResult.url });
2784
- client = {
3574
+ end = () => pool.end();
3575
+ const client = {
2785
3576
  query: (sql, params) => pool.query(sql, params),
2786
- end: () => pool.end()
3577
+ end
2787
3578
  };
2788
3579
  const connectivityResult = await checkConnectivity(client);
2789
3580
  results.push(connectivityResult);
@@ -2796,6 +3587,10 @@ async function runDbChecks(state, options) {
2796
3587
  const tableNames = await inferTableNames(installDirFull ?? "", state.storeAdapter);
2797
3588
  const schemaResults = await checkSchema(client, options.schema, tableNames);
2798
3589
  results.push(...schemaResults);
3590
+ const uniquenessResult = await checkSourceIdUniqueness(client, options.schema, tableNames);
3591
+ results.push(uniquenessResult);
3592
+ const duplicatesResult = await checkDuplicateSourceIds(client, options.schema, tableNames);
3593
+ results.push(duplicatesResult);
2799
3594
  const indexResults = await checkIndexes(client, options.schema, tableNames);
2800
3595
  results.push(...indexResults);
2801
3596
  const dimensionResults = await checkDimensionConsistency(client, options.schema, tableNames, options.scope);
@@ -2814,9 +3609,8 @@ async function runDbChecks(state, options) {
2814
3609
  ]
2815
3610
  });
2816
3611
  } finally {
2817
- if (client) {
2818
- await client.end().catch(() => {});
2819
- }
3612
+ if (end)
3613
+ await end().catch(() => {});
2820
3614
  }
2821
3615
  return results;
2822
3616
  }
@@ -3083,6 +3877,135 @@ async function checkForeignKeys(client, schema, tableNames) {
3083
3877
  };
3084
3878
  }
3085
3879
  }
3880
+ async function checkSourceIdUniqueness(client, schema, tableNames) {
3881
+ try {
3882
+ const uniqueConstraintResult = await client.query(`SELECT con.conname as constraint_name
3883
+ FROM pg_constraint con
3884
+ JOIN pg_class t ON t.oid = con.conrelid
3885
+ JOIN pg_namespace n ON n.oid = t.relnamespace
3886
+ WHERE n.nspname = $1
3887
+ AND t.relname = $2
3888
+ AND con.contype = 'u'
3889
+ AND array_length(con.conkey, 1) = 1
3890
+ AND (
3891
+ SELECT a.attname
3892
+ FROM pg_attribute a
3893
+ WHERE a.attrelid = t.oid AND a.attnum = con.conkey[1]
3894
+ ) = 'source_id'`, [schema, tableNames.documents]);
3895
+ if (uniqueConstraintResult.rows.length > 0) {
3896
+ return {
3897
+ id: "db-sourceid-unique",
3898
+ title: "documents.source_id uniqueness",
3899
+ status: "pass",
3900
+ summary: "UNIQUE constraint exists on documents.source_id.",
3901
+ details: [`Constraint: ${uniqueConstraintResult.rows[0].constraint_name}`]
3902
+ };
3903
+ }
3904
+ const uniqueIndexResult = await client.query(`SELECT i.relname as indexname, pg_get_indexdef(i.oid) as indexdef
3905
+ FROM pg_index ix
3906
+ JOIN pg_class t ON t.oid = ix.indrelid
3907
+ JOIN pg_class i ON i.oid = ix.indexrelid
3908
+ JOIN pg_namespace n ON n.oid = t.relnamespace
3909
+ WHERE n.nspname = $1
3910
+ AND t.relname = $2
3911
+ AND ix.indisunique = true
3912
+ AND ix.indexprs IS NULL
3913
+ AND ix.indpred IS NULL
3914
+ -- Ensure key columns are exactly (source_id). This also allows INCLUDE columns.
3915
+ AND pg_get_indexdef(i.oid) ~* '\\\\(\\\\s*"?source_id"?\\\\s*\\\\)'`, [schema, tableNames.documents]);
3916
+ if (uniqueIndexResult.rows.length > 0) {
3917
+ return {
3918
+ id: "db-sourceid-unique",
3919
+ title: "documents.source_id uniqueness",
3920
+ status: "pass",
3921
+ summary: "UNIQUE index exists on documents.source_id.",
3922
+ details: [`Index: ${uniqueIndexResult.rows[0].indexname}`]
3923
+ };
3924
+ }
3925
+ return {
3926
+ id: "db-sourceid-unique",
3927
+ title: "documents.source_id uniqueness",
3928
+ status: "fail",
3929
+ summary: "Missing UNIQUE constraint on documents.source_id.",
3930
+ details: [
3931
+ "Unrag requires a unique constraint on documents.source_id for idempotent ingestion.",
3932
+ "Without this constraint, concurrent ingests for the same sourceId may create duplicates."
3933
+ ],
3934
+ fixHints: [
3935
+ `ALTER TABLE ${schema}.${tableNames.documents} ADD CONSTRAINT ${tableNames.documents}_source_id_key UNIQUE (source_id);`,
3936
+ "-- Or create a unique index:",
3937
+ `CREATE UNIQUE INDEX ${tableNames.documents}_source_id_unique_idx ON ${schema}.${tableNames.documents}(source_id);`
3938
+ ],
3939
+ docsLink: docsUrl("/docs/getting-started/database#schema-requirements")
3940
+ };
3941
+ } catch (err) {
3942
+ const message = err instanceof Error ? err.message : String(err);
3943
+ return {
3944
+ id: "db-sourceid-unique",
3945
+ title: "documents.source_id uniqueness",
3946
+ status: "fail",
3947
+ summary: `Could not check uniqueness constraint: ${message}`
3948
+ };
3949
+ }
3950
+ }
3951
+ async function checkDuplicateSourceIds(client, schema, tableNames) {
3952
+ try {
3953
+ const countResult = await client.query(`SELECT COUNT(*) as duplicate_count
3954
+ FROM (
3955
+ SELECT source_id
3956
+ FROM ${schema}.${tableNames.documents}
3957
+ GROUP BY source_id
3958
+ HAVING COUNT(*) > 1
3959
+ ) duplicates`);
3960
+ const duplicateCount = parseInt(countResult.rows[0]?.duplicate_count ?? "0", 10);
3961
+ if (duplicateCount === 0) {
3962
+ return {
3963
+ id: "db-sourceid-duplicates",
3964
+ title: "documents.source_id duplicates",
3965
+ status: "pass",
3966
+ summary: "No duplicate source_id values found."
3967
+ };
3968
+ }
3969
+ const sampleResult = await client.query(`SELECT source_id, COUNT(*) as count
3970
+ FROM ${schema}.${tableNames.documents}
3971
+ GROUP BY source_id
3972
+ HAVING COUNT(*) > 1
3973
+ ORDER BY COUNT(*) DESC
3974
+ LIMIT 5`);
3975
+ const samples = sampleResult.rows.map((r) => `"${r.source_id}" (${r.count} copies)`);
3976
+ return {
3977
+ id: "db-sourceid-duplicates",
3978
+ title: "documents.source_id duplicates",
3979
+ status: "fail",
3980
+ summary: `Found ${duplicateCount} source_id value(s) with duplicates.`,
3981
+ details: [
3982
+ "Duplicate source_id values must be resolved before adding a unique constraint.",
3983
+ "",
3984
+ "Sample duplicates:",
3985
+ ...samples,
3986
+ duplicateCount > 5 ? `... and ${duplicateCount - 5} more` : ""
3987
+ ].filter(Boolean),
3988
+ fixHints: [
3989
+ "-- Find all duplicates:",
3990
+ `SELECT source_id, COUNT(*), array_agg(id) as document_ids`,
3991
+ `FROM ${schema}.${tableNames.documents}`,
3992
+ `GROUP BY source_id HAVING COUNT(*) > 1;`,
3993
+ "",
3994
+ "-- Resolve duplicates by deleting extra rows for a given source_id.",
3995
+ "-- (Exact strategy depends on your app; pick which document_id to keep and delete the rest.)"
3996
+ ],
3997
+ docsLink: docsUrl("/docs/getting-started/database#resolving-duplicates")
3998
+ };
3999
+ } catch (err) {
4000
+ const message = err instanceof Error ? err.message : String(err);
4001
+ return {
4002
+ id: "db-sourceid-duplicates",
4003
+ title: "documents.source_id duplicates",
4004
+ status: "warn",
4005
+ summary: `Could not check for duplicates: ${message}`
4006
+ };
4007
+ }
4008
+ }
3086
4009
  async function checkIndexes(client, schema, tableNames) {
3087
4010
  const results = [];
3088
4011
  try {
@@ -3500,14 +4423,14 @@ function resolveConfigPath(projectRoot, configPath) {
3500
4423
  // cli/commands/doctor-setup.ts
3501
4424
  import path14 from "node:path";
3502
4425
  import {
3503
- cancel as cancel3,
3504
- confirm as confirm3,
3505
- isCancel as isCancel3,
4426
+ cancel as cancel4,
4427
+ confirm as confirm4,
4428
+ isCancel as isCancel4,
3506
4429
  multiselect,
3507
4430
  outro as outro3,
3508
- select as select2,
4431
+ select as select3,
3509
4432
  spinner,
3510
- text as text2
4433
+ text as text3
3511
4434
  } from "@clack/prompts";
3512
4435
  var DEFAULT_CONFIG_PATH = ".unrag/doctor.json";
3513
4436
  function parseSetupArgs(args) {
@@ -3582,7 +4505,7 @@ async function doctorSetupCommand(args) {
3582
4505
  });
3583
4506
  const tableNames = state.installDir ? await inferTableNames(path14.join(projectRoot, state.installDir), state.storeAdapter) : { documents: "documents", chunks: "chunks", embeddings: "embeddings" };
3584
4507
  s.stop("Configuration detected.");
3585
- const configPathAnswer = parsed.configPath ? parsed.configPath : nonInteractive ? DEFAULT_CONFIG_PATH : await text2({
4508
+ const configPathAnswer = parsed.configPath ? parsed.configPath : nonInteractive ? DEFAULT_CONFIG_PATH : await text3({
3586
4509
  message: "Config file path",
3587
4510
  initialValue: DEFAULT_CONFIG_PATH,
3588
4511
  validate: (v) => {
@@ -3593,20 +4516,20 @@ async function doctorSetupCommand(args) {
3593
4516
  return;
3594
4517
  }
3595
4518
  });
3596
- if (isCancel3(configPathAnswer)) {
3597
- cancel3("Cancelled.");
4519
+ if (isCancel4(configPathAnswer)) {
4520
+ cancel4("Cancelled.");
3598
4521
  return;
3599
4522
  }
3600
4523
  const configPath = String(configPathAnswer).trim();
3601
4524
  const configFullPath = path14.isAbsolute(configPath) ? configPath : path14.join(projectRoot, configPath);
3602
4525
  if (await exists(configFullPath)) {
3603
4526
  if (nonInteractive) {} else {
3604
- const overwrite = await confirm3({
4527
+ const overwrite = await confirm4({
3605
4528
  message: `Config file ${configPath} already exists. Overwrite?`,
3606
4529
  initialValue: false
3607
4530
  });
3608
- if (isCancel3(overwrite)) {
3609
- cancel3("Cancelled.");
4531
+ if (isCancel4(overwrite)) {
4532
+ cancel4("Cancelled.");
3610
4533
  return;
3611
4534
  }
3612
4535
  if (!overwrite) {
@@ -3615,7 +4538,7 @@ async function doctorSetupCommand(args) {
3615
4538
  }
3616
4539
  }
3617
4540
  }
3618
- const installDirAnswer = nonInteractive ? state.installDir ?? "lib/unrag" : await text2({
4541
+ const installDirAnswer = nonInteractive ? state.installDir ?? "lib/unrag" : await text3({
3619
4542
  message: "Unrag install directory",
3620
4543
  initialValue: state.installDir ?? "lib/unrag",
3621
4544
  validate: (v) => {
@@ -3624,8 +4547,8 @@ async function doctorSetupCommand(args) {
3624
4547
  return;
3625
4548
  }
3626
4549
  });
3627
- if (isCancel3(installDirAnswer)) {
3628
- cancel3("Cancelled.");
4550
+ if (isCancel4(installDirAnswer)) {
4551
+ cancel4("Cancelled.");
3629
4552
  return;
3630
4553
  }
3631
4554
  const installDir = String(installDirAnswer).trim();
@@ -3648,13 +4571,13 @@ async function doctorSetupCommand(args) {
3648
4571
  initialValues: DEFAULT_ENV_LOAD_FILES,
3649
4572
  required: false
3650
4573
  });
3651
- if (isCancel3(envFilesAnswer)) {
3652
- cancel3("Cancelled.");
4574
+ if (isCancel4(envFilesAnswer)) {
4575
+ cancel4("Cancelled.");
3653
4576
  return;
3654
4577
  }
3655
4578
  const envFiles = envFilesAnswer;
3656
4579
  const dbEnvVarDefault = state.inferredDbEnvVar ?? "DATABASE_URL";
3657
- const dbEnvVarAnswer = nonInteractive ? dbEnvVarDefault : await text2({
4580
+ const dbEnvVarAnswer = nonInteractive ? dbEnvVarDefault : await text3({
3658
4581
  message: "Database URL environment variable name",
3659
4582
  initialValue: dbEnvVarDefault,
3660
4583
  validate: (v) => {
@@ -3665,80 +4588,80 @@ async function doctorSetupCommand(args) {
3665
4588
  return;
3666
4589
  }
3667
4590
  });
3668
- if (isCancel3(dbEnvVarAnswer)) {
3669
- cancel3("Cancelled.");
4591
+ if (isCancel4(dbEnvVarAnswer)) {
4592
+ cancel4("Cancelled.");
3670
4593
  return;
3671
4594
  }
3672
4595
  const databaseUrlEnv = String(dbEnvVarAnswer).trim();
3673
- const schemaAnswer = nonInteractive ? "public" : await text2({
4596
+ const schemaAnswer = nonInteractive ? "public" : await text3({
3674
4597
  message: "Database schema name",
3675
4598
  initialValue: "public"
3676
4599
  });
3677
- if (isCancel3(schemaAnswer)) {
3678
- cancel3("Cancelled.");
4600
+ if (isCancel4(schemaAnswer)) {
4601
+ cancel4("Cancelled.");
3679
4602
  return;
3680
4603
  }
3681
4604
  const schema = String(schemaAnswer).trim() || "public";
3682
- const documentsTableAnswer = nonInteractive ? tableNames.documents : await text2({
4605
+ const documentsTableAnswer = nonInteractive ? tableNames.documents : await text3({
3683
4606
  message: "Documents table name",
3684
4607
  initialValue: tableNames.documents
3685
4608
  });
3686
- if (isCancel3(documentsTableAnswer)) {
3687
- cancel3("Cancelled.");
4609
+ if (isCancel4(documentsTableAnswer)) {
4610
+ cancel4("Cancelled.");
3688
4611
  return;
3689
4612
  }
3690
4613
  const documentsTable = String(documentsTableAnswer).trim() || "documents";
3691
- const chunksTableAnswer = nonInteractive ? tableNames.chunks : await text2({
4614
+ const chunksTableAnswer = nonInteractive ? tableNames.chunks : await text3({
3692
4615
  message: "Chunks table name",
3693
4616
  initialValue: tableNames.chunks
3694
4617
  });
3695
- if (isCancel3(chunksTableAnswer)) {
3696
- cancel3("Cancelled.");
4618
+ if (isCancel4(chunksTableAnswer)) {
4619
+ cancel4("Cancelled.");
3697
4620
  return;
3698
4621
  }
3699
4622
  const chunksTable = String(chunksTableAnswer).trim() || "chunks";
3700
- const embeddingsTableAnswer = nonInteractive ? tableNames.embeddings : await text2({
4623
+ const embeddingsTableAnswer = nonInteractive ? tableNames.embeddings : await text3({
3701
4624
  message: "Embeddings table name",
3702
4625
  initialValue: tableNames.embeddings
3703
4626
  });
3704
- if (isCancel3(embeddingsTableAnswer)) {
3705
- cancel3("Cancelled.");
4627
+ if (isCancel4(embeddingsTableAnswer)) {
4628
+ cancel4("Cancelled.");
3706
4629
  return;
3707
4630
  }
3708
4631
  const embeddingsTable = String(embeddingsTableAnswer).trim() || "embeddings";
3709
- const scopeAnswer = nonInteractive ? "" : await text2({
4632
+ const scopeAnswer = nonInteractive ? "" : await text3({
3710
4633
  message: "Default scope prefix for dimension checks (optional, press enter to skip)",
3711
4634
  initialValue: ""
3712
4635
  });
3713
- if (isCancel3(scopeAnswer)) {
3714
- cancel3("Cancelled.");
4636
+ if (isCancel4(scopeAnswer)) {
4637
+ cancel4("Cancelled.");
3715
4638
  return;
3716
4639
  }
3717
4640
  const defaultScope = String(scopeAnswer).trim() || null;
3718
- const strictAnswer = nonInteractive ? false : await confirm3({
4641
+ const strictAnswer = nonInteractive ? false : await confirm4({
3719
4642
  message: "Enable strict mode by default? (treat warnings as failures)",
3720
4643
  initialValue: false
3721
4644
  });
3722
- if (isCancel3(strictAnswer)) {
3723
- cancel3("Cancelled.");
4645
+ if (isCancel4(strictAnswer)) {
4646
+ cancel4("Cancelled.");
3724
4647
  return;
3725
4648
  }
3726
4649
  const strictDefault = Boolean(strictAnswer);
3727
- const ciIncludeDbAnswer = nonInteractive ? true : await confirm3({
4650
+ const ciIncludeDbAnswer = nonInteractive ? true : await confirm4({
3728
4651
  message: "Should CI script include database checks (--db)?",
3729
4652
  initialValue: true
3730
4653
  });
3731
- if (isCancel3(ciIncludeDbAnswer)) {
3732
- cancel3("Cancelled.");
4654
+ if (isCancel4(ciIncludeDbAnswer)) {
4655
+ cancel4("Cancelled.");
3733
4656
  return;
3734
4657
  }
3735
4658
  const ciIncludeDb = Boolean(ciIncludeDbAnswer);
3736
- const ciStrictAnswer = nonInteractive ? true : await confirm3({
4659
+ const ciStrictAnswer = nonInteractive ? true : await confirm4({
3737
4660
  message: "Should CI script use strict mode (--strict)?",
3738
4661
  initialValue: true
3739
4662
  });
3740
- if (isCancel3(ciStrictAnswer)) {
3741
- cancel3("Cancelled.");
4663
+ if (isCancel4(ciStrictAnswer)) {
4664
+ cancel4("Cancelled.");
3742
4665
  return;
3743
4666
  }
3744
4667
  const ciStrict = Boolean(ciStrictAnswer);
@@ -3791,7 +4714,7 @@ async function doctorSetupCommand(args) {
3791
4714
  let scriptsToAdd = scripts;
3792
4715
  if (conflictingScripts.length > 0 && !nonInteractive) {
3793
4716
  for (const scriptName of conflictingScripts) {
3794
- const action = await select2({
4717
+ const action = await select3({
3795
4718
  message: `Script "${scriptName}" already exists. What would you like to do?`,
3796
4719
  options: [
3797
4720
  { value: "keep", label: "Keep existing", hint: existingScripts[scriptName] },
@@ -3804,14 +4727,14 @@ async function doctorSetupCommand(args) {
3804
4727
  ],
3805
4728
  initialValue: "keep"
3806
4729
  });
3807
- if (isCancel3(action)) {
3808
- cancel3("Cancelled.");
4730
+ if (isCancel4(action)) {
4731
+ cancel4("Cancelled.");
3809
4732
  return;
3810
4733
  }
3811
4734
  if (action === "keep") {
3812
4735
  delete scriptsToAdd[scriptName];
3813
4736
  } else if (action === "rename") {
3814
- const newName = await text2({
4737
+ const newName = await text3({
3815
4738
  message: `New script name for ${scriptName}`,
3816
4739
  initialValue: `${scriptName}:new`,
3817
4740
  validate: (v) => {
@@ -3822,8 +4745,8 @@ async function doctorSetupCommand(args) {
3822
4745
  return;
3823
4746
  }
3824
4747
  });
3825
- if (isCancel3(newName)) {
3826
- cancel3("Cancelled.");
4748
+ if (isCancel4(newName)) {
4749
+ cancel4("Cancelled.");
3827
4750
  return;
3828
4751
  }
3829
4752
  const value = scriptsToAdd[scriptName];
@@ -4147,7 +5070,7 @@ function renderHelp() {
4147
5070
  " --alias <@name> Import alias base (e.g. @unrag)",
4148
5071
  " --preset <id|url> Install from a web-generated preset (non-interactive)",
4149
5072
  " --overwrite <mode> skip | force (when files already exist)",
4150
- " --rich-media Enable rich media setup (also enables multimodal embeddings)",
5073
+ " --rich-media Enable rich media setup (extractors + assetProcessing flags)",
4151
5074
  " --no-rich-media Disable rich media setup",
4152
5075
  " --extractors <list> Comma-separated extractors (implies --rich-media)",
4153
5076
  " --no-install Skip automatic dependency installation",