@remnic/cli 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,9 +1,24 @@
1
1
  import {
2
+ buildBenchmarkPublishFeed,
2
3
  checkRegression,
4
+ compareResults,
5
+ defaultBenchmarkBaselineDir,
6
+ defaultBenchmarkPublishPath,
7
+ deleteBenchmarkResults,
8
+ discoverAllProviders,
9
+ getBenchmarkLowerIsBetter,
10
+ listBenchmarkBaselines,
11
+ listBenchmarkResults,
3
12
  loadBaseline,
13
+ loadBenchmarkBaseline,
14
+ loadBenchmarkResult,
15
+ renderBenchmarkResultExport,
16
+ resolveBenchmarkResultReference,
4
17
  runBenchSuite,
5
- runExplain
6
- } from "./chunk-U4MQO3IF.js";
18
+ runExplain,
19
+ saveBenchmarkBaseline,
20
+ writeBenchmarkPublishFeed
21
+ } from "./chunk-GAZ3DFWX.js";
7
22
 
8
23
  // src/index.ts
9
24
  import fs from "fs";
@@ -87,6 +102,186 @@ import {
87
102
  resolveExtensionsRoot,
88
103
  coerceInstallExtension
89
104
  } from "@remnic/core";
105
+ import {
106
+ convertMemoriesToRecords,
107
+ getTrainingExportAdapter as getTrainingExportAdapter2,
108
+ listTrainingExportAdapters,
109
+ parseStrictCliDate
110
+ } from "@remnic/core";
111
+
112
+ // ../export-weclone/dist/index.js
113
+ import {
114
+ getTrainingExportAdapter,
115
+ registerTrainingExportAdapter
116
+ } from "@remnic/core";
117
+ var wecloneExportAdapter = {
118
+ name: "weclone",
119
+ fileExtension: ".json",
120
+ formatRecords(records) {
121
+ const alpacaRecords = records.map((r) => ({
122
+ instruction: r.instruction,
123
+ input: r.input,
124
+ output: r.output
125
+ }));
126
+ return JSON.stringify(alpacaRecords, null, 2);
127
+ }
128
+ };
129
+ var DEFAULT_MAX_PAIRS = 1;
130
+ var QUESTION_TEMPLATES = {
131
+ preferences: [
132
+ "What kind of {topic} do you like?",
133
+ "What's your preference for {topic}?",
134
+ "What are your favorite {topic}?"
135
+ ],
136
+ opinions: [
137
+ "What do you think about {topic}?",
138
+ "How do you feel about {topic}?",
139
+ "What's your opinion on {topic}?"
140
+ ],
141
+ expertise: [
142
+ "Tell me about {topic}.",
143
+ "What do you know about {topic}?",
144
+ "Can you explain {topic}?"
145
+ ],
146
+ personal: [
147
+ "Can you tell me about your {topic}?",
148
+ "Tell me about your {topic}.",
149
+ "What can you share about your {topic}?"
150
+ ]
151
+ };
152
+ var DEFAULT_TEMPLATES = [
153
+ "Tell me about {topic}.",
154
+ "What can you share about {topic}?"
155
+ ];
156
+ var CATEGORY_TO_TEMPLATE = {
157
+ preference: "preferences",
158
+ fact: "expertise",
159
+ entity: "expertise",
160
+ skill: "expertise",
161
+ correction: "opinions",
162
+ decision: "opinions",
163
+ principle: "opinions",
164
+ rule: "opinions",
165
+ personal: "personal",
166
+ relationship: "personal",
167
+ commitment: "personal",
168
+ moment: "personal"
169
+ };
170
+ function synthesizeTrainingPairs(records, options) {
171
+ const maxPairs = options?.maxPairsPerRecord ?? DEFAULT_MAX_PAIRS;
172
+ const style = options?.styleMarkers;
173
+ const result = [];
174
+ for (let i = 0; i < records.length; i++) {
175
+ const record = records[i];
176
+ const templateKey = resolveTemplateKey(record.category);
177
+ const topic = extractTopic(record.instruction);
178
+ const templates = QUESTION_TEMPLATES[templateKey] ?? DEFAULT_TEMPLATES;
179
+ const pairCount = Math.min(maxPairs, templates.length);
180
+ for (let j = 0; j < pairCount; j++) {
181
+ const templateIndex = (i + j) % templates.length;
182
+ const question = templates[templateIndex].replace("{topic}", topic);
183
+ let output = record.output;
184
+ if (style?.usesLowercase) {
185
+ output = output.toLowerCase();
186
+ }
187
+ result.push({
188
+ instruction: question,
189
+ input: "",
190
+ output,
191
+ category: record.category,
192
+ confidence: record.confidence,
193
+ sourceIds: record.sourceIds
194
+ });
195
+ }
196
+ }
197
+ return result;
198
+ }
199
+ function resolveTemplateKey(category) {
200
+ if (!category) return "";
201
+ return CATEGORY_TO_TEMPLATE[category.toLowerCase()] ?? "";
202
+ }
203
+ function extractTopic(instruction) {
204
+ const tagMatch = instruction.match(/\(([^()]+)\)/);
205
+ if (tagMatch) {
206
+ return tagMatch[1].trim().toLowerCase();
207
+ }
208
+ return "this";
209
+ }
210
+ var PII_PATTERNS = [
211
+ {
212
+ // Email: user@domain.tld
213
+ name: "email",
214
+ regex: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g
215
+ },
216
+ {
217
+ // SSN: 123-45-6789 (exactly 3-2-4 digit groups)
218
+ name: "ssn",
219
+ regex: /\b\d{3}-\d{2}-\d{4}\b/g
220
+ },
221
+ {
222
+ // Credit card: 4 groups of 4 digits separated by dashes or spaces
223
+ name: "credit_card",
224
+ regex: /\b\d{4}[-\s]\d{4}[-\s]\d{4}[-\s]\d{4}\b/g
225
+ },
226
+ {
227
+ // IP address: four octets 0-255
228
+ name: "ip_address",
229
+ regex: /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b/g
230
+ },
231
+ {
232
+ // Phone: optional +1- prefix, then 3-3-4 with dashes, dots, or spaces
233
+ // Also matches (555) 123-4567 format
234
+ name: "phone",
235
+ regex: /(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]\d{3}[-.\s]\d{4}\b/g
236
+ }
237
+ ];
238
+ var SCANNED_FIELDS = [
239
+ "instruction",
240
+ "input",
241
+ "output"
242
+ ];
243
+ function sweepPii(records) {
244
+ const redactionDetails = [];
245
+ const recordHasRedaction = /* @__PURE__ */ new Set();
246
+ const cleanRecords = records.map((record, idx) => {
247
+ const cleaned = { ...record };
248
+ for (const field of SCANNED_FIELDS) {
249
+ let value = record[field];
250
+ if (!value) continue;
251
+ for (const pattern of PII_PATTERNS) {
252
+ pattern.regex.lastIndex = 0;
253
+ if (pattern.regex.test(value)) {
254
+ pattern.regex.lastIndex = 0;
255
+ value = value.replace(pattern.regex, "[REDACTED]");
256
+ recordHasRedaction.add(idx);
257
+ redactionDetails.push({
258
+ index: idx,
259
+ field,
260
+ pattern: pattern.name
261
+ });
262
+ }
263
+ }
264
+ cleaned[field] = value;
265
+ }
266
+ return cleaned;
267
+ });
268
+ return {
269
+ cleanRecords,
270
+ redactedCount: recordHasRedaction.size,
271
+ redactionDetails
272
+ };
273
+ }
274
+ function ensureWecloneExportAdapterRegistered() {
275
+ if (getTrainingExportAdapter(wecloneExportAdapter.name) !== void 0) {
276
+ return false;
277
+ }
278
+ registerTrainingExportAdapter(wecloneExportAdapter);
279
+ return true;
280
+ }
281
+ try {
282
+ ensureWecloneExportAdapterRegistered();
283
+ } catch {
284
+ }
90
285
 
91
286
  // src/service-candidates.ts
92
287
  function firstSuccessfulResult(candidates, attempt) {
@@ -143,7 +338,7 @@ function collectBenchmarks(argv) {
143
338
  const benchmarks = [];
144
339
  for (let index = 0; index < argv.length; index += 1) {
145
340
  const arg = argv[index];
146
- if (arg === "--dataset-dir") {
341
+ if (arg === "--dataset-dir" || arg === "--results-dir" || arg === "--baselines-dir" || arg === "--threshold" || arg === "--custom" || arg === "--format" || arg === "--output" || arg === "--target") {
147
342
  index += 1;
148
343
  continue;
149
344
  }
@@ -155,7 +350,7 @@ function collectBenchmarks(argv) {
155
350
  }
156
351
  function parseBenchActionArgs(argv) {
157
352
  const [first, ...rest] = argv;
158
- const action = first === "list" || first === "run" || first === "check" || first === "report" ? first : first === void 0 || first === "--help" || first === "-h" ? "help" : "run";
353
+ const action = first === "list" || first === "run" || first === "datasets" || first === "runs" || first === "compare" || first === "ui" || first === "results" || first === "baseline" || first === "export" || first === "providers" || first === "publish" || first === "check" || first === "report" ? first : first === void 0 || first === "--help" || first === "-h" ? "help" : "run";
159
354
  return {
160
355
  action,
161
356
  args: action === "run" && action !== first ? argv : rest
@@ -163,15 +358,72 @@ function parseBenchActionArgs(argv) {
163
358
  }
164
359
  function parseBenchArgs(argv) {
165
360
  const { action, args } = parseBenchActionArgs(argv);
166
- const benchmarks = collectBenchmarks(args);
361
+ const baselineAction = action === "baseline" ? args[0] === "save" || args[0] === "list" ? args[0] : void 0 : void 0;
362
+ const datasetAction = action === "datasets" ? args[0] === "download" || args[0] === "status" ? args[0] : void 0 : void 0;
363
+ const providerAction = action === "providers" ? args[0] === "discover" ? args[0] : void 0 : void 0;
364
+ const runAction = action === "runs" ? args[0] === "list" || args[0] === "show" || args[0] === "delete" ? args[0] : void 0 : void 0;
365
+ if (action === "baseline" && baselineAction === void 0) {
366
+ throw new Error("ERROR: baseline requires a subcommand: save or list.");
367
+ }
368
+ if (action === "datasets" && datasetAction === void 0) {
369
+ throw new Error("ERROR: datasets requires a subcommand: download or status.");
370
+ }
371
+ if (action === "providers" && providerAction === void 0) {
372
+ throw new Error("ERROR: providers requires a subcommand: discover.");
373
+ }
374
+ if (action === "runs" && runAction === void 0) {
375
+ throw new Error("ERROR: runs requires a subcommand: list, show, or delete.");
376
+ }
377
+ const benchmarkArgs = action === "baseline" || action === "datasets" || action === "providers" || action === "runs" ? args.slice(1) : args;
378
+ const benchmarks = collectBenchmarks(benchmarkArgs);
167
379
  const datasetDir = readBenchOptionValue(args, "--dataset-dir");
380
+ const resultsDir = readBenchOptionValue(args, "--results-dir");
381
+ const baselinesDir = readBenchOptionValue(args, "--baselines-dir");
382
+ const thresholdRaw = readBenchOptionValue(args, "--threshold");
383
+ const customRaw = readBenchOptionValue(args, "--custom");
384
+ const formatRaw = readBenchOptionValue(args, "--format");
385
+ const output = readBenchOptionValue(args, "--output");
386
+ const targetRaw = readBenchOptionValue(args, "--target");
387
+ let threshold;
388
+ if (thresholdRaw !== void 0) {
389
+ threshold = Number(thresholdRaw);
390
+ if (!Number.isFinite(threshold) || threshold < 0) {
391
+ throw new Error("ERROR: --threshold must be a non-negative number.");
392
+ }
393
+ }
394
+ let format;
395
+ if (formatRaw !== void 0) {
396
+ if (formatRaw !== "json" && formatRaw !== "csv" && formatRaw !== "html") {
397
+ throw new Error('ERROR: --format must be "json", "csv", or "html".');
398
+ }
399
+ format = formatRaw;
400
+ }
401
+ let target;
402
+ if (targetRaw !== void 0) {
403
+ if (targetRaw !== "remnic-ai") {
404
+ throw new Error('ERROR: --target must be "remnic-ai".');
405
+ }
406
+ target = targetRaw;
407
+ }
168
408
  return {
169
409
  action,
170
410
  benchmarks,
171
411
  quick: args.includes("--quick"),
172
412
  all: args.includes("--all"),
173
413
  json: args.includes("--json"),
174
- datasetDir: datasetDir ? path.resolve(expandTilde(datasetDir)) : void 0
414
+ detail: args.includes("--detail"),
415
+ datasetDir: datasetDir ? path.resolve(expandTilde(datasetDir)) : void 0,
416
+ resultsDir: resultsDir ? path.resolve(expandTilde(resultsDir)) : void 0,
417
+ baselinesDir: baselinesDir ? path.resolve(expandTilde(baselinesDir)) : void 0,
418
+ threshold,
419
+ custom: customRaw ? path.resolve(expandTilde(customRaw)) : void 0,
420
+ baselineAction,
421
+ datasetAction,
422
+ providerAction,
423
+ runAction,
424
+ format,
425
+ output: output ? path.resolve(expandTilde(output)) : void 0,
426
+ target
175
427
  };
176
428
  }
177
429
 
@@ -296,12 +548,29 @@ var BENCHMARK_CATALOG = [
296
548
  ];
297
549
  var BENCHMARK_IDS = new Set(BENCHMARK_CATALOG.map((entry) => entry.id));
298
550
  function getBenchUsageText() {
299
- return `Usage: remnic bench <list|run> [options] [benchmark...]
300
- remnic benchmark <list|run|check|report> [options] [benchmark...]
551
+ return `Usage: remnic bench <list|run|datasets|runs|compare|results|baseline|export|publish|ui|providers> [options] [benchmark...]
552
+ remnic benchmark <list|run|datasets|runs|compare|results|baseline|export|publish|ui|providers|check|report> [options] [benchmark...]
301
553
 
302
554
  Commands:
303
555
  list List published benchmark packs
304
556
  run [benchmark...] Run one or more benchmark packs
557
+ datasets download [benchmark...]
558
+ Download local datasets for supported published benchmarks
559
+ datasets status Show local dataset availability for supported benchmarks
560
+ runs list List stored benchmark runs
561
+ runs show <run> Show one stored benchmark run
562
+ runs delete <run...> Delete one or more stored benchmark runs
563
+ compare <base> <cand> Compare two stored benchmark runs by id or file path
564
+ results [run] List stored runs or inspect a stored run
565
+ baseline save <name> [run]
566
+ Save a stored run as a named baseline
567
+ baseline list List saved baselines
568
+ export <run> --format <json|csv|html>
569
+ Export one stored run as JSON, aggregate-metrics CSV, or static HTML
570
+ publish --target remnic-ai
571
+ Generate the Remnic.ai benchmark feed from stored runs
572
+ ui Launch the local benchmark overview UI
573
+ providers discover Auto-detect available local provider backends
305
574
  check Legacy latency regression gate (compatibility)
306
575
  report Legacy latency report generator (compatibility)
307
576
 
@@ -309,12 +578,36 @@ Options:
309
578
  --quick Run a lightweight quick pass (maps to --lightweight --limit 1)
310
579
  --all Run every published benchmark
311
580
  --dataset-dir <path> Override the benchmark dataset directory for full runs
581
+ --custom <path> Run a YAML-defined custom benchmark file
582
+ --results-dir <path> Override the stored benchmark results directory
583
+ --baselines-dir <path> Override the named baseline directory
584
+ --threshold <value> Regression threshold for compare (default: 0.05)
585
+ --detail Include per-task details for bench results
586
+ --format <json|csv|html> Output format for bench export
587
+ --output <path> Write bench export output to a file
588
+ --target <name> Publish target for bench publish (remnic-ai)
312
589
  --json Output JSON for \`list\`
313
590
 
314
591
  Examples:
315
592
  remnic bench list
593
+ remnic bench datasets status
594
+ remnic bench datasets download longmemeval
595
+ remnic bench datasets download --all
596
+ remnic bench runs list
597
+ remnic bench runs show candidate-run --detail
598
+ remnic bench runs delete candidate-run
316
599
  remnic bench run --quick longmemeval
317
600
  remnic bench run longmemeval --dataset-dir ~/datasets/longmemeval
601
+ remnic bench compare base-run candidate-run
602
+ remnic bench results
603
+ remnic bench results candidate-run --detail
604
+ remnic bench baseline save main candidate-run
605
+ remnic bench baseline list
606
+ remnic bench export candidate-run --format csv --output ./candidate.csv
607
+ remnic bench export candidate-run --format html --output ./report.html
608
+ remnic bench publish --target remnic-ai
609
+ remnic bench providers discover
610
+ remnic bench run --custom ./my-bench.yaml
318
611
  remnic benchmark run --quick longmemeval`;
319
612
  }
320
613
  function buildBenchRunnerArgs(parsed, benchmarkId) {
@@ -347,7 +640,7 @@ async function listBenchmarksFromPackage() {
347
640
  }
348
641
  async function loadBenchDefinitionsFromPackage() {
349
642
  try {
350
- const benchModule = await import("./dist-B67STFFX.js");
643
+ const benchModule = await import("./dist-7DCVQLUB.js");
351
644
  if (!benchModule.listBenchmarks) return void 0;
352
645
  const result = benchModule.listBenchmarks();
353
646
  return Array.isArray(result) ? result : void 0;
@@ -395,6 +688,154 @@ async function runBenchViaFallback(parsed, benchmarkId) {
395
688
  function resolveBenchOutputDir() {
396
689
  return path2.join(resolveHomeDir(), ".remnic", "bench", "results");
397
690
  }
691
+ var DOWNLOADABLE_BENCHMARK_DATASETS = [
692
+ "ama-bench",
693
+ "memory-arena",
694
+ "amemgym",
695
+ "longmemeval",
696
+ "locomo"
697
+ ];
698
+ var DOWNLOADED_DATASET_MARKERS = {
699
+ "ama-bench": { anyOf: ["open_end_qa_set.jsonl"] },
700
+ longmemeval: {
701
+ anyOf: ["longmemeval_oracle.json", "longmemeval_s_cleaned.json", "longmemeval.json"]
702
+ },
703
+ amemgym: {
704
+ anyOf: ["amemgym-v1-base.json", "amemgym-tasks.json", "data.json"]
705
+ },
706
+ locomo: { anyOf: ["locomo10.json", "locomo.json"] },
707
+ "memory-arena": { ext: ".jsonl" }
708
+ };
709
+ function isDatasetDownloaded(datasetPath, benchmarkId) {
710
+ let stats;
711
+ try {
712
+ stats = fs.statSync(datasetPath);
713
+ } catch {
714
+ return false;
715
+ }
716
+ if (!stats.isDirectory()) {
717
+ return false;
718
+ }
719
+ const marker = DOWNLOADED_DATASET_MARKERS[benchmarkId];
720
+ if (!marker) {
721
+ try {
722
+ return fs.readdirSync(datasetPath).length > 0;
723
+ } catch {
724
+ return false;
725
+ }
726
+ }
727
+ if (marker.anyOf) {
728
+ return marker.anyOf.some((name) => {
729
+ try {
730
+ return fs.statSync(path2.join(datasetPath, name)).isFile();
731
+ } catch {
732
+ return false;
733
+ }
734
+ });
735
+ }
736
+ if (marker.ext) {
737
+ try {
738
+ return fs.readdirSync(datasetPath).some((name) => name.endsWith(marker.ext));
739
+ } catch {
740
+ return false;
741
+ }
742
+ }
743
+ return false;
744
+ }
745
+ async function launchBenchUi(resultsDir) {
746
+ const benchUiDir = path2.join(CLI_REPO_ROOT, "packages", "bench-ui");
747
+ const pnpmCmd = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
748
+ if (!fs.existsSync(path2.join(benchUiDir, "package.json"))) {
749
+ console.error("ERROR: @remnic/bench-ui is not available in this checkout.");
750
+ process.exit(1);
751
+ }
752
+ console.log(`Launching bench UI with results from ${resultsDir}`);
753
+ console.log("Press Ctrl+C to stop the local server.");
754
+ const child = childProcess.spawn(pnpmCmd, ["exec", "vite", "--host", "127.0.0.1"], {
755
+ cwd: benchUiDir,
756
+ stdio: "inherit",
757
+ shell: process.platform === "win32",
758
+ env: {
759
+ ...process.env,
760
+ REMNIC_BENCH_RESULTS_DIR: resultsDir
761
+ }
762
+ });
763
+ await new Promise((resolve, reject) => {
764
+ child.on("error", reject);
765
+ child.on("close", (code, signal) => {
766
+ if (code === 0 || signal === "SIGINT" || signal === "SIGTERM") {
767
+ resolve();
768
+ return;
769
+ }
770
+ reject(new Error(`bench UI exited with code ${code ?? "unknown"}`));
771
+ });
772
+ });
773
+ }
774
+ function resolveBenchBaselineDir() {
775
+ return defaultBenchmarkBaselineDir();
776
+ }
777
+ function resolveRepoDatasetRoot() {
778
+ const repoCandidate = path2.join(CLI_REPO_ROOT, "evals", "datasets");
779
+ if (isRepoCheckout()) {
780
+ return repoCandidate;
781
+ }
782
+ return path2.join(resolveHomeDir(), ".remnic", "bench", "datasets");
783
+ }
784
+ function listDownloadableBenchmarks() {
785
+ return [...DOWNLOADABLE_BENCHMARK_DATASETS];
786
+ }
787
+ function resolveDatasetDownloadScriptPath() {
788
+ const bundled = path2.join(CLI_MODULE_DIR, "assets", "download-datasets.sh");
789
+ if (fs.existsSync(bundled)) {
790
+ return bundled;
791
+ }
792
+ return path2.join(CLI_REPO_ROOT, "evals", "scripts", "download-datasets.sh");
793
+ }
794
+ function isRepoCheckout() {
795
+ return fs.existsSync(path2.join(CLI_REPO_ROOT, "pnpm-workspace.yaml")) && fs.existsSync(path2.join(CLI_REPO_ROOT, "evals", "scripts", "download-datasets.sh"));
796
+ }
797
+ function runDatasetDownloadScript(scriptPath, benchmarkId, datasetRoot, jsonMode) {
798
+ const stdio = jsonMode ? ["inherit", process.stderr, "inherit"] : "inherit";
799
+ const env = { ...process.env, DATASETS_DIR: datasetRoot };
800
+ const options = {
801
+ cwd: CLI_REPO_ROOT,
802
+ stdio,
803
+ env
804
+ };
805
+ const args = ["--benchmark", benchmarkId];
806
+ if (process.platform !== "win32") {
807
+ childProcess.execFileSync(scriptPath, args, options);
808
+ return;
809
+ }
810
+ const bashProbe = childProcess.spawnSync("bash", ["--version"], { stdio: "ignore" });
811
+ if (bashProbe.error || bashProbe.status !== 0) {
812
+ throw new Error(
813
+ "bench datasets download requires bash on Windows (Git Bash or WSL). Install bash or run this command from a Unix shell."
814
+ );
815
+ }
816
+ childProcess.execFileSync("bash", [scriptPath, ...args], options);
817
+ }
818
+ function resolveSelectedDatasetDownloads(parsed) {
819
+ const supported = listDownloadableBenchmarks();
820
+ if (parsed.all) {
821
+ return supported;
822
+ }
823
+ if (parsed.benchmarks.length === 0) {
824
+ console.error(
825
+ "ERROR: datasets download requires at least one benchmark id or --all. Usage: remnic bench datasets download <benchmark...> [--all] [--json]"
826
+ );
827
+ process.exit(1);
828
+ }
829
+ const selected = [...new Set(parsed.benchmarks)];
830
+ const unsupported = selected.filter((benchmarkId) => !supported.includes(benchmarkId));
831
+ if (unsupported.length > 0) {
832
+ console.error(
833
+ `ERROR: unsupported downloadable benchmark dataset(s): ${unsupported.join(", ")}. Supported datasets: ${supported.join(", ")}.`
834
+ );
835
+ process.exit(1);
836
+ }
837
+ return selected;
838
+ }
398
839
  function resolveBenchDatasetDir(benchmarkId, quick, datasetDirOverride) {
399
840
  if (datasetDirOverride) {
400
841
  return datasetDirOverride;
@@ -402,14 +843,13 @@ function resolveBenchDatasetDir(benchmarkId, quick, datasetDirOverride) {
402
843
  if (quick) {
403
844
  return void 0;
404
845
  }
405
- const repoDatasetDir = path2.join(CLI_REPO_ROOT, "evals", "datasets", benchmarkId);
406
- try {
407
- return fs.statSync(repoDatasetDir).isDirectory() ? repoDatasetDir : void 0;
408
- } catch {
409
- return void 0;
846
+ const datasetDir = path2.join(resolveRepoDatasetRoot(), benchmarkId);
847
+ if (isDatasetDownloaded(datasetDir, benchmarkId)) {
848
+ return datasetDir;
410
849
  }
850
+ return void 0;
411
851
  }
412
- function printBenchPackageSummary(result, outputPath) {
852
+ function printBenchPackageSummary(result, outputPath, outputLabel = "Results saved") {
413
853
  console.log(`Benchmark: ${result.meta.benchmark}`);
414
854
  console.log(`Mode: ${result.meta.mode}`);
415
855
  console.log(`Tasks: ${result.results.tasks.length}`);
@@ -417,12 +857,426 @@ function printBenchPackageSummary(result, outputPath) {
417
857
  for (const [metric, aggregate] of Object.entries(result.results.aggregates).sort()) {
418
858
  console.log(` ${metric.padEnd(20)} ${aggregate.mean.toFixed(4)}`);
419
859
  }
420
- console.log(`Results saved: ${outputPath}`);
860
+ console.log(`${outputLabel}: ${outputPath}`);
861
+ }
862
+ function printStoredBenchResultSummary(result, summary) {
863
+ printBenchPackageSummary(result, summary.path, "Stored result");
864
+ console.log(`Run id: ${summary.id}`);
865
+ }
866
+ function printStoredBenchResultDetails(result, summary) {
867
+ printStoredBenchResultSummary(result, summary);
868
+ if (result.results.tasks.length === 0) {
869
+ console.log("Tasks: none");
870
+ return;
871
+ }
872
+ console.log("Task breakdown:");
873
+ for (const task of result.results.tasks) {
874
+ const scores = Object.entries(task.scores).sort(([left], [right]) => left.localeCompare(right)).map(([metric, value]) => `${metric}=${value.toFixed(4)}`).join(", ");
875
+ console.log(
876
+ ` ${task.taskId}: ${task.latencyMs.toFixed(1)}ms${scores.length > 0 ? ` [${scores}]` : ""}`
877
+ );
878
+ }
879
+ }
880
+ function printBenchComparisonSummary(comparison, baseline, candidate) {
881
+ console.log(`Benchmark: ${comparison.benchmark}`);
882
+ console.log(`Baseline: ${baseline.id} (${baseline.path})`);
883
+ console.log(`Candidate: ${candidate.id} (${candidate.path})`);
884
+ console.log(`Verdict: ${comparison.verdict}`);
885
+ const metrics = Object.entries(comparison.metricDeltas).sort(
886
+ ([left], [right]) => left.localeCompare(right)
887
+ );
888
+ if (metrics.length === 0) {
889
+ console.log("No overlapping metrics were found between the two results.");
890
+ return;
891
+ }
892
+ console.log("Metrics:");
893
+ for (const [metric, delta] of metrics) {
894
+ const percent = Number.isFinite(delta.percentChange) ? `${(delta.percentChange * 100).toFixed(2)}%` : delta.percentChange > 0 ? "+Infinity%" : "-Infinity%";
895
+ const direction = delta.delta >= 0 ? "+" : "";
896
+ console.log(
897
+ ` ${metric.padEnd(18)} ${delta.baseline.toFixed(4)} -> ${delta.candidate.toFixed(4)} (${direction}${delta.delta.toFixed(4)}, ${percent}, d=${delta.effectSize.cohensD.toFixed(3)} ${delta.effectSize.interpretation})`
898
+ );
899
+ if (delta.ciOnDelta) {
900
+ console.log(
901
+ ` CI95 delta: [${delta.ciOnDelta.lower.toFixed(4)}, ${delta.ciOnDelta.upper.toFixed(4)}]`
902
+ );
903
+ }
904
+ }
905
+ }
906
+ async function compareBenchPackageResults(parsed) {
907
+ const refs = parsed.benchmarks;
908
+ if (refs.length !== 2) {
909
+ console.error(
910
+ "ERROR: compare requires exactly two stored result references. Usage: remnic bench compare <baseline> <candidate> [--results-dir <path>] [--threshold <value>] [--json]"
911
+ );
912
+ process.exit(1);
913
+ }
914
+ const resultsDir = parsed.resultsDir ?? resolveBenchOutputDir();
915
+ const [baselineRef, candidateRef] = refs;
916
+ const baselineSummary = await resolveBenchmarkResultReference(resultsDir, baselineRef);
917
+ const candidateSummary = await resolveBenchmarkResultReference(resultsDir, candidateRef);
918
+ if (!baselineSummary) {
919
+ console.error(`ERROR: benchmark result not found: ${baselineRef}`);
920
+ process.exit(1);
921
+ }
922
+ if (!candidateSummary) {
923
+ console.error(`ERROR: benchmark result not found: ${candidateRef}`);
924
+ process.exit(1);
925
+ }
926
+ const baseline = await loadBenchmarkResult(baselineSummary.path);
927
+ const candidate = await loadBenchmarkResult(candidateSummary.path);
928
+ if (baseline.meta.benchmark !== candidate.meta.benchmark) {
929
+ console.error(
930
+ `ERROR: benchmark mismatch: ${baseline.meta.benchmark} vs ${candidate.meta.benchmark}. Compare runs from the same benchmark.`
931
+ );
932
+ process.exit(1);
933
+ }
934
+ const comparison = compareResults(
935
+ baseline,
936
+ candidate,
937
+ parsed.threshold ?? 0.05,
938
+ getBenchmarkLowerIsBetter(candidate.meta.benchmark)
939
+ );
940
+ if (parsed.json) {
941
+ console.log(JSON.stringify({
942
+ benchmark: comparison.benchmark,
943
+ baseline: baselineSummary,
944
+ candidate: candidateSummary,
945
+ comparison
946
+ }, null, 2));
947
+ } else {
948
+ printBenchComparisonSummary(comparison, baselineSummary, candidateSummary);
949
+ }
950
+ if (comparison.verdict === "regression") {
951
+ process.exit(1);
952
+ }
953
+ }
954
+ async function showBenchPackageResults(parsed) {
955
+ const resultsDir = parsed.resultsDir ?? resolveBenchOutputDir();
956
+ if (parsed.benchmarks.length === 0) {
957
+ const summaries = await listBenchmarkResults(resultsDir);
958
+ if (parsed.json) {
959
+ console.log(JSON.stringify(summaries, null, 2));
960
+ return;
961
+ }
962
+ if (summaries.length === 0) {
963
+ console.log(`No stored benchmark runs found in ${resultsDir}`);
964
+ return;
965
+ }
966
+ console.log("Stored benchmark runs:");
967
+ for (const summary2 of summaries) {
968
+ console.log(
969
+ ` ${summary2.id.padEnd(24)} ${summary2.benchmark.padEnd(16)} ${summary2.mode.padEnd(5)} ${summary2.timestamp}`
970
+ );
971
+ }
972
+ return;
973
+ }
974
+ if (parsed.benchmarks.length !== 1) {
975
+ console.error(
976
+ "ERROR: results accepts at most one stored result reference. Usage: remnic bench results [run] [--detail] [--results-dir <path>] [--json]"
977
+ );
978
+ process.exit(1);
979
+ }
980
+ const reference = parsed.benchmarks[0];
981
+ const summary = await resolveBenchmarkResultReference(resultsDir, reference);
982
+ if (!summary) {
983
+ console.error(`ERROR: benchmark result not found: ${reference}`);
984
+ process.exit(1);
985
+ }
986
+ const result = await loadBenchmarkResult(summary.path);
987
+ if (parsed.json) {
988
+ console.log(JSON.stringify(result, null, 2));
989
+ return;
990
+ }
991
+ if (parsed.detail) {
992
+ printStoredBenchResultDetails(result, summary);
993
+ } else {
994
+ printStoredBenchResultSummary(result, summary);
995
+ }
996
+ }
997
+ async function manageBenchBaselines(parsed) {
998
+ const baselineDir = parsed.baselinesDir ?? resolveBenchBaselineDir();
999
+ if (parsed.baselineAction === "list") {
1000
+ const baselines = await listBenchmarkBaselines(baselineDir);
1001
+ if (parsed.json) {
1002
+ console.log(JSON.stringify(baselines, null, 2));
1003
+ return;
1004
+ }
1005
+ if (baselines.length === 0) {
1006
+ console.log(`No saved baselines found in ${baselineDir}`);
1007
+ return;
1008
+ }
1009
+ console.log("Saved baselines:");
1010
+ for (const baseline of baselines) {
1011
+ console.log(
1012
+ ` ${baseline.name.padEnd(20)} ${baseline.benchmark.padEnd(16)} ${baseline.mode.padEnd(5)} ${baseline.timestamp}`
1013
+ );
1014
+ }
1015
+ return;
1016
+ }
1017
+ if (parsed.baselineAction !== "save") {
1018
+ console.error("ERROR: baseline requires a subcommand: save or list.");
1019
+ process.exit(1);
1020
+ }
1021
+ if (parsed.benchmarks.length < 1 || parsed.benchmarks.length > 2) {
1022
+ console.error(
1023
+ "ERROR: baseline save requires a name and optionally one stored result reference. Usage: remnic bench baseline save <name> [run] [--results-dir <path>] [--baselines-dir <path>] [--json]"
1024
+ );
1025
+ process.exit(1);
1026
+ }
1027
+ const [name, explicitReference] = parsed.benchmarks;
1028
+ const resultsDir = parsed.resultsDir ?? resolveBenchOutputDir();
1029
+ const sourceSummary = explicitReference ? await resolveBenchmarkResultReference(resultsDir, explicitReference) : (await listBenchmarkResults(resultsDir))[0];
1030
+ if (!sourceSummary) {
1031
+ console.error(
1032
+ explicitReference ? `ERROR: benchmark result not found: ${explicitReference}` : `ERROR: no stored benchmark runs found in ${resultsDir}`
1033
+ );
1034
+ process.exit(1);
1035
+ }
1036
+ const result = await loadBenchmarkResult(sourceSummary.path);
1037
+ let writtenPath;
1038
+ try {
1039
+ writtenPath = await saveBenchmarkBaseline(
1040
+ baselineDir,
1041
+ name,
1042
+ result,
1043
+ { id: sourceSummary.id, path: sourceSummary.path }
1044
+ );
1045
+ } catch (error) {
1046
+ console.error(error instanceof Error ? error.message : String(error));
1047
+ process.exit(1);
1048
+ }
1049
+ if (parsed.json) {
1050
+ const baseline = await loadBenchmarkBaseline(writtenPath);
1051
+ console.log(JSON.stringify({
1052
+ name: baseline.name,
1053
+ path: writtenPath,
1054
+ source: baseline.source,
1055
+ benchmark: baseline.result.meta.benchmark,
1056
+ timestamp: baseline.savedAt
1057
+ }, null, 2));
1058
+ return;
1059
+ }
1060
+ console.log(`Saved baseline "${name}" to ${writtenPath}`);
1061
+ console.log(` Source run: ${sourceSummary.id}`);
1062
+ console.log(` Benchmark: ${result.meta.benchmark}`);
1063
+ }
1064
+ async function exportBenchPackageResult(parsed) {
1065
+ if (parsed.benchmarks.length !== 1) {
1066
+ console.error(
1067
+ "ERROR: export requires exactly one stored result reference. Usage: remnic bench export <run> --format <json|csv|html> [--output <path>] [--results-dir <path>]"
1068
+ );
1069
+ process.exit(1);
1070
+ }
1071
+ if (!parsed.format) {
1072
+ console.error("ERROR: export requires --format json, csv, or html.");
1073
+ process.exit(1);
1074
+ }
1075
+ const resultsDir = parsed.resultsDir ?? resolveBenchOutputDir();
1076
+ const reference = parsed.benchmarks[0];
1077
+ const summary = await resolveBenchmarkResultReference(resultsDir, reference);
1078
+ if (!summary) {
1079
+ console.error(`ERROR: benchmark result not found: ${reference}`);
1080
+ process.exit(1);
1081
+ }
1082
+ const result = await loadBenchmarkResult(summary.path);
1083
+ const rendered = renderBenchmarkResultExport(result, parsed.format);
1084
+ if (parsed.output) {
1085
+ fs.mkdirSync(path2.dirname(parsed.output), { recursive: true });
1086
+ fs.writeFileSync(parsed.output, rendered);
1087
+ console.log(`Exported ${summary.id} as ${parsed.format} to ${parsed.output}`);
1088
+ return;
1089
+ }
1090
+ process.stdout.write(rendered);
1091
+ }
1092
+ async function manageBenchDatasets(parsed) {
1093
+ const datasetRoot = resolveRepoDatasetRoot();
1094
+ const supported = listDownloadableBenchmarks();
1095
+ if (parsed.datasetAction === "status") {
1096
+ if (parsed.benchmarks.length > 0 || parsed.all) {
1097
+ console.error(
1098
+ "ERROR: datasets status does not accept benchmark names or --all. Usage: remnic bench datasets status [--json]"
1099
+ );
1100
+ process.exit(1);
1101
+ }
1102
+ const status = supported.map((benchmarkId) => {
1103
+ const datasetPath = path2.join(datasetRoot, benchmarkId);
1104
+ return {
1105
+ benchmark: benchmarkId,
1106
+ downloaded: isDatasetDownloaded(datasetPath, benchmarkId),
1107
+ path: datasetPath
1108
+ };
1109
+ });
1110
+ if (parsed.json) {
1111
+ console.log(JSON.stringify(status, null, 2));
1112
+ return;
1113
+ }
1114
+ console.log("Downloadable benchmark datasets:");
1115
+ for (const entry of status) {
1116
+ console.log(
1117
+ ` ${entry.benchmark.padEnd(16)} ${entry.downloaded ? "downloaded" : "missing"} ${entry.path}`
1118
+ );
1119
+ }
1120
+ console.log("");
1121
+ console.log(
1122
+ "Only the script-backed published datasets are managed here. Other benchmark fixtures remain repo-managed or manual."
1123
+ );
1124
+ return;
1125
+ }
1126
+ if (parsed.datasetAction !== "download") {
1127
+ console.error("ERROR: datasets requires a subcommand: download or status.");
1128
+ process.exit(1);
1129
+ }
1130
+ const scriptPath = resolveDatasetDownloadScriptPath();
1131
+ if (!fs.existsSync(scriptPath)) {
1132
+ console.error(`ERROR: dataset download script not found: ${scriptPath}`);
1133
+ process.exit(1);
1134
+ }
1135
+ const selected = resolveSelectedDatasetDownloads(parsed);
1136
+ const downloaded = [];
1137
+ for (const benchmarkId of selected) {
1138
+ runDatasetDownloadScript(scriptPath, benchmarkId, datasetRoot, parsed.json === true);
1139
+ downloaded.push({
1140
+ benchmark: benchmarkId,
1141
+ path: path2.join(datasetRoot, benchmarkId)
1142
+ });
1143
+ }
1144
+ if (parsed.json) {
1145
+ console.log(JSON.stringify(downloaded, null, 2));
1146
+ return;
1147
+ }
1148
+ console.log("Downloaded benchmark datasets:");
1149
+ for (const entry of downloaded) {
1150
+ console.log(` ${entry.benchmark} ${entry.path}`);
1151
+ }
1152
+ }
1153
+ async function manageBenchRuns(parsed) {
1154
+ const resultsDir = parsed.resultsDir ?? resolveBenchOutputDir();
1155
+ if (parsed.runAction === "list") {
1156
+ if (parsed.benchmarks.length > 0 || parsed.all) {
1157
+ console.error(
1158
+ "ERROR: runs list does not accept benchmark names or --all. Usage: remnic bench runs list [--results-dir <path>] [--json]"
1159
+ );
1160
+ process.exit(1);
1161
+ }
1162
+ await showBenchPackageResults({ ...parsed, action: "results", benchmarks: [] });
1163
+ return;
1164
+ }
1165
+ if (parsed.runAction === "show") {
1166
+ if (parsed.benchmarks.length !== 1 || parsed.all) {
1167
+ console.error(
1168
+ "ERROR: runs show requires exactly one stored result reference. Usage: remnic bench runs show <run> [--detail] [--results-dir <path>] [--json]"
1169
+ );
1170
+ process.exit(1);
1171
+ }
1172
+ await showBenchPackageResults(parsed);
1173
+ return;
1174
+ }
1175
+ if (parsed.runAction === "delete") {
1176
+ if (parsed.benchmarks.length === 0 || parsed.all) {
1177
+ console.error(
1178
+ "ERROR: runs delete requires at least one stored result reference. Usage: remnic bench runs delete <run...> [--results-dir <path>] [--json]"
1179
+ );
1180
+ process.exit(1);
1181
+ }
1182
+ const deleted = await deleteBenchmarkResults(resultsDir, parsed.benchmarks);
1183
+ if (parsed.json) {
1184
+ console.log(JSON.stringify(deleted, null, 2));
1185
+ } else {
1186
+ if (deleted.deleted.length === 0) {
1187
+ console.log("No benchmark runs were deleted.");
1188
+ } else {
1189
+ console.log("Deleted benchmark runs:");
1190
+ for (const summary of deleted.deleted) {
1191
+ console.log(` ${summary.id} ${summary.path}`);
1192
+ }
1193
+ }
1194
+ if (deleted.missing.length > 0) {
1195
+ console.log("Missing benchmark runs:");
1196
+ for (const reference of deleted.missing) {
1197
+ console.log(` ${reference}`);
1198
+ }
1199
+ }
1200
+ }
1201
+ if (deleted.missing.length > 0) {
1202
+ process.exit(1);
1203
+ }
1204
+ return;
1205
+ }
1206
+ console.error("ERROR: runs requires a subcommand: list, show, or delete.");
1207
+ process.exit(1);
1208
+ }
1209
+ async function discoverBenchProviders(parsed) {
1210
+ if (parsed.benchmarks.length > 0) {
1211
+ console.error(
1212
+ "ERROR: providers discover does not accept positional arguments. Usage: remnic bench providers discover [--json]"
1213
+ );
1214
+ process.exit(1);
1215
+ }
1216
+ const discovered = await discoverAllProviders();
1217
+ if (parsed.json) {
1218
+ console.log(JSON.stringify(discovered, null, 2));
1219
+ return;
1220
+ }
1221
+ if (discovered.length === 0) {
1222
+ console.log("No local bench providers were discovered.");
1223
+ return;
1224
+ }
1225
+ console.log("Discovered bench providers:");
1226
+ for (const entry of discovered) {
1227
+ console.log(` ${entry.provider}`);
1228
+ for (const model of entry.models) {
1229
+ const capabilities = model.capabilities.join(", ");
1230
+ const details = [
1231
+ model.contextLength > 0 ? `context=${model.contextLength}` : void 0,
1232
+ model.parameterCount ? `params=${model.parameterCount}` : void 0,
1233
+ model.quantization ? `quant=${model.quantization}` : void 0,
1234
+ capabilities.length > 0 ? `caps=${capabilities}` : void 0
1235
+ ].filter((value) => Boolean(value));
1236
+ console.log(
1237
+ ` - ${model.id}${details.length > 0 ? ` (${details.join(", ")})` : ""}`
1238
+ );
1239
+ }
1240
+ }
1241
+ }
1242
+ async function publishBenchPackageResults(parsed) {
1243
+ if (parsed.benchmarks.length > 0) {
1244
+ console.error(
1245
+ "ERROR: publish does not accept positional result references. Usage: remnic bench publish --target remnic-ai [--results-dir <path>] [--output <path>] [--json]"
1246
+ );
1247
+ process.exit(1);
1248
+ }
1249
+ if (parsed.target !== "remnic-ai") {
1250
+ console.error("ERROR: publish requires --target remnic-ai.");
1251
+ process.exit(1);
1252
+ }
1253
+ const resultsDir = parsed.resultsDir ?? resolveBenchOutputDir();
1254
+ const feed = await buildBenchmarkPublishFeed(resultsDir, parsed.target);
1255
+ if (feed.benchmarks.length === 0) {
1256
+ console.error(
1257
+ `ERROR: no publishable benchmark results found in ${resultsDir}. remnic-ai requires stored full runs for published benchmarks.`
1258
+ );
1259
+ process.exit(1);
1260
+ }
1261
+ const outputPath = parsed.output ?? defaultBenchmarkPublishPath(parsed.target);
1262
+ const writtenPath = await writeBenchmarkPublishFeed(feed, outputPath);
1263
+ if (parsed.json) {
1264
+ console.log(JSON.stringify({
1265
+ target: parsed.target,
1266
+ outputPath: writtenPath,
1267
+ benchmarkCount: feed.benchmarks.length,
1268
+ feed
1269
+ }, null, 2));
1270
+ return;
1271
+ }
1272
+ console.log(
1273
+ `Published ${feed.benchmarks.length} benchmark entries for ${parsed.target} to ${writtenPath}`
1274
+ );
421
1275
  }
422
1276
  async function runBenchViaPackage(parsed, benchmarkId) {
423
1277
  let benchModule;
424
1278
  try {
425
- benchModule = await import("./dist-B67STFFX.js");
1279
+ benchModule = await import("./dist-7DCVQLUB.js");
426
1280
  } catch {
427
1281
  return false;
428
1282
  }
@@ -430,6 +1284,11 @@ async function runBenchViaPackage(parsed, benchmarkId) {
430
1284
  if (!definition?.runnerAvailable || !benchModule.runBenchmark || !benchModule.writeBenchmarkResult) {
431
1285
  return false;
432
1286
  }
1287
+ if (definition.meta?.category === "ingestion") {
1288
+ throw new Error(
1289
+ `Benchmark "${benchmarkId}" requires an ingestion adapter which is not yet available via the CLI. Run ingestion benchmarks programmatically by passing an ingestionAdapter to runBenchmark().`
1290
+ );
1291
+ }
433
1292
  const createAdapter = parsed.quick ? benchModule.createLightweightAdapter : benchModule.createRemnicAdapter;
434
1293
  if (!createAdapter) {
435
1294
  return false;
@@ -442,7 +1301,7 @@ async function runBenchViaPackage(parsed, benchmarkId) {
442
1301
  );
443
1302
  if (!parsed.quick && !datasetDir) {
444
1303
  throw new Error(
445
- `full benchmark runs for "${benchmarkId}" require dataset files. Pass --dataset-dir <path> or run from a Remnic repo checkout with evals/datasets/${benchmarkId}.`
1304
+ `full benchmark runs for "${benchmarkId}" require dataset files. Run "remnic bench datasets download ${benchmarkId}" or pass --dataset-dir <path>.`
446
1305
  );
447
1306
  }
448
1307
  const system = await createAdapter();
@@ -466,6 +1325,41 @@ async function runBenchViaPackage(parsed, benchmarkId) {
466
1325
  await system.destroy();
467
1326
  }
468
1327
  }
1328
+ async function runCustomBenchViaPackage(parsed) {
1329
+ let benchModule;
1330
+ try {
1331
+ benchModule = await import("./dist-7DCVQLUB.js");
1332
+ } catch {
1333
+ return false;
1334
+ }
1335
+ if (!benchModule.runCustomBenchmarkFile || !benchModule.writeBenchmarkResult) {
1336
+ return false;
1337
+ }
1338
+ const createAdapter = parsed.quick ? benchModule.createLightweightAdapter : benchModule.createRemnicAdapter;
1339
+ if (!createAdapter) {
1340
+ return false;
1341
+ }
1342
+ const outputDir = resolveBenchOutputDir();
1343
+ const system = await createAdapter();
1344
+ try {
1345
+ const result = await benchModule.runCustomBenchmarkFile(parsed.custom, {
1346
+ mode: parsed.quick ? "quick" : "full",
1347
+ outputDir,
1348
+ limit: parsed.quick ? 1 : void 0,
1349
+ adapterMode: parsed.quick ? "lightweight" : "direct",
1350
+ system
1351
+ });
1352
+ const writtenPath = await benchModule.writeBenchmarkResult(result, outputDir);
1353
+ if (parsed.json) {
1354
+ console.log(JSON.stringify(result, null, 2));
1355
+ } else {
1356
+ printBenchPackageSummary(result, writtenPath);
1357
+ }
1358
+ return true;
1359
+ } finally {
1360
+ await system.destroy();
1361
+ }
1362
+ }
469
1363
  function resolveConfigPath(cliPath) {
470
1364
  if (cliPath) return path2.resolve(cliPath);
471
1365
  const envPath = readCompatEnv("REMNIC_CONFIG_PATH", "ENGRAM_CONFIG_PATH");
@@ -628,6 +1522,7 @@ async function cmdQuery(queryText, json, explain) {
628
1522
  const config = parseConfig(remnicCfg);
629
1523
  const orchestrator = new Orchestrator(config);
630
1524
  await orchestrator.initialize();
1525
+ await orchestrator.deferredReady;
631
1526
  const service = new EngramAccessService(orchestrator);
632
1527
  if (explain) {
633
1528
  const result2 = await runExplain(service, queryText);
@@ -808,6 +1703,7 @@ async function cmdEnrich(rest) {
808
1703
  ];
809
1704
  const orchestrator2 = new Orchestrator(config);
810
1705
  await orchestrator2.initialize();
1706
+ await orchestrator2.deferredReady;
811
1707
  const searchBackend2 = orchestrator2.qmd;
812
1708
  const searchFn2 = searchBackend2.isAvailable() ? async (query) => {
813
1709
  const results2 = await searchBackend2.search(query, void 0, 10);
@@ -843,6 +1739,7 @@ Registered providers:`);
843
1739
  }
844
1740
  const orchestrator = new Orchestrator(config);
845
1741
  await orchestrator.initialize();
1742
+ await orchestrator.deferredReady;
846
1743
  const storage = await orchestrator.getStorage(config.defaultNamespace);
847
1744
  const entityFiles = await storage.readAllEntityFiles();
848
1745
  let targets = entityFiles;
@@ -1925,6 +2822,42 @@ async function cmdBench(rest) {
1925
2822
  await cmdLegacyBenchmark(parsed.action, benchAction.args, parsed.json);
1926
2823
  return;
1927
2824
  }
2825
+ if (parsed.action === "compare") {
2826
+ await compareBenchPackageResults(parsed);
2827
+ return;
2828
+ }
2829
+ if (parsed.action === "results") {
2830
+ await showBenchPackageResults(parsed);
2831
+ return;
2832
+ }
2833
+ if (parsed.action === "baseline") {
2834
+ await manageBenchBaselines(parsed);
2835
+ return;
2836
+ }
2837
+ if (parsed.action === "export") {
2838
+ await exportBenchPackageResult(parsed);
2839
+ return;
2840
+ }
2841
+ if (parsed.action === "datasets") {
2842
+ await manageBenchDatasets(parsed);
2843
+ return;
2844
+ }
2845
+ if (parsed.action === "runs") {
2846
+ await manageBenchRuns(parsed);
2847
+ return;
2848
+ }
2849
+ if (parsed.action === "publish") {
2850
+ await publishBenchPackageResults(parsed);
2851
+ return;
2852
+ }
2853
+ if (parsed.action === "ui") {
2854
+ await launchBenchUi(parsed.resultsDir ?? resolveBenchOutputDir());
2855
+ return;
2856
+ }
2857
+ if (parsed.action === "providers") {
2858
+ await discoverBenchProviders(parsed);
2859
+ return;
2860
+ }
1928
2861
  if (parsed.action === "list") {
1929
2862
  const catalog = await listBenchmarksFromPackage() ?? BENCHMARK_CATALOG;
1930
2863
  if (parsed.json) {
@@ -1937,6 +2870,20 @@ async function cmdBench(rest) {
1937
2870
  }
1938
2871
  return;
1939
2872
  }
2873
+ if (parsed.custom) {
2874
+ if (parsed.all || parsed.benchmarks.length > 0) {
2875
+ console.error("ERROR: --custom cannot be combined with benchmark names or --all.");
2876
+ process.exit(1);
2877
+ }
2878
+ const handledByPackage = await runCustomBenchViaPackage(parsed);
2879
+ if (!handledByPackage) {
2880
+ console.error(
2881
+ "Benchmark runner not found. Expected a phase-1 @remnic/bench runtime export for custom benchmarks."
2882
+ );
2883
+ process.exit(1);
2884
+ }
2885
+ return;
2886
+ }
1940
2887
  const selectedBenchmarks = parsed.all ? await resolveAllBenchmarks() : parsed.benchmarks;
1941
2888
  if (selectedBenchmarks.length === 0) {
1942
2889
  console.error(
@@ -2789,6 +3736,180 @@ Usage:
2789
3736
  break;
2790
3737
  }
2791
3738
  }
3739
+ function resolveRequiredValueFlag(args, flag) {
3740
+ if (!hasFlag(args, flag)) return void 0;
3741
+ const value = resolveFlagStrict(args, flag);
3742
+ if (value === void 0) {
3743
+ throw new Error(
3744
+ `${flag} requires a value. Provide it as \`${flag} <value>\`, not as a bare flag.`
3745
+ );
3746
+ }
3747
+ return value;
3748
+ }
3749
+ function parseTrainingExportArgs(rest, defaultMemoryDir) {
3750
+ const format = resolveRequiredValueFlag(rest, "--format");
3751
+ if (!format) {
3752
+ throw new Error(
3753
+ "--format <name> is required. Run `remnic training:export --help` for the list of registered adapters."
3754
+ );
3755
+ }
3756
+ const dryRun = hasFlag(rest, "--dry-run");
3757
+ const outputRaw = resolveRequiredValueFlag(rest, "--output") ?? resolveRequiredValueFlag(rest, "--out");
3758
+ if (!outputRaw && !dryRun) {
3759
+ throw new Error(
3760
+ "--output <path> (or --out <path>) is required for training:export. Use --dry-run to print statistics without writing a file."
3761
+ );
3762
+ }
3763
+ const output = outputRaw ? expandTilde(outputRaw) : "";
3764
+ const memoryDirFlag = resolveRequiredValueFlag(rest, "--memory-dir");
3765
+ const memoryDir = expandTilde(memoryDirFlag ?? defaultMemoryDir);
3766
+ const since = resolveRequiredValueFlag(rest, "--since");
3767
+ const until = resolveRequiredValueFlag(rest, "--until");
3768
+ const minConfidenceRaw = resolveRequiredValueFlag(rest, "--min-confidence");
3769
+ let minConfidence;
3770
+ if (minConfidenceRaw !== void 0) {
3771
+ const n = Number(minConfidenceRaw);
3772
+ if (!Number.isFinite(n) || n < 0 || n > 1) {
3773
+ throw new Error(
3774
+ `Invalid --min-confidence value "${minConfidenceRaw}": expected a number in [0, 1].`
3775
+ );
3776
+ }
3777
+ minConfidence = n;
3778
+ }
3779
+ const categoriesRaw = resolveRequiredValueFlag(rest, "--categories");
3780
+ const categories = categoriesRaw ? categoriesRaw.split(",").map((c) => c.trim()).filter((c) => c.length > 0) : void 0;
3781
+ const maxPairsRaw = resolveRequiredValueFlag(rest, "--max-pairs-per-record");
3782
+ let maxPairsPerRecord;
3783
+ if (maxPairsRaw !== void 0) {
3784
+ const n = Number(maxPairsRaw);
3785
+ if (!Number.isInteger(n) || n < 1) {
3786
+ throw new Error(
3787
+ `Invalid --max-pairs-per-record value "${maxPairsRaw}": expected a positive integer.`
3788
+ );
3789
+ }
3790
+ maxPairsPerRecord = n;
3791
+ }
3792
+ const includeEntities = hasFlag(rest, "--include-entities");
3793
+ const synthesize = hasFlag(rest, "--synthesize");
3794
+ const privacySweep = !hasFlag(rest, "--no-privacy-sweep");
3795
+ return {
3796
+ format,
3797
+ output,
3798
+ memoryDir,
3799
+ since,
3800
+ until,
3801
+ minConfidence,
3802
+ categories,
3803
+ includeEntities,
3804
+ synthesize,
3805
+ maxPairsPerRecord,
3806
+ privacySweep,
3807
+ dryRun
3808
+ };
3809
+ }
3810
+ async function runTrainingExport(args, stdout = process.stdout) {
3811
+ ensureWecloneExportAdapterRegistered();
3812
+ const adapter = getTrainingExportAdapter2(args.format);
3813
+ if (!adapter) {
3814
+ const registered = listTrainingExportAdapters();
3815
+ const validList = registered.length > 0 ? `Valid formats: [${registered.join(", ")}]` : "No adapters are currently registered.";
3816
+ throw new Error(
3817
+ `Unknown training-export format "${args.format}". ${validList}`
3818
+ );
3819
+ }
3820
+ if (!fs.existsSync(args.memoryDir)) {
3821
+ throw new Error(
3822
+ `--memory-dir "${args.memoryDir}" does not exist. Provide the path to an existing memory directory.`
3823
+ );
3824
+ }
3825
+ if (!fs.statSync(args.memoryDir).isDirectory()) {
3826
+ throw new Error(
3827
+ `--memory-dir "${args.memoryDir}" is not a directory. Provide the path to a memory directory, not a file.`
3828
+ );
3829
+ }
3830
+ let since;
3831
+ if (args.since) since = parseStrictCliDate(args.since, "--since");
3832
+ let until;
3833
+ if (args.until) until = parseStrictCliDate(args.until, "--until");
3834
+ const convertOptions = {
3835
+ memoryDir: args.memoryDir,
3836
+ since,
3837
+ until,
3838
+ minConfidence: args.minConfidence,
3839
+ categories: args.categories,
3840
+ includeEntities: args.includeEntities
3841
+ };
3842
+ let records = await convertMemoriesToRecords(convertOptions);
3843
+ const recordsRead = records.length;
3844
+ if (args.synthesize) {
3845
+ records = synthesizeTrainingPairs(records, {
3846
+ maxPairsPerRecord: args.maxPairsPerRecord
3847
+ });
3848
+ }
3849
+ let redactedCount = 0;
3850
+ if (args.privacySweep) {
3851
+ const swept = sweepPii(records);
3852
+ records = swept.cleanRecords;
3853
+ redactedCount = swept.redactedCount;
3854
+ }
3855
+ if (args.dryRun) {
3856
+ stdout.write(`Training export dry run
3857
+ `);
3858
+ stdout.write(`Format: ${adapter.name}
3859
+ `);
3860
+ stdout.write(`Records read: ${recordsRead}
3861
+ `);
3862
+ stdout.write(`Records to write: ${records.length}
3863
+ `);
3864
+ if (args.privacySweep) {
3865
+ stdout.write(`Redacted records: ${redactedCount}
3866
+ `);
3867
+ }
3868
+ const cats = /* @__PURE__ */ new Map();
3869
+ for (const r of records) {
3870
+ const c = r.category ?? "unknown";
3871
+ cats.set(c, (cats.get(c) ?? 0) + 1);
3872
+ }
3873
+ const sortedCats = [...cats.entries()].sort(
3874
+ (a, b) => a[0].localeCompare(b[0])
3875
+ );
3876
+ for (const [cat, count] of sortedCats) {
3877
+ stdout.write(` ${cat}: ${count}
3878
+ `);
3879
+ }
3880
+ return {
3881
+ recordsRead,
3882
+ recordsWritten: 0,
3883
+ redactedCount,
3884
+ outputPath: null
3885
+ };
3886
+ }
3887
+ if (!args.output) {
3888
+ throw new Error(
3889
+ "runTrainingExport: `output` is required when dryRun is false. Pass dryRun: true to skip file I/O."
3890
+ );
3891
+ }
3892
+ const formatted = adapter.formatRecords(records);
3893
+ const outDir = path2.dirname(args.output);
3894
+ fs.mkdirSync(outDir, { recursive: true });
3895
+ const tmpPath = `${args.output}.tmp-${process.pid}-${Date.now()}`;
3896
+ fs.writeFileSync(tmpPath, formatted, "utf-8");
3897
+ fs.renameSync(tmpPath, args.output);
3898
+ stdout.write(
3899
+ `Exported ${records.length} records to ${args.output} (${adapter.name} format)
3900
+ `
3901
+ );
3902
+ if (args.privacySweep && redactedCount > 0) {
3903
+ stdout.write(`Privacy sweep redacted PII in ${redactedCount} record(s).
3904
+ `);
3905
+ }
3906
+ return {
3907
+ recordsRead,
3908
+ recordsWritten: records.length,
3909
+ redactedCount,
3910
+ outputPath: args.output
3911
+ };
3912
+ }
2792
3913
  async function main(argv = process.argv.slice(2)) {
2793
3914
  const [command, ...rest] = argv;
2794
3915
  if (command !== "migrate") {
@@ -3047,6 +4168,51 @@ Options:
3047
4168
  await cmdExtensions(action, rest.slice(1));
3048
4169
  break;
3049
4170
  }
4171
+ case "training:export": {
4172
+ if (rest.includes("--help") || rest.includes("-h")) {
4173
+ console.log(`
4174
+ remnic training:export \u2014 Export Remnic memories as fine-tuning datasets (issue #459)
4175
+
4176
+ Usage:
4177
+ remnic training:export --format <name> --output <path> [options]
4178
+
4179
+ Required:
4180
+ --format <name> Registered adapter name (e.g. weclone)
4181
+ --output <path> | --out Path to write the dataset file
4182
+
4183
+ Filters:
4184
+ --memory-dir <path> Memory directory (defaults to resolved memoryDir)
4185
+ --since <YYYY-MM-DD[T...]> Only include memories created at or after this date
4186
+ --until <YYYY-MM-DD[T...]> Only include memories created before this date (exclusive)
4187
+ --min-confidence <0..1> Inclusive lower bound on memory confidence
4188
+ --categories <list> Comma-separated category filter (fact,preference,...)
4189
+ --include-entities Also read from entities/ (off by default)
4190
+
4191
+ Adapter options:
4192
+ --synthesize Generate conversational Q/A pairs (WeClone-optimised)
4193
+ --max-pairs-per-record <n> When --synthesize, max pairs emitted per memory
4194
+ --no-privacy-sweep Skip the final PII redaction pass (default: on)
4195
+
4196
+ Other:
4197
+ --dry-run Print statistics only; do not write the file
4198
+ `);
4199
+ break;
4200
+ }
4201
+ let parsed;
4202
+ try {
4203
+ parsed = parseTrainingExportArgs(rest, resolveMemoryDir());
4204
+ } catch (err) {
4205
+ console.error(err instanceof Error ? err.message : String(err));
4206
+ process.exit(1);
4207
+ }
4208
+ try {
4209
+ await runTrainingExport(parsed);
4210
+ } catch (err) {
4211
+ console.error(err instanceof Error ? err.message : String(err));
4212
+ process.exit(1);
4213
+ }
4214
+ break;
4215
+ }
3050
4216
  case "openclaw": {
3051
4217
  const subAction = rest[0] ?? "help";
3052
4218
  if (subAction === "install") {
@@ -3103,9 +4269,9 @@ Usage:
3103
4269
  remnic extensions <list|show|validate|reload> Manage memory extensions
3104
4270
  remnic space <list|switch|create|delete|push|pull|share|promote|audit> Manage spaces
3105
4271
  create accepts --parent <id> to set parent-child relationship
3106
- remnic bench <list|run> [benchmark...] [--quick] [--all] [--dataset-dir <path>] [--json]
4272
+ remnic bench <list|run|datasets|runs|compare|results|baseline|export|publish|ui|providers> [benchmark...] [--quick] [--all] [--dataset-dir <path>] [--results-dir <path>] [--baselines-dir <path>] [--threshold <value>] [--detail] [--format <json|csv|html>] [--output <path>] [--target remnic-ai] [--json]
3107
4273
  benchmark is kept as a compatibility alias. check/report remain under that alias.
3108
- remnic benchmark <list|run|check|report> [queries...] [--explain] [--baseline=<path>] [--report=<path>]
4274
+ remnic benchmark <list|run|datasets|runs|compare|results|baseline|export|publish|ui|providers|check|report> [queries...] [--explain] [--baseline=<path>] [--report=<path>]
3109
4275
  remnic briefing [--since <window>] [--focus <filter>] [--save] [--format markdown|json]
3110
4276
  Daily context briefing. Windows: yesterday, today, NNh, NNd, NNw.
3111
4277
  Focus: person:<name>, project:<name>, topic:<name>.
@@ -3126,6 +4292,9 @@ Usage:
3126
4292
  remnic enrich --dry-run Preview what would be enriched
3127
4293
  remnic enrich audit Show recent enrichment audit log
3128
4294
  remnic enrich providers List registered providers and their status
4295
+ remnic training:export --format <name> --output <path> [options]
4296
+ Export memories as a fine-tuning dataset (issue #459). Run
4297
+ 'remnic training:export --help' for the full option list.
3129
4298
 
3130
4299
  Options:
3131
4300
  --json Output in JSON format
@@ -3152,7 +4321,9 @@ export {
3152
4321
  main,
3153
4322
  parseBenchArgs,
3154
4323
  parseConnectorConfig,
4324
+ parseTrainingExportArgs,
3155
4325
  resolveFlag,
4326
+ runTrainingExport,
3156
4327
  stripConfigArgv,
3157
4328
  stripResolveFlags
3158
4329
  };