npm - monomind - Versions diffs - 1.7.0 → 1.9.0 - Mend

monomind 1.7.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (562) hide show

package/packages/@monomind/cli/dist/src/autopilot-state.js CHANGED Viewed

@@ -7,6 +7,10 @@
  * ADR-072: Autopilot Integration
  * Security: Addresses prototype pollution, NaN bypass, input validation
  */
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { randomUUID } from 'crypto';
 // ── Constants ─────────────────────────────────────────────────
 export const STATE_DIR = '.monomind/data';
 export const STATE_FILE = `${STATE_DIR}/autopilot-state.json`;
@@ -71,9 +75,8 @@ export function validateTaskSources(sources) {
 }
 // ── State Management ──────────────────────────────────────────
 export function getDefaultState() {
-    const crypto = require('crypto');
     return {
-        sessionId: crypto.randomUUID(),
+        sessionId: randomUUID(),
         enabled: false,
         startTime: Date.now(),
         iterations: 0,
@@ -85,8 +88,6 @@ export function getDefaultState() {
     };
 }
 export function loadState() {
-    const fs = require('fs');
-    const path = require('path');
     const filePath = path.resolve(STATE_FILE);
     const defaults = getDefaultState();
     try {
@@ -111,8 +112,6 @@ export function loadState() {
     return defaults;
 }
 export function saveState(state) {
-    const fs = require('fs');
-    const path = require('path');
     const dir = path.resolve(STATE_DIR);
     if (!fs.existsSync(dir))
         fs.mkdirSync(dir, { recursive: true });
@@ -120,43 +119,98 @@ export function saveState(state) {
     if (state.history.length > MAX_HISTORY_ENTRIES) {
         state.history = state.history.slice(-MAX_HISTORY_ENTRIES);
     }
-    const tmpFile = path.resolve(STATE_FILE) + '.tmp';
+    // Unique tmp filename — concurrent autopilot_enable/disable/reset calls
+    // must not collide on the same .tmp path.
+    const tmpFile = `${path.resolve(STATE_FILE)}.${process.pid}.${Date.now()}.tmp`;
     fs.writeFileSync(tmpFile, JSON.stringify(state, null, 2));
     fs.renameSync(tmpFile, path.resolve(STATE_FILE));
 }
 export function appendLog(entry) {
-    const fs = require('fs');
-    const path = require('path');
     const filePath = path.resolve(LOG_FILE);
     const dir = path.resolve(STATE_DIR);
     if (!fs.existsSync(dir))
         fs.mkdirSync(dir, { recursive: true });
-    let log = [];
+    // Append-only NDJSON: atomic at the OS level for individual lines, no
+    // read-modify-write race between concurrent MCP tool calls. Compaction is
+    // handled lazily by `compactLog()` which the daemon can call periodically.
+    // Previously this function did read → push → tmp-write → rename, which
+    // under concurrent autopilot_enable/disable/reset calls silently lost
+    // entries (last writer wins) and could truncate the log to a single entry
+    // if a peer crashed mid-write and the next caller's safeJsonParse threw.
     try {
-        if (fs.existsSync(filePath)) {
-            log = safeJsonParse(fs.readFileSync(filePath, 'utf-8'));
-            if (!Array.isArray(log))
-                log = [];
+        fs.appendFileSync(filePath, JSON.stringify(entry) + '\n', { flag: 'a' });
+    }
+    catch {
+        // Best-effort logging; do not throw from a non-critical observability path.
+    }
+    // Opportunistic compaction so the file doesn't grow without bound.
+    try {
+        const stat = fs.statSync(filePath);
+        if (stat.size > 4 * 1024 * 1024) {
+            compactLog(filePath);
         }
     }
+    catch { /* ignore */ }
+}
+/**
+ * Compact NDJSON log down to MAX_LOG_ENTRIES. On parse failure for any line,
+ * preserve the corrupt file aside (do not silently destroy data).
+ */
+function compactLog(filePath) {
+    let lines;
+    try {
+        lines = fs.readFileSync(filePath, 'utf-8').split('\n').filter(l => l.trim().length > 0);
+    }
     catch {
-        log = [];
+        return;
     }
-    log.push(entry);
-    if (log.length > MAX_LOG_ENTRIES)
-        log = log.slice(-MAX_LOG_ENTRIES);
-    const tmpFile = filePath + '.tmp';
-    fs.writeFileSync(tmpFile, JSON.stringify(log, null, 2));
-    fs.renameSync(tmpFile, filePath);
+    const entries = [];
+    let corrupt = 0;
+    for (const line of lines) {
+        try {
+            const e = safeJsonParse(line);
+            if (e && typeof e === 'object')
+                entries.push(e);
+        }
+        catch {
+            corrupt++;
+        }
+    }
+    if (corrupt > 0) {
+        try {
+            fs.copyFileSync(filePath, `${filePath}.corrupt-${Date.now()}`);
+        }
+        catch { /* ignore */ }
+    }
+    const trimmed = entries.length > MAX_LOG_ENTRIES ? entries.slice(-MAX_LOG_ENTRIES) : entries;
+    const tmp = `${filePath}.${process.pid}.${Date.now()}.tmp`;
+    fs.writeFileSync(tmp, trimmed.map(e => JSON.stringify(e)).join('\n') + '\n');
+    fs.renameSync(tmp, filePath);
 }
 export function loadLog() {
-    const fs = require('fs');
-    const path = require('path');
     const filePath = path.resolve(LOG_FILE);
     try {
         if (fs.existsSync(filePath)) {
-            const result = safeJsonParse(fs.readFileSync(filePath, 'utf-8'));
-            return Array.isArray(result) ? result : [];
+            const raw = fs.readFileSync(filePath, 'utf-8');
+            // Backward compatible: support both old JSON-array form and the new
+            // append-only NDJSON form. Prefer NDJSON if the file looks line-based.
+            const trimmed = raw.trim();
+            if (trimmed.startsWith('[')) {
+                const result = safeJsonParse(raw);
+                return Array.isArray(result) ? result : [];
+            }
+            const out = [];
+            for (const line of trimmed.split('\n')) {
+                if (!line)
+                    continue;
+                try {
+                    const entry = safeJsonParse(line);
+                    if (entry && typeof entry === 'object')
+                        out.push(entry);
+                }
+                catch { /* skip corrupt line */ }
+            }
+            return out;
         }
     }
     catch {
@@ -166,9 +220,6 @@ export function loadLog() {
 }
 // ── Task Discovery ────────────────────────────────────────────
 export function discoverTasks(sources) {
-    const fs = require('fs');
-    const path = require('path');
-    const os = require('os');
     const tasks = [];
     // Only process valid sources
     const validSources = sources.filter(s => VALID_TASK_SOURCES.has(s));

package/packages/@monomind/cli/dist/src/benchmarks/benchmark-runner.d.ts CHANGED Viewed

@@ -2,7 +2,11 @@
  * Benchmark Runner for Regression Testing (Task 34)
  * Loads benchmark definitions, evaluates quality metrics, and detects regressions.
  */
-import type { BenchmarkDefinition, BenchmarkResult, BenchmarkBaseline, QualityMetric, MetricResult } from '@monoes/shared';
+type BenchmarkDefinition = any;
+type BenchmarkResult = any;
+type BenchmarkBaseline = any;
+type QualityMetric = any;
+type MetricResult = any;
 export declare class BenchmarkRunner {
     private baselines;
     /**
@@ -34,7 +38,7 @@ export declare class BenchmarkRunner {
     private evaluateSingleMetric;
 }
 /** Minimal local type aliases so SwarmBench doesn't depend on the broken
- *  @monoes/shared exports at the top of this file. */
+ *  @monomind/shared exports at the top of this file. */
 export interface SwarmBenchTask {
     /** Unique task ID */
     id: string;
@@ -79,4 +83,5 @@ export declare class SwarmBenchRunner {
     /** Expose underlying BenchmarkRunner for general benchmarks. */
     get benchmarkRunner(): BenchmarkRunner;
 }
+export {};
 //# sourceMappingURL=benchmark-runner.d.ts.map

package/packages/@monomind/cli/dist/src/benchmarks/benchmark-runner.js CHANGED Viewed

@@ -14,19 +14,31 @@ export class BenchmarkRunner {
      */
     loadBenchmarks(dir) {
         const benchmarks = [];
-        if (!fs.existsSync(dir)) {
+        // Safe-root constraint: reject any path that escapes the working directory
+        const safeRoot = path.resolve(process.cwd());
+        const resolved = path.resolve(dir);
+        const rel = path.relative(safeRoot, resolved);
+        if (rel.startsWith('..') || path.isAbsolute(rel))
+            return [];
+        if (!fs.existsSync(resolved)) {
             return benchmarks;
         }
-        const files = fs.readdirSync(dir).filter((f) => f.endsWith('.json'));
+        const files = fs.readdirSync(resolved).filter((f) => f.endsWith('.json'));
         for (const file of files) {
-            const filePath = path.join(dir, file);
+            const filePath = path.join(resolved, file);
             const raw = fs.readFileSync(filePath, 'utf-8');
-            const parsed = JSON.parse(raw);
-            if (Array.isArray(parsed)) {
-                benchmarks.push(...parsed);
+            try {
+                const parsed = JSON.parse(raw);
+                if (Array.isArray(parsed)) {
+                    benchmarks.push(...parsed);
+                }
+                else {
+                    benchmarks.push(parsed);
+                }
             }
-            else {
-                benchmarks.push(parsed);
+            catch {
+                // skip malformed file
+                continue;
             }
         }
         return benchmarks;

package/packages/@monomind/cli/dist/src/benchmarks/metric-evaluators.d.ts CHANGED Viewed

@@ -2,7 +2,7 @@
  * Metric Evaluators for Benchmark Runner (Task 34)
  * Individual metric evaluation functions for quality assessment.
  */
-import type { MetricResult } from '@monoes/shared';
+type MetricResult = any;
 /**
  * Checks whether the output contains the expected substring.
  */
@@ -32,4 +32,5 @@ export declare function jsonValid(output: string): MetricResult;
 export declare function customRegex(output: string, config: {
     pattern: string;
 }): MetricResult;
+export {};
 //# sourceMappingURL=metric-evaluators.d.ts.map

package/packages/@monomind/cli/dist/src/benchmarks/metric-evaluators.js CHANGED Viewed

@@ -76,12 +76,35 @@ export function jsonValid(output) {
  * Checks whether the output matches a custom regex pattern.
  */
 export function customRegex(output, config) {
+    // Reject overly long patterns and those with nested/repeated quantifiers
+    // (catastrophic backtracking — a malicious benchmark definition could
+    // pin CI runners with `^(a+)+$` against a long output string).
+    if (typeof config.pattern !== 'string' || config.pattern.length > 200) {
+        return {
+            type: 'custom_regex',
+            passed: false,
+            actual: null,
+            expected: config.pattern,
+            message: 'Pattern rejected: too long or invalid',
+        };
+    }
+    if (/(\(.*[+*?].*\)|[+*?]){2,}|\{[0-9,]+\}.*[+*?]|\([^)]*\|[^)]*\)[+*?{]/.test(config.pattern)) {
+        return {
+            type: 'custom_regex',
+            passed: false,
+            actual: null,
+            expected: config.pattern,
+            message: 'Pattern rejected: nested quantifiers risk catastrophic backtracking',
+        };
+    }
+    // Cap output length so even slow patterns can't burn unlimited CPU
+    const boundedOutput = output.length > 1024 * 1024 ? output.slice(0, 1024 * 1024) : output;
     const regex = new RegExp(config.pattern);
-    const match = regex.test(output);
+    const match = regex.test(boundedOutput);
     return {
         type: 'custom_regex',
         passed: match,
-        actual: match ? output.match(regex)?.[0] ?? null : null,
+        actual: match ? boundedOutput.match(regex)?.[0] ?? null : null,
         expected: config.pattern,
         message: match
             ? `Output matches pattern /${config.pattern}/`

package/packages/@monomind/cli/dist/src/commands/agent.js CHANGED Viewed

@@ -34,7 +34,9 @@ function updateSwarmActivityMetrics(agentCountDelta) {
         swarm.coordination_active = newCount > 0;
         data.swarm = swarm;
         data.timestamp = new Date().toISOString();
-        fs.writeFileSync(activityPath, JSON.stringify(data, null, 2));
+        const tmpPath = activityPath + '.tmp';
+        fs.writeFileSync(tmpPath, JSON.stringify(data, null, 2));
+        fs.renameSync(tmpPath, activityPath);
     }
     catch {
         // Non-critical — don't fail the command if metrics update fails
@@ -125,7 +127,7 @@ const spawnCommand = {
         const taskDescription = ctx.flags.task;
         if (!agentType && taskDescription) {
             try {
-                const { RouteLayer, ALL_ROUTES } = await import('@monoes/routing');
+                const { RouteLayer, ALL_ROUTES } = await import('@monomind/routing');
                 const layer = new RouteLayer({ routes: ALL_ROUTES });
                 const routeResult = await layer.route(taskDescription);
                 agentType = routeResult.agentSlug;
@@ -154,7 +156,7 @@ const spawnCommand = {
                     model: ctx.flags.model,
                     task: ctx.flags.task,
                     timeout: ctx.flags.timeout,
-                    autoTools: ctx.flags.autoTools,
+                    autoTools: ctx.flags['auto-tools'],
                 },
                 priority: 'normal',
                 metadata: {
@@ -602,7 +604,7 @@ const poolCommand = {
                 size: ctx.flags.size,
                 min: ctx.flags.min,
                 max: ctx.flags.max,
-                autoScale: ctx.flags.autoScale ?? true,
+                autoScale: ctx.flags['auto-scale'] ?? true,
             });
             if (ctx.flags.format === 'json') {
                 output.printJson(result);

package/packages/@monomind/cli/dist/src/commands/appliance-advanced.js CHANGED Viewed

@@ -3,6 +3,7 @@
  * Sign, publish, and hot-patch RVFA appliances.
  */
 import { output } from '../output.js';
+import * as path from 'node:path';
 function fmtSize(bytes) {
     if (bytes < 1024)
         return `${bytes} B`;
@@ -15,6 +16,16 @@ function fmtSize(bytes) {
 function errMsg(err) {
     return err instanceof Error ? err.message : String(err);
 }
+function checkRelativePath(userPath) {
+    if (!path.isAbsolute(userPath)) {
+        const resolved = path.resolve(userPath);
+        const rel = path.relative(process.cwd(), resolved);
+        if (rel.startsWith('..')) {
+            return `Relative path '${userPath}' escapes the working directory.`;
+        }
+    }
+    return null;
+}
 const fail = (msg, detail) => {
     output.printError(msg, detail);
     return { success: false, exitCode: 1 };
@@ -69,6 +80,9 @@ export const signCommand = {
             hdr('Signing RVFA Appliance');
             let privateKey;
             if (keyPath) {
+                const pathErr = checkRelativePath(keyPath);
+                if (pathErr)
+                    return fail(pathErr);
                 const fs = await import('fs');
                 privateKey = fs.readFileSync(keyPath);
             }
@@ -162,12 +176,18 @@ export const updateAppCommand = {
             output.writeln();
             let patchBuf;
             if (patchPath) {
+                const patchPathErr = checkRelativePath(patchPath);
+                if (patchPathErr)
+                    return fail(patchPathErr);
                 if (!(await requireFile(patchPath)))
                     return { success: false, exitCode: 1 };
                 patchBuf = fs.readFileSync(patchPath);
                 output.printInfo(`Patch file: ${patchPath} (${fmtSize(patchBuf.length)})`);
             }
             else {
+                const dataPathErr = checkRelativePath(dataPath);
+                if (dataPathErr)
+                    return fail(dataPathErr);
                 if (!(await requireFile(dataPath)))
                     return { success: false, exitCode: 1 };
                 const newData = fs.readFileSync(dataPath);
@@ -186,6 +206,9 @@ export const updateAppCommand = {
             let pubKey;
             if (ctx.flags['public-key']) {
                 const pkPath = ctx.flags['public-key'];
+                const pkPathErr = checkRelativePath(pkPath);
+                if (pkPathErr)
+                    return fail(pkPathErr);
                 if (!(await requireFile(pkPath)))
                     return { success: false, exitCode: 1 };
                 pubKey = fs.readFileSync(pkPath);

package/packages/@monomind/cli/dist/src/commands/autopilot.js CHANGED Viewed

@@ -6,6 +6,8 @@
  */
 import { output } from '../output.js';
 import { loadState, saveState, appendLog, loadLog, discoverTasks, getProgress, calculateReward, tryLoadLearning, validateNumber, validateTaskSources, LOG_FILE, } from '../autopilot-state.js';
+import { writeFileSync } from 'node:fs';
+import { resolve } from 'node:path';
 // ── Check Handler (for Stop hook) ─────────────────────────────
 export async function autopilotCheck() {
     const state = loadState();
@@ -182,10 +184,8 @@ const logCommand = {
     ],
     action: async (ctx) => {
         if (ctx.flags?.clear) {
-            const fs = require('fs');
-            const path = require('path');
             try {
-                fs.writeFileSync(path.resolve(LOG_FILE), '[]');
+                writeFileSync(resolve(LOG_FILE), '[]');
             }
             catch { /* ignore */ }
             output.writeln('Autopilot log cleared');

package/packages/@monomind/cli/dist/src/commands/benchmark.js CHANGED Viewed

@@ -5,8 +5,9 @@
  * @module v1/cli/commands/benchmark
  */
 import { output } from '../output.js';
-import { writeFileSync, existsSync, mkdirSync } from 'node:fs';
+import { writeFileSync, renameSync, readFileSync, existsSync, mkdirSync } from 'node:fs';
 import { join } from 'node:path';
+import { BenchmarkRunner } from '../benchmarks/benchmark-runner.js';
 // ============================================================================
 // Pretrain Benchmark Subcommand
 // ============================================================================
@@ -50,7 +51,9 @@ const pretrainCommand = {
                     mkdirSync(resultsDir, { recursive: true });
                 }
                 const savePath = saveFile.startsWith('/') ? saveFile : join(resultsDir, saveFile);
-                writeFileSync(savePath, JSON.stringify(results, null, 2));
+                const saveTmp = savePath + '.tmp';
+                writeFileSync(saveTmp, JSON.stringify(results, null, 2));
+                renameSync(saveTmp, savePath);
                 output.writeln(output.success(`Results saved to ${savePath}`));
             }
             const allPassed = results.results.every(r => r.targetMet);
@@ -276,7 +279,7 @@ const memoryCommand = {
                 searchEntries = memory.searchEntries;
             }
             catch {
-                // @monoes/memory not available — return null metrics instead of fake numbers
+                // @monomind/memory not available — return null metrics instead of fake numbers
                 storeEntry = async () => ({ success: true });
                 searchEntries = async () => ({ results: [], searchTime: 0 }); // 0 = no-op fallback, not a real benchmark
             }
@@ -410,17 +413,121 @@ const allCommand = {
                 mkdirSync(resultsDir, { recursive: true });
             }
             const savePath = saveFile.startsWith('/') ? saveFile : join(resultsDir, saveFile);
-            writeFileSync(savePath, JSON.stringify({
+            const saveTmp2 = savePath + '.tmp';
+            writeFileSync(saveTmp2, JSON.stringify({
                 timestamp: new Date().toISOString(),
                 duration: totalDuration,
                 results: allResults,
             }, null, 2));
+            renameSync(saveTmp2, savePath);
             output.writeln(output.success(`Results saved to ${savePath}`));
         }
         return { success: true, message: 'All benchmarks complete' };
     },
 };
 // ============================================================================
+// Regression Benchmark Subcommand
+// ============================================================================
+const regressionCommand = {
+    name: 'regression',
+    description: 'Quality regression testing using benchmark definitions and baselines',
+    options: [
+        { name: 'suite', short: 's', type: 'string', description: 'Path to benchmark definitions directory', default: '.monomind/benchmarks/definitions' },
+        { name: 'benchmark-id', short: 'b', type: 'string', description: 'Run a specific benchmark by ID' },
+        { name: 'agent-output', short: 'a', type: 'string', description: 'Path to file containing agent output to evaluate' },
+        { name: 'pin-baseline', type: 'boolean', description: 'Save current results as the new baseline', default: 'false' },
+        { name: 'output', short: 'o', type: 'string', description: 'Output format: text, json', default: 'text' },
+    ],
+    examples: [
+        { command: 'monomind benchmark regression', description: 'List all benchmark definitions' },
+        { command: 'monomind benchmark regression -b agent-spawn -a output.txt', description: 'Evaluate agent output against a benchmark' },
+        { command: 'monomind benchmark regression -b agent-spawn -a output.txt --pin-baseline', description: 'Evaluate and pin results as new baseline' },
+    ],
+    action: async (ctx) => {
+        const suiteDir = ctx.flags.suite || '.monomind/benchmarks/definitions';
+        const benchmarkId = ctx.flags['benchmark-id'];
+        const agentOutputFile = ctx.flags['agent-output'];
+        const pinBaseline = ctx.flags['pin-baseline'] === true;
+        const outputFormat = ctx.flags.output || 'text';
+        const runner = new BenchmarkRunner();
+        const baselinesDir = join(process.cwd(), '.monomind', 'benchmarks', 'baselines');
+        const definitions = runner.loadBenchmarks(join(process.cwd(), suiteDir));
+        if (definitions.length === 0) {
+            output.writeln(output.dim(`No benchmark definitions found in ${suiteDir}`));
+            output.writeln(output.dim('Create JSON files there to define quality benchmarks.'));
+            return { success: true, message: 'No benchmarks defined' };
+        }
+        // List mode — no agent output provided
+        if (!agentOutputFile) {
+            output.writeln();
+            output.writeln(output.bold('Benchmark Definitions'));
+            output.writeln(output.dim('─'.repeat(50)));
+            for (const def of definitions) {
+                output.writeln(`  ${output.highlight(def.benchmarkId)}  ${output.dim(def.agentSlug)}  — ${def.qualityMetrics?.length ?? 0} metrics`);
+            }
+            output.writeln();
+            output.writeln(output.dim('Use --agent-output <file> to evaluate against a benchmark.'));
+            return { success: true, message: `${definitions.length} benchmarks loaded` };
+        }
+        // Evaluation mode
+        if (!existsSync(agentOutputFile)) {
+            output.writeln(output.error(`Agent output file not found: ${agentOutputFile}`));
+            return { success: false, message: 'Agent output file not found' };
+        }
+        const agentOutput = readFileSync(agentOutputFile, 'utf-8');
+        const targetDefs = benchmarkId
+            ? definitions.filter((d) => d.benchmarkId === benchmarkId)
+            : definitions;
+        if (targetDefs.length === 0) {
+            output.writeln(output.error(`No benchmark found with id: ${benchmarkId}`));
+            return { success: false, message: 'Benchmark not found' };
+        }
+        const results = targetDefs.map((def) => runner.runBenchmark(def, agentOutput));
+        if (outputFormat === 'json') {
+            output.writeln(JSON.stringify(results, null, 2));
+        }
+        else {
+            output.writeln();
+            output.writeln(output.bold('Regression Results'));
+            output.writeln(output.dim('─'.repeat(50)));
+            for (const result of results) {
+                const status = result.passed ? output.success('PASS') : output.error('FAIL');
+                output.writeln(`  ${status}  ${result.benchmarkId}  ${output.dim(`${result.durationMs}ms`)}`);
+                for (const m of result.metricResults) {
+                    const mStatus = m.passed ? '  ✓' : '  ✗';
+                    output.writeln(`    ${mStatus}  ${m.type}`);
+                }
+            }
+            output.writeln();
+        }
+        // Baseline comparison
+        const baselinePath = join(baselinesDir, `${benchmarkId ?? 'all'}.json`);
+        if (existsSync(baselinePath)) {
+            const baseline = JSON.parse(readFileSync(baselinePath, 'utf-8'));
+            const hasRegression = runner.detectRegression(results, baseline);
+            if (hasRegression) {
+                output.writeln(output.error(`Regression detected — pass rate dropped below baseline (${(baseline.passRate * 100).toFixed(0)}%)`));
+            }
+            else {
+                output.writeln(output.success('No regression detected vs baseline'));
+            }
+        }
+        // Pin baseline
+        if (pinBaseline) {
+            if (!existsSync(baselinesDir))
+                mkdirSync(baselinesDir, { recursive: true });
+            const id = benchmarkId ?? 'all';
+            const baseline = runner.pinBaseline(id, results);
+            const baselineTmp = baselinePath + '.tmp';
+            writeFileSync(baselineTmp, JSON.stringify(baseline, null, 2));
+            renameSync(baselineTmp, baselinePath);
+            output.writeln(output.success(`Baseline pinned: ${(baseline.passRate * 100).toFixed(0)}% pass rate`));
+        }
+        const allPassed = results.every((r) => r.passed);
+        return { success: allPassed, message: allPassed ? 'All metrics passed' : 'Some metrics failed' };
+    },
+};
+// ============================================================================
 // Main Benchmark Command
 // ============================================================================
 export const benchmarkCommand = {
@@ -431,12 +538,15 @@ export const benchmarkCommand = {
         neuralCommand,
         memoryCommand,
         allCommand,
+        regressionCommand,
     ],
     examples: [
         { command: 'monomind benchmark pretrain', description: 'Benchmark pre-training system' },
         { command: 'monomind benchmark neural', description: 'Benchmark neural operations' },
         { command: 'monomind benchmark memory', description: 'Benchmark memory operations' },
         { command: 'monomind benchmark all', description: 'Run all benchmarks' },
+        { command: 'monomind benchmark regression', description: 'List quality regression benchmarks' },
+        { command: 'monomind benchmark regression -b my-bench -a output.txt', description: 'Evaluate agent output against a benchmark' },
     ],
     action: async (_ctx) => {
         output.writeln();
@@ -444,10 +554,11 @@ export const benchmarkCommand = {
         output.writeln(output.dim('─'.repeat(50)));
         output.writeln();
         output.writeln('Available subcommands:');
-        output.writeln(`  ${output.highlight('pretrain')}  - Benchmark self-learning pre-training (SONA, EWC++, MoE)`);
-        output.writeln(`  ${output.highlight('neural')}    - Benchmark neural operations (embeddings, WASM)`);
-        output.writeln(`  ${output.highlight('memory')}    - Benchmark memory operations (HNSW, store, search)`);
-        output.writeln(`  ${output.highlight('all')}       - Run all benchmark suites`);
+        output.writeln(`  ${output.highlight('pretrain')}    - Benchmark self-learning pre-training (SONA, EWC++, MoE)`);
+        output.writeln(`  ${output.highlight('neural')}      - Benchmark neural operations (embeddings, WASM)`);
+        output.writeln(`  ${output.highlight('memory')}      - Benchmark memory operations (HNSW, store, search)`);
+        output.writeln(`  ${output.highlight('all')}         - Run all benchmark suites`);
+        output.writeln(`  ${output.highlight('regression')}  - Quality regression testing with baselines`);
         output.writeln();
         output.writeln('Examples:');
         output.writeln('  monomind benchmark pretrain -i 200');