@pellux/goodvibes-agent 0.1.54 → 0.1.55

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,10 @@
2
2
 
3
3
  All notable changes to GoodVibes Agent will be recorded here.
4
4
 
5
+ ## 0.1.55 - 2026-05-31
6
+
7
+ - d8f4eee Remove copied developer audit surfaces
8
+
5
9
  ## 0.1.54 - 2026-05-31
6
10
 
7
11
  - dc1a290 Keep release docs version-neutral
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pellux/goodvibes-agent",
3
- "version": "0.1.54",
3
+ "version": "0.1.55",
4
4
  "private": false,
5
5
  "description": "Near-fork GoodVibes operator assistant with the GoodVibes TUI shell, renderer, input, fullscreen workspace, and daemon-connected Agent product brain.",
6
6
  "type": "module",
@@ -65,10 +65,7 @@
65
65
  "architecture:check": "bun run scripts/check-architecture.ts",
66
66
  "foundation:artifacts": "bun run scripts/export-foundation-artifacts.ts",
67
67
  "verification:ledger": "bun run scripts/verification-ledger.ts",
68
- "verification:live": "bun run scripts/verify-live.ts",
69
- "eval:gate": "bun run scripts/eval-gate.ts",
70
- "eval:gate:verbose": "bun run scripts/eval-gate.ts --verbose",
71
- "eval:baseline": "bun run scripts/eval-gate.ts --save-baseline"
68
+ "verification:live": "bun run scripts/verify-live.ts"
72
69
  },
73
70
  "license": "MIT",
74
71
  "repository": {
@@ -177,7 +177,6 @@ export interface CommandOpsServices
177
177
  export interface CommandExtensionRegistryServices {
178
178
  readonly toolRegistry: ToolRegistry;
179
179
  readonly mcpRegistry: McpRegistry;
180
- readonly evalRegistry?: import('../panels/eval-panel.ts').EvalRegistry;
181
180
  }
182
181
 
183
182
  export interface CommandExtensionServices
@@ -67,7 +67,7 @@ export function registerLocalRuntimeCommands(registry: CommandRegistry): void {
67
67
  registry.register({
68
68
  name: 'tools',
69
69
  aliases: ['t'],
70
- description: 'List available tools and review compact native tool capability surfaces',
70
+ description: 'List available tools and review tool safety/status',
71
71
  usage: '[review|panel]',
72
72
  handler(args, ctx) {
73
73
  const sub = (args[0] ?? '').toLowerCase();
@@ -79,12 +79,11 @@ export function registerLocalRuntimeCommands(registry: CommandRegistry): void {
79
79
  }
80
80
  if (sub === 'review') {
81
81
  ctx.print([
82
- 'Tool Surface Review',
83
- ' Native file tools stay compact by default.',
84
- ' Read/write/edit/notebook capabilities are available through the native tool stack, with detail routed to the tools panel and approval surfaces instead of transcript bloat.',
85
- ' Shell and native tool approvals classify work into read, mutation, destructive, dependency, config, notebook, network, remote, and lifecycle risk families.',
86
- ' Use /tools panel to inspect risk class, output-policy actions, spill posture, compact summaries, and approval posture for recent calls.',
87
- ' Use /approval review shell or /approval review file when you need the action-specific why-prompted posture.',
82
+ 'Tool Status',
83
+ ' Tools are available for the main Agent conversation.',
84
+ ' Read-only actions can run directly; writes, destructive changes, network effects, service changes, and external side effects require explicit user intent or approval.',
85
+ ' Recent tool activity and approval posture are available in the tools and approvals views.',
86
+ ' Build/fix/review work should be delegated explicitly with /delegate.',
88
87
  ].join('\n'));
89
88
  }
90
89
  return;
@@ -1,6 +1,5 @@
1
1
  import type { CommandRegistry } from '../command-registry.ts';
2
2
  import type { ProfileData } from '@pellux/goodvibes-sdk/platform/profiles';
3
- import { ToolContractVerifier } from '@/runtime/index.ts';
4
3
  import type { ReplaySnapshotInput } from '@/runtime/index.ts';
5
4
  import { logger } from '@pellux/goodvibes-sdk/platform/utils';
6
5
  import { registerOperatorPanelCommand } from './operator-panel-runtime.ts';
@@ -283,55 +282,6 @@ export function registerOperatorRuntimeCommands(registry: CommandRegistry): void
283
282
  },
284
283
  });
285
284
 
286
- registry.register({
287
- name: 'tool',
288
- description: 'Tool contract verification — verify registered tool contracts',
289
- usage: 'verify <name> | verify-all | contract show <name>',
290
- argsHint: 'verify <name> | verify-all | contract show <name>',
291
- handler(args, ctx) {
292
- const sub = args[0];
293
- if (sub === 'verify' && args[1]) {
294
- const result = ctx.extensions.toolRegistry.verifyContract(args[1]);
295
- if (!result) {
296
- ctx.print(`[tool verify] Tool '${args[1]}' is not registered.`);
297
- return;
298
- }
299
- ctx.print(ToolContractVerifier.formatResult(result));
300
- return;
301
- }
302
- if (sub === 'verify-all') {
303
- ctx.print(ToolContractVerifier.formatAllResults(ctx.extensions.toolRegistry.verifyAllContracts()));
304
- return;
305
- }
306
- if (sub === 'contract' && args[1] === 'show' && args[2]) {
307
- const toolName = args[2];
308
- const result = ctx.extensions.toolRegistry.verifyContract(toolName);
309
- if (!result) {
310
- ctx.print(`[tool contract show] Tool '${toolName}' is not registered.`);
311
- return;
312
- }
313
- const lines: string[] = [ToolContractVerifier.formatResult(result)];
314
- const tool = ctx.extensions.toolRegistry.list().find((t) => t.definition.name === toolName);
315
- if (tool) {
316
- lines.push('');
317
- lines.push('Tool Definition:');
318
- lines.push(` Name: ${tool.definition.name}`);
319
- lines.push(` Description: ${tool.definition.description}`);
320
- lines.push(` Parameters: ${JSON.stringify(tool.definition.parameters, null, 2).replace(/\n/g, '\n ')}`);
321
- }
322
- ctx.print(lines.join('\n'));
323
- return;
324
- }
325
-
326
- ctx.print(
327
- 'Usage: /tool <subcommand>\n'
328
- + ' /tool verify <name> — verify contract for a specific registered tool\n'
329
- + ' /tool verify-all — verify contracts for all registered tools\n'
330
- + ' /tool contract show <name> — show full contract details for a tool'
331
- );
332
- },
333
- });
334
-
335
285
  registry.register({
336
286
  name: 'forensics',
337
287
  aliases: ['foren'],
@@ -1,9 +1,7 @@
1
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
2
- import { dirname, join, resolve } from 'node:path';
1
+ import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
2
+ import { dirname } from 'node:path';
3
3
  import type { CommandContext, CommandRegistry } from '../command-registry.ts';
4
- import { listInstalledEcosystemEntries, loadEcosystemCatalog } from '@/runtime/index.ts';
5
- import { BUILTIN_SUITES } from '@/runtime/index.ts';
6
- import { requireEcosystemCatalogPaths, requireReadModels, requireSecretsManager, requireServiceRegistry, requireShellPaths } from './runtime-services.ts';
4
+ import { requireReadModels, requireSecretsManager, requireServiceRegistry, requireShellPaths } from './runtime-services.ts';
7
5
  import { requireYesFlag, stripYesFlag } from './confirmation.ts';
8
6
 
9
7
  interface TrustReviewBundle {
@@ -29,29 +27,6 @@ interface TrustReviewBundle {
29
27
  };
30
28
  }
31
29
 
32
- interface ReleaseBundle {
33
- readonly version: 1;
34
- readonly capturedAt: number;
35
- readonly runtime: {
36
- readonly provider: string;
37
- readonly model: string;
38
- readonly sessionId: string;
39
- };
40
- readonly evalSuites: readonly string[];
41
- readonly incidentCount: number;
42
- readonly remote: {
43
- readonly pools: number;
44
- readonly contracts: number;
45
- readonly artifacts: number;
46
- };
47
- readonly ecosystem: {
48
- readonly pluginCatalog: number;
49
- readonly skillCatalog: number;
50
- readonly installedPlugins: number;
51
- readonly installedSkills: number;
52
- };
53
- }
54
-
55
30
  function countByMode<T extends string>(values: readonly T[], mode: T): number {
56
31
  return values.filter((value) => value === mode).length;
57
32
  }
@@ -112,46 +87,6 @@ function inspectTrustBundle(path: string): string {
112
87
  ].join('\n');
113
88
  }
114
89
 
115
- function buildReleaseBundle(ctx: Parameters<NonNullable<CommandRegistry['register']>>[0]['handler'] extends (args: string[], context: infer C) => unknown ? C : never): ReleaseBundle {
116
- const remoteRuntime = ctx.ops.remoteRuntime;
117
- const incidents = ctx.extensions.forensicsRegistry?.getAll() ?? [];
118
- const ecosystemPaths = requireEcosystemCatalogPaths(ctx);
119
- return {
120
- version: 1,
121
- capturedAt: Date.now(),
122
- runtime: {
123
- provider: ctx.session.runtime.provider,
124
- model: ctx.session.runtime.model,
125
- sessionId: ctx.session.runtime.sessionId,
126
- },
127
- evalSuites: Object.keys(BUILTIN_SUITES),
128
- incidentCount: incidents.length,
129
- remote: {
130
- pools: remoteRuntime?.listPools().length ?? 0,
131
- contracts: remoteRuntime?.listContracts().length ?? 0,
132
- artifacts: remoteRuntime?.listArtifacts().length ?? 0,
133
- },
134
- ecosystem: {
135
- pluginCatalog: loadEcosystemCatalog('plugin', ecosystemPaths).length,
136
- skillCatalog: loadEcosystemCatalog('skill', ecosystemPaths).length,
137
- installedPlugins: listInstalledEcosystemEntries('plugin', ecosystemPaths).length,
138
- installedSkills: listInstalledEcosystemEntries('skill', ecosystemPaths).length,
139
- },
140
- };
141
- }
142
-
143
- function inspectReleaseBundle(path: string): string {
144
- const parsed = JSON.parse(readFileSync(path, 'utf-8')) as ReleaseBundle;
145
- return [
146
- 'Release Bundle Review',
147
- ` provider/model: ${parsed.runtime.provider || '(unset)'}/${parsed.runtime.model || '(unset)'}`,
148
- ` eval suites: ${parsed.evalSuites.length}`,
149
- ` incidents: ${parsed.incidentCount}`,
150
- ` remote pools/contracts/artifacts: ${parsed.remote.pools}/${parsed.remote.contracts}/${parsed.remote.artifacts}`,
151
- ` ecosystem catalog plugins/skills: ${parsed.ecosystem.pluginCatalog}/${parsed.ecosystem.skillCatalog}`,
152
- ].join('\n');
153
- }
154
-
155
90
  export function registerProductRuntimeCommands(registry: CommandRegistry): void {
156
91
  registry.register({
157
92
  name: 'trust',
@@ -314,65 +249,4 @@ export function registerProductRuntimeCommands(registry: CommandRegistry): void
314
249
  },
315
250
  });
316
251
 
317
- registry.register({
318
- name: 'release',
319
- description: 'Package certification and release-readiness operations',
320
- usage: '[review|checklist|bundle export <path> --yes|bundle inspect <path>]',
321
- handler(args, ctx) {
322
- const parsed = stripYesFlag(args);
323
- const commandArgs = [...parsed.rest];
324
- const shellPaths = requireShellPaths(ctx);
325
- const sub = commandArgs[0] ?? 'review';
326
- if (sub === 'review') {
327
- const bundle = buildReleaseBundle(ctx);
328
- ctx.print([
329
- 'Release Review',
330
- ` provider/model: ${bundle.runtime.provider || '(unset)'}/${bundle.runtime.model || '(unset)'}`,
331
- ` eval suites: ${bundle.evalSuites.length}`,
332
- ` incidents: ${bundle.incidentCount}`,
333
- ` remote pools/contracts/artifacts: ${bundle.remote.pools}/${bundle.remote.contracts}/${bundle.remote.artifacts}`,
334
- ` ecosystem catalog plugins/skills: ${bundle.ecosystem.pluginCatalog}/${bundle.ecosystem.skillCatalog}`,
335
- ` installed plugins/skills: ${bundle.ecosystem.installedPlugins}/${bundle.ecosystem.installedSkills}`,
336
- ].join('\n'));
337
- return;
338
- }
339
- if (sub === 'checklist') {
340
- ctx.print([
341
- 'Release Checklist',
342
- ' 1. Run /setup review and /setup doctor',
343
- ' 2. Run /security review and /trust review',
344
- ' 3. Run /policy preflight and /policy simulate',
345
- ' 4. Run /eval gate <suite> --yes for required certification suites',
346
- ' 5. Review /incident latest and /bridge status',
347
- ' 6. Export /release bundle export <path> --yes for release evidence',
348
- ].join('\n'));
349
- return;
350
- }
351
- if (sub === 'bundle') {
352
- const mode = commandArgs[1];
353
- const pathArg = commandArgs[2];
354
- if ((mode === 'export' || mode === 'inspect') && !pathArg) {
355
- ctx.print(`Usage: /release bundle ${mode} <path>${mode === 'export' ? ' --yes' : ''}`);
356
- return;
357
- }
358
- if (mode === 'export') {
359
- if (!parsed.yes) {
360
- requireYesFlag(ctx, `export release bundle to ${pathArg}`, '/release bundle export <path> --yes');
361
- return;
362
- }
363
- const bundle = buildReleaseBundle(ctx);
364
- const targetPath = shellPaths.resolveWorkspacePath(pathArg!);
365
- mkdirSync(dirname(targetPath), { recursive: true });
366
- writeFileSync(targetPath, JSON.stringify(bundle, null, 2) + '\n', 'utf-8');
367
- ctx.print(`Release bundle exported to ${targetPath}`);
368
- return;
369
- }
370
- if (mode === 'inspect') {
371
- ctx.print(inspectReleaseBundle(shellPaths.resolveWorkspacePath(pathArg!)));
372
- return;
373
- }
374
- }
375
- ctx.print('Usage: /release [review|checklist|bundle export <path> --yes|bundle inspect <path>]');
376
- },
377
- });
378
252
  }
@@ -1,7 +1,6 @@
1
1
  import type { CommandRegistry } from './command-registry.ts';
2
2
  import { policyCommand } from './commands/policy.ts';
3
3
  import { providerCommand } from './commands/provider.ts';
4
- import { evalCommand } from './commands/eval.ts';
5
4
  import { sessionCommand } from './commands/session.ts';
6
5
  import { recallCommand } from './commands/memory.ts';
7
6
  import { knowledgeCommand } from './commands/knowledge.ts';
@@ -126,9 +125,6 @@ export function registerBuiltinCommands(registry: CommandRegistry): void {
126
125
  // ── /provider ─────────────────────────────────────────────────────────────
127
126
  registry.register(providerCommand);
128
127
 
129
- // ── /eval ─────────────────────────────────────────────────────────────────
130
- registry.register(evalCommand);
131
-
132
128
  // ── /session ─────────────────────────────────────────────────────────────
133
129
  registry.register(sessionCommand);
134
130
 
@@ -27,7 +27,6 @@ import { DebugPanel } from '../debug-panel.ts';
27
27
  import { IncidentReviewPanel } from '../incident-review-panel.ts';
28
28
  import { ForensicsPanel } from '../forensics-panel.ts';
29
29
  import { PolicyPanel } from '../policy-panel.ts';
30
- import { EvalPanel } from '../eval-panel.ts';
31
30
  import { createProviderAccountSnapshotQuery } from '../provider-account-snapshot.ts';
32
31
  import {
33
32
  createEnvironmentVariableQuery,
@@ -335,15 +334,4 @@ export function registerOperationsPanels(manager: PanelManager, deps: ResolvedBu
335
334
  factory: () => new PolicyPanel(deps.policyRuntimeState),
336
335
  });
337
336
 
338
- if (deps.evalRegistry) {
339
- const { evalRegistry } = deps;
340
- manager.registerType({
341
- id: 'eval',
342
- name: 'Eval',
343
- icon: 'Y',
344
- category: 'monitoring',
345
- description: 'Evaluation harness: benchmark suite results, scorecards, and regression gates',
346
- factory: () => new EvalPanel(evalRegistry),
347
- });
348
- }
349
337
  }
@@ -62,8 +62,6 @@ export interface BuiltinPanelDeps {
62
62
  dismissPlanning?: () => void;
63
63
  /** ForensicsRegistry for the Forensics panel. */
64
64
  forensicsRegistry?: import('@/runtime/index.ts').ForensicsRegistry;
65
- /** EvalRegistry for the Eval panel. */
66
- evalRegistry?: import('../eval-panel.ts').EvalRegistry;
67
65
  /** MemoryRegistry for the Memory panel. */
68
66
  memoryRegistry?: MemoryRegistry;
69
67
  /** Isolated Agent Knowledge service for the Agent Knowledge panel. */
package/src/version.ts CHANGED
@@ -6,7 +6,7 @@ import { join } from 'node:path';
6
6
  // The prebuild script updates the fallback value before compilation.
7
7
  // Uses import.meta.dir (Bun) to locate package.json relative to this file,
8
8
  // which is correct regardless of the process working directory.
9
- let _version = '0.1.54';
9
+ let _version = '0.1.55';
10
10
  let _sdkVersion = '0.33.35';
11
11
  try {
12
12
  const pkg = JSON.parse(readFileSync(join(import.meta.dir, '..', 'package.json'), 'utf-8')) as {
@@ -1,217 +0,0 @@
1
- /**
2
- * /eval command handler.
3
- *
4
- * Implements the Evaluation Harness commands:
5
- *
6
- * /eval list — List all available eval suites
7
- * /eval run <suite> --yes — Run a named suite (or 'all')
8
- * /eval compare <baseline-file> — Compare last run against a baseline file
9
- * /eval gate <suite> --yes — Run suite and apply CI gate (exits 1 on regression)
10
- */
11
-
12
- import type { SlashCommand, CommandContext } from '../command-registry.ts';
13
- import { EvalRunner } from '@/runtime/index.ts';
14
- import { BUILTIN_SUITES } from '@/runtime/index.ts';
15
- import { formatScorecard } from '@/runtime/index.ts';
16
- import { loadBaseline, captureBaseline, formatBaselineComparison, writeBaseline } from '@/runtime/index.ts';
17
- import type { EvalRegistry } from '../../panels/eval-panel.ts';
18
- import { formatSuiteResult, formatGateResult } from '@/runtime/index.ts';
19
- import { requireShellPaths } from './runtime-services.ts';
20
- import { summarizeError } from '@pellux/goodvibes-sdk/platform/utils';
21
- import { requireYesFlag, stripYesFlag } from './confirmation.ts';
22
-
23
- // ── Subcommand helpers ────────────────────────────────────────────────────────
24
-
25
- function printSuiteList(context: CommandContext): void {
26
- context.print('[eval] Available suites:');
27
- for (const [name, scenarios] of Object.entries(BUILTIN_SUITES)) {
28
- context.print(` ${name} (${scenarios.length} scenarios)`);
29
- for (const s of scenarios) {
30
- context.print(` - ${s.id}: ${s.name}`);
31
- }
32
- }
33
- context.print('[eval] Usage: /eval run <suite> --yes or /eval run all --yes');
34
- }
35
-
36
- function getRegistry(context: CommandContext): EvalRegistry | undefined {
37
- return context.extensions.evalRegistry;
38
- }
39
-
40
- // ── /eval list ────────────────────────────────────────────────────────────────
41
-
42
- function handleList(_args: string[], context: CommandContext): void {
43
- printSuiteList(context);
44
- }
45
-
46
- // ── /eval run ────────────────────────────────────────────────────────────────
47
-
48
- async function handleRun(args: string[], context: CommandContext): Promise<void> {
49
- const { rest, yes } = stripYesFlag(args);
50
- const suiteName = rest[0] ?? 'all';
51
- const registry = getRegistry(context);
52
-
53
- const suitesToRun =
54
- suiteName === 'all'
55
- ? Object.keys(BUILTIN_SUITES)
56
- : BUILTIN_SUITES[suiteName]
57
- ? [suiteName]
58
- : null;
59
-
60
- if (!suitesToRun) {
61
- context.print(`[eval] Unknown suite: "${suiteName}". Run /eval list to see available suites.`);
62
- return;
63
- }
64
- if (!yes) {
65
- requireYesFlag(context, `run eval suite ${suiteName}`, '/eval run <suite|all> --yes');
66
- return;
67
- }
68
-
69
- const runner = new EvalRunner();
70
- registry?.setRunning(true);
71
-
72
- for (const name of suitesToRun) {
73
- const scenarios = BUILTIN_SUITES[name];
74
- if (!scenarios) continue;
75
-
76
- context.print(`[eval] Running suite: ${name} (${scenarios.length} scenarios)...`);
77
- const result = await runner.runSuite(name, scenarios);
78
- registry?.push(result);
79
-
80
- context.print(formatSuiteResult(result));
81
-
82
- for (const r of result.results) {
83
- context.print(formatScorecard(r.scorecard));
84
- }
85
- }
86
-
87
- registry?.setRunning(false);
88
- }
89
-
90
- // ── /eval compare ─────────────────────────────────────────────────────────────
91
-
92
- async function handleCompare(args: string[], context: CommandContext): Promise<void> {
93
- const baselineFile = args[0] ?? '.goodvibes/eval/baseline.json';
94
- const registry = getRegistry(context);
95
- const projectRoot = requireShellPaths(context).workingDirectory;
96
- const suiteResults = registry?.getSuiteResults() ?? [];
97
-
98
- if (suiteResults.length === 0) {
99
- context.print('[eval] No suite results to compare. Run /eval run <suite> --yes first.');
100
- return;
101
- }
102
-
103
- const baseline = await loadBaseline(baselineFile, projectRoot);
104
- if (!baseline) {
105
- context.print(`[eval] Baseline file not found: ${baselineFile}`);
106
- context.print('[eval] Tip: run /eval gate <suite> [baseline-file] --save-baseline --yes to create a baseline.');
107
- return;
108
- }
109
-
110
- for (const result of suiteResults) {
111
- context.print(formatBaselineComparison(baseline, result));
112
- }
113
- }
114
-
115
- // ── /eval gate ────────────────────────────────────────────────────────────────
116
-
117
- async function handleGate(args: string[], context: CommandContext): Promise<void> {
118
- const { rest, yes } = stripYesFlag(args);
119
- const positional = rest.filter((arg) => arg !== '--save-baseline');
120
- const suiteName = positional[0];
121
- const baselineFile = positional[1] ?? '.goodvibes/eval/baseline.json';
122
- const saveFlag = rest.includes('--save-baseline');
123
- const projectRoot = requireShellPaths(context).workingDirectory;
124
-
125
- if (!suiteName) {
126
- context.print('[eval] Usage: /eval gate <suite> [baseline-file] [--save-baseline] --yes');
127
- return;
128
- }
129
-
130
- const scenarios = BUILTIN_SUITES[suiteName];
131
- if (!scenarios) {
132
- context.print(`[eval] Unknown suite: "${suiteName}". Run /eval list to see available suites.`);
133
- return;
134
- }
135
- if (!yes) {
136
- requireYesFlag(context, `run eval gate ${suiteName}`, '/eval gate <suite> [baseline-file] [--save-baseline] --yes');
137
- return;
138
- }
139
-
140
- const registry = getRegistry(context);
141
- const runner = new EvalRunner();
142
-
143
- context.print(`[eval] Gate: running suite "${suiteName}"...`);
144
- registry?.setRunning(true);
145
- const fresh = await runner.runSuite(suiteName, scenarios);
146
- registry?.push(fresh);
147
- registry?.setRunning(false);
148
-
149
- const baseline = await loadBaseline(baselineFile, projectRoot);
150
- const gate = runner.evaluateGate(fresh, baseline);
151
- registry?.pushGate(gate);
152
-
153
- context.print(formatGateResult(gate));
154
-
155
- if (saveFlag || !baseline) {
156
- const label = suiteName ?? 'latest';
157
- const newBaseline = captureBaseline(label, [fresh]);
158
- try {
159
- await writeBaseline(baselineFile, newBaseline, projectRoot);
160
- context.print(`[eval] Baseline saved to ${baselineFile}`);
161
- } catch (err) {
162
- context.print(`[eval] Warning: could not save baseline: ${summarizeError(err)}`);
163
- }
164
- }
165
-
166
- if (!gate.passed) {
167
- context.print(`[eval] Gate FAILED: ${gate.regressions.length} regression(s) detected.`);
168
- } else {
169
- context.print('[eval] Gate PASSED.');
170
- }
171
- }
172
-
173
- // ── Top-level command ─────────────────────────────────────────────────────────
174
-
175
- export const evalCommand: SlashCommand = {
176
- name: 'eval',
177
- description: 'Evaluation harness: run benchmark suites, compare baselines, and gate regressions.',
178
- usage: '<subcommand> [args]',
179
- argsHint: 'list|run <suite> --yes|compare <baseline>|gate <suite> --yes',
180
- handler: async (args: string[], context: CommandContext): Promise<void> => {
181
- const [sub, ...rest] = args;
182
-
183
- switch (sub) {
184
- case 'list':
185
- case 'ls':
186
- handleList(rest, context);
187
- break;
188
-
189
- case 'run':
190
- await handleRun(rest, context);
191
- break;
192
-
193
- case 'compare':
194
- case 'cmp':
195
- await handleCompare(rest, context);
196
- break;
197
-
198
- case 'gate':
199
- await handleGate(rest, context);
200
- break;
201
-
202
- default: {
203
- const usage = [
204
- 'Usage: /eval <subcommand>',
205
- ' list — List all available eval suites',
206
- ' run <suite|all> --yes — Run a named suite (or all suites)',
207
- ' compare [baseline-file] — Compare last results against baseline',
208
- ' gate <suite> [baseline-file] --yes',
209
- ' — Run suite and apply regression gate',
210
- ' --save-baseline — Save fresh run as new baseline',
211
- ].join('\n');
212
- context.print(usage);
213
- break;
214
- }
215
- }
216
- },
217
- };
@@ -1,399 +0,0 @@
1
- /**
2
- * Eval Panel — renders evaluation harness results in list and detail modes.
3
- *
4
- * Displays suite run summaries, per-scenario scorecards, and regression
5
- * indicators. Wired with an EvalRegistry that holds the latest run results.
6
- */
7
-
8
- import { BasePanel } from './base-panel.ts';
9
- import type { Line } from '../types/grid.ts';
10
- import { createEmptyLine } from '../types/grid.ts';
11
- import {
12
- buildEmptyState,
13
- buildPanelLine,
14
- buildPanelWorkspace,
15
- resolveScrollablePanelSection,
16
- DEFAULT_PANEL_PALETTE,
17
- } from './polish.ts';
18
-
19
- // ── EvalRegistry ─────────────────────────────────────────────────────────────
20
-
21
- import type {
22
- EvalSuiteResult,
23
- EvalResult,
24
- EvalGateResult,
25
- EvalDimension,
26
- } from '@/runtime/index.ts';
27
-
28
- /**
29
- * Holds the latest eval run state for display in EvalPanel.
30
- * Created externally, injected into the panel.
31
- */
32
- export class EvalRegistry {
33
- private _suiteResults: EvalSuiteResult[] = [];
34
- private _gateResults: EvalGateResult[] = [];
35
- private _running = false;
36
- private _lastRunAt: number | null = null;
37
- private readonly _subscribers = new Set<() => void>();
38
-
39
- push(result: EvalSuiteResult): void {
40
- const idx = this._suiteResults.findIndex((r) => r.suite === result.suite);
41
- if (idx >= 0) {
42
- this._suiteResults[idx] = result;
43
- } else {
44
- this._suiteResults.push(result);
45
- }
46
- this._lastRunAt = Date.now();
47
- this._notify();
48
- }
49
-
50
- pushGate(gate: EvalGateResult): void {
51
- const idx = this._gateResults.findIndex((g) => g.suite === gate.suite);
52
- if (idx >= 0) {
53
- this._gateResults[idx] = gate;
54
- } else {
55
- this._gateResults.push(gate);
56
- }
57
- this._notify();
58
- }
59
-
60
- setRunning(running: boolean): void {
61
- this._running = running;
62
- this._notify();
63
- }
64
-
65
- isRunning(): boolean { return this._running; }
66
- getLastRunAt(): number | null { return this._lastRunAt; }
67
- getSuiteResults(): EvalSuiteResult[] { return this._suiteResults; }
68
- getGateResults(): EvalGateResult[] { return this._gateResults; }
69
-
70
- subscribe(cb: () => void): () => void {
71
- this._subscribers.add(cb);
72
- return () => this._subscribers.delete(cb);
73
- }
74
-
75
- private _notify(): void {
76
- for (const cb of this._subscribers) cb();
77
- }
78
- }
79
-
80
- // ── Colour palette (hex fg colours for createStyledCell) ─────────────────────
81
-
82
- const C = {
83
- ...DEFAULT_PANEL_PALETTE,
84
- header: '#94a3b8',
85
- headerBg: '#1e293b',
86
- cyan: '#38bdf8',
87
- green: '#22c55e',
88
- yellow: '#eab308',
89
- red: '#ef4444',
90
- dim: '#4b5563',
91
- label: '#64748b',
92
- value: '#e2e8f0',
93
- selected: '#f1f5f9',
94
- sep: '#1e293b',
95
- white: '#cbd5e1',
96
- selectBg: '#0f172a',
97
- } as const;
98
-
99
- // ── Helpers ───────────────────────────────────────────────────────────────────
100
-
101
- function scoreColor(score: number): string {
102
- if (score >= 80) return C.green;
103
- if (score >= 60) return C.yellow;
104
- return C.red;
105
- }
106
-
107
- function fmtTime(ms: number): string {
108
- if (ms < 1000) return `${ms.toFixed(0)}ms`;
109
- return `${(ms / 1000).toFixed(1)}s`;
110
- }
111
-
112
- const DIMENSION_ORDER: EvalDimension[] = ['safety', 'quality', 'latency', 'cost', 'recovery'];
113
-
114
- // ── EvalPanel ─────────────────────────────────────────────────────────────────
115
-
116
- export class EvalPanel extends BasePanel {
117
- private readonly _registry: EvalRegistry;
118
- private _mode: 'list' | 'detail' = 'list';
119
- private _selectedSuiteIdx = 0;
120
- private _selectedScenarioIdx = 0;
121
- private _scrollOffset = 0;
122
- private _unsub: (() => void) | null = null;
123
-
124
- public constructor(registry: EvalRegistry) {
125
- super('eval', 'Eval', 'V', 'monitoring');
126
- this._registry = registry;
127
- }
128
-
129
- public override onActivate(): void {
130
- this._unsub = this._registry.subscribe(() => this.markDirty());
131
- this.markDirty();
132
- }
133
-
134
- public override onDestroy(): void {
135
- this._unsub?.();
136
- this._unsub = null;
137
- }
138
-
139
- public handleInput(key: string): boolean {
140
- const suites = this._registry.getSuiteResults();
141
-
142
- if (this._mode === 'list') {
143
- if (key === 'ArrowUp' || key === 'k') {
144
- this._selectedSuiteIdx = Math.max(0, this._selectedSuiteIdx - 1);
145
- this.markDirty();
146
- return true;
147
- }
148
- if (key === 'ArrowDown' || key === 'j') {
149
- this._selectedSuiteIdx = Math.min(suites.length - 1, this._selectedSuiteIdx + 1);
150
- this.markDirty();
151
- return true;
152
- }
153
- if ((key === 'Enter' || key === 'Return' || key === 'l') && suites.length > 0) {
154
- this._mode = 'detail';
155
- this._selectedScenarioIdx = 0;
156
- this._scrollOffset = 0;
157
- this.markDirty();
158
- return true;
159
- }
160
- return false;
161
- }
162
-
163
- // detail mode
164
- if (key === 'Escape' || key === 'q' || key === 'h') {
165
- this._mode = 'list';
166
- this.markDirty();
167
- return true;
168
- }
169
- if (key === 'ArrowUp' || key === 'k') {
170
- const suite = suites[this._selectedSuiteIdx];
171
- if (suite) {
172
- this._selectedScenarioIdx = Math.max(0, this._selectedScenarioIdx - 1);
173
- this._scrollOffset = 0;
174
- this.markDirty();
175
- }
176
- return true;
177
- }
178
- if (key === 'ArrowDown' || key === 'j') {
179
- const suite = suites[this._selectedSuiteIdx];
180
- if (suite) {
181
- this._selectedScenarioIdx = Math.min(
182
- suite.results.length - 1,
183
- this._selectedScenarioIdx + 1,
184
- );
185
- this._scrollOffset = 0;
186
- this.markDirty();
187
- }
188
- return true;
189
- }
190
- if (key === 'PageUp') {
191
- this._scrollOffset = Math.max(0, this._scrollOffset - 5);
192
- this.markDirty();
193
- return true;
194
- }
195
- if (key === 'PageDown') {
196
- this._scrollOffset += 5;
197
- this.markDirty();
198
- return true;
199
- }
200
- return false;
201
- }
202
-
203
- public render(width: number, height: number): Line[] {
204
- this.needsRender = false;
205
- const suites = this._registry.getSuiteResults();
206
- const gates = this._registry.getGateResults();
207
- const intro = 'Evaluation harness runs, gates, scenario scorecards, and regression indicators for model and product validation.';
208
-
209
- const running = this._registry.isRunning();
210
- const lastRun = this._registry.getLastRunAt();
211
- const summaryLine = buildPanelLine(width, [
212
- [' state: ', C.label],
213
- [running ? 'running' : 'idle', running ? C.yellow : C.dim],
214
- [' last: ', C.label],
215
- [lastRun ? new Date(lastRun).toLocaleTimeString() : 'n/a', C.dim],
216
- ]);
217
-
218
- if (suites.length === 0) {
219
- const workspace = buildPanelWorkspace(width, height, {
220
- title: 'Eval Harness',
221
- intro,
222
- sections: [{
223
- title: 'Status',
224
- lines: [
225
- summaryLine,
226
- ...buildEmptyState(
227
- width,
228
- ' No results yet.',
229
- 'Run an eval suite to populate this workspace with suite scores, gate results, and per-scenario detail.',
230
- [{ command: '/eval run <suite>', summary: 'start a suite such as core-performance, safety-baseline, or cost-tokens' }],
231
- C,
232
- ),
233
- ],
234
- }],
235
- palette: C,
236
- });
237
- while (workspace.length < height) workspace.push(createEmptyLine(width));
238
- return workspace;
239
- }
240
-
241
- const lines: Line[] = [];
242
- if (this._mode === 'list') {
243
- this._renderList(lines, suites, gates, width, height, intro, summaryLine);
244
- } else {
245
- const suite = suites[this._selectedSuiteIdx];
246
- if (suite) {
247
- this._renderDetail(lines, suite, width, height, intro, summaryLine);
248
- }
249
- }
250
-
251
- return lines;
252
- }
253
-
254
- // ── List view ────────────────────────────────────────────────────────────────
255
-
256
- private _renderList(
257
- lines: Line[],
258
- suites: EvalSuiteResult[],
259
- gates: EvalGateResult[],
260
- width: number,
261
- _height: number,
262
- intro: string,
263
- summaryLine: Line,
264
- ): void {
265
- const gateMap = new Map(gates.map((g) => [g.suite, g]));
266
- const sectionLines: Line[] = [
267
- summaryLine,
268
- buildPanelLine(width, [
269
- ['Suite'.padEnd(28), C.header],
270
- ['Score'.padEnd(8), C.header],
271
- ['Pass'.padEnd(6), C.header],
272
- ['Gate'.padEnd(6), C.header],
273
- ['Duration', C.header],
274
- ]),
275
- ];
276
-
277
- suites.forEach((suite, idx) => {
278
- const selected = idx === this._selectedSuiteIdx;
279
- const gate = gateMap.get(suite.suite);
280
- const gateStr = gate ? (gate.passed ? 'ok' : 'FAIL') : '-';
281
- const gateColor = gate ? (gate.passed ? C.green : C.red) : C.dim;
282
- const durationMs = suite.finishedAt - suite.startedAt;
283
- const scoreC = scoreColor(suite.meanScore);
284
- const passC = suite.passed ? C.green : C.red;
285
- const nameColor = selected ? C.selected : C.white;
286
- const bg = selected ? C.selectBg : undefined;
287
- const prefix = selected ? '▸ ' : ' ';
288
- const name = suite.suite.slice(0, 24).padEnd(26);
289
-
290
- sectionLines.push(buildPanelLine(width, [
291
- [prefix + name, nameColor, bg],
292
- [suite.meanScore.toFixed(1).padEnd(8), scoreC, bg],
293
- [(suite.passed ? 'PASS' : 'FAIL').padEnd(6), passC, bg],
294
- [gateStr.padEnd(6), gateColor, bg],
295
- [fmtTime(durationMs), C.dim, bg],
296
- ]));
297
- });
298
-
299
- sectionLines.push(buildPanelLine(width, [[' Enter/l: detail j/k: navigate', C.dim]]));
300
- lines.push(...buildPanelWorkspace(width, _height, {
301
- title: 'Eval Harness',
302
- intro,
303
- sections: [{ title: 'Suites', lines: sectionLines }],
304
- palette: C,
305
- }));
306
- }
307
-
308
- // ── Detail view ──────────────────────────────────────────────────────────────
309
-
310
- private _renderDetail(
311
- lines: Line[],
312
- suite: EvalSuiteResult,
313
- width: number,
314
- height: number,
315
- intro: string,
316
- summaryLine: Line,
317
- ): void {
318
- const sectionLines: Line[] = [
319
- summaryLine,
320
- buildPanelLine(width, [
321
- [`Suite: ${suite.suite}`, C.cyan],
322
- [' mean=', C.label],
323
- [suite.meanScore.toFixed(1), scoreColor(suite.meanScore)],
324
- [' ', C.label],
325
- [suite.passed ? 'PASS' : 'FAIL', suite.passed ? C.green : C.red],
326
- ]),
327
- ];
328
-
329
- const allDetailLines: Line[] = [];
330
- suite.results.forEach((result, idx) => {
331
- const selected = idx === this._selectedScenarioIdx;
332
- this._renderScenarioBlock(allDetailLines, result, selected, width);
333
- });
334
-
335
- const detailSection = resolveScrollablePanelSection(width, height, {
336
- intro,
337
- palette: C,
338
- beforeSections: [{ title: 'Scenario Detail', lines: sectionLines }],
339
- section: {
340
- scrollableLines: allDetailLines,
341
- scrollOffset: this._scrollOffset,
342
- minRows: 1,
343
- },
344
- });
345
- this._scrollOffset = detailSection.scrollOffset;
346
- sectionLines.push(...detailSection.section.lines);
347
- sectionLines.push(buildPanelLine(width, [[' Esc/q: back j/k: scenario PgUp/PgDn: scroll', C.dim]]));
348
- lines.push(...buildPanelWorkspace(width, height, {
349
- title: 'Eval Harness',
350
- intro,
351
- sections: [{ title: 'Scenario Detail', lines: sectionLines }],
352
- palette: C,
353
- }));
354
- }
355
-
356
- private _renderScenarioBlock(
357
- lines: Line[],
358
- result: EvalResult,
359
- selected: boolean,
360
- width: number,
361
- ): void {
362
- const sc = result.scorecard;
363
- const prefix = selected ? '▸ ' : ' ';
364
- const nameColor = selected ? C.selected : C.white;
365
- const scoreC = scoreColor(sc.compositeScore);
366
- const passC = sc.passed ? C.green : C.red;
367
- const nameLen = Math.max(1, width - 22);
368
-
369
- lines.push(buildPanelLine(width, [
370
- [prefix + result.scenario.name.slice(0, nameLen).padEnd(nameLen + 2), nameColor, selected ? C.selectBg : undefined],
371
- [sc.compositeScore.toFixed(1).padStart(5), scoreC, selected ? C.selectBg : undefined],
372
- [' ', C.label, selected ? C.selectBg : undefined],
373
- [sc.passed ? 'PASS' : 'FAIL', passC, selected ? C.selectBg : undefined],
374
- ]));
375
-
376
- if (selected) {
377
- for (const dim of DIMENSION_ORDER) {
378
- const d = sc.dimensions.find((x) => x.dimension === dim);
379
- if (!d) continue;
380
- const filled = Math.round(d.score / 10);
381
- const bar = '#'.repeat(filled) + '.'.repeat(10 - filled);
382
- lines.push(buildPanelLine(width, [
383
- [' ' + dim.padEnd(10) + ' ', C.label],
384
- [bar, scoreColor(d.score)],
385
- [` ${d.score.toFixed(0).padStart(3)}/100`, C.value],
386
- ]));
387
- }
388
-
389
- if (sc.notes && sc.notes.length > 0) {
390
- for (const note of sc.notes) {
391
- lines.push(buildPanelLine(width, [
392
- [' ! ', C.yellow],
393
- [note.slice(0, width - 6), C.yellow],
394
- ]));
395
- }
396
- }
397
- }
398
- }
399
- }