@pellux/goodvibes-agent 0.1.54 → 0.1.56

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,16 @@
2
2
 
3
3
  All notable changes to GoodVibes Agent will be recorded here.
4
4
 
5
+ ## 0.1.56 - 2026-05-31
6
+
7
+ - a0b54e8 Document Bun global PATH setup
8
+ - f845cca Rename onboarding capabilities step
9
+ - 4462e52 Clarify Bun-only install path
10
+
11
+ ## 0.1.55 - 2026-05-31
12
+
13
+ - d8f4eee Remove copied developer audit surfaces
14
+
5
15
  ## 0.1.54 - 2026-05-31
6
16
 
7
17
  - dc1a290 Keep release docs version-neutral
package/README.md CHANGED
@@ -19,6 +19,13 @@ goodvibes-agent profiles templates
19
19
  goodvibes-agent knowledge status
20
20
  ```
21
21
 
22
+ If `goodvibes-agent` is not found after installation, add Bun's global bin directory to `PATH`:
23
+
24
+ ```sh
25
+ export PATH="$(bun pm bin -g):$PATH"
26
+ goodvibes-agent --help
27
+ ```
28
+
22
29
  If Bun reports untrusted lifecycle dependencies, trust only the package and dependencies required by this package:
23
30
 
24
31
  ```sh
@@ -18,6 +18,13 @@ goodvibes-agent --help
18
18
  goodvibes-agent status
19
19
  ```
20
20
 
21
+ If the installed command is not found, add Bun's global bin directory to `PATH`:
22
+
23
+ ```sh
24
+ export PATH="$(bun pm bin -g):$PATH"
25
+ goodvibes-agent --help
26
+ ```
27
+
21
28
  If Bun requires lifecycle trust:
22
29
 
23
30
  ```sh
@@ -4,13 +4,22 @@ GoodVibes Agent's current installable public alpha version is recorded in `packa
4
4
 
5
5
  ## Package Identity
6
6
 
7
- - npm package: `@pellux/goodvibes-agent`
7
+ - registry package: `@pellux/goodvibes-agent`
8
8
  - executable: `goodvibes-agent`
9
9
  - SDK dependency: exact pin to `@pellux/goodvibes-sdk@0.33.35`
10
10
  - runtime: Bun
11
11
  - source language: TypeScript
12
12
  - daemon ownership: external only
13
13
 
14
+ End users install and run GoodVibes Agent with Bun:
15
+
16
+ ```sh
17
+ bun add -g @pellux/goodvibes-agent
18
+ goodvibes-agent --help
19
+ ```
20
+
21
+ Do not add non-Bun install instructions for this product. The package is hosted on the npm registry, but the supported install and smoke path is Bun.
22
+
14
23
  ## Required Gates
15
24
 
16
25
  Before any release candidate:
@@ -25,7 +34,7 @@ bun pm pack --dry-run
25
34
  git diff --check
26
35
  ```
27
36
 
28
- `bun run publish:package` publishes from a staged package directory. If `NPM_CONFIG_USERCONFIG` is already set, npm uses it. Otherwise the script creates a temporary 0600 npm userconfig from `NODE_AUTH_TOKEN` or `NPM_TOKEN`, uses it for that publish command, and removes it with the staging directory.
37
+ `bun run publish:package` publishes from a staged package directory to the package registry. If `NPM_CONFIG_USERCONFIG` is already set, the registry publish command uses it. Otherwise the script creates a temporary 0600 registry userconfig from `NODE_AUTH_TOKEN` or `NPM_TOKEN`, uses it for that publish command, and removes it with the staging directory.
29
38
 
30
39
  Also run the package install smoke from a packed artifact. It must prove:
31
40
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pellux/goodvibes-agent",
3
- "version": "0.1.54",
3
+ "version": "0.1.56",
4
4
  "private": false,
5
5
  "description": "Near-fork GoodVibes operator assistant with the GoodVibes TUI shell, renderer, input, fullscreen workspace, and daemon-connected Agent product brain.",
6
6
  "type": "module",
@@ -65,10 +65,7 @@
65
65
  "architecture:check": "bun run scripts/check-architecture.ts",
66
66
  "foundation:artifacts": "bun run scripts/export-foundation-artifacts.ts",
67
67
  "verification:ledger": "bun run scripts/verification-ledger.ts",
68
- "verification:live": "bun run scripts/verify-live.ts",
69
- "eval:gate": "bun run scripts/eval-gate.ts",
70
- "eval:gate:verbose": "bun run scripts/eval-gate.ts --verbose",
71
- "eval:baseline": "bun run scripts/eval-gate.ts --save-baseline"
68
+ "verification:live": "bun run scripts/verify-live.ts"
72
69
  },
73
70
  "license": "MIT",
74
71
  "repository": {
@@ -5,9 +5,13 @@ if ! command -v bun >/dev/null 2>&1; then
5
5
  cat >&2 <<'EOF'
6
6
  goodvibes-agent requires Bun.
7
7
 
8
- Install Bun first, then install GoodVibes Agent from the npm registry with:
8
+ Install Bun first, then install GoodVibes Agent with:
9
9
 
10
10
  bun add -g @pellux/goodvibes-agent
11
+
12
+ If the installed command is not found, add Bun's global bin directory to PATH:
13
+
14
+ export PATH="$(bun pm bin -g):$PATH"
11
15
  EOF
12
16
  exit 1
13
17
  fi
@@ -177,7 +177,6 @@ export interface CommandOpsServices
177
177
  export interface CommandExtensionRegistryServices {
178
178
  readonly toolRegistry: ToolRegistry;
179
179
  readonly mcpRegistry: McpRegistry;
180
- readonly evalRegistry?: import('../panels/eval-panel.ts').EvalRegistry;
181
180
  }
182
181
 
183
182
  export interface CommandExtensionServices
@@ -67,7 +67,7 @@ export function registerLocalRuntimeCommands(registry: CommandRegistry): void {
67
67
  registry.register({
68
68
  name: 'tools',
69
69
  aliases: ['t'],
70
- description: 'List available tools and review compact native tool capability surfaces',
70
+ description: 'List available tools and review tool safety/status',
71
71
  usage: '[review|panel]',
72
72
  handler(args, ctx) {
73
73
  const sub = (args[0] ?? '').toLowerCase();
@@ -79,12 +79,11 @@ export function registerLocalRuntimeCommands(registry: CommandRegistry): void {
79
79
  }
80
80
  if (sub === 'review') {
81
81
  ctx.print([
82
- 'Tool Surface Review',
83
- ' Native file tools stay compact by default.',
84
- ' Read/write/edit/notebook capabilities are available through the native tool stack, with detail routed to the tools panel and approval surfaces instead of transcript bloat.',
85
- ' Shell and native tool approvals classify work into read, mutation, destructive, dependency, config, notebook, network, remote, and lifecycle risk families.',
86
- ' Use /tools panel to inspect risk class, output-policy actions, spill posture, compact summaries, and approval posture for recent calls.',
87
- ' Use /approval review shell or /approval review file when you need the action-specific why-prompted posture.',
82
+ 'Tool Status',
83
+ ' Tools are available for the main Agent conversation.',
84
+ ' Read-only actions can run directly; writes, destructive changes, network effects, service changes, and external side effects require explicit user intent or approval.',
85
+ ' Recent tool activity and approval posture are available in the tools and approvals views.',
86
+ ' Build/fix/review work should be delegated explicitly with /delegate.',
88
87
  ].join('\n'));
89
88
  }
90
89
  return;
@@ -1,6 +1,5 @@
1
1
  import type { CommandRegistry } from '../command-registry.ts';
2
2
  import type { ProfileData } from '@pellux/goodvibes-sdk/platform/profiles';
3
- import { ToolContractVerifier } from '@/runtime/index.ts';
4
3
  import type { ReplaySnapshotInput } from '@/runtime/index.ts';
5
4
  import { logger } from '@pellux/goodvibes-sdk/platform/utils';
6
5
  import { registerOperatorPanelCommand } from './operator-panel-runtime.ts';
@@ -283,55 +282,6 @@ export function registerOperatorRuntimeCommands(registry: CommandRegistry): void
283
282
  },
284
283
  });
285
284
 
286
- registry.register({
287
- name: 'tool',
288
- description: 'Tool contract verification — verify registered tool contracts',
289
- usage: 'verify <name> | verify-all | contract show <name>',
290
- argsHint: 'verify <name> | verify-all | contract show <name>',
291
- handler(args, ctx) {
292
- const sub = args[0];
293
- if (sub === 'verify' && args[1]) {
294
- const result = ctx.extensions.toolRegistry.verifyContract(args[1]);
295
- if (!result) {
296
- ctx.print(`[tool verify] Tool '${args[1]}' is not registered.`);
297
- return;
298
- }
299
- ctx.print(ToolContractVerifier.formatResult(result));
300
- return;
301
- }
302
- if (sub === 'verify-all') {
303
- ctx.print(ToolContractVerifier.formatAllResults(ctx.extensions.toolRegistry.verifyAllContracts()));
304
- return;
305
- }
306
- if (sub === 'contract' && args[1] === 'show' && args[2]) {
307
- const toolName = args[2];
308
- const result = ctx.extensions.toolRegistry.verifyContract(toolName);
309
- if (!result) {
310
- ctx.print(`[tool contract show] Tool '${toolName}' is not registered.`);
311
- return;
312
- }
313
- const lines: string[] = [ToolContractVerifier.formatResult(result)];
314
- const tool = ctx.extensions.toolRegistry.list().find((t) => t.definition.name === toolName);
315
- if (tool) {
316
- lines.push('');
317
- lines.push('Tool Definition:');
318
- lines.push(` Name: ${tool.definition.name}`);
319
- lines.push(` Description: ${tool.definition.description}`);
320
- lines.push(` Parameters: ${JSON.stringify(tool.definition.parameters, null, 2).replace(/\n/g, '\n ')}`);
321
- }
322
- ctx.print(lines.join('\n'));
323
- return;
324
- }
325
-
326
- ctx.print(
327
- 'Usage: /tool <subcommand>\n'
328
- + ' /tool verify <name> — verify contract for a specific registered tool\n'
329
- + ' /tool verify-all — verify contracts for all registered tools\n'
330
- + ' /tool contract show <name> — show full contract details for a tool'
331
- );
332
- },
333
- });
334
-
335
285
  registry.register({
336
286
  name: 'forensics',
337
287
  aliases: ['foren'],
@@ -1,9 +1,7 @@
1
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
2
- import { dirname, join, resolve } from 'node:path';
1
+ import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
2
+ import { dirname } from 'node:path';
3
3
  import type { CommandContext, CommandRegistry } from '../command-registry.ts';
4
- import { listInstalledEcosystemEntries, loadEcosystemCatalog } from '@/runtime/index.ts';
5
- import { BUILTIN_SUITES } from '@/runtime/index.ts';
6
- import { requireEcosystemCatalogPaths, requireReadModels, requireSecretsManager, requireServiceRegistry, requireShellPaths } from './runtime-services.ts';
4
+ import { requireReadModels, requireSecretsManager, requireServiceRegistry, requireShellPaths } from './runtime-services.ts';
7
5
  import { requireYesFlag, stripYesFlag } from './confirmation.ts';
8
6
 
9
7
  interface TrustReviewBundle {
@@ -29,29 +27,6 @@ interface TrustReviewBundle {
29
27
  };
30
28
  }
31
29
 
32
- interface ReleaseBundle {
33
- readonly version: 1;
34
- readonly capturedAt: number;
35
- readonly runtime: {
36
- readonly provider: string;
37
- readonly model: string;
38
- readonly sessionId: string;
39
- };
40
- readonly evalSuites: readonly string[];
41
- readonly incidentCount: number;
42
- readonly remote: {
43
- readonly pools: number;
44
- readonly contracts: number;
45
- readonly artifacts: number;
46
- };
47
- readonly ecosystem: {
48
- readonly pluginCatalog: number;
49
- readonly skillCatalog: number;
50
- readonly installedPlugins: number;
51
- readonly installedSkills: number;
52
- };
53
- }
54
-
55
30
  function countByMode<T extends string>(values: readonly T[], mode: T): number {
56
31
  return values.filter((value) => value === mode).length;
57
32
  }
@@ -112,46 +87,6 @@ function inspectTrustBundle(path: string): string {
112
87
  ].join('\n');
113
88
  }
114
89
 
115
- function buildReleaseBundle(ctx: Parameters<NonNullable<CommandRegistry['register']>>[0]['handler'] extends (args: string[], context: infer C) => unknown ? C : never): ReleaseBundle {
116
- const remoteRuntime = ctx.ops.remoteRuntime;
117
- const incidents = ctx.extensions.forensicsRegistry?.getAll() ?? [];
118
- const ecosystemPaths = requireEcosystemCatalogPaths(ctx);
119
- return {
120
- version: 1,
121
- capturedAt: Date.now(),
122
- runtime: {
123
- provider: ctx.session.runtime.provider,
124
- model: ctx.session.runtime.model,
125
- sessionId: ctx.session.runtime.sessionId,
126
- },
127
- evalSuites: Object.keys(BUILTIN_SUITES),
128
- incidentCount: incidents.length,
129
- remote: {
130
- pools: remoteRuntime?.listPools().length ?? 0,
131
- contracts: remoteRuntime?.listContracts().length ?? 0,
132
- artifacts: remoteRuntime?.listArtifacts().length ?? 0,
133
- },
134
- ecosystem: {
135
- pluginCatalog: loadEcosystemCatalog('plugin', ecosystemPaths).length,
136
- skillCatalog: loadEcosystemCatalog('skill', ecosystemPaths).length,
137
- installedPlugins: listInstalledEcosystemEntries('plugin', ecosystemPaths).length,
138
- installedSkills: listInstalledEcosystemEntries('skill', ecosystemPaths).length,
139
- },
140
- };
141
- }
142
-
143
- function inspectReleaseBundle(path: string): string {
144
- const parsed = JSON.parse(readFileSync(path, 'utf-8')) as ReleaseBundle;
145
- return [
146
- 'Release Bundle Review',
147
- ` provider/model: ${parsed.runtime.provider || '(unset)'}/${parsed.runtime.model || '(unset)'}`,
148
- ` eval suites: ${parsed.evalSuites.length}`,
149
- ` incidents: ${parsed.incidentCount}`,
150
- ` remote pools/contracts/artifacts: ${parsed.remote.pools}/${parsed.remote.contracts}/${parsed.remote.artifacts}`,
151
- ` ecosystem catalog plugins/skills: ${parsed.ecosystem.pluginCatalog}/${parsed.ecosystem.skillCatalog}`,
152
- ].join('\n');
153
- }
154
-
155
90
  export function registerProductRuntimeCommands(registry: CommandRegistry): void {
156
91
  registry.register({
157
92
  name: 'trust',
@@ -314,65 +249,4 @@ export function registerProductRuntimeCommands(registry: CommandRegistry): void
314
249
  },
315
250
  });
316
251
 
317
- registry.register({
318
- name: 'release',
319
- description: 'Package certification and release-readiness operations',
320
- usage: '[review|checklist|bundle export <path> --yes|bundle inspect <path>]',
321
- handler(args, ctx) {
322
- const parsed = stripYesFlag(args);
323
- const commandArgs = [...parsed.rest];
324
- const shellPaths = requireShellPaths(ctx);
325
- const sub = commandArgs[0] ?? 'review';
326
- if (sub === 'review') {
327
- const bundle = buildReleaseBundle(ctx);
328
- ctx.print([
329
- 'Release Review',
330
- ` provider/model: ${bundle.runtime.provider || '(unset)'}/${bundle.runtime.model || '(unset)'}`,
331
- ` eval suites: ${bundle.evalSuites.length}`,
332
- ` incidents: ${bundle.incidentCount}`,
333
- ` remote pools/contracts/artifacts: ${bundle.remote.pools}/${bundle.remote.contracts}/${bundle.remote.artifacts}`,
334
- ` ecosystem catalog plugins/skills: ${bundle.ecosystem.pluginCatalog}/${bundle.ecosystem.skillCatalog}`,
335
- ` installed plugins/skills: ${bundle.ecosystem.installedPlugins}/${bundle.ecosystem.installedSkills}`,
336
- ].join('\n'));
337
- return;
338
- }
339
- if (sub === 'checklist') {
340
- ctx.print([
341
- 'Release Checklist',
342
- ' 1. Run /setup review and /setup doctor',
343
- ' 2. Run /security review and /trust review',
344
- ' 3. Run /policy preflight and /policy simulate',
345
- ' 4. Run /eval gate <suite> --yes for required certification suites',
346
- ' 5. Review /incident latest and /bridge status',
347
- ' 6. Export /release bundle export <path> --yes for release evidence',
348
- ].join('\n'));
349
- return;
350
- }
351
- if (sub === 'bundle') {
352
- const mode = commandArgs[1];
353
- const pathArg = commandArgs[2];
354
- if ((mode === 'export' || mode === 'inspect') && !pathArg) {
355
- ctx.print(`Usage: /release bundle ${mode} <path>${mode === 'export' ? ' --yes' : ''}`);
356
- return;
357
- }
358
- if (mode === 'export') {
359
- if (!parsed.yes) {
360
- requireYesFlag(ctx, `export release bundle to ${pathArg}`, '/release bundle export <path> --yes');
361
- return;
362
- }
363
- const bundle = buildReleaseBundle(ctx);
364
- const targetPath = shellPaths.resolveWorkspacePath(pathArg!);
365
- mkdirSync(dirname(targetPath), { recursive: true });
366
- writeFileSync(targetPath, JSON.stringify(bundle, null, 2) + '\n', 'utf-8');
367
- ctx.print(`Release bundle exported to ${targetPath}`);
368
- return;
369
- }
370
- if (mode === 'inspect') {
371
- ctx.print(inspectReleaseBundle(shellPaths.resolveWorkspacePath(pathArg!)));
372
- return;
373
- }
374
- }
375
- ctx.print('Usage: /release [review|checklist|bundle export <path> --yes|bundle inspect <path>]');
376
- },
377
- });
378
252
  }
@@ -1,7 +1,6 @@
1
1
  import type { CommandRegistry } from './command-registry.ts';
2
2
  import { policyCommand } from './commands/policy.ts';
3
3
  import { providerCommand } from './commands/provider.ts';
4
- import { evalCommand } from './commands/eval.ts';
5
4
  import { sessionCommand } from './commands/session.ts';
6
5
  import { recallCommand } from './commands/memory.ts';
7
6
  import { knowledgeCommand } from './commands/knowledge.ts';
@@ -126,9 +125,6 @@ export function registerBuiltinCommands(registry: CommandRegistry): void {
126
125
  // ── /provider ─────────────────────────────────────────────────────────────
127
126
  registry.register(providerCommand);
128
127
 
129
- // ── /eval ─────────────────────────────────────────────────────────────────
130
- registry.register(evalCommand);
131
-
132
128
  // ── /session ─────────────────────────────────────────────────────────────
133
129
  registry.register(sessionCommand);
134
130
 
@@ -130,12 +130,12 @@ export function buildCapabilitiesStep(controller: OnboardingWizardController): O
130
130
 
131
131
  return {
132
132
  id: 'capabilities',
133
- title: 'Choose GoodVibes surfaces',
134
- shortLabel: 'Capabilities',
133
+ title: 'Choose Agent surfaces',
134
+ shortLabel: 'Surfaces',
135
135
  description: 'Choose what Agent should prepare locally. Daemon-backed surfaces are reviewed as external dependencies; Agent does not enable service mode or autostart.',
136
136
  summaryTitle: 'Selected surfaces',
137
137
  summaryLines: [
138
- `${selectedCount}/${capabilities.length} option(s) selected`,
138
+ `${selectedCount}/${capabilities.length} surface option(s) selected`,
139
139
  `Mode: ${controller.mode === 'edit' ? 'edit existing shell state' : controller.mode === 'reopen' ? 'reopen review flow' : 'new setup'}`,
140
140
  controller.runtimeSnapshot?.collectionIssues.length
141
141
  ? `${controller.runtimeSnapshot.collectionIssues.length} runtime collection issue(s)`
@@ -27,7 +27,6 @@ import { DebugPanel } from '../debug-panel.ts';
27
27
  import { IncidentReviewPanel } from '../incident-review-panel.ts';
28
28
  import { ForensicsPanel } from '../forensics-panel.ts';
29
29
  import { PolicyPanel } from '../policy-panel.ts';
30
- import { EvalPanel } from '../eval-panel.ts';
31
30
  import { createProviderAccountSnapshotQuery } from '../provider-account-snapshot.ts';
32
31
  import {
33
32
  createEnvironmentVariableQuery,
@@ -335,15 +334,4 @@ export function registerOperationsPanels(manager: PanelManager, deps: ResolvedBu
335
334
  factory: () => new PolicyPanel(deps.policyRuntimeState),
336
335
  });
337
336
 
338
- if (deps.evalRegistry) {
339
- const { evalRegistry } = deps;
340
- manager.registerType({
341
- id: 'eval',
342
- name: 'Eval',
343
- icon: 'Y',
344
- category: 'monitoring',
345
- description: 'Evaluation harness: benchmark suite results, scorecards, and regression gates',
346
- factory: () => new EvalPanel(evalRegistry),
347
- });
348
- }
349
337
  }
@@ -62,8 +62,6 @@ export interface BuiltinPanelDeps {
62
62
  dismissPlanning?: () => void;
63
63
  /** ForensicsRegistry for the Forensics panel. */
64
64
  forensicsRegistry?: import('@/runtime/index.ts').ForensicsRegistry;
65
- /** EvalRegistry for the Eval panel. */
66
- evalRegistry?: import('../eval-panel.ts').EvalRegistry;
67
65
  /** MemoryRegistry for the Memory panel. */
68
66
  memoryRegistry?: MemoryRegistry;
69
67
  /** Isolated Agent Knowledge service for the Agent Knowledge panel. */
package/src/version.ts CHANGED
@@ -6,7 +6,7 @@ import { join } from 'node:path';
6
6
  // The prebuild script updates the fallback value before compilation.
7
7
  // Uses import.meta.dir (Bun) to locate package.json relative to this file,
8
8
  // which is correct regardless of the process working directory.
9
- let _version = '0.1.54';
9
+ let _version = '0.1.56';
10
10
  let _sdkVersion = '0.33.35';
11
11
  try {
12
12
  const pkg = JSON.parse(readFileSync(join(import.meta.dir, '..', 'package.json'), 'utf-8')) as {
@@ -1,217 +0,0 @@
1
- /**
2
- * /eval command handler.
3
- *
4
- * Implements the Evaluation Harness commands:
5
- *
6
- * /eval list — List all available eval suites
7
- * /eval run <suite> --yes — Run a named suite (or 'all')
8
- * /eval compare <baseline-file> — Compare last run against a baseline file
9
- * /eval gate <suite> --yes — Run suite and apply CI gate (exits 1 on regression)
10
- */
11
-
12
- import type { SlashCommand, CommandContext } from '../command-registry.ts';
13
- import { EvalRunner } from '@/runtime/index.ts';
14
- import { BUILTIN_SUITES } from '@/runtime/index.ts';
15
- import { formatScorecard } from '@/runtime/index.ts';
16
- import { loadBaseline, captureBaseline, formatBaselineComparison, writeBaseline } from '@/runtime/index.ts';
17
- import type { EvalRegistry } from '../../panels/eval-panel.ts';
18
- import { formatSuiteResult, formatGateResult } from '@/runtime/index.ts';
19
- import { requireShellPaths } from './runtime-services.ts';
20
- import { summarizeError } from '@pellux/goodvibes-sdk/platform/utils';
21
- import { requireYesFlag, stripYesFlag } from './confirmation.ts';
22
-
23
- // ── Subcommand helpers ────────────────────────────────────────────────────────
24
-
25
- function printSuiteList(context: CommandContext): void {
26
- context.print('[eval] Available suites:');
27
- for (const [name, scenarios] of Object.entries(BUILTIN_SUITES)) {
28
- context.print(` ${name} (${scenarios.length} scenarios)`);
29
- for (const s of scenarios) {
30
- context.print(` - ${s.id}: ${s.name}`);
31
- }
32
- }
33
- context.print('[eval] Usage: /eval run <suite> --yes or /eval run all --yes');
34
- }
35
-
36
- function getRegistry(context: CommandContext): EvalRegistry | undefined {
37
- return context.extensions.evalRegistry;
38
- }
39
-
40
- // ── /eval list ────────────────────────────────────────────────────────────────
41
-
42
- function handleList(_args: string[], context: CommandContext): void {
43
- printSuiteList(context);
44
- }
45
-
46
- // ── /eval run ────────────────────────────────────────────────────────────────
47
-
48
- async function handleRun(args: string[], context: CommandContext): Promise<void> {
49
- const { rest, yes } = stripYesFlag(args);
50
- const suiteName = rest[0] ?? 'all';
51
- const registry = getRegistry(context);
52
-
53
- const suitesToRun =
54
- suiteName === 'all'
55
- ? Object.keys(BUILTIN_SUITES)
56
- : BUILTIN_SUITES[suiteName]
57
- ? [suiteName]
58
- : null;
59
-
60
- if (!suitesToRun) {
61
- context.print(`[eval] Unknown suite: "${suiteName}". Run /eval list to see available suites.`);
62
- return;
63
- }
64
- if (!yes) {
65
- requireYesFlag(context, `run eval suite ${suiteName}`, '/eval run <suite|all> --yes');
66
- return;
67
- }
68
-
69
- const runner = new EvalRunner();
70
- registry?.setRunning(true);
71
-
72
- for (const name of suitesToRun) {
73
- const scenarios = BUILTIN_SUITES[name];
74
- if (!scenarios) continue;
75
-
76
- context.print(`[eval] Running suite: ${name} (${scenarios.length} scenarios)...`);
77
- const result = await runner.runSuite(name, scenarios);
78
- registry?.push(result);
79
-
80
- context.print(formatSuiteResult(result));
81
-
82
- for (const r of result.results) {
83
- context.print(formatScorecard(r.scorecard));
84
- }
85
- }
86
-
87
- registry?.setRunning(false);
88
- }
89
-
90
- // ── /eval compare ─────────────────────────────────────────────────────────────
91
-
92
- async function handleCompare(args: string[], context: CommandContext): Promise<void> {
93
- const baselineFile = args[0] ?? '.goodvibes/eval/baseline.json';
94
- const registry = getRegistry(context);
95
- const projectRoot = requireShellPaths(context).workingDirectory;
96
- const suiteResults = registry?.getSuiteResults() ?? [];
97
-
98
- if (suiteResults.length === 0) {
99
- context.print('[eval] No suite results to compare. Run /eval run <suite> --yes first.');
100
- return;
101
- }
102
-
103
- const baseline = await loadBaseline(baselineFile, projectRoot);
104
- if (!baseline) {
105
- context.print(`[eval] Baseline file not found: ${baselineFile}`);
106
- context.print('[eval] Tip: run /eval gate <suite> [baseline-file] --save-baseline --yes to create a baseline.');
107
- return;
108
- }
109
-
110
- for (const result of suiteResults) {
111
- context.print(formatBaselineComparison(baseline, result));
112
- }
113
- }
114
-
115
- // ── /eval gate ────────────────────────────────────────────────────────────────
116
-
117
- async function handleGate(args: string[], context: CommandContext): Promise<void> {
118
- const { rest, yes } = stripYesFlag(args);
119
- const positional = rest.filter((arg) => arg !== '--save-baseline');
120
- const suiteName = positional[0];
121
- const baselineFile = positional[1] ?? '.goodvibes/eval/baseline.json';
122
- const saveFlag = rest.includes('--save-baseline');
123
- const projectRoot = requireShellPaths(context).workingDirectory;
124
-
125
- if (!suiteName) {
126
- context.print('[eval] Usage: /eval gate <suite> [baseline-file] [--save-baseline] --yes');
127
- return;
128
- }
129
-
130
- const scenarios = BUILTIN_SUITES[suiteName];
131
- if (!scenarios) {
132
- context.print(`[eval] Unknown suite: "${suiteName}". Run /eval list to see available suites.`);
133
- return;
134
- }
135
- if (!yes) {
136
- requireYesFlag(context, `run eval gate ${suiteName}`, '/eval gate <suite> [baseline-file] [--save-baseline] --yes');
137
- return;
138
- }
139
-
140
- const registry = getRegistry(context);
141
- const runner = new EvalRunner();
142
-
143
- context.print(`[eval] Gate: running suite "${suiteName}"...`);
144
- registry?.setRunning(true);
145
- const fresh = await runner.runSuite(suiteName, scenarios);
146
- registry?.push(fresh);
147
- registry?.setRunning(false);
148
-
149
- const baseline = await loadBaseline(baselineFile, projectRoot);
150
- const gate = runner.evaluateGate(fresh, baseline);
151
- registry?.pushGate(gate);
152
-
153
- context.print(formatGateResult(gate));
154
-
155
- if (saveFlag || !baseline) {
156
- const label = suiteName ?? 'latest';
157
- const newBaseline = captureBaseline(label, [fresh]);
158
- try {
159
- await writeBaseline(baselineFile, newBaseline, projectRoot);
160
- context.print(`[eval] Baseline saved to ${baselineFile}`);
161
- } catch (err) {
162
- context.print(`[eval] Warning: could not save baseline: ${summarizeError(err)}`);
163
- }
164
- }
165
-
166
- if (!gate.passed) {
167
- context.print(`[eval] Gate FAILED: ${gate.regressions.length} regression(s) detected.`);
168
- } else {
169
- context.print('[eval] Gate PASSED.');
170
- }
171
- }
172
-
173
- // ── Top-level command ─────────────────────────────────────────────────────────
174
-
175
- export const evalCommand: SlashCommand = {
176
- name: 'eval',
177
- description: 'Evaluation harness: run benchmark suites, compare baselines, and gate regressions.',
178
- usage: '<subcommand> [args]',
179
- argsHint: 'list|run <suite> --yes|compare <baseline>|gate <suite> --yes',
180
- handler: async (args: string[], context: CommandContext): Promise<void> => {
181
- const [sub, ...rest] = args;
182
-
183
- switch (sub) {
184
- case 'list':
185
- case 'ls':
186
- handleList(rest, context);
187
- break;
188
-
189
- case 'run':
190
- await handleRun(rest, context);
191
- break;
192
-
193
- case 'compare':
194
- case 'cmp':
195
- await handleCompare(rest, context);
196
- break;
197
-
198
- case 'gate':
199
- await handleGate(rest, context);
200
- break;
201
-
202
- default: {
203
- const usage = [
204
- 'Usage: /eval <subcommand>',
205
- ' list — List all available eval suites',
206
- ' run <suite|all> --yes — Run a named suite (or all suites)',
207
- ' compare [baseline-file] — Compare last results against baseline',
208
- ' gate <suite> [baseline-file] --yes',
209
- ' — Run suite and apply regression gate',
210
- ' --save-baseline — Save fresh run as new baseline',
211
- ].join('\n');
212
- context.print(usage);
213
- break;
214
- }
215
- }
216
- },
217
- };
@@ -1,399 +0,0 @@
1
- /**
2
- * Eval Panel — renders evaluation harness results in list and detail modes.
3
- *
4
- * Displays suite run summaries, per-scenario scorecards, and regression
5
- * indicators. Wired with an EvalRegistry that holds the latest run results.
6
- */
7
-
8
- import { BasePanel } from './base-panel.ts';
9
- import type { Line } from '../types/grid.ts';
10
- import { createEmptyLine } from '../types/grid.ts';
11
- import {
12
- buildEmptyState,
13
- buildPanelLine,
14
- buildPanelWorkspace,
15
- resolveScrollablePanelSection,
16
- DEFAULT_PANEL_PALETTE,
17
- } from './polish.ts';
18
-
19
- // ── EvalRegistry ─────────────────────────────────────────────────────────────
20
-
21
- import type {
22
- EvalSuiteResult,
23
- EvalResult,
24
- EvalGateResult,
25
- EvalDimension,
26
- } from '@/runtime/index.ts';
27
-
28
- /**
29
- * Holds the latest eval run state for display in EvalPanel.
30
- * Created externally, injected into the panel.
31
- */
32
- export class EvalRegistry {
33
- private _suiteResults: EvalSuiteResult[] = [];
34
- private _gateResults: EvalGateResult[] = [];
35
- private _running = false;
36
- private _lastRunAt: number | null = null;
37
- private readonly _subscribers = new Set<() => void>();
38
-
39
- push(result: EvalSuiteResult): void {
40
- const idx = this._suiteResults.findIndex((r) => r.suite === result.suite);
41
- if (idx >= 0) {
42
- this._suiteResults[idx] = result;
43
- } else {
44
- this._suiteResults.push(result);
45
- }
46
- this._lastRunAt = Date.now();
47
- this._notify();
48
- }
49
-
50
- pushGate(gate: EvalGateResult): void {
51
- const idx = this._gateResults.findIndex((g) => g.suite === gate.suite);
52
- if (idx >= 0) {
53
- this._gateResults[idx] = gate;
54
- } else {
55
- this._gateResults.push(gate);
56
- }
57
- this._notify();
58
- }
59
-
60
- setRunning(running: boolean): void {
61
- this._running = running;
62
- this._notify();
63
- }
64
-
65
- isRunning(): boolean { return this._running; }
66
- getLastRunAt(): number | null { return this._lastRunAt; }
67
- getSuiteResults(): EvalSuiteResult[] { return this._suiteResults; }
68
- getGateResults(): EvalGateResult[] { return this._gateResults; }
69
-
70
- subscribe(cb: () => void): () => void {
71
- this._subscribers.add(cb);
72
- return () => this._subscribers.delete(cb);
73
- }
74
-
75
- private _notify(): void {
76
- for (const cb of this._subscribers) cb();
77
- }
78
- }
79
-
80
- // ── Colour palette (hex fg colours for createStyledCell) ─────────────────────
81
-
82
- const C = {
83
- ...DEFAULT_PANEL_PALETTE,
84
- header: '#94a3b8',
85
- headerBg: '#1e293b',
86
- cyan: '#38bdf8',
87
- green: '#22c55e',
88
- yellow: '#eab308',
89
- red: '#ef4444',
90
- dim: '#4b5563',
91
- label: '#64748b',
92
- value: '#e2e8f0',
93
- selected: '#f1f5f9',
94
- sep: '#1e293b',
95
- white: '#cbd5e1',
96
- selectBg: '#0f172a',
97
- } as const;
98
-
99
- // ── Helpers ───────────────────────────────────────────────────────────────────
100
-
101
- function scoreColor(score: number): string {
102
- if (score >= 80) return C.green;
103
- if (score >= 60) return C.yellow;
104
- return C.red;
105
- }
106
-
107
- function fmtTime(ms: number): string {
108
- if (ms < 1000) return `${ms.toFixed(0)}ms`;
109
- return `${(ms / 1000).toFixed(1)}s`;
110
- }
111
-
112
- const DIMENSION_ORDER: EvalDimension[] = ['safety', 'quality', 'latency', 'cost', 'recovery'];
113
-
114
- // ── EvalPanel ─────────────────────────────────────────────────────────────────
115
-
116
- export class EvalPanel extends BasePanel {
117
- private readonly _registry: EvalRegistry;
118
- private _mode: 'list' | 'detail' = 'list';
119
- private _selectedSuiteIdx = 0;
120
- private _selectedScenarioIdx = 0;
121
- private _scrollOffset = 0;
122
- private _unsub: (() => void) | null = null;
123
-
124
- public constructor(registry: EvalRegistry) {
125
- super('eval', 'Eval', 'V', 'monitoring');
126
- this._registry = registry;
127
- }
128
-
129
- public override onActivate(): void {
130
- this._unsub = this._registry.subscribe(() => this.markDirty());
131
- this.markDirty();
132
- }
133
-
134
- public override onDestroy(): void {
135
- this._unsub?.();
136
- this._unsub = null;
137
- }
138
-
139
- public handleInput(key: string): boolean {
140
- const suites = this._registry.getSuiteResults();
141
-
142
- if (this._mode === 'list') {
143
- if (key === 'ArrowUp' || key === 'k') {
144
- this._selectedSuiteIdx = Math.max(0, this._selectedSuiteIdx - 1);
145
- this.markDirty();
146
- return true;
147
- }
148
- if (key === 'ArrowDown' || key === 'j') {
149
- this._selectedSuiteIdx = Math.min(suites.length - 1, this._selectedSuiteIdx + 1);
150
- this.markDirty();
151
- return true;
152
- }
153
- if ((key === 'Enter' || key === 'Return' || key === 'l') && suites.length > 0) {
154
- this._mode = 'detail';
155
- this._selectedScenarioIdx = 0;
156
- this._scrollOffset = 0;
157
- this.markDirty();
158
- return true;
159
- }
160
- return false;
161
- }
162
-
163
- // detail mode
164
- if (key === 'Escape' || key === 'q' || key === 'h') {
165
- this._mode = 'list';
166
- this.markDirty();
167
- return true;
168
- }
169
- if (key === 'ArrowUp' || key === 'k') {
170
- const suite = suites[this._selectedSuiteIdx];
171
- if (suite) {
172
- this._selectedScenarioIdx = Math.max(0, this._selectedScenarioIdx - 1);
173
- this._scrollOffset = 0;
174
- this.markDirty();
175
- }
176
- return true;
177
- }
178
- if (key === 'ArrowDown' || key === 'j') {
179
- const suite = suites[this._selectedSuiteIdx];
180
- if (suite) {
181
- this._selectedScenarioIdx = Math.min(
182
- suite.results.length - 1,
183
- this._selectedScenarioIdx + 1,
184
- );
185
- this._scrollOffset = 0;
186
- this.markDirty();
187
- }
188
- return true;
189
- }
190
- if (key === 'PageUp') {
191
- this._scrollOffset = Math.max(0, this._scrollOffset - 5);
192
- this.markDirty();
193
- return true;
194
- }
195
- if (key === 'PageDown') {
196
- this._scrollOffset += 5;
197
- this.markDirty();
198
- return true;
199
- }
200
- return false;
201
- }
202
-
203
- public render(width: number, height: number): Line[] {
204
- this.needsRender = false;
205
- const suites = this._registry.getSuiteResults();
206
- const gates = this._registry.getGateResults();
207
- const intro = 'Evaluation harness runs, gates, scenario scorecards, and regression indicators for model and product validation.';
208
-
209
- const running = this._registry.isRunning();
210
- const lastRun = this._registry.getLastRunAt();
211
- const summaryLine = buildPanelLine(width, [
212
- [' state: ', C.label],
213
- [running ? 'running' : 'idle', running ? C.yellow : C.dim],
214
- [' last: ', C.label],
215
- [lastRun ? new Date(lastRun).toLocaleTimeString() : 'n/a', C.dim],
216
- ]);
217
-
218
- if (suites.length === 0) {
219
- const workspace = buildPanelWorkspace(width, height, {
220
- title: 'Eval Harness',
221
- intro,
222
- sections: [{
223
- title: 'Status',
224
- lines: [
225
- summaryLine,
226
- ...buildEmptyState(
227
- width,
228
- ' No results yet.',
229
- 'Run an eval suite to populate this workspace with suite scores, gate results, and per-scenario detail.',
230
- [{ command: '/eval run <suite>', summary: 'start a suite such as core-performance, safety-baseline, or cost-tokens' }],
231
- C,
232
- ),
233
- ],
234
- }],
235
- palette: C,
236
- });
237
- while (workspace.length < height) workspace.push(createEmptyLine(width));
238
- return workspace;
239
- }
240
-
241
- const lines: Line[] = [];
242
- if (this._mode === 'list') {
243
- this._renderList(lines, suites, gates, width, height, intro, summaryLine);
244
- } else {
245
- const suite = suites[this._selectedSuiteIdx];
246
- if (suite) {
247
- this._renderDetail(lines, suite, width, height, intro, summaryLine);
248
- }
249
- }
250
-
251
- return lines;
252
- }
253
-
254
- // ── List view ────────────────────────────────────────────────────────────────
255
-
256
- private _renderList(
257
- lines: Line[],
258
- suites: EvalSuiteResult[],
259
- gates: EvalGateResult[],
260
- width: number,
261
- _height: number,
262
- intro: string,
263
- summaryLine: Line,
264
- ): void {
265
- const gateMap = new Map(gates.map((g) => [g.suite, g]));
266
- const sectionLines: Line[] = [
267
- summaryLine,
268
- buildPanelLine(width, [
269
- ['Suite'.padEnd(28), C.header],
270
- ['Score'.padEnd(8), C.header],
271
- ['Pass'.padEnd(6), C.header],
272
- ['Gate'.padEnd(6), C.header],
273
- ['Duration', C.header],
274
- ]),
275
- ];
276
-
277
- suites.forEach((suite, idx) => {
278
- const selected = idx === this._selectedSuiteIdx;
279
- const gate = gateMap.get(suite.suite);
280
- const gateStr = gate ? (gate.passed ? 'ok' : 'FAIL') : '-';
281
- const gateColor = gate ? (gate.passed ? C.green : C.red) : C.dim;
282
- const durationMs = suite.finishedAt - suite.startedAt;
283
- const scoreC = scoreColor(suite.meanScore);
284
- const passC = suite.passed ? C.green : C.red;
285
- const nameColor = selected ? C.selected : C.white;
286
- const bg = selected ? C.selectBg : undefined;
287
- const prefix = selected ? '▸ ' : ' ';
288
- const name = suite.suite.slice(0, 24).padEnd(26);
289
-
290
- sectionLines.push(buildPanelLine(width, [
291
- [prefix + name, nameColor, bg],
292
- [suite.meanScore.toFixed(1).padEnd(8), scoreC, bg],
293
- [(suite.passed ? 'PASS' : 'FAIL').padEnd(6), passC, bg],
294
- [gateStr.padEnd(6), gateColor, bg],
295
- [fmtTime(durationMs), C.dim, bg],
296
- ]));
297
- });
298
-
299
- sectionLines.push(buildPanelLine(width, [[' Enter/l: detail j/k: navigate', C.dim]]));
300
- lines.push(...buildPanelWorkspace(width, _height, {
301
- title: 'Eval Harness',
302
- intro,
303
- sections: [{ title: 'Suites', lines: sectionLines }],
304
- palette: C,
305
- }));
306
- }
307
-
308
- // ── Detail view ──────────────────────────────────────────────────────────────
309
-
310
- private _renderDetail(
311
- lines: Line[],
312
- suite: EvalSuiteResult,
313
- width: number,
314
- height: number,
315
- intro: string,
316
- summaryLine: Line,
317
- ): void {
318
- const sectionLines: Line[] = [
319
- summaryLine,
320
- buildPanelLine(width, [
321
- [`Suite: ${suite.suite}`, C.cyan],
322
- [' mean=', C.label],
323
- [suite.meanScore.toFixed(1), scoreColor(suite.meanScore)],
324
- [' ', C.label],
325
- [suite.passed ? 'PASS' : 'FAIL', suite.passed ? C.green : C.red],
326
- ]),
327
- ];
328
-
329
- const allDetailLines: Line[] = [];
330
- suite.results.forEach((result, idx) => {
331
- const selected = idx === this._selectedScenarioIdx;
332
- this._renderScenarioBlock(allDetailLines, result, selected, width);
333
- });
334
-
335
- const detailSection = resolveScrollablePanelSection(width, height, {
336
- intro,
337
- palette: C,
338
- beforeSections: [{ title: 'Scenario Detail', lines: sectionLines }],
339
- section: {
340
- scrollableLines: allDetailLines,
341
- scrollOffset: this._scrollOffset,
342
- minRows: 1,
343
- },
344
- });
345
- this._scrollOffset = detailSection.scrollOffset;
346
- sectionLines.push(...detailSection.section.lines);
347
- sectionLines.push(buildPanelLine(width, [[' Esc/q: back j/k: scenario PgUp/PgDn: scroll', C.dim]]));
348
- lines.push(...buildPanelWorkspace(width, height, {
349
- title: 'Eval Harness',
350
- intro,
351
- sections: [{ title: 'Scenario Detail', lines: sectionLines }],
352
- palette: C,
353
- }));
354
- }
355
-
356
- private _renderScenarioBlock(
357
- lines: Line[],
358
- result: EvalResult,
359
- selected: boolean,
360
- width: number,
361
- ): void {
362
- const sc = result.scorecard;
363
- const prefix = selected ? '▸ ' : ' ';
364
- const nameColor = selected ? C.selected : C.white;
365
- const scoreC = scoreColor(sc.compositeScore);
366
- const passC = sc.passed ? C.green : C.red;
367
- const nameLen = Math.max(1, width - 22);
368
-
369
- lines.push(buildPanelLine(width, [
370
- [prefix + result.scenario.name.slice(0, nameLen).padEnd(nameLen + 2), nameColor, selected ? C.selectBg : undefined],
371
- [sc.compositeScore.toFixed(1).padStart(5), scoreC, selected ? C.selectBg : undefined],
372
- [' ', C.label, selected ? C.selectBg : undefined],
373
- [sc.passed ? 'PASS' : 'FAIL', passC, selected ? C.selectBg : undefined],
374
- ]));
375
-
376
- if (selected) {
377
- for (const dim of DIMENSION_ORDER) {
378
- const d = sc.dimensions.find((x) => x.dimension === dim);
379
- if (!d) continue;
380
- const filled = Math.round(d.score / 10);
381
- const bar = '#'.repeat(filled) + '.'.repeat(10 - filled);
382
- lines.push(buildPanelLine(width, [
383
- [' ' + dim.padEnd(10) + ' ', C.label],
384
- [bar, scoreColor(d.score)],
385
- [` ${d.score.toFixed(0).padStart(3)}/100`, C.value],
386
- ]));
387
- }
388
-
389
- if (sc.notes && sc.notes.length > 0) {
390
- for (const note of sc.notes) {
391
- lines.push(buildPanelLine(width, [
392
- [' ! ', C.yellow],
393
- [note.slice(0, width - 6), C.yellow],
394
- ]));
395
- }
396
- }
397
- }
398
- }
399
- }