@pellux/goodvibes-agent 0.1.54 → 0.1.55
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +4 -0
- package/package.json +2 -5
- package/src/input/command-registry.ts +0 -1
- package/src/input/commands/local-runtime.ts +6 -7
- package/src/input/commands/operator-runtime.ts +0 -50
- package/src/input/commands/product-runtime.ts +3 -129
- package/src/input/commands.ts +0 -4
- package/src/panels/builtin/operations.ts +0 -12
- package/src/panels/builtin/shared.ts +0 -2
- package/src/version.ts +1 -1
- package/src/input/commands/eval.ts +0 -217
- package/src/panels/eval-panel.ts +0 -399
package/CHANGELOG.md
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pellux/goodvibes-agent",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.55",
|
|
4
4
|
"private": false,
|
|
5
5
|
"description": "Near-fork GoodVibes operator assistant with the GoodVibes TUI shell, renderer, input, fullscreen workspace, and daemon-connected Agent product brain.",
|
|
6
6
|
"type": "module",
|
|
@@ -65,10 +65,7 @@
|
|
|
65
65
|
"architecture:check": "bun run scripts/check-architecture.ts",
|
|
66
66
|
"foundation:artifacts": "bun run scripts/export-foundation-artifacts.ts",
|
|
67
67
|
"verification:ledger": "bun run scripts/verification-ledger.ts",
|
|
68
|
-
"verification:live": "bun run scripts/verify-live.ts"
|
|
69
|
-
"eval:gate": "bun run scripts/eval-gate.ts",
|
|
70
|
-
"eval:gate:verbose": "bun run scripts/eval-gate.ts --verbose",
|
|
71
|
-
"eval:baseline": "bun run scripts/eval-gate.ts --save-baseline"
|
|
68
|
+
"verification:live": "bun run scripts/verify-live.ts"
|
|
72
69
|
},
|
|
73
70
|
"license": "MIT",
|
|
74
71
|
"repository": {
|
|
@@ -177,7 +177,6 @@ export interface CommandOpsServices
|
|
|
177
177
|
export interface CommandExtensionRegistryServices {
|
|
178
178
|
readonly toolRegistry: ToolRegistry;
|
|
179
179
|
readonly mcpRegistry: McpRegistry;
|
|
180
|
-
readonly evalRegistry?: import('../panels/eval-panel.ts').EvalRegistry;
|
|
181
180
|
}
|
|
182
181
|
|
|
183
182
|
export interface CommandExtensionServices
|
|
@@ -67,7 +67,7 @@ export function registerLocalRuntimeCommands(registry: CommandRegistry): void {
|
|
|
67
67
|
registry.register({
|
|
68
68
|
name: 'tools',
|
|
69
69
|
aliases: ['t'],
|
|
70
|
-
description: 'List available tools and review
|
|
70
|
+
description: 'List available tools and review tool safety/status',
|
|
71
71
|
usage: '[review|panel]',
|
|
72
72
|
handler(args, ctx) {
|
|
73
73
|
const sub = (args[0] ?? '').toLowerCase();
|
|
@@ -79,12 +79,11 @@ export function registerLocalRuntimeCommands(registry: CommandRegistry): void {
|
|
|
79
79
|
}
|
|
80
80
|
if (sub === 'review') {
|
|
81
81
|
ctx.print([
|
|
82
|
-
'Tool
|
|
83
|
-
'
|
|
84
|
-
' Read
|
|
85
|
-
'
|
|
86
|
-
'
|
|
87
|
-
' Use /approval review shell or /approval review file when you need the action-specific why-prompted posture.',
|
|
82
|
+
'Tool Status',
|
|
83
|
+
' Tools are available for the main Agent conversation.',
|
|
84
|
+
' Read-only actions can run directly; writes, destructive changes, network effects, service changes, and external side effects require explicit user intent or approval.',
|
|
85
|
+
' Recent tool activity and approval posture are available in the tools and approvals views.',
|
|
86
|
+
' Build/fix/review work should be delegated explicitly with /delegate.',
|
|
88
87
|
].join('\n'));
|
|
89
88
|
}
|
|
90
89
|
return;
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import type { CommandRegistry } from '../command-registry.ts';
|
|
2
2
|
import type { ProfileData } from '@pellux/goodvibes-sdk/platform/profiles';
|
|
3
|
-
import { ToolContractVerifier } from '@/runtime/index.ts';
|
|
4
3
|
import type { ReplaySnapshotInput } from '@/runtime/index.ts';
|
|
5
4
|
import { logger } from '@pellux/goodvibes-sdk/platform/utils';
|
|
6
5
|
import { registerOperatorPanelCommand } from './operator-panel-runtime.ts';
|
|
@@ -283,55 +282,6 @@ export function registerOperatorRuntimeCommands(registry: CommandRegistry): void
|
|
|
283
282
|
},
|
|
284
283
|
});
|
|
285
284
|
|
|
286
|
-
registry.register({
|
|
287
|
-
name: 'tool',
|
|
288
|
-
description: 'Tool contract verification — verify registered tool contracts',
|
|
289
|
-
usage: 'verify <name> | verify-all | contract show <name>',
|
|
290
|
-
argsHint: 'verify <name> | verify-all | contract show <name>',
|
|
291
|
-
handler(args, ctx) {
|
|
292
|
-
const sub = args[0];
|
|
293
|
-
if (sub === 'verify' && args[1]) {
|
|
294
|
-
const result = ctx.extensions.toolRegistry.verifyContract(args[1]);
|
|
295
|
-
if (!result) {
|
|
296
|
-
ctx.print(`[tool verify] Tool '${args[1]}' is not registered.`);
|
|
297
|
-
return;
|
|
298
|
-
}
|
|
299
|
-
ctx.print(ToolContractVerifier.formatResult(result));
|
|
300
|
-
return;
|
|
301
|
-
}
|
|
302
|
-
if (sub === 'verify-all') {
|
|
303
|
-
ctx.print(ToolContractVerifier.formatAllResults(ctx.extensions.toolRegistry.verifyAllContracts()));
|
|
304
|
-
return;
|
|
305
|
-
}
|
|
306
|
-
if (sub === 'contract' && args[1] === 'show' && args[2]) {
|
|
307
|
-
const toolName = args[2];
|
|
308
|
-
const result = ctx.extensions.toolRegistry.verifyContract(toolName);
|
|
309
|
-
if (!result) {
|
|
310
|
-
ctx.print(`[tool contract show] Tool '${toolName}' is not registered.`);
|
|
311
|
-
return;
|
|
312
|
-
}
|
|
313
|
-
const lines: string[] = [ToolContractVerifier.formatResult(result)];
|
|
314
|
-
const tool = ctx.extensions.toolRegistry.list().find((t) => t.definition.name === toolName);
|
|
315
|
-
if (tool) {
|
|
316
|
-
lines.push('');
|
|
317
|
-
lines.push('Tool Definition:');
|
|
318
|
-
lines.push(` Name: ${tool.definition.name}`);
|
|
319
|
-
lines.push(` Description: ${tool.definition.description}`);
|
|
320
|
-
lines.push(` Parameters: ${JSON.stringify(tool.definition.parameters, null, 2).replace(/\n/g, '\n ')}`);
|
|
321
|
-
}
|
|
322
|
-
ctx.print(lines.join('\n'));
|
|
323
|
-
return;
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
ctx.print(
|
|
327
|
-
'Usage: /tool <subcommand>\n'
|
|
328
|
-
+ ' /tool verify <name> — verify contract for a specific registered tool\n'
|
|
329
|
-
+ ' /tool verify-all — verify contracts for all registered tools\n'
|
|
330
|
-
+ ' /tool contract show <name> — show full contract details for a tool'
|
|
331
|
-
);
|
|
332
|
-
},
|
|
333
|
-
});
|
|
334
|
-
|
|
335
285
|
registry.register({
|
|
336
286
|
name: 'forensics',
|
|
337
287
|
aliases: ['foren'],
|
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { dirname
|
|
1
|
+
import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
|
2
|
+
import { dirname } from 'node:path';
|
|
3
3
|
import type { CommandContext, CommandRegistry } from '../command-registry.ts';
|
|
4
|
-
import {
|
|
5
|
-
import { BUILTIN_SUITES } from '@/runtime/index.ts';
|
|
6
|
-
import { requireEcosystemCatalogPaths, requireReadModels, requireSecretsManager, requireServiceRegistry, requireShellPaths } from './runtime-services.ts';
|
|
4
|
+
import { requireReadModels, requireSecretsManager, requireServiceRegistry, requireShellPaths } from './runtime-services.ts';
|
|
7
5
|
import { requireYesFlag, stripYesFlag } from './confirmation.ts';
|
|
8
6
|
|
|
9
7
|
interface TrustReviewBundle {
|
|
@@ -29,29 +27,6 @@ interface TrustReviewBundle {
|
|
|
29
27
|
};
|
|
30
28
|
}
|
|
31
29
|
|
|
32
|
-
interface ReleaseBundle {
|
|
33
|
-
readonly version: 1;
|
|
34
|
-
readonly capturedAt: number;
|
|
35
|
-
readonly runtime: {
|
|
36
|
-
readonly provider: string;
|
|
37
|
-
readonly model: string;
|
|
38
|
-
readonly sessionId: string;
|
|
39
|
-
};
|
|
40
|
-
readonly evalSuites: readonly string[];
|
|
41
|
-
readonly incidentCount: number;
|
|
42
|
-
readonly remote: {
|
|
43
|
-
readonly pools: number;
|
|
44
|
-
readonly contracts: number;
|
|
45
|
-
readonly artifacts: number;
|
|
46
|
-
};
|
|
47
|
-
readonly ecosystem: {
|
|
48
|
-
readonly pluginCatalog: number;
|
|
49
|
-
readonly skillCatalog: number;
|
|
50
|
-
readonly installedPlugins: number;
|
|
51
|
-
readonly installedSkills: number;
|
|
52
|
-
};
|
|
53
|
-
}
|
|
54
|
-
|
|
55
30
|
function countByMode<T extends string>(values: readonly T[], mode: T): number {
|
|
56
31
|
return values.filter((value) => value === mode).length;
|
|
57
32
|
}
|
|
@@ -112,46 +87,6 @@ function inspectTrustBundle(path: string): string {
|
|
|
112
87
|
].join('\n');
|
|
113
88
|
}
|
|
114
89
|
|
|
115
|
-
function buildReleaseBundle(ctx: Parameters<NonNullable<CommandRegistry['register']>>[0]['handler'] extends (args: string[], context: infer C) => unknown ? C : never): ReleaseBundle {
|
|
116
|
-
const remoteRuntime = ctx.ops.remoteRuntime;
|
|
117
|
-
const incidents = ctx.extensions.forensicsRegistry?.getAll() ?? [];
|
|
118
|
-
const ecosystemPaths = requireEcosystemCatalogPaths(ctx);
|
|
119
|
-
return {
|
|
120
|
-
version: 1,
|
|
121
|
-
capturedAt: Date.now(),
|
|
122
|
-
runtime: {
|
|
123
|
-
provider: ctx.session.runtime.provider,
|
|
124
|
-
model: ctx.session.runtime.model,
|
|
125
|
-
sessionId: ctx.session.runtime.sessionId,
|
|
126
|
-
},
|
|
127
|
-
evalSuites: Object.keys(BUILTIN_SUITES),
|
|
128
|
-
incidentCount: incidents.length,
|
|
129
|
-
remote: {
|
|
130
|
-
pools: remoteRuntime?.listPools().length ?? 0,
|
|
131
|
-
contracts: remoteRuntime?.listContracts().length ?? 0,
|
|
132
|
-
artifacts: remoteRuntime?.listArtifacts().length ?? 0,
|
|
133
|
-
},
|
|
134
|
-
ecosystem: {
|
|
135
|
-
pluginCatalog: loadEcosystemCatalog('plugin', ecosystemPaths).length,
|
|
136
|
-
skillCatalog: loadEcosystemCatalog('skill', ecosystemPaths).length,
|
|
137
|
-
installedPlugins: listInstalledEcosystemEntries('plugin', ecosystemPaths).length,
|
|
138
|
-
installedSkills: listInstalledEcosystemEntries('skill', ecosystemPaths).length,
|
|
139
|
-
},
|
|
140
|
-
};
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
function inspectReleaseBundle(path: string): string {
|
|
144
|
-
const parsed = JSON.parse(readFileSync(path, 'utf-8')) as ReleaseBundle;
|
|
145
|
-
return [
|
|
146
|
-
'Release Bundle Review',
|
|
147
|
-
` provider/model: ${parsed.runtime.provider || '(unset)'}/${parsed.runtime.model || '(unset)'}`,
|
|
148
|
-
` eval suites: ${parsed.evalSuites.length}`,
|
|
149
|
-
` incidents: ${parsed.incidentCount}`,
|
|
150
|
-
` remote pools/contracts/artifacts: ${parsed.remote.pools}/${parsed.remote.contracts}/${parsed.remote.artifacts}`,
|
|
151
|
-
` ecosystem catalog plugins/skills: ${parsed.ecosystem.pluginCatalog}/${parsed.ecosystem.skillCatalog}`,
|
|
152
|
-
].join('\n');
|
|
153
|
-
}
|
|
154
|
-
|
|
155
90
|
export function registerProductRuntimeCommands(registry: CommandRegistry): void {
|
|
156
91
|
registry.register({
|
|
157
92
|
name: 'trust',
|
|
@@ -314,65 +249,4 @@ export function registerProductRuntimeCommands(registry: CommandRegistry): void
|
|
|
314
249
|
},
|
|
315
250
|
});
|
|
316
251
|
|
|
317
|
-
registry.register({
|
|
318
|
-
name: 'release',
|
|
319
|
-
description: 'Package certification and release-readiness operations',
|
|
320
|
-
usage: '[review|checklist|bundle export <path> --yes|bundle inspect <path>]',
|
|
321
|
-
handler(args, ctx) {
|
|
322
|
-
const parsed = stripYesFlag(args);
|
|
323
|
-
const commandArgs = [...parsed.rest];
|
|
324
|
-
const shellPaths = requireShellPaths(ctx);
|
|
325
|
-
const sub = commandArgs[0] ?? 'review';
|
|
326
|
-
if (sub === 'review') {
|
|
327
|
-
const bundle = buildReleaseBundle(ctx);
|
|
328
|
-
ctx.print([
|
|
329
|
-
'Release Review',
|
|
330
|
-
` provider/model: ${bundle.runtime.provider || '(unset)'}/${bundle.runtime.model || '(unset)'}`,
|
|
331
|
-
` eval suites: ${bundle.evalSuites.length}`,
|
|
332
|
-
` incidents: ${bundle.incidentCount}`,
|
|
333
|
-
` remote pools/contracts/artifacts: ${bundle.remote.pools}/${bundle.remote.contracts}/${bundle.remote.artifacts}`,
|
|
334
|
-
` ecosystem catalog plugins/skills: ${bundle.ecosystem.pluginCatalog}/${bundle.ecosystem.skillCatalog}`,
|
|
335
|
-
` installed plugins/skills: ${bundle.ecosystem.installedPlugins}/${bundle.ecosystem.installedSkills}`,
|
|
336
|
-
].join('\n'));
|
|
337
|
-
return;
|
|
338
|
-
}
|
|
339
|
-
if (sub === 'checklist') {
|
|
340
|
-
ctx.print([
|
|
341
|
-
'Release Checklist',
|
|
342
|
-
' 1. Run /setup review and /setup doctor',
|
|
343
|
-
' 2. Run /security review and /trust review',
|
|
344
|
-
' 3. Run /policy preflight and /policy simulate',
|
|
345
|
-
' 4. Run /eval gate <suite> --yes for required certification suites',
|
|
346
|
-
' 5. Review /incident latest and /bridge status',
|
|
347
|
-
' 6. Export /release bundle export <path> --yes for release evidence',
|
|
348
|
-
].join('\n'));
|
|
349
|
-
return;
|
|
350
|
-
}
|
|
351
|
-
if (sub === 'bundle') {
|
|
352
|
-
const mode = commandArgs[1];
|
|
353
|
-
const pathArg = commandArgs[2];
|
|
354
|
-
if ((mode === 'export' || mode === 'inspect') && !pathArg) {
|
|
355
|
-
ctx.print(`Usage: /release bundle ${mode} <path>${mode === 'export' ? ' --yes' : ''}`);
|
|
356
|
-
return;
|
|
357
|
-
}
|
|
358
|
-
if (mode === 'export') {
|
|
359
|
-
if (!parsed.yes) {
|
|
360
|
-
requireYesFlag(ctx, `export release bundle to ${pathArg}`, '/release bundle export <path> --yes');
|
|
361
|
-
return;
|
|
362
|
-
}
|
|
363
|
-
const bundle = buildReleaseBundle(ctx);
|
|
364
|
-
const targetPath = shellPaths.resolveWorkspacePath(pathArg!);
|
|
365
|
-
mkdirSync(dirname(targetPath), { recursive: true });
|
|
366
|
-
writeFileSync(targetPath, JSON.stringify(bundle, null, 2) + '\n', 'utf-8');
|
|
367
|
-
ctx.print(`Release bundle exported to ${targetPath}`);
|
|
368
|
-
return;
|
|
369
|
-
}
|
|
370
|
-
if (mode === 'inspect') {
|
|
371
|
-
ctx.print(inspectReleaseBundle(shellPaths.resolveWorkspacePath(pathArg!)));
|
|
372
|
-
return;
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
ctx.print('Usage: /release [review|checklist|bundle export <path> --yes|bundle inspect <path>]');
|
|
376
|
-
},
|
|
377
|
-
});
|
|
378
252
|
}
|
package/src/input/commands.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import type { CommandRegistry } from './command-registry.ts';
|
|
2
2
|
import { policyCommand } from './commands/policy.ts';
|
|
3
3
|
import { providerCommand } from './commands/provider.ts';
|
|
4
|
-
import { evalCommand } from './commands/eval.ts';
|
|
5
4
|
import { sessionCommand } from './commands/session.ts';
|
|
6
5
|
import { recallCommand } from './commands/memory.ts';
|
|
7
6
|
import { knowledgeCommand } from './commands/knowledge.ts';
|
|
@@ -126,9 +125,6 @@ export function registerBuiltinCommands(registry: CommandRegistry): void {
|
|
|
126
125
|
// ── /provider ─────────────────────────────────────────────────────────────
|
|
127
126
|
registry.register(providerCommand);
|
|
128
127
|
|
|
129
|
-
// ── /eval ─────────────────────────────────────────────────────────────────
|
|
130
|
-
registry.register(evalCommand);
|
|
131
|
-
|
|
132
128
|
// ── /session ─────────────────────────────────────────────────────────────
|
|
133
129
|
registry.register(sessionCommand);
|
|
134
130
|
|
|
@@ -27,7 +27,6 @@ import { DebugPanel } from '../debug-panel.ts';
|
|
|
27
27
|
import { IncidentReviewPanel } from '../incident-review-panel.ts';
|
|
28
28
|
import { ForensicsPanel } from '../forensics-panel.ts';
|
|
29
29
|
import { PolicyPanel } from '../policy-panel.ts';
|
|
30
|
-
import { EvalPanel } from '../eval-panel.ts';
|
|
31
30
|
import { createProviderAccountSnapshotQuery } from '../provider-account-snapshot.ts';
|
|
32
31
|
import {
|
|
33
32
|
createEnvironmentVariableQuery,
|
|
@@ -335,15 +334,4 @@ export function registerOperationsPanels(manager: PanelManager, deps: ResolvedBu
|
|
|
335
334
|
factory: () => new PolicyPanel(deps.policyRuntimeState),
|
|
336
335
|
});
|
|
337
336
|
|
|
338
|
-
if (deps.evalRegistry) {
|
|
339
|
-
const { evalRegistry } = deps;
|
|
340
|
-
manager.registerType({
|
|
341
|
-
id: 'eval',
|
|
342
|
-
name: 'Eval',
|
|
343
|
-
icon: 'Y',
|
|
344
|
-
category: 'monitoring',
|
|
345
|
-
description: 'Evaluation harness: benchmark suite results, scorecards, and regression gates',
|
|
346
|
-
factory: () => new EvalPanel(evalRegistry),
|
|
347
|
-
});
|
|
348
|
-
}
|
|
349
337
|
}
|
|
@@ -62,8 +62,6 @@ export interface BuiltinPanelDeps {
|
|
|
62
62
|
dismissPlanning?: () => void;
|
|
63
63
|
/** ForensicsRegistry for the Forensics panel. */
|
|
64
64
|
forensicsRegistry?: import('@/runtime/index.ts').ForensicsRegistry;
|
|
65
|
-
/** EvalRegistry for the Eval panel. */
|
|
66
|
-
evalRegistry?: import('../eval-panel.ts').EvalRegistry;
|
|
67
65
|
/** MemoryRegistry for the Memory panel. */
|
|
68
66
|
memoryRegistry?: MemoryRegistry;
|
|
69
67
|
/** Isolated Agent Knowledge service for the Agent Knowledge panel. */
|
package/src/version.ts
CHANGED
|
@@ -6,7 +6,7 @@ import { join } from 'node:path';
|
|
|
6
6
|
// The prebuild script updates the fallback value before compilation.
|
|
7
7
|
// Uses import.meta.dir (Bun) to locate package.json relative to this file,
|
|
8
8
|
// which is correct regardless of the process working directory.
|
|
9
|
-
let _version = '0.1.
|
|
9
|
+
let _version = '0.1.55';
|
|
10
10
|
let _sdkVersion = '0.33.35';
|
|
11
11
|
try {
|
|
12
12
|
const pkg = JSON.parse(readFileSync(join(import.meta.dir, '..', 'package.json'), 'utf-8')) as {
|
|
@@ -1,217 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* /eval command handler.
|
|
3
|
-
*
|
|
4
|
-
* Implements the Evaluation Harness commands:
|
|
5
|
-
*
|
|
6
|
-
* /eval list — List all available eval suites
|
|
7
|
-
* /eval run <suite> --yes — Run a named suite (or 'all')
|
|
8
|
-
* /eval compare <baseline-file> — Compare last run against a baseline file
|
|
9
|
-
* /eval gate <suite> --yes — Run suite and apply CI gate (exits 1 on regression)
|
|
10
|
-
*/
|
|
11
|
-
|
|
12
|
-
import type { SlashCommand, CommandContext } from '../command-registry.ts';
|
|
13
|
-
import { EvalRunner } from '@/runtime/index.ts';
|
|
14
|
-
import { BUILTIN_SUITES } from '@/runtime/index.ts';
|
|
15
|
-
import { formatScorecard } from '@/runtime/index.ts';
|
|
16
|
-
import { loadBaseline, captureBaseline, formatBaselineComparison, writeBaseline } from '@/runtime/index.ts';
|
|
17
|
-
import type { EvalRegistry } from '../../panels/eval-panel.ts';
|
|
18
|
-
import { formatSuiteResult, formatGateResult } from '@/runtime/index.ts';
|
|
19
|
-
import { requireShellPaths } from './runtime-services.ts';
|
|
20
|
-
import { summarizeError } from '@pellux/goodvibes-sdk/platform/utils';
|
|
21
|
-
import { requireYesFlag, stripYesFlag } from './confirmation.ts';
|
|
22
|
-
|
|
23
|
-
// ── Subcommand helpers ────────────────────────────────────────────────────────
|
|
24
|
-
|
|
25
|
-
function printSuiteList(context: CommandContext): void {
|
|
26
|
-
context.print('[eval] Available suites:');
|
|
27
|
-
for (const [name, scenarios] of Object.entries(BUILTIN_SUITES)) {
|
|
28
|
-
context.print(` ${name} (${scenarios.length} scenarios)`);
|
|
29
|
-
for (const s of scenarios) {
|
|
30
|
-
context.print(` - ${s.id}: ${s.name}`);
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
|
-
context.print('[eval] Usage: /eval run <suite> --yes or /eval run all --yes');
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
function getRegistry(context: CommandContext): EvalRegistry | undefined {
|
|
37
|
-
return context.extensions.evalRegistry;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
// ── /eval list ────────────────────────────────────────────────────────────────
|
|
41
|
-
|
|
42
|
-
function handleList(_args: string[], context: CommandContext): void {
|
|
43
|
-
printSuiteList(context);
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
// ── /eval run ────────────────────────────────────────────────────────────────
|
|
47
|
-
|
|
48
|
-
async function handleRun(args: string[], context: CommandContext): Promise<void> {
|
|
49
|
-
const { rest, yes } = stripYesFlag(args);
|
|
50
|
-
const suiteName = rest[0] ?? 'all';
|
|
51
|
-
const registry = getRegistry(context);
|
|
52
|
-
|
|
53
|
-
const suitesToRun =
|
|
54
|
-
suiteName === 'all'
|
|
55
|
-
? Object.keys(BUILTIN_SUITES)
|
|
56
|
-
: BUILTIN_SUITES[suiteName]
|
|
57
|
-
? [suiteName]
|
|
58
|
-
: null;
|
|
59
|
-
|
|
60
|
-
if (!suitesToRun) {
|
|
61
|
-
context.print(`[eval] Unknown suite: "${suiteName}". Run /eval list to see available suites.`);
|
|
62
|
-
return;
|
|
63
|
-
}
|
|
64
|
-
if (!yes) {
|
|
65
|
-
requireYesFlag(context, `run eval suite ${suiteName}`, '/eval run <suite|all> --yes');
|
|
66
|
-
return;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
const runner = new EvalRunner();
|
|
70
|
-
registry?.setRunning(true);
|
|
71
|
-
|
|
72
|
-
for (const name of suitesToRun) {
|
|
73
|
-
const scenarios = BUILTIN_SUITES[name];
|
|
74
|
-
if (!scenarios) continue;
|
|
75
|
-
|
|
76
|
-
context.print(`[eval] Running suite: ${name} (${scenarios.length} scenarios)...`);
|
|
77
|
-
const result = await runner.runSuite(name, scenarios);
|
|
78
|
-
registry?.push(result);
|
|
79
|
-
|
|
80
|
-
context.print(formatSuiteResult(result));
|
|
81
|
-
|
|
82
|
-
for (const r of result.results) {
|
|
83
|
-
context.print(formatScorecard(r.scorecard));
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
registry?.setRunning(false);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
// ── /eval compare ─────────────────────────────────────────────────────────────
|
|
91
|
-
|
|
92
|
-
async function handleCompare(args: string[], context: CommandContext): Promise<void> {
|
|
93
|
-
const baselineFile = args[0] ?? '.goodvibes/eval/baseline.json';
|
|
94
|
-
const registry = getRegistry(context);
|
|
95
|
-
const projectRoot = requireShellPaths(context).workingDirectory;
|
|
96
|
-
const suiteResults = registry?.getSuiteResults() ?? [];
|
|
97
|
-
|
|
98
|
-
if (suiteResults.length === 0) {
|
|
99
|
-
context.print('[eval] No suite results to compare. Run /eval run <suite> --yes first.');
|
|
100
|
-
return;
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
const baseline = await loadBaseline(baselineFile, projectRoot);
|
|
104
|
-
if (!baseline) {
|
|
105
|
-
context.print(`[eval] Baseline file not found: ${baselineFile}`);
|
|
106
|
-
context.print('[eval] Tip: run /eval gate <suite> [baseline-file] --save-baseline --yes to create a baseline.');
|
|
107
|
-
return;
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
for (const result of suiteResults) {
|
|
111
|
-
context.print(formatBaselineComparison(baseline, result));
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
// ── /eval gate ────────────────────────────────────────────────────────────────
|
|
116
|
-
|
|
117
|
-
async function handleGate(args: string[], context: CommandContext): Promise<void> {
|
|
118
|
-
const { rest, yes } = stripYesFlag(args);
|
|
119
|
-
const positional = rest.filter((arg) => arg !== '--save-baseline');
|
|
120
|
-
const suiteName = positional[0];
|
|
121
|
-
const baselineFile = positional[1] ?? '.goodvibes/eval/baseline.json';
|
|
122
|
-
const saveFlag = rest.includes('--save-baseline');
|
|
123
|
-
const projectRoot = requireShellPaths(context).workingDirectory;
|
|
124
|
-
|
|
125
|
-
if (!suiteName) {
|
|
126
|
-
context.print('[eval] Usage: /eval gate <suite> [baseline-file] [--save-baseline] --yes');
|
|
127
|
-
return;
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
const scenarios = BUILTIN_SUITES[suiteName];
|
|
131
|
-
if (!scenarios) {
|
|
132
|
-
context.print(`[eval] Unknown suite: "${suiteName}". Run /eval list to see available suites.`);
|
|
133
|
-
return;
|
|
134
|
-
}
|
|
135
|
-
if (!yes) {
|
|
136
|
-
requireYesFlag(context, `run eval gate ${suiteName}`, '/eval gate <suite> [baseline-file] [--save-baseline] --yes');
|
|
137
|
-
return;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
const registry = getRegistry(context);
|
|
141
|
-
const runner = new EvalRunner();
|
|
142
|
-
|
|
143
|
-
context.print(`[eval] Gate: running suite "${suiteName}"...`);
|
|
144
|
-
registry?.setRunning(true);
|
|
145
|
-
const fresh = await runner.runSuite(suiteName, scenarios);
|
|
146
|
-
registry?.push(fresh);
|
|
147
|
-
registry?.setRunning(false);
|
|
148
|
-
|
|
149
|
-
const baseline = await loadBaseline(baselineFile, projectRoot);
|
|
150
|
-
const gate = runner.evaluateGate(fresh, baseline);
|
|
151
|
-
registry?.pushGate(gate);
|
|
152
|
-
|
|
153
|
-
context.print(formatGateResult(gate));
|
|
154
|
-
|
|
155
|
-
if (saveFlag || !baseline) {
|
|
156
|
-
const label = suiteName ?? 'latest';
|
|
157
|
-
const newBaseline = captureBaseline(label, [fresh]);
|
|
158
|
-
try {
|
|
159
|
-
await writeBaseline(baselineFile, newBaseline, projectRoot);
|
|
160
|
-
context.print(`[eval] Baseline saved to ${baselineFile}`);
|
|
161
|
-
} catch (err) {
|
|
162
|
-
context.print(`[eval] Warning: could not save baseline: ${summarizeError(err)}`);
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
if (!gate.passed) {
|
|
167
|
-
context.print(`[eval] Gate FAILED: ${gate.regressions.length} regression(s) detected.`);
|
|
168
|
-
} else {
|
|
169
|
-
context.print('[eval] Gate PASSED.');
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
// ── Top-level command ─────────────────────────────────────────────────────────
|
|
174
|
-
|
|
175
|
-
export const evalCommand: SlashCommand = {
|
|
176
|
-
name: 'eval',
|
|
177
|
-
description: 'Evaluation harness: run benchmark suites, compare baselines, and gate regressions.',
|
|
178
|
-
usage: '<subcommand> [args]',
|
|
179
|
-
argsHint: 'list|run <suite> --yes|compare <baseline>|gate <suite> --yes',
|
|
180
|
-
handler: async (args: string[], context: CommandContext): Promise<void> => {
|
|
181
|
-
const [sub, ...rest] = args;
|
|
182
|
-
|
|
183
|
-
switch (sub) {
|
|
184
|
-
case 'list':
|
|
185
|
-
case 'ls':
|
|
186
|
-
handleList(rest, context);
|
|
187
|
-
break;
|
|
188
|
-
|
|
189
|
-
case 'run':
|
|
190
|
-
await handleRun(rest, context);
|
|
191
|
-
break;
|
|
192
|
-
|
|
193
|
-
case 'compare':
|
|
194
|
-
case 'cmp':
|
|
195
|
-
await handleCompare(rest, context);
|
|
196
|
-
break;
|
|
197
|
-
|
|
198
|
-
case 'gate':
|
|
199
|
-
await handleGate(rest, context);
|
|
200
|
-
break;
|
|
201
|
-
|
|
202
|
-
default: {
|
|
203
|
-
const usage = [
|
|
204
|
-
'Usage: /eval <subcommand>',
|
|
205
|
-
' list — List all available eval suites',
|
|
206
|
-
' run <suite|all> --yes — Run a named suite (or all suites)',
|
|
207
|
-
' compare [baseline-file] — Compare last results against baseline',
|
|
208
|
-
' gate <suite> [baseline-file] --yes',
|
|
209
|
-
' — Run suite and apply regression gate',
|
|
210
|
-
' --save-baseline — Save fresh run as new baseline',
|
|
211
|
-
].join('\n');
|
|
212
|
-
context.print(usage);
|
|
213
|
-
break;
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
},
|
|
217
|
-
};
|
package/src/panels/eval-panel.ts
DELETED
|
@@ -1,399 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Eval Panel — renders evaluation harness results in list and detail modes.
|
|
3
|
-
*
|
|
4
|
-
* Displays suite run summaries, per-scenario scorecards, and regression
|
|
5
|
-
* indicators. Wired with an EvalRegistry that holds the latest run results.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
import { BasePanel } from './base-panel.ts';
|
|
9
|
-
import type { Line } from '../types/grid.ts';
|
|
10
|
-
import { createEmptyLine } from '../types/grid.ts';
|
|
11
|
-
import {
|
|
12
|
-
buildEmptyState,
|
|
13
|
-
buildPanelLine,
|
|
14
|
-
buildPanelWorkspace,
|
|
15
|
-
resolveScrollablePanelSection,
|
|
16
|
-
DEFAULT_PANEL_PALETTE,
|
|
17
|
-
} from './polish.ts';
|
|
18
|
-
|
|
19
|
-
// ── EvalRegistry ─────────────────────────────────────────────────────────────
|
|
20
|
-
|
|
21
|
-
import type {
|
|
22
|
-
EvalSuiteResult,
|
|
23
|
-
EvalResult,
|
|
24
|
-
EvalGateResult,
|
|
25
|
-
EvalDimension,
|
|
26
|
-
} from '@/runtime/index.ts';
|
|
27
|
-
|
|
28
|
-
/**
|
|
29
|
-
* Holds the latest eval run state for display in EvalPanel.
|
|
30
|
-
* Created externally, injected into the panel.
|
|
31
|
-
*/
|
|
32
|
-
export class EvalRegistry {
|
|
33
|
-
private _suiteResults: EvalSuiteResult[] = [];
|
|
34
|
-
private _gateResults: EvalGateResult[] = [];
|
|
35
|
-
private _running = false;
|
|
36
|
-
private _lastRunAt: number | null = null;
|
|
37
|
-
private readonly _subscribers = new Set<() => void>();
|
|
38
|
-
|
|
39
|
-
push(result: EvalSuiteResult): void {
|
|
40
|
-
const idx = this._suiteResults.findIndex((r) => r.suite === result.suite);
|
|
41
|
-
if (idx >= 0) {
|
|
42
|
-
this._suiteResults[idx] = result;
|
|
43
|
-
} else {
|
|
44
|
-
this._suiteResults.push(result);
|
|
45
|
-
}
|
|
46
|
-
this._lastRunAt = Date.now();
|
|
47
|
-
this._notify();
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
pushGate(gate: EvalGateResult): void {
|
|
51
|
-
const idx = this._gateResults.findIndex((g) => g.suite === gate.suite);
|
|
52
|
-
if (idx >= 0) {
|
|
53
|
-
this._gateResults[idx] = gate;
|
|
54
|
-
} else {
|
|
55
|
-
this._gateResults.push(gate);
|
|
56
|
-
}
|
|
57
|
-
this._notify();
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
setRunning(running: boolean): void {
|
|
61
|
-
this._running = running;
|
|
62
|
-
this._notify();
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
isRunning(): boolean { return this._running; }
|
|
66
|
-
getLastRunAt(): number | null { return this._lastRunAt; }
|
|
67
|
-
getSuiteResults(): EvalSuiteResult[] { return this._suiteResults; }
|
|
68
|
-
getGateResults(): EvalGateResult[] { return this._gateResults; }
|
|
69
|
-
|
|
70
|
-
subscribe(cb: () => void): () => void {
|
|
71
|
-
this._subscribers.add(cb);
|
|
72
|
-
return () => this._subscribers.delete(cb);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
private _notify(): void {
|
|
76
|
-
for (const cb of this._subscribers) cb();
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
// ── Colour palette (hex fg colours for createStyledCell) ─────────────────────
|
|
81
|
-
|
|
82
|
-
const C = {
|
|
83
|
-
...DEFAULT_PANEL_PALETTE,
|
|
84
|
-
header: '#94a3b8',
|
|
85
|
-
headerBg: '#1e293b',
|
|
86
|
-
cyan: '#38bdf8',
|
|
87
|
-
green: '#22c55e',
|
|
88
|
-
yellow: '#eab308',
|
|
89
|
-
red: '#ef4444',
|
|
90
|
-
dim: '#4b5563',
|
|
91
|
-
label: '#64748b',
|
|
92
|
-
value: '#e2e8f0',
|
|
93
|
-
selected: '#f1f5f9',
|
|
94
|
-
sep: '#1e293b',
|
|
95
|
-
white: '#cbd5e1',
|
|
96
|
-
selectBg: '#0f172a',
|
|
97
|
-
} as const;
|
|
98
|
-
|
|
99
|
-
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
100
|
-
|
|
101
|
-
function scoreColor(score: number): string {
|
|
102
|
-
if (score >= 80) return C.green;
|
|
103
|
-
if (score >= 60) return C.yellow;
|
|
104
|
-
return C.red;
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
function fmtTime(ms: number): string {
|
|
108
|
-
if (ms < 1000) return `${ms.toFixed(0)}ms`;
|
|
109
|
-
return `${(ms / 1000).toFixed(1)}s`;
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
const DIMENSION_ORDER: EvalDimension[] = ['safety', 'quality', 'latency', 'cost', 'recovery'];
|
|
113
|
-
|
|
114
|
-
// ── EvalPanel ─────────────────────────────────────────────────────────────────
|
|
115
|
-
|
|
116
|
-
export class EvalPanel extends BasePanel {
|
|
117
|
-
private readonly _registry: EvalRegistry;
|
|
118
|
-
private _mode: 'list' | 'detail' = 'list';
|
|
119
|
-
private _selectedSuiteIdx = 0;
|
|
120
|
-
private _selectedScenarioIdx = 0;
|
|
121
|
-
private _scrollOffset = 0;
|
|
122
|
-
private _unsub: (() => void) | null = null;
|
|
123
|
-
|
|
124
|
-
public constructor(registry: EvalRegistry) {
|
|
125
|
-
super('eval', 'Eval', 'V', 'monitoring');
|
|
126
|
-
this._registry = registry;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
public override onActivate(): void {
|
|
130
|
-
this._unsub = this._registry.subscribe(() => this.markDirty());
|
|
131
|
-
this.markDirty();
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
public override onDestroy(): void {
|
|
135
|
-
this._unsub?.();
|
|
136
|
-
this._unsub = null;
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
public handleInput(key: string): boolean {
|
|
140
|
-
const suites = this._registry.getSuiteResults();
|
|
141
|
-
|
|
142
|
-
if (this._mode === 'list') {
|
|
143
|
-
if (key === 'ArrowUp' || key === 'k') {
|
|
144
|
-
this._selectedSuiteIdx = Math.max(0, this._selectedSuiteIdx - 1);
|
|
145
|
-
this.markDirty();
|
|
146
|
-
return true;
|
|
147
|
-
}
|
|
148
|
-
if (key === 'ArrowDown' || key === 'j') {
|
|
149
|
-
this._selectedSuiteIdx = Math.min(suites.length - 1, this._selectedSuiteIdx + 1);
|
|
150
|
-
this.markDirty();
|
|
151
|
-
return true;
|
|
152
|
-
}
|
|
153
|
-
if ((key === 'Enter' || key === 'Return' || key === 'l') && suites.length > 0) {
|
|
154
|
-
this._mode = 'detail';
|
|
155
|
-
this._selectedScenarioIdx = 0;
|
|
156
|
-
this._scrollOffset = 0;
|
|
157
|
-
this.markDirty();
|
|
158
|
-
return true;
|
|
159
|
-
}
|
|
160
|
-
return false;
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
// detail mode
|
|
164
|
-
if (key === 'Escape' || key === 'q' || key === 'h') {
|
|
165
|
-
this._mode = 'list';
|
|
166
|
-
this.markDirty();
|
|
167
|
-
return true;
|
|
168
|
-
}
|
|
169
|
-
if (key === 'ArrowUp' || key === 'k') {
|
|
170
|
-
const suite = suites[this._selectedSuiteIdx];
|
|
171
|
-
if (suite) {
|
|
172
|
-
this._selectedScenarioIdx = Math.max(0, this._selectedScenarioIdx - 1);
|
|
173
|
-
this._scrollOffset = 0;
|
|
174
|
-
this.markDirty();
|
|
175
|
-
}
|
|
176
|
-
return true;
|
|
177
|
-
}
|
|
178
|
-
if (key === 'ArrowDown' || key === 'j') {
|
|
179
|
-
const suite = suites[this._selectedSuiteIdx];
|
|
180
|
-
if (suite) {
|
|
181
|
-
this._selectedScenarioIdx = Math.min(
|
|
182
|
-
suite.results.length - 1,
|
|
183
|
-
this._selectedScenarioIdx + 1,
|
|
184
|
-
);
|
|
185
|
-
this._scrollOffset = 0;
|
|
186
|
-
this.markDirty();
|
|
187
|
-
}
|
|
188
|
-
return true;
|
|
189
|
-
}
|
|
190
|
-
if (key === 'PageUp') {
|
|
191
|
-
this._scrollOffset = Math.max(0, this._scrollOffset - 5);
|
|
192
|
-
this.markDirty();
|
|
193
|
-
return true;
|
|
194
|
-
}
|
|
195
|
-
if (key === 'PageDown') {
|
|
196
|
-
this._scrollOffset += 5;
|
|
197
|
-
this.markDirty();
|
|
198
|
-
return true;
|
|
199
|
-
}
|
|
200
|
-
return false;
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
public render(width: number, height: number): Line[] {
|
|
204
|
-
this.needsRender = false;
|
|
205
|
-
const suites = this._registry.getSuiteResults();
|
|
206
|
-
const gates = this._registry.getGateResults();
|
|
207
|
-
const intro = 'Evaluation harness runs, gates, scenario scorecards, and regression indicators for model and product validation.';
|
|
208
|
-
|
|
209
|
-
const running = this._registry.isRunning();
|
|
210
|
-
const lastRun = this._registry.getLastRunAt();
|
|
211
|
-
const summaryLine = buildPanelLine(width, [
|
|
212
|
-
[' state: ', C.label],
|
|
213
|
-
[running ? 'running' : 'idle', running ? C.yellow : C.dim],
|
|
214
|
-
[' last: ', C.label],
|
|
215
|
-
[lastRun ? new Date(lastRun).toLocaleTimeString() : 'n/a', C.dim],
|
|
216
|
-
]);
|
|
217
|
-
|
|
218
|
-
if (suites.length === 0) {
|
|
219
|
-
const workspace = buildPanelWorkspace(width, height, {
|
|
220
|
-
title: 'Eval Harness',
|
|
221
|
-
intro,
|
|
222
|
-
sections: [{
|
|
223
|
-
title: 'Status',
|
|
224
|
-
lines: [
|
|
225
|
-
summaryLine,
|
|
226
|
-
...buildEmptyState(
|
|
227
|
-
width,
|
|
228
|
-
' No results yet.',
|
|
229
|
-
'Run an eval suite to populate this workspace with suite scores, gate results, and per-scenario detail.',
|
|
230
|
-
[{ command: '/eval run <suite>', summary: 'start a suite such as core-performance, safety-baseline, or cost-tokens' }],
|
|
231
|
-
C,
|
|
232
|
-
),
|
|
233
|
-
],
|
|
234
|
-
}],
|
|
235
|
-
palette: C,
|
|
236
|
-
});
|
|
237
|
-
while (workspace.length < height) workspace.push(createEmptyLine(width));
|
|
238
|
-
return workspace;
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
const lines: Line[] = [];
|
|
242
|
-
if (this._mode === 'list') {
|
|
243
|
-
this._renderList(lines, suites, gates, width, height, intro, summaryLine);
|
|
244
|
-
} else {
|
|
245
|
-
const suite = suites[this._selectedSuiteIdx];
|
|
246
|
-
if (suite) {
|
|
247
|
-
this._renderDetail(lines, suite, width, height, intro, summaryLine);
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
return lines;
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
// ── List view ────────────────────────────────────────────────────────────────
|
|
255
|
-
|
|
256
|
-
private _renderList(
|
|
257
|
-
lines: Line[],
|
|
258
|
-
suites: EvalSuiteResult[],
|
|
259
|
-
gates: EvalGateResult[],
|
|
260
|
-
width: number,
|
|
261
|
-
_height: number,
|
|
262
|
-
intro: string,
|
|
263
|
-
summaryLine: Line,
|
|
264
|
-
): void {
|
|
265
|
-
const gateMap = new Map(gates.map((g) => [g.suite, g]));
|
|
266
|
-
const sectionLines: Line[] = [
|
|
267
|
-
summaryLine,
|
|
268
|
-
buildPanelLine(width, [
|
|
269
|
-
['Suite'.padEnd(28), C.header],
|
|
270
|
-
['Score'.padEnd(8), C.header],
|
|
271
|
-
['Pass'.padEnd(6), C.header],
|
|
272
|
-
['Gate'.padEnd(6), C.header],
|
|
273
|
-
['Duration', C.header],
|
|
274
|
-
]),
|
|
275
|
-
];
|
|
276
|
-
|
|
277
|
-
suites.forEach((suite, idx) => {
|
|
278
|
-
const selected = idx === this._selectedSuiteIdx;
|
|
279
|
-
const gate = gateMap.get(suite.suite);
|
|
280
|
-
const gateStr = gate ? (gate.passed ? 'ok' : 'FAIL') : '-';
|
|
281
|
-
const gateColor = gate ? (gate.passed ? C.green : C.red) : C.dim;
|
|
282
|
-
const durationMs = suite.finishedAt - suite.startedAt;
|
|
283
|
-
const scoreC = scoreColor(suite.meanScore);
|
|
284
|
-
const passC = suite.passed ? C.green : C.red;
|
|
285
|
-
const nameColor = selected ? C.selected : C.white;
|
|
286
|
-
const bg = selected ? C.selectBg : undefined;
|
|
287
|
-
const prefix = selected ? '▸ ' : ' ';
|
|
288
|
-
const name = suite.suite.slice(0, 24).padEnd(26);
|
|
289
|
-
|
|
290
|
-
sectionLines.push(buildPanelLine(width, [
|
|
291
|
-
[prefix + name, nameColor, bg],
|
|
292
|
-
[suite.meanScore.toFixed(1).padEnd(8), scoreC, bg],
|
|
293
|
-
[(suite.passed ? 'PASS' : 'FAIL').padEnd(6), passC, bg],
|
|
294
|
-
[gateStr.padEnd(6), gateColor, bg],
|
|
295
|
-
[fmtTime(durationMs), C.dim, bg],
|
|
296
|
-
]));
|
|
297
|
-
});
|
|
298
|
-
|
|
299
|
-
sectionLines.push(buildPanelLine(width, [[' Enter/l: detail j/k: navigate', C.dim]]));
|
|
300
|
-
lines.push(...buildPanelWorkspace(width, _height, {
|
|
301
|
-
title: 'Eval Harness',
|
|
302
|
-
intro,
|
|
303
|
-
sections: [{ title: 'Suites', lines: sectionLines }],
|
|
304
|
-
palette: C,
|
|
305
|
-
}));
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
// ── Detail view ──────────────────────────────────────────────────────────────
|
|
309
|
-
|
|
310
|
-
private _renderDetail(
|
|
311
|
-
lines: Line[],
|
|
312
|
-
suite: EvalSuiteResult,
|
|
313
|
-
width: number,
|
|
314
|
-
height: number,
|
|
315
|
-
intro: string,
|
|
316
|
-
summaryLine: Line,
|
|
317
|
-
): void {
|
|
318
|
-
const sectionLines: Line[] = [
|
|
319
|
-
summaryLine,
|
|
320
|
-
buildPanelLine(width, [
|
|
321
|
-
[`Suite: ${suite.suite}`, C.cyan],
|
|
322
|
-
[' mean=', C.label],
|
|
323
|
-
[suite.meanScore.toFixed(1), scoreColor(suite.meanScore)],
|
|
324
|
-
[' ', C.label],
|
|
325
|
-
[suite.passed ? 'PASS' : 'FAIL', suite.passed ? C.green : C.red],
|
|
326
|
-
]),
|
|
327
|
-
];
|
|
328
|
-
|
|
329
|
-
const allDetailLines: Line[] = [];
|
|
330
|
-
suite.results.forEach((result, idx) => {
|
|
331
|
-
const selected = idx === this._selectedScenarioIdx;
|
|
332
|
-
this._renderScenarioBlock(allDetailLines, result, selected, width);
|
|
333
|
-
});
|
|
334
|
-
|
|
335
|
-
const detailSection = resolveScrollablePanelSection(width, height, {
|
|
336
|
-
intro,
|
|
337
|
-
palette: C,
|
|
338
|
-
beforeSections: [{ title: 'Scenario Detail', lines: sectionLines }],
|
|
339
|
-
section: {
|
|
340
|
-
scrollableLines: allDetailLines,
|
|
341
|
-
scrollOffset: this._scrollOffset,
|
|
342
|
-
minRows: 1,
|
|
343
|
-
},
|
|
344
|
-
});
|
|
345
|
-
this._scrollOffset = detailSection.scrollOffset;
|
|
346
|
-
sectionLines.push(...detailSection.section.lines);
|
|
347
|
-
sectionLines.push(buildPanelLine(width, [[' Esc/q: back j/k: scenario PgUp/PgDn: scroll', C.dim]]));
|
|
348
|
-
lines.push(...buildPanelWorkspace(width, height, {
|
|
349
|
-
title: 'Eval Harness',
|
|
350
|
-
intro,
|
|
351
|
-
sections: [{ title: 'Scenario Detail', lines: sectionLines }],
|
|
352
|
-
palette: C,
|
|
353
|
-
}));
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
private _renderScenarioBlock(
|
|
357
|
-
lines: Line[],
|
|
358
|
-
result: EvalResult,
|
|
359
|
-
selected: boolean,
|
|
360
|
-
width: number,
|
|
361
|
-
): void {
|
|
362
|
-
const sc = result.scorecard;
|
|
363
|
-
const prefix = selected ? '▸ ' : ' ';
|
|
364
|
-
const nameColor = selected ? C.selected : C.white;
|
|
365
|
-
const scoreC = scoreColor(sc.compositeScore);
|
|
366
|
-
const passC = sc.passed ? C.green : C.red;
|
|
367
|
-
const nameLen = Math.max(1, width - 22);
|
|
368
|
-
|
|
369
|
-
lines.push(buildPanelLine(width, [
|
|
370
|
-
[prefix + result.scenario.name.slice(0, nameLen).padEnd(nameLen + 2), nameColor, selected ? C.selectBg : undefined],
|
|
371
|
-
[sc.compositeScore.toFixed(1).padStart(5), scoreC, selected ? C.selectBg : undefined],
|
|
372
|
-
[' ', C.label, selected ? C.selectBg : undefined],
|
|
373
|
-
[sc.passed ? 'PASS' : 'FAIL', passC, selected ? C.selectBg : undefined],
|
|
374
|
-
]));
|
|
375
|
-
|
|
376
|
-
if (selected) {
|
|
377
|
-
for (const dim of DIMENSION_ORDER) {
|
|
378
|
-
const d = sc.dimensions.find((x) => x.dimension === dim);
|
|
379
|
-
if (!d) continue;
|
|
380
|
-
const filled = Math.round(d.score / 10);
|
|
381
|
-
const bar = '#'.repeat(filled) + '.'.repeat(10 - filled);
|
|
382
|
-
lines.push(buildPanelLine(width, [
|
|
383
|
-
[' ' + dim.padEnd(10) + ' ', C.label],
|
|
384
|
-
[bar, scoreColor(d.score)],
|
|
385
|
-
[` ${d.score.toFixed(0).padStart(3)}/100`, C.value],
|
|
386
|
-
]));
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
if (sc.notes && sc.notes.length > 0) {
|
|
390
|
-
for (const note of sc.notes) {
|
|
391
|
-
lines.push(buildPanelLine(width, [
|
|
392
|
-
[' ! ', C.yellow],
|
|
393
|
-
[note.slice(0, width - 6), C.yellow],
|
|
394
|
-
]));
|
|
395
|
-
}
|
|
396
|
-
}
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
}
|