cclaw-cli 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +10 -2
- package/dist/cli.js +388 -18
- package/dist/content/eval-scaffold.d.ts +2 -2
- package/dist/content/eval-scaffold.js +7 -6
- package/dist/eval/agents/single-shot.d.ts +1 -1
- package/dist/eval/agents/single-shot.js +4 -4
- package/dist/eval/agents/with-tools.d.ts +14 -1
- package/dist/eval/agents/with-tools.js +22 -16
- package/dist/eval/agents/workflow.d.ts +31 -0
- package/dist/eval/agents/workflow.js +135 -0
- package/dist/eval/baseline.d.ts +24 -0
- package/dist/eval/baseline.js +75 -2
- package/dist/eval/config-loader.js +52 -19
- package/dist/eval/cost-guard.d.ts +22 -0
- package/dist/eval/cost-guard.js +38 -1
- package/dist/eval/diff.d.ts +64 -0
- package/dist/eval/diff.js +323 -0
- package/dist/eval/llm-client.d.ts +13 -2
- package/dist/eval/llm-client.js +8 -1
- package/dist/eval/mode.d.ts +28 -0
- package/dist/eval/mode.js +61 -0
- package/dist/eval/progress.d.ts +83 -0
- package/dist/eval/progress.js +59 -0
- package/dist/eval/report.js +36 -1
- package/dist/eval/runner.d.ts +37 -8
- package/dist/eval/runner.js +351 -42
- package/dist/eval/runs.d.ts +41 -0
- package/dist/eval/runs.js +114 -0
- package/dist/eval/sandbox.js +1 -1
- package/dist/eval/tools/index.js +1 -1
- package/dist/eval/tools/types.d.ts +1 -1
- package/dist/eval/types.d.ts +158 -15
- package/dist/eval/types.js +39 -7
- package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
- package/dist/eval/verifiers/workflow-consistency.js +225 -0
- package/dist/eval/workflow-corpus.d.ts +7 -0
- package/dist/eval/workflow-corpus.js +207 -0
- package/package.json +1 -1
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export declare const RUNS_DIR = "runs";
|
|
2
|
+
export interface EvalRunStatus {
|
|
3
|
+
id: string;
|
|
4
|
+
startedAt: string;
|
|
5
|
+
endedAt?: string;
|
|
6
|
+
pid: number;
|
|
7
|
+
argv: string[];
|
|
8
|
+
cwd: string;
|
|
9
|
+
exitCode?: number;
|
|
10
|
+
state: "running" | "succeeded" | "failed";
|
|
11
|
+
}
|
|
12
|
+
export declare function runsRoot(projectRoot: string): string;
|
|
13
|
+
export declare function runDir(projectRoot: string, id: string): string;
|
|
14
|
+
export declare function runLogPath(projectRoot: string, id: string): string;
|
|
15
|
+
export declare function runStatusPath(projectRoot: string, id: string): string;
|
|
16
|
+
/**
|
|
17
|
+
* Generate a short, lexicographically-sortable run id. The timestamp
|
|
18
|
+
* prefix means `ls -1` already returns the runs in chronological order
|
|
19
|
+
* which keeps the `runs list` subcommand trivial.
|
|
20
|
+
*/
|
|
21
|
+
export declare function generateRunId(now?: Date): string;
|
|
22
|
+
export declare function ensureRunDir(projectRoot: string, id: string): Promise<string>;
|
|
23
|
+
export declare function writeRunStatus(projectRoot: string, status: EvalRunStatus): Promise<void>;
|
|
24
|
+
export declare function readRunStatus(projectRoot: string, id: string): Promise<EvalRunStatus | null>;
|
|
25
|
+
/**
|
|
26
|
+
* List run ids under `.cclaw/evals/runs/`, most recent first. Directory
|
|
27
|
+
* entries that don't contain a `run.json` are skipped (half-initialized
|
|
28
|
+
* or manually mkdir'd folders).
|
|
29
|
+
*/
|
|
30
|
+
export declare function listRuns(projectRoot: string): Promise<EvalRunStatus[]>;
|
|
31
|
+
/**
|
|
32
|
+
* Resolve `"latest"` (or undefined) to the most recent run id.
|
|
33
|
+
* Returns `null` when there are no runs.
|
|
34
|
+
*/
|
|
35
|
+
export declare function resolveRunId(projectRoot: string, hint: string | undefined): Promise<string | null>;
|
|
36
|
+
/**
|
|
37
|
+
* Cheap liveness probe for an EvalRunStatus. A `run.json` can be stale
|
|
38
|
+
* (process crashed mid-commit), so we double-check with `kill(pid, 0)`
|
|
39
|
+
* before trusting the `state: "running"` field.
|
|
40
|
+
*/
|
|
41
|
+
export declare function isRunAlive(status: EvalRunStatus): boolean;
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Run bookkeeping for backgrounded `cclaw eval` invocations.
|
|
3
|
+
*
|
|
4
|
+
* A backgrounded run writes three artifacts under `.cclaw/evals/runs/<id>/`:
|
|
5
|
+
*
|
|
6
|
+
* - `run.json` — status metadata (pid, started/ended ISO timestamps,
|
|
7
|
+
* exit code, argv, cwd). Updated at start and at exit.
|
|
8
|
+
* - `run.log` — combined stdout+stderr of the child process. This is
|
|
9
|
+
* what `cclaw eval runs tail` streams.
|
|
10
|
+
* - `run.pid` — just the pid, written atomically so `runs status`
|
|
11
|
+
* can probe liveness without parsing JSON.
|
|
12
|
+
*
|
|
13
|
+
* The `id` is a short alphanumeric string (8 chars + ISO timestamp prefix)
|
|
14
|
+
* chosen so sorting directory entries by name produces a chronological
|
|
15
|
+
* listing without any extra work.
|
|
16
|
+
*/
|
|
17
|
+
import { randomBytes } from "node:crypto";
|
|
18
|
+
import fs from "node:fs/promises";
|
|
19
|
+
import path from "node:path";
|
|
20
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
21
|
+
import { exists } from "../fs-utils.js";
|
|
22
|
+
export const RUNS_DIR = "runs";
|
|
23
|
+
export function runsRoot(projectRoot) {
|
|
24
|
+
return path.join(projectRoot, EVALS_ROOT, RUNS_DIR);
|
|
25
|
+
}
|
|
26
|
+
export function runDir(projectRoot, id) {
|
|
27
|
+
return path.join(runsRoot(projectRoot), id);
|
|
28
|
+
}
|
|
29
|
+
export function runLogPath(projectRoot, id) {
|
|
30
|
+
return path.join(runDir(projectRoot, id), "run.log");
|
|
31
|
+
}
|
|
32
|
+
export function runStatusPath(projectRoot, id) {
|
|
33
|
+
return path.join(runDir(projectRoot, id), "run.json");
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Generate a short, lexicographically-sortable run id. The timestamp
|
|
37
|
+
* prefix means `ls -1` already returns the runs in chronological order
|
|
38
|
+
* which keeps the `runs list` subcommand trivial.
|
|
39
|
+
*/
|
|
40
|
+
export function generateRunId(now = new Date()) {
|
|
41
|
+
const ts = now.toISOString().replace(/[-:]/g, "").replace(/\.\d+Z$/, "Z");
|
|
42
|
+
const suffix = randomBytes(3).toString("hex");
|
|
43
|
+
return `${ts}-${suffix}`;
|
|
44
|
+
}
|
|
45
|
+
export async function ensureRunDir(projectRoot, id) {
|
|
46
|
+
const dir = runDir(projectRoot, id);
|
|
47
|
+
await fs.mkdir(dir, { recursive: true });
|
|
48
|
+
return dir;
|
|
49
|
+
}
|
|
50
|
+
export async function writeRunStatus(projectRoot, status) {
|
|
51
|
+
await ensureRunDir(projectRoot, status.id);
|
|
52
|
+
await fs.writeFile(runStatusPath(projectRoot, status.id), `${JSON.stringify(status, null, 2)}\n`, "utf8");
|
|
53
|
+
}
|
|
54
|
+
export async function readRunStatus(projectRoot, id) {
|
|
55
|
+
const file = runStatusPath(projectRoot, id);
|
|
56
|
+
if (!(await exists(file)))
|
|
57
|
+
return null;
|
|
58
|
+
try {
|
|
59
|
+
const raw = await fs.readFile(file, "utf8");
|
|
60
|
+
return JSON.parse(raw);
|
|
61
|
+
}
|
|
62
|
+
catch {
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* List run ids under `.cclaw/evals/runs/`, most recent first. Directory
|
|
68
|
+
* entries that don't contain a `run.json` are skipped (half-initialized
|
|
69
|
+
* or manually mkdir'd folders).
|
|
70
|
+
*/
|
|
71
|
+
export async function listRuns(projectRoot) {
|
|
72
|
+
const root = runsRoot(projectRoot);
|
|
73
|
+
if (!(await exists(root)))
|
|
74
|
+
return [];
|
|
75
|
+
const entries = await fs.readdir(root, { withFileTypes: true });
|
|
76
|
+
const out = [];
|
|
77
|
+
for (const entry of entries) {
|
|
78
|
+
if (!entry.isDirectory())
|
|
79
|
+
continue;
|
|
80
|
+
const status = await readRunStatus(projectRoot, entry.name);
|
|
81
|
+
if (status)
|
|
82
|
+
out.push(status);
|
|
83
|
+
}
|
|
84
|
+
out.sort((a, b) => (a.startedAt < b.startedAt ? 1 : -1));
|
|
85
|
+
return out;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Resolve `"latest"` (or undefined) to the most recent run id.
|
|
89
|
+
* Returns `null` when there are no runs.
|
|
90
|
+
*/
|
|
91
|
+
export async function resolveRunId(projectRoot, hint) {
|
|
92
|
+
if (hint && hint !== "latest") {
|
|
93
|
+
const status = await readRunStatus(projectRoot, hint);
|
|
94
|
+
return status ? hint : null;
|
|
95
|
+
}
|
|
96
|
+
const runs = await listRuns(projectRoot);
|
|
97
|
+
return runs[0]?.id ?? null;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Cheap liveness probe for an EvalRunStatus. A `run.json` can be stale
|
|
101
|
+
* (process crashed mid-commit), so we double-check with `kill(pid, 0)`
|
|
102
|
+
* before trusting the `state: "running"` field.
|
|
103
|
+
*/
|
|
104
|
+
export function isRunAlive(status) {
|
|
105
|
+
if (status.state !== "running")
|
|
106
|
+
return false;
|
|
107
|
+
try {
|
|
108
|
+
process.kill(status.pid, 0);
|
|
109
|
+
return true;
|
|
110
|
+
}
|
|
111
|
+
catch {
|
|
112
|
+
return false;
|
|
113
|
+
}
|
|
114
|
+
}
|
package/dist/eval/sandbox.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Per-case sandbox for the
|
|
2
|
+
* Per-case sandbox for the with-tools agent (agent/workflow mode).
|
|
3
3
|
*
|
|
4
4
|
* Every case gets its own `os.tmpdir()/cclaw-eval-<uuid>/` directory. Any
|
|
5
5
|
* `contextFiles` the case declares are copied in relative to the project
|
package/dist/eval/tools/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Registry of sandbox-confined tools used by the
|
|
2
|
+
* Registry of sandbox-confined tools used by the with-tools agent (agent/workflow mode).
|
|
3
3
|
*
|
|
4
4
|
* The registry order defines the advertised schema order in the
|
|
5
5
|
* function-calling payload. Keeping it stable means judges reading
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Shared types for
|
|
2
|
+
* Shared types for sandbox-confined tools (agent/workflow mode).
|
|
3
3
|
*
|
|
4
4
|
* Tools are plain async functions: they take validated arguments and a
|
|
5
5
|
* sandbox handle and return a structured result. The runner serializes
|
package/dist/eval/types.d.ts
CHANGED
|
@@ -11,21 +11,37 @@
|
|
|
11
11
|
*/
|
|
12
12
|
import type { FlowStage } from "../types.js";
|
|
13
13
|
/**
|
|
14
|
-
*
|
|
14
|
+
* Evaluation mode — what the agent-under-test actually does.
|
|
15
15
|
*
|
|
16
|
-
* - `
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
16
|
+
* - `fixture` — verify an existing artifact against structural/rule/judge
|
|
17
|
+
* expectations. No LLM drafting, only verifiers (judge may still invoke
|
|
18
|
+
* the API). Cheapest mode.
|
|
19
|
+
* - `agent` — LLM drafts a single-stage artifact inside a sandbox using the
|
|
20
|
+
* function-calling loop (read_file/write_file/glob/grep). Replaces the
|
|
21
|
+
* previous single-shot path entirely.
|
|
22
|
+
* - `workflow` — LLM orchestrates the full multi-stage flow
|
|
23
|
+
* (brainstorm → scope → design → spec → plan) with threaded artifacts.
|
|
24
|
+
*
|
|
25
|
+
* Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
|
|
26
|
+
* a deprecation warning — see `src/eval/mode.ts` for the mapping.
|
|
27
|
+
*/
|
|
28
|
+
export declare const EVAL_MODES: readonly ["fixture", "agent", "workflow"];
|
|
29
|
+
export type EvalMode = (typeof EVAL_MODES)[number];
|
|
30
|
+
/**
|
|
31
|
+
* Legacy tier identifier, kept so on-disk reports generated before v0.28.0
|
|
32
|
+
* keep parsing. New code should always use `EvalMode`.
|
|
33
|
+
* @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
|
|
21
34
|
*/
|
|
22
35
|
export declare const EVAL_TIERS: readonly ["A", "B", "C"];
|
|
36
|
+
/** @deprecated use `EvalMode`. */
|
|
23
37
|
export type EvalTier = (typeof EVAL_TIERS)[number];
|
|
24
38
|
/**
|
|
25
39
|
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
26
40
|
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
41
|
+
* `consistency` is the workflow-mode cross-artifact family (deterministic but
|
|
42
|
+
* operates over multiple artifacts at once).
|
|
27
43
|
*/
|
|
28
|
-
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
|
|
44
|
+
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow", "consistency"];
|
|
29
45
|
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
30
46
|
/**
|
|
31
47
|
* Structural expectations — deterministic, LLM-free checks against a single
|
|
@@ -118,7 +134,7 @@ export interface TraceabilityExpected {
|
|
|
118
134
|
* LLM-judge expectations — Step 3.
|
|
119
135
|
*
|
|
120
136
|
* When present, the judge runs against the resolved artifact (live-agent
|
|
121
|
-
* output in
|
|
137
|
+
* output in agent/workflow mode, or the pre-generated fixture when `--judge` is
|
|
122
138
|
* combined with `--schema-only` for smoke tests). Every field below is
|
|
123
139
|
* optional; the case-level hint overlays the stage-level rubric loaded
|
|
124
140
|
* from `.cclaw/evals/rubrics/<stage>.yaml`.
|
|
@@ -159,7 +175,7 @@ export interface EvalCase {
|
|
|
159
175
|
id: string;
|
|
160
176
|
stage: FlowStage;
|
|
161
177
|
inputPrompt: string;
|
|
162
|
-
/** Project files copied into the
|
|
178
|
+
/** Project files copied into the agent/workflow sandbox before the agent runs. */
|
|
163
179
|
contextFiles?: string[];
|
|
164
180
|
/**
|
|
165
181
|
* Typed expectation hints consumed by the structural/rules/judge verifiers.
|
|
@@ -194,11 +210,17 @@ export interface VerifierResult {
|
|
|
194
210
|
export interface EvalCaseResult {
|
|
195
211
|
caseId: string;
|
|
196
212
|
stage: FlowStage;
|
|
197
|
-
|
|
213
|
+
mode: EvalMode;
|
|
198
214
|
passed: boolean;
|
|
199
215
|
durationMs: number;
|
|
200
216
|
costUsd?: number;
|
|
201
217
|
verifierResults: VerifierResult[];
|
|
218
|
+
/**
|
|
219
|
+
* Only populated in `workflow` mode: per-stage breakdown collected by
|
|
220
|
+
* the workflow orchestrator. Unset for `fixture` / `agent` modes so the
|
|
221
|
+
* on-disk JSON stays small.
|
|
222
|
+
*/
|
|
223
|
+
workflow?: WorkflowRunSummary;
|
|
202
224
|
}
|
|
203
225
|
/** Top-level eval report, serialized to JSON and rendered to Markdown. */
|
|
204
226
|
export interface EvalReport {
|
|
@@ -208,7 +230,7 @@ export interface EvalReport {
|
|
|
208
230
|
cclawVersion: string;
|
|
209
231
|
provider: string;
|
|
210
232
|
model: string;
|
|
211
|
-
|
|
233
|
+
mode: EvalMode;
|
|
212
234
|
stages: FlowStage[];
|
|
213
235
|
cases: EvalCaseResult[];
|
|
214
236
|
summary: {
|
|
@@ -238,8 +260,8 @@ export interface EvalConfig {
|
|
|
238
260
|
model: string;
|
|
239
261
|
/** Optional separate model for the judge role. Defaults to `model`. */
|
|
240
262
|
judgeModel?: string;
|
|
241
|
-
/** Default
|
|
242
|
-
|
|
263
|
+
/** Default mode when `--mode` is not supplied. */
|
|
264
|
+
defaultMode: EvalMode;
|
|
243
265
|
/** Optional hard stop on estimated USD spend per day. Unset = no cap. */
|
|
244
266
|
dailyUsdCap?: number;
|
|
245
267
|
/** Regression thresholds for CI gates. */
|
|
@@ -270,7 +292,7 @@ export interface EvalConfig {
|
|
|
270
292
|
tokenPricing?: Record<string, TokenPricing>;
|
|
271
293
|
/**
|
|
272
294
|
* Maximum assistant turns (tool_calls → tool result cycles) allowed by
|
|
273
|
-
* the
|
|
295
|
+
* the with-tools agent loop (agent/workflow mode). Defaults to 8. Runs that
|
|
274
296
|
* exceed the cap fail with a `MaxTurnsExceededError` and surface as a
|
|
275
297
|
* workflow verifier result.
|
|
276
298
|
*/
|
|
@@ -286,6 +308,14 @@ export interface EvalConfig {
|
|
|
286
308
|
* marker so the model sees the cutoff.
|
|
287
309
|
*/
|
|
288
310
|
toolMaxResultBytes?: number;
|
|
311
|
+
/**
|
|
312
|
+
* Maximum total turns a single workflow-mode case may consume
|
|
313
|
+
* across all stages combined. Defaults to 40 (stages × toolMaxTurns).
|
|
314
|
+
* Runs that exceed the cap fail the current stage with a
|
|
315
|
+
* `MaxTurnsExceededError` propagated from the underlying with-tools
|
|
316
|
+
* loop rather than a dedicated workflow-level error.
|
|
317
|
+
*/
|
|
318
|
+
workflowMaxTotalTurns?: number;
|
|
289
319
|
}
|
|
290
320
|
/** Per-model pricing schedule, expressed as USD per 1K tokens. */
|
|
291
321
|
export interface TokenPricing {
|
|
@@ -310,6 +340,18 @@ export interface BaselineSnapshot {
|
|
|
310
340
|
cclawVersion: string;
|
|
311
341
|
/** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
|
|
312
342
|
cases: Record<string, BaselineCaseEntry>;
|
|
343
|
+
/**
|
|
344
|
+
* Tamper-evident signature computed as sha256 over the canonical JSON
|
|
345
|
+
* encoding of `{ schemaVersion, stage, cases }`. Present on files
|
|
346
|
+
* written by cclaw >= 0.28.0; older baselines load with `signature`
|
|
347
|
+
* absent and the loader skips verification.
|
|
348
|
+
*/
|
|
349
|
+
signature?: {
|
|
350
|
+
algorithm: "sha256";
|
|
351
|
+
digest: string;
|
|
352
|
+
/** ISO timestamp of when the digest was computed. */
|
|
353
|
+
signedAt: string;
|
|
354
|
+
};
|
|
313
355
|
}
|
|
314
356
|
export interface BaselineCaseEntry {
|
|
315
357
|
passed: boolean;
|
|
@@ -400,7 +442,7 @@ export interface JudgeInvocation {
|
|
|
400
442
|
durationMs: number;
|
|
401
443
|
}
|
|
402
444
|
/**
|
|
403
|
-
* Tool-use summary produced by the
|
|
445
|
+
* Tool-use summary produced by the with-tools agent loop. Captured so
|
|
404
446
|
* the runner can surface per-case tool metrics in the markdown report
|
|
405
447
|
* (number of calls, depth, error rate, denied paths).
|
|
406
448
|
*/
|
|
@@ -416,3 +458,104 @@ export interface ToolUseSummary {
|
|
|
416
458
|
/** Per-tool call counts, keyed by tool name. */
|
|
417
459
|
byTool: Record<string, number>;
|
|
418
460
|
}
|
|
461
|
+
/**
|
|
462
|
+
* Cross-stage consistency expectations for a workflow-mode case. Every
|
|
463
|
+
* sub-check is optional so authors can opt in incrementally; an empty
|
|
464
|
+
* block produces zero verifier results.
|
|
465
|
+
*/
|
|
466
|
+
export interface WorkflowConsistencyExpected {
|
|
467
|
+
/**
|
|
468
|
+
* For each rule, every id extracted from the `from` stage must appear in
|
|
469
|
+
* every listed `to` stage. Typical entry: `{ idPattern: "D-\\d+", from:
|
|
470
|
+
* "scope", to: ["plan"] }`. Guards the "decisions flow downstream" rule.
|
|
471
|
+
*/
|
|
472
|
+
idsFlow?: Array<{
|
|
473
|
+
idPattern: string;
|
|
474
|
+
idFlags?: string;
|
|
475
|
+
from: WorkflowStageName;
|
|
476
|
+
to: WorkflowStageName[];
|
|
477
|
+
}>;
|
|
478
|
+
/**
|
|
479
|
+
* Stages that must not contain any of the listed case-insensitive
|
|
480
|
+
* phrases. Defaults to `["TBD", "TODO", "placeholder"]` when set to an
|
|
481
|
+
* empty array; omit entirely to skip the check.
|
|
482
|
+
*/
|
|
483
|
+
placeholderFree?: {
|
|
484
|
+
stages: WorkflowStageName[];
|
|
485
|
+
phrases?: string[];
|
|
486
|
+
};
|
|
487
|
+
/**
|
|
488
|
+
* Free-form substring pairs: for every entry, if `must` appears in the
|
|
489
|
+
* named stage, `forbid` must NOT appear anywhere in the listed
|
|
490
|
+
* `stages`. Useful for "v1 decided in scope, plan must not say v2".
|
|
491
|
+
*/
|
|
492
|
+
noContradictions?: Array<{
|
|
493
|
+
stage: WorkflowStageName;
|
|
494
|
+
must: string;
|
|
495
|
+
forbid: string;
|
|
496
|
+
stages: WorkflowStageName[];
|
|
497
|
+
}>;
|
|
498
|
+
}
|
|
499
|
+
/**
|
|
500
|
+
* A single stage step inside a workflow-mode case. The stage's
|
|
501
|
+
* `inputPrompt` is handed to the with-tools agent loop with prior-stage
|
|
502
|
+
* artifacts seeded into the sandbox under `stages/<name>.md`.
|
|
503
|
+
*/
|
|
504
|
+
export interface WorkflowStageStep {
|
|
505
|
+
name: WorkflowStageName;
|
|
506
|
+
inputPrompt: string;
|
|
507
|
+
/** Per-stage rubric id override (defaults to the stage name). */
|
|
508
|
+
rubric?: string;
|
|
509
|
+
/** Per-stage required rubric check ids (mirror of JudgeExpected.requiredChecks). */
|
|
510
|
+
requiredChecks?: string[];
|
|
511
|
+
/** Per-stage minimum rubric scores (mirror of JudgeExpected.minimumScores). */
|
|
512
|
+
minimumScores?: Record<string, number>;
|
|
513
|
+
}
|
|
514
|
+
/**
|
|
515
|
+
* Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
|
|
516
|
+
* the workflow mode covers the early "design" arc of a project. TDD/review/ship
|
|
517
|
+
* are out of scope (they require real code execution).
|
|
518
|
+
*/
|
|
519
|
+
export declare const WORKFLOW_STAGES: readonly ["brainstorm", "scope", "design", "spec", "plan"];
|
|
520
|
+
export type WorkflowStageName = (typeof WORKFLOW_STAGES)[number];
|
|
521
|
+
/**
|
|
522
|
+
* A workflow-mode case. Lives under
|
|
523
|
+
* `.cclaw/evals/corpus/workflows/<id>.yaml` and wires a multi-stage run
|
|
524
|
+
* through the with-tools agent.
|
|
525
|
+
*/
|
|
526
|
+
export interface WorkflowCase {
|
|
527
|
+
id: string;
|
|
528
|
+
/** Short human-readable description (rendered in reports). */
|
|
529
|
+
description?: string;
|
|
530
|
+
/** Project files seeded into the sandbox before stage 1 runs. */
|
|
531
|
+
contextFiles?: string[];
|
|
532
|
+
/** Ordered list of stages to run. Must be non-empty. */
|
|
533
|
+
stages: WorkflowStageStep[];
|
|
534
|
+
/** Cross-stage consistency checks (workflow-mode verifier family). */
|
|
535
|
+
consistency?: WorkflowConsistencyExpected;
|
|
536
|
+
}
|
|
537
|
+
/** Per-stage record inside a workflow-mode run. */
|
|
538
|
+
export interface WorkflowStageResult {
|
|
539
|
+
stage: WorkflowStageName;
|
|
540
|
+
artifact: string;
|
|
541
|
+
durationMs: number;
|
|
542
|
+
usageUsd: number;
|
|
543
|
+
toolUse: ToolUseSummary;
|
|
544
|
+
attempts: number;
|
|
545
|
+
model: string;
|
|
546
|
+
promptTokens: number;
|
|
547
|
+
completionTokens: number;
|
|
548
|
+
/** True when the judge (when requested) produced `ok:true` for every required check. */
|
|
549
|
+
judgeOk?: boolean;
|
|
550
|
+
/** Per-rubric-check medians keyed by check id (for the report). */
|
|
551
|
+
judgeMedians?: Record<string, number>;
|
|
552
|
+
}
|
|
553
|
+
/** Workflow-mode orchestration output collected by the runner. */
|
|
554
|
+
export interface WorkflowRunSummary {
|
|
555
|
+
caseId: string;
|
|
556
|
+
stages: WorkflowStageResult[];
|
|
557
|
+
totalUsageUsd: number;
|
|
558
|
+
totalDurationMs: number;
|
|
559
|
+
/** True when every stage judge was ok (or judge was skipped everywhere). */
|
|
560
|
+
allJudgeOk: boolean;
|
|
561
|
+
}
|
package/dist/eval/types.js
CHANGED
|
@@ -1,15 +1,47 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Evaluation mode — what the agent-under-test actually does.
|
|
3
3
|
*
|
|
4
|
-
* - `
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
4
|
+
* - `fixture` — verify an existing artifact against structural/rule/judge
|
|
5
|
+
* expectations. No LLM drafting, only verifiers (judge may still invoke
|
|
6
|
+
* the API). Cheapest mode.
|
|
7
|
+
* - `agent` — LLM drafts a single-stage artifact inside a sandbox using the
|
|
8
|
+
* function-calling loop (read_file/write_file/glob/grep). Replaces the
|
|
9
|
+
* previous single-shot path entirely.
|
|
10
|
+
* - `workflow` — LLM orchestrates the full multi-stage flow
|
|
11
|
+
* (brainstorm → scope → design → spec → plan) with threaded artifacts.
|
|
12
|
+
*
|
|
13
|
+
* Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
|
|
14
|
+
* a deprecation warning — see `src/eval/mode.ts` for the mapping.
|
|
15
|
+
*/
|
|
16
|
+
export const EVAL_MODES = ["fixture", "agent", "workflow"];
|
|
17
|
+
/**
|
|
18
|
+
* Legacy tier identifier, kept so on-disk reports generated before v0.28.0
|
|
19
|
+
* keep parsing. New code should always use `EvalMode`.
|
|
20
|
+
* @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
|
|
9
21
|
*/
|
|
10
22
|
export const EVAL_TIERS = ["A", "B", "C"];
|
|
11
23
|
/**
|
|
12
24
|
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
13
25
|
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
26
|
+
* `consistency` is the workflow-mode cross-artifact family (deterministic but
|
|
27
|
+
* operates over multiple artifacts at once).
|
|
28
|
+
*/
|
|
29
|
+
export const VERIFIER_KINDS = [
|
|
30
|
+
"structural",
|
|
31
|
+
"rules",
|
|
32
|
+
"judge",
|
|
33
|
+
"workflow",
|
|
34
|
+
"consistency"
|
|
35
|
+
];
|
|
36
|
+
/**
|
|
37
|
+
* Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
|
|
38
|
+
* the workflow mode covers the early "design" arc of a project. TDD/review/ship
|
|
39
|
+
* are out of scope (they require real code execution).
|
|
14
40
|
*/
|
|
15
|
-
export const
|
|
41
|
+
export const WORKFLOW_STAGES = [
|
|
42
|
+
"brainstorm",
|
|
43
|
+
"scope",
|
|
44
|
+
"design",
|
|
45
|
+
"spec",
|
|
46
|
+
"plan"
|
|
47
|
+
];
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-artifact consistency verifier for workflow mode.
|
|
3
|
+
*
|
|
4
|
+
* Operates over a `{ stage → artifact }` map produced by the workflow
|
|
5
|
+
* agent and emits deterministic verifier results for:
|
|
6
|
+
*
|
|
7
|
+
* - `ids_flow`: every id extracted from `from` must appear in every
|
|
8
|
+
* `to` stage. Typical use — `D-\d+` from scope must all land in plan.
|
|
9
|
+
* - `placeholder_free`: none of the listed phrases
|
|
10
|
+
* (default `TBD`/`TODO`/`placeholder`) appear in any of the named
|
|
11
|
+
* stages.
|
|
12
|
+
* - `no_contradictions`: for each entry, if `must` is present in the
|
|
13
|
+
* declaring stage, `forbid` must not appear in any of the listed
|
|
14
|
+
* `stages`.
|
|
15
|
+
*
|
|
16
|
+
* Each sub-check contributes zero or more `VerifierResult`s with
|
|
17
|
+
* `kind: "consistency"`. An empty `WorkflowConsistencyExpected` produces
|
|
18
|
+
* zero results so authors can opt in incrementally.
|
|
19
|
+
*/
|
|
20
|
+
import type { VerifierResult, WorkflowConsistencyExpected, WorkflowStageName } from "../types.js";
|
|
21
|
+
export declare function verifyWorkflowConsistency(artifacts: Map<WorkflowStageName, string>, expected: WorkflowConsistencyExpected | undefined): VerifierResult[];
|