cclaw-cli 0.49.0 → 0.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/README.md +54 -82
  2. package/dist/artifact-linter.d.ts +4 -0
  3. package/dist/artifact-linter.js +24 -3
  4. package/dist/cli.d.ts +1 -19
  5. package/dist/cli.js +49 -491
  6. package/dist/constants.d.ts +2 -13
  7. package/dist/constants.js +1 -43
  8. package/dist/content/closeout-guidance.d.ts +14 -0
  9. package/dist/content/closeout-guidance.js +42 -0
  10. package/dist/content/core-agents.js +51 -9
  11. package/dist/content/decision-protocol.d.ts +12 -0
  12. package/dist/content/decision-protocol.js +20 -0
  13. package/dist/content/diff-command.d.ts +1 -2
  14. package/dist/content/diff-command.js +8 -94
  15. package/dist/content/examples.d.ts +4 -10
  16. package/dist/content/examples.js +10 -20
  17. package/dist/content/hook-events.js +2 -2
  18. package/dist/content/hook-inline-snippets.d.ts +5 -2
  19. package/dist/content/hook-inline-snippets.js +33 -1
  20. package/dist/content/hook-manifest.d.ts +3 -4
  21. package/dist/content/hook-manifest.js +11 -12
  22. package/dist/content/hooks.js +2 -0
  23. package/dist/content/ideate-command.d.ts +2 -0
  24. package/dist/content/ideate-command.js +31 -25
  25. package/dist/content/iron-laws.d.ts +5 -5
  26. package/dist/content/iron-laws.js +5 -5
  27. package/dist/content/learnings.d.ts +3 -4
  28. package/dist/content/learnings.js +24 -50
  29. package/dist/content/meta-skill.js +31 -21
  30. package/dist/content/next-command.js +38 -38
  31. package/dist/content/node-hooks.js +17 -343
  32. package/dist/content/opencode-plugin.js +2 -100
  33. package/dist/content/research-playbooks.js +14 -14
  34. package/dist/content/review-loop.d.ts +2 -0
  35. package/dist/content/review-loop.js +8 -0
  36. package/dist/content/session-hooks.js +14 -46
  37. package/dist/content/skills.d.ts +0 -5
  38. package/dist/content/skills.js +53 -128
  39. package/dist/content/stage-common-guidance.d.ts +0 -1
  40. package/dist/content/stage-common-guidance.js +15 -14
  41. package/dist/content/stage-schema.d.ts +26 -1
  42. package/dist/content/stage-schema.js +121 -40
  43. package/dist/content/stages/_lint-metadata/index.js +9 -15
  44. package/dist/content/stages/brainstorm.js +22 -43
  45. package/dist/content/stages/design.js +37 -57
  46. package/dist/content/stages/plan.js +22 -13
  47. package/dist/content/stages/review.js +24 -27
  48. package/dist/content/stages/scope.js +34 -46
  49. package/dist/content/stages/ship.js +7 -4
  50. package/dist/content/stages/spec.js +20 -9
  51. package/dist/content/stages/tdd.js +64 -44
  52. package/dist/content/start-command.js +10 -12
  53. package/dist/content/status-command.d.ts +2 -7
  54. package/dist/content/status-command.js +19 -146
  55. package/dist/content/subagents.d.ts +0 -5
  56. package/dist/content/subagents.js +47 -28
  57. package/dist/content/templates.d.ts +1 -1
  58. package/dist/content/templates.js +126 -135
  59. package/dist/content/track-render-context.d.ts +17 -0
  60. package/dist/content/track-render-context.js +44 -0
  61. package/dist/content/tree-command.d.ts +1 -2
  62. package/dist/content/tree-command.js +4 -87
  63. package/dist/content/utility-skills.d.ts +2 -29
  64. package/dist/content/utility-skills.js +2 -1534
  65. package/dist/content/view-command.js +29 -11
  66. package/dist/delegation.d.ts +1 -1
  67. package/dist/delegation.js +5 -15
  68. package/dist/doctor-registry.js +20 -21
  69. package/dist/doctor.js +88 -344
  70. package/dist/flow-state.d.ts +3 -0
  71. package/dist/flow-state.js +2 -0
  72. package/dist/harness-adapters.d.ts +1 -1
  73. package/dist/harness-adapters.js +48 -57
  74. package/dist/install.js +128 -358
  75. package/dist/internal/advance-stage.js +3 -9
  76. package/dist/internal/compound-readiness.d.ts +1 -1
  77. package/dist/internal/compound-readiness.js +1 -1
  78. package/dist/internal/tdd-loop-status.d.ts +1 -1
  79. package/dist/internal/tdd-loop-status.js +1 -1
  80. package/dist/knowledge-store.d.ts +16 -10
  81. package/dist/knowledge-store.js +51 -15
  82. package/dist/policy.js +16 -105
  83. package/dist/run-archive.d.ts +4 -6
  84. package/dist/run-archive.js +15 -20
  85. package/dist/run-persistence.d.ts +2 -2
  86. package/dist/run-persistence.js +3 -9
  87. package/package.json +1 -2
  88. package/dist/content/archive-command.d.ts +0 -2
  89. package/dist/content/archive-command.js +0 -124
  90. package/dist/content/compound-command.d.ts +0 -5
  91. package/dist/content/compound-command.js +0 -193
  92. package/dist/content/contexts.d.ts +0 -18
  93. package/dist/content/contexts.js +0 -24
  94. package/dist/content/contracts.d.ts +0 -2
  95. package/dist/content/contracts.js +0 -51
  96. package/dist/content/doctor-references.d.ts +0 -2
  97. package/dist/content/doctor-references.js +0 -150
  98. package/dist/content/eval-scaffold.d.ts +0 -15
  99. package/dist/content/eval-scaffold.js +0 -370
  100. package/dist/content/feature-command.d.ts +0 -2
  101. package/dist/content/feature-command.js +0 -123
  102. package/dist/content/flow-map.d.ts +0 -23
  103. package/dist/content/flow-map.js +0 -134
  104. package/dist/content/harness-doc.d.ts +0 -2
  105. package/dist/content/harness-doc.js +0 -202
  106. package/dist/content/harness-playbooks.d.ts +0 -24
  107. package/dist/content/harness-playbooks.js +0 -393
  108. package/dist/content/harness-tool-refs.d.ts +0 -20
  109. package/dist/content/harness-tool-refs.js +0 -268
  110. package/dist/content/ops-command.d.ts +0 -2
  111. package/dist/content/ops-command.js +0 -71
  112. package/dist/content/protocols.d.ts +0 -7
  113. package/dist/content/protocols.js +0 -215
  114. package/dist/content/retro-command.d.ts +0 -2
  115. package/dist/content/retro-command.js +0 -165
  116. package/dist/content/rewind-command.d.ts +0 -2
  117. package/dist/content/rewind-command.js +0 -106
  118. package/dist/content/tdd-log-command.d.ts +0 -2
  119. package/dist/content/tdd-log-command.js +0 -85
  120. package/dist/eval/agents/single-shot.d.ts +0 -27
  121. package/dist/eval/agents/single-shot.js +0 -79
  122. package/dist/eval/agents/with-tools.d.ts +0 -44
  123. package/dist/eval/agents/with-tools.js +0 -261
  124. package/dist/eval/agents/workflow.d.ts +0 -31
  125. package/dist/eval/agents/workflow.js +0 -155
  126. package/dist/eval/baseline.d.ts +0 -38
  127. package/dist/eval/baseline.js +0 -282
  128. package/dist/eval/config-loader.d.ts +0 -14
  129. package/dist/eval/config-loader.js +0 -395
  130. package/dist/eval/corpus.d.ts +0 -30
  131. package/dist/eval/corpus.js +0 -330
  132. package/dist/eval/cost-guard.d.ts +0 -102
  133. package/dist/eval/cost-guard.js +0 -190
  134. package/dist/eval/diff.d.ts +0 -64
  135. package/dist/eval/diff.js +0 -323
  136. package/dist/eval/llm-client.d.ts +0 -176
  137. package/dist/eval/llm-client.js +0 -267
  138. package/dist/eval/mode.d.ts +0 -28
  139. package/dist/eval/mode.js +0 -61
  140. package/dist/eval/progress.d.ts +0 -83
  141. package/dist/eval/progress.js +0 -59
  142. package/dist/eval/report.d.ts +0 -11
  143. package/dist/eval/report.js +0 -181
  144. package/dist/eval/rubric-loader.d.ts +0 -20
  145. package/dist/eval/rubric-loader.js +0 -143
  146. package/dist/eval/runner.d.ts +0 -81
  147. package/dist/eval/runner.js +0 -746
  148. package/dist/eval/runs.d.ts +0 -41
  149. package/dist/eval/runs.js +0 -114
  150. package/dist/eval/sandbox.d.ts +0 -38
  151. package/dist/eval/sandbox.js +0 -137
  152. package/dist/eval/tools/glob.d.ts +0 -2
  153. package/dist/eval/tools/glob.js +0 -163
  154. package/dist/eval/tools/grep.d.ts +0 -2
  155. package/dist/eval/tools/grep.js +0 -152
  156. package/dist/eval/tools/index.d.ts +0 -7
  157. package/dist/eval/tools/index.js +0 -35
  158. package/dist/eval/tools/read.d.ts +0 -2
  159. package/dist/eval/tools/read.js +0 -122
  160. package/dist/eval/tools/types.d.ts +0 -49
  161. package/dist/eval/tools/types.js +0 -41
  162. package/dist/eval/tools/write.d.ts +0 -2
  163. package/dist/eval/tools/write.js +0 -92
  164. package/dist/eval/types.d.ts +0 -561
  165. package/dist/eval/types.js +0 -47
  166. package/dist/eval/verifiers/judge.d.ts +0 -40
  167. package/dist/eval/verifiers/judge.js +0 -256
  168. package/dist/eval/verifiers/rules.d.ts +0 -24
  169. package/dist/eval/verifiers/rules.js +0 -218
  170. package/dist/eval/verifiers/structural.d.ts +0 -14
  171. package/dist/eval/verifiers/structural.js +0 -171
  172. package/dist/eval/verifiers/traceability.d.ts +0 -23
  173. package/dist/eval/verifiers/traceability.js +0 -84
  174. package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
  175. package/dist/eval/verifiers/workflow-consistency.js +0 -225
  176. package/dist/eval/workflow-corpus.d.ts +0 -7
  177. package/dist/eval/workflow-corpus.js +0 -207
  178. package/dist/feature-system.d.ts +0 -42
  179. package/dist/feature-system.js +0 -432
  180. package/dist/internal/knowledge-digest.d.ts +0 -7
  181. package/dist/internal/knowledge-digest.js +0 -93
@@ -1,64 +0,0 @@
1
- import type { EvalReport } from "./types.js";
2
- export interface EvalDiffInput {
3
- projectRoot: string;
4
- /** Version string, filename, or "latest". */
5
- old: string;
6
- /** Version string, filename, or "latest". */
7
- new: string;
8
- }
9
- export interface EvalDiffCaseEntry {
10
- caseId: string;
11
- stage: string;
12
- /** Pass/fail transition: `same`, `regressed`, `recovered`, `added`, `removed`. */
13
- transition: "same" | "regressed" | "recovered" | "added" | "removed";
14
- previousPassed?: boolean;
15
- currentPassed?: boolean;
16
- durationDeltaMs?: number;
17
- costDeltaUsd?: number;
18
- verifierDeltas: EvalDiffVerifierEntry[];
19
- stageDeltas?: EvalDiffStageEntry[];
20
- }
21
- export interface EvalDiffVerifierEntry {
22
- verifierId: string;
23
- kind: string;
24
- transition: "same" | "regressed" | "recovered" | "added" | "removed" | "score-drop";
25
- previousScore?: number;
26
- currentScore?: number;
27
- previousOk?: boolean;
28
- currentOk?: boolean;
29
- }
30
- export interface EvalDiffStageEntry {
31
- stage: string;
32
- durationDeltaMs: number;
33
- costDeltaUsd: number;
34
- turnsDelta: number;
35
- callsDelta: number;
36
- }
37
- export interface EvalDiffReport {
38
- old: EvalDiffReportMeta;
39
- new: EvalDiffReportMeta;
40
- summaryDelta: {
41
- totalCasesDelta: number;
42
- passedDelta: number;
43
- failedDelta: number;
44
- skippedDelta: number;
45
- totalCostUsdDelta: number;
46
- totalDurationMsDelta: number;
47
- };
48
- cases: EvalDiffCaseEntry[];
49
- /** True when any case regressed or any verifier dropped. */
50
- regressed: boolean;
51
- }
52
- export interface EvalDiffReportMeta {
53
- runId: string;
54
- cclawVersion: string;
55
- generatedAt: string;
56
- mode: string;
57
- model: string;
58
- sourcePath: string;
59
- }
60
- export declare function resolveReportPath(projectRoot: string, selector: string): Promise<string>;
61
- export declare function diffReports(previous: EvalReport, current: EvalReport, prevPath: string, currPath: string): EvalDiffReport;
62
- export declare function runEvalDiff(input: EvalDiffInput): Promise<EvalDiffReport>;
63
- /** Render the diff as a terse human-readable Markdown block. */
64
- export declare function formatDiffMarkdown(diff: EvalDiffReport): string;
package/dist/eval/diff.js DELETED
@@ -1,323 +0,0 @@
1
- /**
2
- * `cclaw eval diff <old> <new>` — side-by-side report comparison.
3
- *
4
- * Loads two JSON reports under `.cclaw/evals/reports/` (by version tag or
5
- * explicit filename) and emits a compact human-readable + JSON diff:
6
- *
7
- * - summary-level deltas (passed/failed/cost/duration)
8
- * - per-case pass/fail transitions
9
- * - per-verifier score drops (only the drops — new passes are noted in
10
- * the summary line, not repeated per verifier)
11
- * - Workflow-mode stage-level cost & duration deltas when both reports
12
- * carry a `workflow` summary for the same case id
13
- *
14
- * The resolver accepts three shapes for the `<old>` / `<new>` arguments:
15
- *
16
- * 1. A bare version string (`0.26.0`) — matched against any report JSON
17
- * whose `cclawVersion` field equals the string.
18
- * 2. A full or relative filename (`eval-2026-04-17T...-abc123.json`).
19
- * 3. The literal `latest` — picks the most recent report on disk by
20
- * mtime.
21
- *
22
- * The diff is deterministic: sorted by case id, then verifier id. Missing
23
- * cases in one report show up as `added` or `removed` so callers can see
24
- * which corpus changes slipped in between versions.
25
- */
26
- import fs from "node:fs/promises";
27
- import path from "node:path";
28
- import { EVALS_ROOT } from "../constants.js";
29
- import { exists } from "../fs-utils.js";
30
- const SCORE_DROP_EPSILON = 0.0001;
31
- export async function resolveReportPath(projectRoot, selector) {
32
- const dir = path.join(projectRoot, EVALS_ROOT, "reports");
33
- if (!(await exists(dir))) {
34
- throw new Error(`No reports directory at ${path.relative(projectRoot, dir)}. ` +
35
- `Run \`cclaw eval\` at least once before comparing reports.`);
36
- }
37
- const trimmed = selector.trim();
38
- if (trimmed.length === 0) {
39
- throw new Error(`Empty report selector. Pass a version like "0.26.0" or "latest".`);
40
- }
41
- // 1. Explicit filename (absolute or relative).
42
- const asPath = path.isAbsolute(trimmed) ? trimmed : path.join(dir, trimmed);
43
- if (await exists(asPath))
44
- return asPath;
45
- if (trimmed.endsWith(".json") && (await exists(asPath)))
46
- return asPath;
47
- const entries = await fs.readdir(dir, { withFileTypes: true });
48
- const jsonFiles = entries
49
- .filter((e) => e.isFile() && e.name.endsWith(".json"))
50
- .map((e) => path.join(dir, e.name));
51
- if (jsonFiles.length === 0) {
52
- throw new Error(`No JSON reports found under ${path.relative(projectRoot, dir)}.`);
53
- }
54
- if (trimmed === "latest") {
55
- let latest = jsonFiles[0];
56
- let latestMtime = (await fs.stat(latest)).mtimeMs;
57
- for (const f of jsonFiles.slice(1)) {
58
- const stat = await fs.stat(f);
59
- if (stat.mtimeMs > latestMtime) {
60
- latest = f;
61
- latestMtime = stat.mtimeMs;
62
- }
63
- }
64
- return latest;
65
- }
66
- // 3. Version match — pick most recent by mtime among matches.
67
- const matches = [];
68
- for (const file of jsonFiles) {
69
- try {
70
- const raw = await fs.readFile(file, "utf8");
71
- const parsed = JSON.parse(raw);
72
- if (parsed.cclawVersion === trimmed) {
73
- const stat = await fs.stat(file);
74
- matches.push({ file, mtimeMs: stat.mtimeMs });
75
- }
76
- }
77
- catch {
78
- continue;
79
- }
80
- }
81
- if (matches.length === 0) {
82
- throw new Error(`No report matched selector "${selector}". ` +
83
- `Pass a filename under ${path.relative(projectRoot, dir)} or a cclawVersion present in one of the reports.`);
84
- }
85
- matches.sort((a, b) => b.mtimeMs - a.mtimeMs);
86
- return matches[0].file;
87
- }
88
- async function loadReport(filePath) {
89
- const raw = await fs.readFile(filePath, "utf8");
90
- const parsed = JSON.parse(raw);
91
- if (parsed.schemaVersion !== 1 || !Array.isArray(parsed.cases)) {
92
- throw new Error(`File at ${filePath} is not a valid cclaw eval report (missing schemaVersion or cases).`);
93
- }
94
- return parsed;
95
- }
96
- function meta(report, sourcePath) {
97
- return {
98
- runId: report.runId,
99
- cclawVersion: report.cclawVersion,
100
- generatedAt: report.generatedAt,
101
- mode: report.mode,
102
- model: report.model,
103
- sourcePath
104
- };
105
- }
106
- function verifierMap(results) {
107
- const out = new Map();
108
- for (const v of results)
109
- out.set(v.id, v);
110
- return out;
111
- }
112
- function diffCase(caseId, previous, current) {
113
- const stage = (current ?? previous).stage;
114
- if (!previous) {
115
- return {
116
- caseId,
117
- stage,
118
- transition: "added",
119
- currentPassed: current?.passed,
120
- verifierDeltas: []
121
- };
122
- }
123
- if (!current) {
124
- return {
125
- caseId,
126
- stage,
127
- transition: "removed",
128
- previousPassed: previous.passed,
129
- verifierDeltas: []
130
- };
131
- }
132
- const transition = previous.passed === current.passed
133
- ? "same"
134
- : previous.passed && !current.passed
135
- ? "regressed"
136
- : "recovered";
137
- const prevMap = verifierMap(previous.verifierResults);
138
- const currMap = verifierMap(current.verifierResults);
139
- const verifierDeltas = [];
140
- const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
141
- for (const id of [...allIds].sort((a, b) => a.localeCompare(b))) {
142
- const p = prevMap.get(id);
143
- const c = currMap.get(id);
144
- const kind = (c ?? p).kind;
145
- if (!p && c) {
146
- verifierDeltas.push({
147
- verifierId: id,
148
- kind,
149
- transition: "added",
150
- currentOk: c.ok,
151
- ...(c.score !== undefined ? { currentScore: c.score } : {})
152
- });
153
- continue;
154
- }
155
- if (p && !c) {
156
- verifierDeltas.push({
157
- verifierId: id,
158
- kind,
159
- transition: "removed",
160
- previousOk: p.ok,
161
- ...(p.score !== undefined ? { previousScore: p.score } : {})
162
- });
163
- continue;
164
- }
165
- if (!p || !c)
166
- continue;
167
- const okChanged = p.ok !== c.ok;
168
- const scoreChanged = typeof p.score === "number" &&
169
- typeof c.score === "number" &&
170
- Math.abs(p.score - c.score) > SCORE_DROP_EPSILON;
171
- if (!okChanged && !scoreChanged)
172
- continue;
173
- const entry = {
174
- verifierId: id,
175
- kind,
176
- transition: okChanged
177
- ? p.ok
178
- ? "regressed"
179
- : "recovered"
180
- : typeof p.score === "number" &&
181
- typeof c.score === "number" &&
182
- c.score < p.score
183
- ? "score-drop"
184
- : "same",
185
- previousOk: p.ok,
186
- currentOk: c.ok
187
- };
188
- if (typeof p.score === "number")
189
- entry.previousScore = p.score;
190
- if (typeof c.score === "number")
191
- entry.currentScore = c.score;
192
- if (entry.transition !== "same")
193
- verifierDeltas.push(entry);
194
- }
195
- const caseEntry = {
196
- caseId,
197
- stage,
198
- transition,
199
- previousPassed: previous.passed,
200
- currentPassed: current.passed,
201
- durationDeltaMs: current.durationMs - previous.durationMs,
202
- verifierDeltas
203
- };
204
- const costDelta = (current.costUsd ?? 0) - (previous.costUsd ?? 0);
205
- if (Math.abs(costDelta) > SCORE_DROP_EPSILON) {
206
- caseEntry.costDeltaUsd = Number(costDelta.toFixed(6));
207
- }
208
- if (previous.workflow && current.workflow) {
209
- const prevStages = new Map();
210
- for (const s of previous.workflow.stages)
211
- prevStages.set(s.stage, s);
212
- const stageDeltas = [];
213
- for (const curStage of current.workflow.stages) {
214
- const prevStage = prevStages.get(curStage.stage);
215
- if (!prevStage)
216
- continue;
217
- stageDeltas.push({
218
- stage: curStage.stage,
219
- durationDeltaMs: curStage.durationMs - prevStage.durationMs,
220
- costDeltaUsd: Number((curStage.usageUsd - prevStage.usageUsd).toFixed(6)),
221
- turnsDelta: curStage.toolUse.turns - prevStage.toolUse.turns,
222
- callsDelta: curStage.toolUse.calls - prevStage.toolUse.calls
223
- });
224
- }
225
- if (stageDeltas.length > 0)
226
- caseEntry.stageDeltas = stageDeltas;
227
- }
228
- return caseEntry;
229
- }
230
- export function diffReports(previous, current, prevPath, currPath) {
231
- const prevMap = new Map();
232
- const currMap = new Map();
233
- for (const c of previous.cases)
234
- prevMap.set(c.caseId, c);
235
- for (const c of current.cases)
236
- currMap.set(c.caseId, c);
237
- const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
238
- const cases = [...allIds]
239
- .sort((a, b) => a.localeCompare(b))
240
- .map((id) => diffCase(id, prevMap.get(id), currMap.get(id)));
241
- const regressed = cases.some((c) => c.transition === "regressed" ||
242
- c.transition === "removed" ||
243
- c.verifierDeltas.some((v) => v.transition === "regressed" || v.transition === "score-drop"));
244
- return {
245
- old: meta(previous, prevPath),
246
- new: meta(current, currPath),
247
- summaryDelta: {
248
- totalCasesDelta: current.summary.totalCases - previous.summary.totalCases,
249
- passedDelta: current.summary.passed - previous.summary.passed,
250
- failedDelta: current.summary.failed - previous.summary.failed,
251
- skippedDelta: current.summary.skipped - previous.summary.skipped,
252
- totalCostUsdDelta: Number((current.summary.totalCostUsd - previous.summary.totalCostUsd).toFixed(6)),
253
- totalDurationMsDelta: current.summary.totalDurationMs - previous.summary.totalDurationMs
254
- },
255
- cases,
256
- regressed
257
- };
258
- }
259
- export async function runEvalDiff(input) {
260
- const [oldPath, newPath] = await Promise.all([
261
- resolveReportPath(input.projectRoot, input.old),
262
- resolveReportPath(input.projectRoot, input.new)
263
- ]);
264
- const [oldReport, newReport] = await Promise.all([
265
- loadReport(oldPath),
266
- loadReport(newPath)
267
- ]);
268
- return diffReports(oldReport, newReport, oldPath, newPath);
269
- }
270
- /** Render the diff as a terse human-readable Markdown block. */
271
- export function formatDiffMarkdown(diff) {
272
- const lines = [];
273
- lines.push(`# cclaw eval diff`);
274
- lines.push(``);
275
- lines.push(`- old: ${diff.old.cclawVersion} (${path.basename(diff.old.sourcePath)})`);
276
- lines.push(`- new: ${diff.new.cclawVersion} (${path.basename(diff.new.sourcePath)})`);
277
- lines.push(`- regressed: ${diff.regressed ? "yes" : "no"}`);
278
- lines.push(``);
279
- lines.push(`## Summary delta`);
280
- lines.push(``);
281
- const sd = diff.summaryDelta;
282
- lines.push(`| metric | delta |`);
283
- lines.push(`| --- | --- |`);
284
- lines.push(`| total cases | ${sd.totalCasesDelta >= 0 ? "+" : ""}${sd.totalCasesDelta} |`);
285
- lines.push(`| passed | ${sd.passedDelta >= 0 ? "+" : ""}${sd.passedDelta} |`);
286
- lines.push(`| failed | ${sd.failedDelta >= 0 ? "+" : ""}${sd.failedDelta} |`);
287
- lines.push(`| skipped | ${sd.skippedDelta >= 0 ? "+" : ""}${sd.skippedDelta} |`);
288
- lines.push(`| cost (USD) | ${sd.totalCostUsdDelta >= 0 ? "+" : ""}${sd.totalCostUsdDelta.toFixed(4)} |`);
289
- lines.push(`| duration (ms) | ${sd.totalDurationMsDelta >= 0 ? "+" : ""}${sd.totalDurationMsDelta} |`);
290
- lines.push(``);
291
- const noisyCases = diff.cases.filter((c) => c.transition !== "same" || c.verifierDeltas.length > 0);
292
- if (noisyCases.length === 0) {
293
- lines.push(`No case-level changes.`);
294
- lines.push(``);
295
- return `${lines.join("\n")}\n`;
296
- }
297
- lines.push(`## Case changes`);
298
- lines.push(``);
299
- lines.push(`| case id | stage | transition | prev | curr |`);
300
- lines.push(`| --- | --- | --- | --- | --- |`);
301
- for (const c of noisyCases) {
302
- const prev = c.previousPassed === undefined ? "-" : c.previousPassed ? "pass" : "fail";
303
- const curr = c.currentPassed === undefined ? "-" : c.currentPassed ? "pass" : "fail";
304
- lines.push(`| ${c.caseId} | ${c.stage} | ${c.transition} | ${prev} | ${curr} |`);
305
- }
306
- lines.push(``);
307
- const withVerifiers = noisyCases.filter((c) => c.verifierDeltas.length > 0);
308
- if (withVerifiers.length > 0) {
309
- lines.push(`## Verifier changes`);
310
- lines.push(``);
311
- lines.push(`| case id | verifier | kind | transition | prev score | curr score |`);
312
- lines.push(`| --- | --- | --- | --- | --- | --- |`);
313
- for (const c of withVerifiers) {
314
- for (const v of c.verifierDeltas) {
315
- const prev = v.previousScore !== undefined ? v.previousScore.toFixed(2) : "-";
316
- const curr = v.currentScore !== undefined ? v.currentScore.toFixed(2) : "-";
317
- lines.push(`| ${c.caseId} | ${v.verifierId} | ${v.kind} | ${v.transition} | ${prev} | ${curr} |`);
318
- }
319
- }
320
- lines.push(``);
321
- }
322
- return `${lines.join("\n")}\n`;
323
- }
@@ -1,176 +0,0 @@
1
- import type { ClientOptions } from "openai";
2
- import type { ResolvedEvalConfig } from "./types.js";
3
- export interface ChatMessage {
4
- role: "system" | "user" | "assistant" | "tool";
5
- content: string;
6
- name?: string;
7
- toolCallId?: string;
8
- /**
9
- * OpenAI-style tool calls carried on a preceding assistant message.
10
- * Populated by the with-tools loop so the wire transcript stays
11
- * consistent (assistant message → tool responses).
12
- */
13
- toolCalls?: Array<{
14
- id: string;
15
- name: string;
16
- arguments: string;
17
- }>;
18
- }
19
- export interface ChatRequest {
20
- model: string;
21
- messages: ChatMessage[];
22
- maxTokens?: number;
23
- temperature?: number;
24
- /** Per-call timeout override. Falls back to `config.timeoutMs`. */
25
- timeoutMs?: number;
26
- /**
27
- * Ask the provider for a JSON-object response. The judge pipeline sets
28
- * this; the agent-under-test usually leaves it unset.
29
- */
30
- responseFormatJson?: boolean;
31
- /**
32
- * Optional deterministic sampling seed. Providers that don't implement
33
- * `seed` simply ignore it.
34
- */
35
- seed?: number;
36
- /**
37
- * Tool/function-calling definitions in OpenAI wire format. Populated only
38
- * by agent/workflow modes. Ignored by the single-shot path.
39
- */
40
- tools?: unknown[];
41
- toolChoice?: "auto" | "none";
42
- }
43
- export interface ChatUsage {
44
- promptTokens: number;
45
- completionTokens: number;
46
- totalTokens: number;
47
- }
48
- export interface ChatResponse {
49
- content: string;
50
- toolCalls?: Array<{
51
- id: string;
52
- name: string;
53
- arguments: string;
54
- }>;
55
- usage: ChatUsage;
56
- finishReason: "stop" | "length" | "tool_calls" | "content_filter";
57
- model: string;
58
- attempts: number;
59
- }
60
- /** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
61
- export declare class EvalLlmError extends Error {
62
- readonly retryable: boolean;
63
- readonly status?: number;
64
- constructor(message: string, opts: {
65
- retryable: boolean;
66
- status?: number;
67
- cause?: unknown;
68
- });
69
- }
70
- export declare class EvalLlmAuthError extends EvalLlmError {
71
- constructor(cause: unknown);
72
- }
73
- export declare class EvalLlmConfigError extends EvalLlmError {
74
- constructor(message: string, cause?: unknown);
75
- }
76
- export declare class EvalLlmTimeoutError extends EvalLlmError {
77
- constructor(timeoutMs: number);
78
- }
79
- export declare class EvalLlmRateLimitedError extends EvalLlmError {
80
- constructor(cause: unknown);
81
- }
82
- export declare class EvalLlmTransportError extends EvalLlmError {
83
- constructor(cause: unknown, status?: number);
84
- }
85
- export declare class EvalLlmInvalidResponseError extends EvalLlmError {
86
- constructor(message: string, details?: Record<string, unknown>);
87
- }
88
- export declare class EvalLlmNotConfiguredError extends EvalLlmError {
89
- constructor();
90
- }
91
- /** Lightweight client abstraction shared across eval runners. */
92
- export interface EvalLlmClient {
93
- chat(request: ChatRequest): Promise<ChatResponse>;
94
- }
95
- /**
96
- * Deprecated shim preserved so older wiring keeps compiling. Prefer
97
- * `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
98
- * key" case.
99
- */
100
- export declare class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
101
- }
102
- /** `createEvalClient` options — mostly for tests to inject a fake transport. */
103
- export interface CreateEvalClientOptions {
104
- /** Inject an `openai` stand-in. Used by unit tests to avoid real HTTP. */
105
- openaiFactory?: (opts: ClientOptions) => OpenAILike;
106
- /**
107
- * Override the default retry/backoff policy. Honored by the internal
108
- * retry loop; transport errors still fall back to the defaults when
109
- * unset.
110
- */
111
- retryPolicy?: RetryPolicy;
112
- /** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
113
- sleep?: (ms: number) => Promise<void>;
114
- /**
115
- * Observer invoked when a chat() call is about to sleep before the next
116
- * retry attempt. Use this to surface "we are retrying" status via the
117
- * progress logger so long, silent backoff windows become visible.
118
- */
119
- onRetry?: (event: {
120
- attempt: number;
121
- maxAttempts: number;
122
- waitMs: number;
123
- error: EvalLlmError;
124
- }) => void;
125
- }
126
- export interface RetryPolicy {
127
- /** Max retries *on top of* the initial attempt. 0 = single attempt. */
128
- maxRetries: number;
129
- /** Initial backoff in ms. Doubles each retry (capped at `maxBackoffMs`). */
130
- initialBackoffMs: number;
131
- /** Upper bound for a single sleep between attempts. */
132
- maxBackoffMs: number;
133
- }
134
- export declare const DEFAULT_RETRY_POLICY: RetryPolicy;
135
- /**
136
- * Minimal OpenAI-SDK surface we depend on, declared here so tests can
137
- * substitute a plain object without pulling the real SDK into the test
138
- * runtime.
139
- */
140
- export interface OpenAILike {
141
- chat: {
142
- completions: {
143
- create(body: Record<string, unknown>, options: {
144
- signal: AbortSignal;
145
- }): Promise<OpenAILikeChatResponse>;
146
- };
147
- };
148
- }
149
- interface OpenAILikeChatResponse {
150
- model?: string;
151
- choices: Array<{
152
- message?: {
153
- content?: string | null;
154
- tool_calls?: Array<{
155
- id: string;
156
- function: {
157
- name: string;
158
- arguments: string;
159
- };
160
- }>;
161
- };
162
- finish_reason?: string | null;
163
- }>;
164
- usage?: {
165
- prompt_tokens?: number;
166
- completion_tokens?: number;
167
- total_tokens?: number;
168
- };
169
- }
170
- /**
171
- * Build a real client pointed at the configured endpoint. Throws
172
- * `EvalLlmNotConfiguredError` at call time (not construction time) when no
173
- * API key is available, so CLI help and dry-run paths stay offline-safe.
174
- */
175
- export declare function createEvalClient(config: ResolvedEvalConfig, options?: CreateEvalClientOptions): EvalLlmClient;
176
- export {};