cclaw-cli 0.49.0 → 0.51.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +57 -84
  2. package/dist/artifact-linter.d.ts +4 -0
  3. package/dist/artifact-linter.js +24 -3
  4. package/dist/cli.d.ts +1 -19
  5. package/dist/cli.js +49 -491
  6. package/dist/constants.d.ts +2 -13
  7. package/dist/constants.js +1 -43
  8. package/dist/content/closeout-guidance.d.ts +14 -0
  9. package/dist/content/closeout-guidance.js +42 -0
  10. package/dist/content/core-agents.js +55 -17
  11. package/dist/content/decision-protocol.d.ts +12 -0
  12. package/dist/content/decision-protocol.js +20 -0
  13. package/dist/content/diff-command.d.ts +1 -2
  14. package/dist/content/diff-command.js +8 -94
  15. package/dist/content/examples.d.ts +4 -10
  16. package/dist/content/examples.js +10 -20
  17. package/dist/content/hook-events.js +2 -2
  18. package/dist/content/hook-inline-snippets.d.ts +5 -2
  19. package/dist/content/hook-inline-snippets.js +33 -1
  20. package/dist/content/hook-manifest.d.ts +3 -4
  21. package/dist/content/hook-manifest.js +11 -12
  22. package/dist/content/hooks.js +44 -21
  23. package/dist/content/ideate-command.d.ts +2 -0
  24. package/dist/content/ideate-command.js +34 -25
  25. package/dist/content/iron-laws.d.ts +5 -5
  26. package/dist/content/iron-laws.js +5 -5
  27. package/dist/content/language-policy.d.ts +2 -0
  28. package/dist/content/language-policy.js +13 -0
  29. package/dist/content/learnings.d.ts +3 -4
  30. package/dist/content/learnings.js +26 -50
  31. package/dist/content/meta-skill.js +33 -22
  32. package/dist/content/next-command.js +41 -38
  33. package/dist/content/node-hooks.js +17 -345
  34. package/dist/content/opencode-plugin.js +5 -103
  35. package/dist/content/research-playbooks.js +14 -14
  36. package/dist/content/review-loop.d.ts +2 -0
  37. package/dist/content/review-loop.js +8 -0
  38. package/dist/content/session-hooks.js +15 -47
  39. package/dist/content/skills.d.ts +0 -5
  40. package/dist/content/skills.js +55 -128
  41. package/dist/content/stage-common-guidance.d.ts +0 -1
  42. package/dist/content/stage-common-guidance.js +17 -14
  43. package/dist/content/stage-schema.d.ts +26 -1
  44. package/dist/content/stage-schema.js +121 -40
  45. package/dist/content/stages/_lint-metadata/index.js +9 -15
  46. package/dist/content/stages/brainstorm.js +22 -43
  47. package/dist/content/stages/design.js +37 -57
  48. package/dist/content/stages/plan.js +22 -13
  49. package/dist/content/stages/review.js +24 -27
  50. package/dist/content/stages/scope.js +34 -46
  51. package/dist/content/stages/ship.js +7 -4
  52. package/dist/content/stages/spec.js +20 -9
  53. package/dist/content/stages/tdd.js +64 -44
  54. package/dist/content/start-command.js +13 -12
  55. package/dist/content/status-command.d.ts +2 -7
  56. package/dist/content/status-command.js +19 -146
  57. package/dist/content/subagents.d.ts +0 -5
  58. package/dist/content/subagents.js +51 -28
  59. package/dist/content/templates.d.ts +1 -1
  60. package/dist/content/templates.js +126 -135
  61. package/dist/content/track-render-context.d.ts +17 -0
  62. package/dist/content/track-render-context.js +44 -0
  63. package/dist/content/tree-command.d.ts +1 -2
  64. package/dist/content/tree-command.js +4 -87
  65. package/dist/content/utility-skills.d.ts +2 -29
  66. package/dist/content/utility-skills.js +2 -1534
  67. package/dist/content/view-command.js +31 -11
  68. package/dist/delegation.d.ts +1 -1
  69. package/dist/delegation.js +5 -15
  70. package/dist/doctor-registry.js +20 -21
  71. package/dist/doctor.js +88 -344
  72. package/dist/flow-state.d.ts +3 -0
  73. package/dist/flow-state.js +2 -0
  74. package/dist/harness-adapters.d.ts +1 -1
  75. package/dist/harness-adapters.js +51 -58
  76. package/dist/install.js +128 -358
  77. package/dist/internal/advance-stage.js +3 -9
  78. package/dist/internal/compound-readiness.d.ts +1 -1
  79. package/dist/internal/compound-readiness.js +1 -1
  80. package/dist/internal/tdd-loop-status.d.ts +1 -1
  81. package/dist/internal/tdd-loop-status.js +1 -1
  82. package/dist/knowledge-store.d.ts +16 -10
  83. package/dist/knowledge-store.js +51 -15
  84. package/dist/policy.js +16 -105
  85. package/dist/run-archive.d.ts +4 -6
  86. package/dist/run-archive.js +15 -20
  87. package/dist/run-persistence.d.ts +2 -2
  88. package/dist/run-persistence.js +3 -9
  89. package/package.json +1 -2
  90. package/dist/content/archive-command.d.ts +0 -2
  91. package/dist/content/archive-command.js +0 -124
  92. package/dist/content/compound-command.d.ts +0 -5
  93. package/dist/content/compound-command.js +0 -193
  94. package/dist/content/contexts.d.ts +0 -18
  95. package/dist/content/contexts.js +0 -24
  96. package/dist/content/contracts.d.ts +0 -2
  97. package/dist/content/contracts.js +0 -51
  98. package/dist/content/doctor-references.d.ts +0 -2
  99. package/dist/content/doctor-references.js +0 -150
  100. package/dist/content/eval-scaffold.d.ts +0 -15
  101. package/dist/content/eval-scaffold.js +0 -370
  102. package/dist/content/feature-command.d.ts +0 -2
  103. package/dist/content/feature-command.js +0 -123
  104. package/dist/content/flow-map.d.ts +0 -23
  105. package/dist/content/flow-map.js +0 -134
  106. package/dist/content/harness-doc.d.ts +0 -2
  107. package/dist/content/harness-doc.js +0 -202
  108. package/dist/content/harness-playbooks.d.ts +0 -24
  109. package/dist/content/harness-playbooks.js +0 -393
  110. package/dist/content/harness-tool-refs.d.ts +0 -20
  111. package/dist/content/harness-tool-refs.js +0 -268
  112. package/dist/content/ops-command.d.ts +0 -2
  113. package/dist/content/ops-command.js +0 -71
  114. package/dist/content/protocols.d.ts +0 -7
  115. package/dist/content/protocols.js +0 -215
  116. package/dist/content/retro-command.d.ts +0 -2
  117. package/dist/content/retro-command.js +0 -165
  118. package/dist/content/rewind-command.d.ts +0 -2
  119. package/dist/content/rewind-command.js +0 -106
  120. package/dist/content/tdd-log-command.d.ts +0 -2
  121. package/dist/content/tdd-log-command.js +0 -85
  122. package/dist/eval/agents/single-shot.d.ts +0 -27
  123. package/dist/eval/agents/single-shot.js +0 -79
  124. package/dist/eval/agents/with-tools.d.ts +0 -44
  125. package/dist/eval/agents/with-tools.js +0 -261
  126. package/dist/eval/agents/workflow.d.ts +0 -31
  127. package/dist/eval/agents/workflow.js +0 -155
  128. package/dist/eval/baseline.d.ts +0 -38
  129. package/dist/eval/baseline.js +0 -282
  130. package/dist/eval/config-loader.d.ts +0 -14
  131. package/dist/eval/config-loader.js +0 -395
  132. package/dist/eval/corpus.d.ts +0 -30
  133. package/dist/eval/corpus.js +0 -330
  134. package/dist/eval/cost-guard.d.ts +0 -102
  135. package/dist/eval/cost-guard.js +0 -190
  136. package/dist/eval/diff.d.ts +0 -64
  137. package/dist/eval/diff.js +0 -323
  138. package/dist/eval/llm-client.d.ts +0 -176
  139. package/dist/eval/llm-client.js +0 -267
  140. package/dist/eval/mode.d.ts +0 -28
  141. package/dist/eval/mode.js +0 -61
  142. package/dist/eval/progress.d.ts +0 -83
  143. package/dist/eval/progress.js +0 -59
  144. package/dist/eval/report.d.ts +0 -11
  145. package/dist/eval/report.js +0 -181
  146. package/dist/eval/rubric-loader.d.ts +0 -20
  147. package/dist/eval/rubric-loader.js +0 -143
  148. package/dist/eval/runner.d.ts +0 -81
  149. package/dist/eval/runner.js +0 -746
  150. package/dist/eval/runs.d.ts +0 -41
  151. package/dist/eval/runs.js +0 -114
  152. package/dist/eval/sandbox.d.ts +0 -38
  153. package/dist/eval/sandbox.js +0 -137
  154. package/dist/eval/tools/glob.d.ts +0 -2
  155. package/dist/eval/tools/glob.js +0 -163
  156. package/dist/eval/tools/grep.d.ts +0 -2
  157. package/dist/eval/tools/grep.js +0 -152
  158. package/dist/eval/tools/index.d.ts +0 -7
  159. package/dist/eval/tools/index.js +0 -35
  160. package/dist/eval/tools/read.d.ts +0 -2
  161. package/dist/eval/tools/read.js +0 -122
  162. package/dist/eval/tools/types.d.ts +0 -49
  163. package/dist/eval/tools/types.js +0 -41
  164. package/dist/eval/tools/write.d.ts +0 -2
  165. package/dist/eval/tools/write.js +0 -92
  166. package/dist/eval/types.d.ts +0 -561
  167. package/dist/eval/types.js +0 -47
  168. package/dist/eval/verifiers/judge.d.ts +0 -40
  169. package/dist/eval/verifiers/judge.js +0 -256
  170. package/dist/eval/verifiers/rules.d.ts +0 -24
  171. package/dist/eval/verifiers/rules.js +0 -218
  172. package/dist/eval/verifiers/structural.d.ts +0 -14
  173. package/dist/eval/verifiers/structural.js +0 -171
  174. package/dist/eval/verifiers/traceability.d.ts +0 -23
  175. package/dist/eval/verifiers/traceability.js +0 -84
  176. package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
  177. package/dist/eval/verifiers/workflow-consistency.js +0 -225
  178. package/dist/eval/workflow-corpus.d.ts +0 -7
  179. package/dist/eval/workflow-corpus.js +0 -207
  180. package/dist/feature-system.d.ts +0 -42
  181. package/dist/feature-system.js +0 -432
  182. package/dist/internal/knowledge-digest.d.ts +0 -7
  183. package/dist/internal/knowledge-digest.js +0 -93
@@ -1,64 +0,0 @@
1
- import type { EvalReport } from "./types.js";
2
- export interface EvalDiffInput {
3
- projectRoot: string;
4
- /** Version string, filename, or "latest". */
5
- old: string;
6
- /** Version string, filename, or "latest". */
7
- new: string;
8
- }
9
- export interface EvalDiffCaseEntry {
10
- caseId: string;
11
- stage: string;
12
- /** Pass/fail transition: `same`, `regressed`, `recovered`, `added`, `removed`. */
13
- transition: "same" | "regressed" | "recovered" | "added" | "removed";
14
- previousPassed?: boolean;
15
- currentPassed?: boolean;
16
- durationDeltaMs?: number;
17
- costDeltaUsd?: number;
18
- verifierDeltas: EvalDiffVerifierEntry[];
19
- stageDeltas?: EvalDiffStageEntry[];
20
- }
21
- export interface EvalDiffVerifierEntry {
22
- verifierId: string;
23
- kind: string;
24
- transition: "same" | "regressed" | "recovered" | "added" | "removed" | "score-drop";
25
- previousScore?: number;
26
- currentScore?: number;
27
- previousOk?: boolean;
28
- currentOk?: boolean;
29
- }
30
- export interface EvalDiffStageEntry {
31
- stage: string;
32
- durationDeltaMs: number;
33
- costDeltaUsd: number;
34
- turnsDelta: number;
35
- callsDelta: number;
36
- }
37
- export interface EvalDiffReport {
38
- old: EvalDiffReportMeta;
39
- new: EvalDiffReportMeta;
40
- summaryDelta: {
41
- totalCasesDelta: number;
42
- passedDelta: number;
43
- failedDelta: number;
44
- skippedDelta: number;
45
- totalCostUsdDelta: number;
46
- totalDurationMsDelta: number;
47
- };
48
- cases: EvalDiffCaseEntry[];
49
- /** True when any case regressed or any verifier dropped. */
50
- regressed: boolean;
51
- }
52
- export interface EvalDiffReportMeta {
53
- runId: string;
54
- cclawVersion: string;
55
- generatedAt: string;
56
- mode: string;
57
- model: string;
58
- sourcePath: string;
59
- }
60
- export declare function resolveReportPath(projectRoot: string, selector: string): Promise<string>;
61
- export declare function diffReports(previous: EvalReport, current: EvalReport, prevPath: string, currPath: string): EvalDiffReport;
62
- export declare function runEvalDiff(input: EvalDiffInput): Promise<EvalDiffReport>;
63
- /** Render the diff as a terse human-readable Markdown block. */
64
- export declare function formatDiffMarkdown(diff: EvalDiffReport): string;
package/dist/eval/diff.js DELETED
@@ -1,323 +0,0 @@
1
- /**
2
- * `cclaw eval diff <old> <new>` — side-by-side report comparison.
3
- *
4
- * Loads two JSON reports under `.cclaw/evals/reports/` (by version tag or
5
- * explicit filename) and emits a compact human-readable + JSON diff:
6
- *
7
- * - summary-level deltas (passed/failed/cost/duration)
8
- * - per-case pass/fail transitions
9
- * - per-verifier score drops (only the drops — new passes are noted in
10
- * the summary line, not repeated per verifier)
11
- * - Workflow-mode stage-level cost & duration deltas when both reports
12
- * carry a `workflow` summary for the same case id
13
- *
14
- * The resolver accepts three shapes for the `<old>` / `<new>` arguments:
15
- *
16
- * 1. A bare version string (`0.26.0`) — matched against any report JSON
17
- * whose `cclawVersion` field equals the string.
18
- * 2. A full or relative filename (`eval-2026-04-17T...-abc123.json`).
19
- * 3. The literal `latest` — picks the most recent report on disk by
20
- * mtime.
21
- *
22
- * The diff is deterministic: sorted by case id, then verifier id. Missing
23
- * cases in one report show up as `added` or `removed` so callers can see
24
- * which corpus changes slipped in between versions.
25
- */
26
- import fs from "node:fs/promises";
27
- import path from "node:path";
28
- import { EVALS_ROOT } from "../constants.js";
29
- import { exists } from "../fs-utils.js";
30
- const SCORE_DROP_EPSILON = 0.0001;
31
- export async function resolveReportPath(projectRoot, selector) {
32
- const dir = path.join(projectRoot, EVALS_ROOT, "reports");
33
- if (!(await exists(dir))) {
34
- throw new Error(`No reports directory at ${path.relative(projectRoot, dir)}. ` +
35
- `Run \`cclaw eval\` at least once before comparing reports.`);
36
- }
37
- const trimmed = selector.trim();
38
- if (trimmed.length === 0) {
39
- throw new Error(`Empty report selector. Pass a version like "0.26.0" or "latest".`);
40
- }
41
- // 1. Explicit filename (absolute or relative).
42
- const asPath = path.isAbsolute(trimmed) ? trimmed : path.join(dir, trimmed);
43
- if (await exists(asPath))
44
- return asPath;
45
- if (trimmed.endsWith(".json") && (await exists(asPath)))
46
- return asPath;
47
- const entries = await fs.readdir(dir, { withFileTypes: true });
48
- const jsonFiles = entries
49
- .filter((e) => e.isFile() && e.name.endsWith(".json"))
50
- .map((e) => path.join(dir, e.name));
51
- if (jsonFiles.length === 0) {
52
- throw new Error(`No JSON reports found under ${path.relative(projectRoot, dir)}.`);
53
- }
54
- if (trimmed === "latest") {
55
- let latest = jsonFiles[0];
56
- let latestMtime = (await fs.stat(latest)).mtimeMs;
57
- for (const f of jsonFiles.slice(1)) {
58
- const stat = await fs.stat(f);
59
- if (stat.mtimeMs > latestMtime) {
60
- latest = f;
61
- latestMtime = stat.mtimeMs;
62
- }
63
- }
64
- return latest;
65
- }
66
- // 3. Version match — pick most recent by mtime among matches.
67
- const matches = [];
68
- for (const file of jsonFiles) {
69
- try {
70
- const raw = await fs.readFile(file, "utf8");
71
- const parsed = JSON.parse(raw);
72
- if (parsed.cclawVersion === trimmed) {
73
- const stat = await fs.stat(file);
74
- matches.push({ file, mtimeMs: stat.mtimeMs });
75
- }
76
- }
77
- catch {
78
- continue;
79
- }
80
- }
81
- if (matches.length === 0) {
82
- throw new Error(`No report matched selector "${selector}". ` +
83
- `Pass a filename under ${path.relative(projectRoot, dir)} or a cclawVersion present in one of the reports.`);
84
- }
85
- matches.sort((a, b) => b.mtimeMs - a.mtimeMs);
86
- return matches[0].file;
87
- }
88
- async function loadReport(filePath) {
89
- const raw = await fs.readFile(filePath, "utf8");
90
- const parsed = JSON.parse(raw);
91
- if (parsed.schemaVersion !== 1 || !Array.isArray(parsed.cases)) {
92
- throw new Error(`File at ${filePath} is not a valid cclaw eval report (missing schemaVersion or cases).`);
93
- }
94
- return parsed;
95
- }
96
- function meta(report, sourcePath) {
97
- return {
98
- runId: report.runId,
99
- cclawVersion: report.cclawVersion,
100
- generatedAt: report.generatedAt,
101
- mode: report.mode,
102
- model: report.model,
103
- sourcePath
104
- };
105
- }
106
- function verifierMap(results) {
107
- const out = new Map();
108
- for (const v of results)
109
- out.set(v.id, v);
110
- return out;
111
- }
112
- function diffCase(caseId, previous, current) {
113
- const stage = (current ?? previous).stage;
114
- if (!previous) {
115
- return {
116
- caseId,
117
- stage,
118
- transition: "added",
119
- currentPassed: current?.passed,
120
- verifierDeltas: []
121
- };
122
- }
123
- if (!current) {
124
- return {
125
- caseId,
126
- stage,
127
- transition: "removed",
128
- previousPassed: previous.passed,
129
- verifierDeltas: []
130
- };
131
- }
132
- const transition = previous.passed === current.passed
133
- ? "same"
134
- : previous.passed && !current.passed
135
- ? "regressed"
136
- : "recovered";
137
- const prevMap = verifierMap(previous.verifierResults);
138
- const currMap = verifierMap(current.verifierResults);
139
- const verifierDeltas = [];
140
- const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
141
- for (const id of [...allIds].sort((a, b) => a.localeCompare(b))) {
142
- const p = prevMap.get(id);
143
- const c = currMap.get(id);
144
- const kind = (c ?? p).kind;
145
- if (!p && c) {
146
- verifierDeltas.push({
147
- verifierId: id,
148
- kind,
149
- transition: "added",
150
- currentOk: c.ok,
151
- ...(c.score !== undefined ? { currentScore: c.score } : {})
152
- });
153
- continue;
154
- }
155
- if (p && !c) {
156
- verifierDeltas.push({
157
- verifierId: id,
158
- kind,
159
- transition: "removed",
160
- previousOk: p.ok,
161
- ...(p.score !== undefined ? { previousScore: p.score } : {})
162
- });
163
- continue;
164
- }
165
- if (!p || !c)
166
- continue;
167
- const okChanged = p.ok !== c.ok;
168
- const scoreChanged = typeof p.score === "number" &&
169
- typeof c.score === "number" &&
170
- Math.abs(p.score - c.score) > SCORE_DROP_EPSILON;
171
- if (!okChanged && !scoreChanged)
172
- continue;
173
- const entry = {
174
- verifierId: id,
175
- kind,
176
- transition: okChanged
177
- ? p.ok
178
- ? "regressed"
179
- : "recovered"
180
- : typeof p.score === "number" &&
181
- typeof c.score === "number" &&
182
- c.score < p.score
183
- ? "score-drop"
184
- : "same",
185
- previousOk: p.ok,
186
- currentOk: c.ok
187
- };
188
- if (typeof p.score === "number")
189
- entry.previousScore = p.score;
190
- if (typeof c.score === "number")
191
- entry.currentScore = c.score;
192
- if (entry.transition !== "same")
193
- verifierDeltas.push(entry);
194
- }
195
- const caseEntry = {
196
- caseId,
197
- stage,
198
- transition,
199
- previousPassed: previous.passed,
200
- currentPassed: current.passed,
201
- durationDeltaMs: current.durationMs - previous.durationMs,
202
- verifierDeltas
203
- };
204
- const costDelta = (current.costUsd ?? 0) - (previous.costUsd ?? 0);
205
- if (Math.abs(costDelta) > SCORE_DROP_EPSILON) {
206
- caseEntry.costDeltaUsd = Number(costDelta.toFixed(6));
207
- }
208
- if (previous.workflow && current.workflow) {
209
- const prevStages = new Map();
210
- for (const s of previous.workflow.stages)
211
- prevStages.set(s.stage, s);
212
- const stageDeltas = [];
213
- for (const curStage of current.workflow.stages) {
214
- const prevStage = prevStages.get(curStage.stage);
215
- if (!prevStage)
216
- continue;
217
- stageDeltas.push({
218
- stage: curStage.stage,
219
- durationDeltaMs: curStage.durationMs - prevStage.durationMs,
220
- costDeltaUsd: Number((curStage.usageUsd - prevStage.usageUsd).toFixed(6)),
221
- turnsDelta: curStage.toolUse.turns - prevStage.toolUse.turns,
222
- callsDelta: curStage.toolUse.calls - prevStage.toolUse.calls
223
- });
224
- }
225
- if (stageDeltas.length > 0)
226
- caseEntry.stageDeltas = stageDeltas;
227
- }
228
- return caseEntry;
229
- }
230
- export function diffReports(previous, current, prevPath, currPath) {
231
- const prevMap = new Map();
232
- const currMap = new Map();
233
- for (const c of previous.cases)
234
- prevMap.set(c.caseId, c);
235
- for (const c of current.cases)
236
- currMap.set(c.caseId, c);
237
- const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
238
- const cases = [...allIds]
239
- .sort((a, b) => a.localeCompare(b))
240
- .map((id) => diffCase(id, prevMap.get(id), currMap.get(id)));
241
- const regressed = cases.some((c) => c.transition === "regressed" ||
242
- c.transition === "removed" ||
243
- c.verifierDeltas.some((v) => v.transition === "regressed" || v.transition === "score-drop"));
244
- return {
245
- old: meta(previous, prevPath),
246
- new: meta(current, currPath),
247
- summaryDelta: {
248
- totalCasesDelta: current.summary.totalCases - previous.summary.totalCases,
249
- passedDelta: current.summary.passed - previous.summary.passed,
250
- failedDelta: current.summary.failed - previous.summary.failed,
251
- skippedDelta: current.summary.skipped - previous.summary.skipped,
252
- totalCostUsdDelta: Number((current.summary.totalCostUsd - previous.summary.totalCostUsd).toFixed(6)),
253
- totalDurationMsDelta: current.summary.totalDurationMs - previous.summary.totalDurationMs
254
- },
255
- cases,
256
- regressed
257
- };
258
- }
259
- export async function runEvalDiff(input) {
260
- const [oldPath, newPath] = await Promise.all([
261
- resolveReportPath(input.projectRoot, input.old),
262
- resolveReportPath(input.projectRoot, input.new)
263
- ]);
264
- const [oldReport, newReport] = await Promise.all([
265
- loadReport(oldPath),
266
- loadReport(newPath)
267
- ]);
268
- return diffReports(oldReport, newReport, oldPath, newPath);
269
- }
270
- /** Render the diff as a terse human-readable Markdown block. */
271
- export function formatDiffMarkdown(diff) {
272
- const lines = [];
273
- lines.push(`# cclaw eval diff`);
274
- lines.push(``);
275
- lines.push(`- old: ${diff.old.cclawVersion} (${path.basename(diff.old.sourcePath)})`);
276
- lines.push(`- new: ${diff.new.cclawVersion} (${path.basename(diff.new.sourcePath)})`);
277
- lines.push(`- regressed: ${diff.regressed ? "yes" : "no"}`);
278
- lines.push(``);
279
- lines.push(`## Summary delta`);
280
- lines.push(``);
281
- const sd = diff.summaryDelta;
282
- lines.push(`| metric | delta |`);
283
- lines.push(`| --- | --- |`);
284
- lines.push(`| total cases | ${sd.totalCasesDelta >= 0 ? "+" : ""}${sd.totalCasesDelta} |`);
285
- lines.push(`| passed | ${sd.passedDelta >= 0 ? "+" : ""}${sd.passedDelta} |`);
286
- lines.push(`| failed | ${sd.failedDelta >= 0 ? "+" : ""}${sd.failedDelta} |`);
287
- lines.push(`| skipped | ${sd.skippedDelta >= 0 ? "+" : ""}${sd.skippedDelta} |`);
288
- lines.push(`| cost (USD) | ${sd.totalCostUsdDelta >= 0 ? "+" : ""}${sd.totalCostUsdDelta.toFixed(4)} |`);
289
- lines.push(`| duration (ms) | ${sd.totalDurationMsDelta >= 0 ? "+" : ""}${sd.totalDurationMsDelta} |`);
290
- lines.push(``);
291
- const noisyCases = diff.cases.filter((c) => c.transition !== "same" || c.verifierDeltas.length > 0);
292
- if (noisyCases.length === 0) {
293
- lines.push(`No case-level changes.`);
294
- lines.push(``);
295
- return `${lines.join("\n")}\n`;
296
- }
297
- lines.push(`## Case changes`);
298
- lines.push(``);
299
- lines.push(`| case id | stage | transition | prev | curr |`);
300
- lines.push(`| --- | --- | --- | --- | --- |`);
301
- for (const c of noisyCases) {
302
- const prev = c.previousPassed === undefined ? "-" : c.previousPassed ? "pass" : "fail";
303
- const curr = c.currentPassed === undefined ? "-" : c.currentPassed ? "pass" : "fail";
304
- lines.push(`| ${c.caseId} | ${c.stage} | ${c.transition} | ${prev} | ${curr} |`);
305
- }
306
- lines.push(``);
307
- const withVerifiers = noisyCases.filter((c) => c.verifierDeltas.length > 0);
308
- if (withVerifiers.length > 0) {
309
- lines.push(`## Verifier changes`);
310
- lines.push(``);
311
- lines.push(`| case id | verifier | kind | transition | prev score | curr score |`);
312
- lines.push(`| --- | --- | --- | --- | --- | --- |`);
313
- for (const c of withVerifiers) {
314
- for (const v of c.verifierDeltas) {
315
- const prev = v.previousScore !== undefined ? v.previousScore.toFixed(2) : "-";
316
- const curr = v.currentScore !== undefined ? v.currentScore.toFixed(2) : "-";
317
- lines.push(`| ${c.caseId} | ${v.verifierId} | ${v.kind} | ${v.transition} | ${prev} | ${curr} |`);
318
- }
319
- }
320
- lines.push(``);
321
- }
322
- return `${lines.join("\n")}\n`;
323
- }
@@ -1,176 +0,0 @@
1
- import type { ClientOptions } from "openai";
2
- import type { ResolvedEvalConfig } from "./types.js";
3
- export interface ChatMessage {
4
- role: "system" | "user" | "assistant" | "tool";
5
- content: string;
6
- name?: string;
7
- toolCallId?: string;
8
- /**
9
- * OpenAI-style tool calls carried on a preceding assistant message.
10
- * Populated by the with-tools loop so the wire transcript stays
11
- * consistent (assistant message → tool responses).
12
- */
13
- toolCalls?: Array<{
14
- id: string;
15
- name: string;
16
- arguments: string;
17
- }>;
18
- }
19
- export interface ChatRequest {
20
- model: string;
21
- messages: ChatMessage[];
22
- maxTokens?: number;
23
- temperature?: number;
24
- /** Per-call timeout override. Falls back to `config.timeoutMs`. */
25
- timeoutMs?: number;
26
- /**
27
- * Ask the provider for a JSON-object response. The judge pipeline sets
28
- * this; the agent-under-test usually leaves it unset.
29
- */
30
- responseFormatJson?: boolean;
31
- /**
32
- * Optional deterministic sampling seed. Providers that don't implement
33
- * `seed` simply ignore it.
34
- */
35
- seed?: number;
36
- /**
37
- * Tool/function-calling definitions in OpenAI wire format. Populated only
38
- * by agent/workflow modes. Ignored by the single-shot path.
39
- */
40
- tools?: unknown[];
41
- toolChoice?: "auto" | "none";
42
- }
43
- export interface ChatUsage {
44
- promptTokens: number;
45
- completionTokens: number;
46
- totalTokens: number;
47
- }
48
- export interface ChatResponse {
49
- content: string;
50
- toolCalls?: Array<{
51
- id: string;
52
- name: string;
53
- arguments: string;
54
- }>;
55
- usage: ChatUsage;
56
- finishReason: "stop" | "length" | "tool_calls" | "content_filter";
57
- model: string;
58
- attempts: number;
59
- }
60
- /** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
61
- export declare class EvalLlmError extends Error {
62
- readonly retryable: boolean;
63
- readonly status?: number;
64
- constructor(message: string, opts: {
65
- retryable: boolean;
66
- status?: number;
67
- cause?: unknown;
68
- });
69
- }
70
- export declare class EvalLlmAuthError extends EvalLlmError {
71
- constructor(cause: unknown);
72
- }
73
- export declare class EvalLlmConfigError extends EvalLlmError {
74
- constructor(message: string, cause?: unknown);
75
- }
76
- export declare class EvalLlmTimeoutError extends EvalLlmError {
77
- constructor(timeoutMs: number);
78
- }
79
- export declare class EvalLlmRateLimitedError extends EvalLlmError {
80
- constructor(cause: unknown);
81
- }
82
- export declare class EvalLlmTransportError extends EvalLlmError {
83
- constructor(cause: unknown, status?: number);
84
- }
85
- export declare class EvalLlmInvalidResponseError extends EvalLlmError {
86
- constructor(message: string, details?: Record<string, unknown>);
87
- }
88
- export declare class EvalLlmNotConfiguredError extends EvalLlmError {
89
- constructor();
90
- }
91
- /** Lightweight client abstraction shared across eval runners. */
92
- export interface EvalLlmClient {
93
- chat(request: ChatRequest): Promise<ChatResponse>;
94
- }
95
- /**
96
- * Deprecated shim preserved so older wiring keeps compiling. Prefer
97
- * `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
98
- * key" case.
99
- */
100
- export declare class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
101
- }
102
- /** `createEvalClient` options — mostly for tests to inject a fake transport. */
103
- export interface CreateEvalClientOptions {
104
- /** Inject an `openai` stand-in. Used by unit tests to avoid real HTTP. */
105
- openaiFactory?: (opts: ClientOptions) => OpenAILike;
106
- /**
107
- * Override the default retry/backoff policy. Honored by the internal
108
- * retry loop; transport errors still fall back to the defaults when
109
- * unset.
110
- */
111
- retryPolicy?: RetryPolicy;
112
- /** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
113
- sleep?: (ms: number) => Promise<void>;
114
- /**
115
- * Observer invoked when a chat() call is about to sleep before the next
116
- * retry attempt. Use this to surface "we are retrying" status via the
117
- * progress logger so long, silent backoff windows become visible.
118
- */
119
- onRetry?: (event: {
120
- attempt: number;
121
- maxAttempts: number;
122
- waitMs: number;
123
- error: EvalLlmError;
124
- }) => void;
125
- }
126
- export interface RetryPolicy {
127
- /** Max retries *on top of* the initial attempt. 0 = single attempt. */
128
- maxRetries: number;
129
- /** Initial backoff in ms. Doubles each retry (capped at `maxBackoffMs`). */
130
- initialBackoffMs: number;
131
- /** Upper bound for a single sleep between attempts. */
132
- maxBackoffMs: number;
133
- }
134
- export declare const DEFAULT_RETRY_POLICY: RetryPolicy;
135
- /**
136
- * Minimal OpenAI-SDK surface we depend on, declared here so tests can
137
- * substitute a plain object without pulling the real SDK into the test
138
- * runtime.
139
- */
140
- export interface OpenAILike {
141
- chat: {
142
- completions: {
143
- create(body: Record<string, unknown>, options: {
144
- signal: AbortSignal;
145
- }): Promise<OpenAILikeChatResponse>;
146
- };
147
- };
148
- }
149
- interface OpenAILikeChatResponse {
150
- model?: string;
151
- choices: Array<{
152
- message?: {
153
- content?: string | null;
154
- tool_calls?: Array<{
155
- id: string;
156
- function: {
157
- name: string;
158
- arguments: string;
159
- };
160
- }>;
161
- };
162
- finish_reason?: string | null;
163
- }>;
164
- usage?: {
165
- prompt_tokens?: number;
166
- completion_tokens?: number;
167
- total_tokens?: number;
168
- };
169
- }
170
- /**
171
- * Build a real client pointed at the configured endpoint. Throws
172
- * `EvalLlmNotConfiguredError` at call time (not construction time) when no
173
- * API key is available, so CLI help and dry-run paths stay offline-safe.
174
- */
175
- export declare function createEvalClient(config: ResolvedEvalConfig, options?: CreateEvalClientOptions): EvalLlmClient;
176
- export {};