cclaw-cli 0.48.35 → 0.51.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -82
- package/dist/artifact-linter.d.ts +4 -0
- package/dist/artifact-linter.js +24 -3
- package/dist/cli.d.ts +1 -19
- package/dist/cli.js +49 -495
- package/dist/constants.d.ts +2 -13
- package/dist/constants.js +1 -46
- package/dist/content/closeout-guidance.d.ts +14 -0
- package/dist/content/closeout-guidance.js +42 -0
- package/dist/content/core-agents.js +51 -9
- package/dist/content/decision-protocol.d.ts +12 -0
- package/dist/content/decision-protocol.js +20 -0
- package/dist/content/diff-command.d.ts +1 -2
- package/dist/content/diff-command.js +8 -94
- package/dist/content/examples.d.ts +4 -10
- package/dist/content/examples.js +10 -20
- package/dist/content/hook-events.js +2 -2
- package/dist/content/hook-inline-snippets.d.ts +5 -2
- package/dist/content/hook-inline-snippets.js +33 -1
- package/dist/content/hook-manifest.d.ts +3 -4
- package/dist/content/hook-manifest.js +11 -12
- package/dist/content/hooks.js +2 -0
- package/dist/content/ideate-command.d.ts +2 -0
- package/dist/content/ideate-command.js +31 -25
- package/dist/content/iron-laws.d.ts +5 -5
- package/dist/content/iron-laws.js +5 -5
- package/dist/content/learnings.d.ts +3 -4
- package/dist/content/learnings.js +24 -50
- package/dist/content/meta-skill.js +31 -24
- package/dist/content/next-command.js +38 -38
- package/dist/content/node-hooks.js +17 -343
- package/dist/content/opencode-plugin.js +2 -100
- package/dist/content/research-playbooks.js +14 -14
- package/dist/content/review-loop.d.ts +2 -0
- package/dist/content/review-loop.js +8 -0
- package/dist/content/session-hooks.js +14 -46
- package/dist/content/skills.d.ts +0 -5
- package/dist/content/skills.js +53 -128
- package/dist/content/stage-common-guidance.d.ts +0 -1
- package/dist/content/stage-common-guidance.js +15 -14
- package/dist/content/stage-schema.d.ts +26 -1
- package/dist/content/stage-schema.js +121 -40
- package/dist/content/stages/_lint-metadata/index.js +9 -15
- package/dist/content/stages/brainstorm.js +22 -43
- package/dist/content/stages/design.js +37 -57
- package/dist/content/stages/plan.js +22 -13
- package/dist/content/stages/review.js +24 -27
- package/dist/content/stages/scope.js +34 -46
- package/dist/content/stages/ship.js +7 -4
- package/dist/content/stages/spec.js +20 -9
- package/dist/content/stages/tdd.js +64 -44
- package/dist/content/start-command.js +10 -12
- package/dist/content/status-command.d.ts +2 -7
- package/dist/content/status-command.js +19 -146
- package/dist/content/subagents.d.ts +0 -5
- package/dist/content/subagents.js +47 -28
- package/dist/content/templates.d.ts +1 -1
- package/dist/content/templates.js +126 -135
- package/dist/content/track-render-context.d.ts +17 -0
- package/dist/content/track-render-context.js +44 -0
- package/dist/content/tree-command.d.ts +1 -2
- package/dist/content/tree-command.js +4 -87
- package/dist/content/utility-skills.d.ts +2 -29
- package/dist/content/utility-skills.js +2 -1533
- package/dist/content/view-command.js +29 -11
- package/dist/delegation.d.ts +1 -1
- package/dist/delegation.js +5 -15
- package/dist/doctor-registry.js +20 -21
- package/dist/doctor.js +88 -408
- package/dist/flow-state.d.ts +3 -0
- package/dist/flow-state.js +2 -0
- package/dist/harness-adapters.d.ts +1 -1
- package/dist/harness-adapters.js +48 -57
- package/dist/install.js +128 -520
- package/dist/internal/advance-stage.js +3 -9
- package/dist/internal/compound-readiness.d.ts +1 -1
- package/dist/internal/compound-readiness.js +1 -1
- package/dist/internal/tdd-loop-status.d.ts +1 -1
- package/dist/internal/tdd-loop-status.js +1 -1
- package/dist/knowledge-store.d.ts +16 -10
- package/dist/knowledge-store.js +51 -15
- package/dist/policy.js +16 -109
- package/dist/run-archive.d.ts +4 -6
- package/dist/run-archive.js +15 -20
- package/dist/run-persistence.d.ts +2 -2
- package/dist/run-persistence.js +3 -9
- package/package.json +1 -2
- package/dist/content/archive-command.d.ts +0 -2
- package/dist/content/archive-command.js +0 -124
- package/dist/content/compound-command.d.ts +0 -5
- package/dist/content/compound-command.js +0 -193
- package/dist/content/contexts.d.ts +0 -9
- package/dist/content/contexts.js +0 -65
- package/dist/content/contracts.d.ts +0 -2
- package/dist/content/contracts.js +0 -51
- package/dist/content/doctor-references.d.ts +0 -2
- package/dist/content/doctor-references.js +0 -150
- package/dist/content/eval-scaffold.d.ts +0 -15
- package/dist/content/eval-scaffold.js +0 -370
- package/dist/content/feature-command.d.ts +0 -2
- package/dist/content/feature-command.js +0 -123
- package/dist/content/flow-map.d.ts +0 -23
- package/dist/content/flow-map.js +0 -134
- package/dist/content/harness-doc.d.ts +0 -2
- package/dist/content/harness-doc.js +0 -202
- package/dist/content/harness-playbooks.d.ts +0 -24
- package/dist/content/harness-playbooks.js +0 -393
- package/dist/content/harness-tool-refs.d.ts +0 -20
- package/dist/content/harness-tool-refs.js +0 -268
- package/dist/content/ops-command.d.ts +0 -2
- package/dist/content/ops-command.js +0 -71
- package/dist/content/protocols.d.ts +0 -7
- package/dist/content/protocols.js +0 -215
- package/dist/content/retro-command.d.ts +0 -2
- package/dist/content/retro-command.js +0 -165
- package/dist/content/rewind-command.d.ts +0 -2
- package/dist/content/rewind-command.js +0 -106
- package/dist/content/tdd-log-command.d.ts +0 -2
- package/dist/content/tdd-log-command.js +0 -85
- package/dist/eval/agents/single-shot.d.ts +0 -27
- package/dist/eval/agents/single-shot.js +0 -79
- package/dist/eval/agents/with-tools.d.ts +0 -44
- package/dist/eval/agents/with-tools.js +0 -261
- package/dist/eval/agents/workflow.d.ts +0 -31
- package/dist/eval/agents/workflow.js +0 -155
- package/dist/eval/baseline.d.ts +0 -38
- package/dist/eval/baseline.js +0 -282
- package/dist/eval/config-loader.d.ts +0 -14
- package/dist/eval/config-loader.js +0 -395
- package/dist/eval/corpus.d.ts +0 -30
- package/dist/eval/corpus.js +0 -330
- package/dist/eval/cost-guard.d.ts +0 -102
- package/dist/eval/cost-guard.js +0 -190
- package/dist/eval/diff.d.ts +0 -64
- package/dist/eval/diff.js +0 -323
- package/dist/eval/llm-client.d.ts +0 -176
- package/dist/eval/llm-client.js +0 -267
- package/dist/eval/mode.d.ts +0 -28
- package/dist/eval/mode.js +0 -61
- package/dist/eval/progress.d.ts +0 -83
- package/dist/eval/progress.js +0 -59
- package/dist/eval/report.d.ts +0 -11
- package/dist/eval/report.js +0 -181
- package/dist/eval/rubric-loader.d.ts +0 -20
- package/dist/eval/rubric-loader.js +0 -143
- package/dist/eval/runner.d.ts +0 -81
- package/dist/eval/runner.js +0 -746
- package/dist/eval/runs.d.ts +0 -41
- package/dist/eval/runs.js +0 -114
- package/dist/eval/sandbox.d.ts +0 -38
- package/dist/eval/sandbox.js +0 -137
- package/dist/eval/tools/glob.d.ts +0 -2
- package/dist/eval/tools/glob.js +0 -163
- package/dist/eval/tools/grep.d.ts +0 -2
- package/dist/eval/tools/grep.js +0 -152
- package/dist/eval/tools/index.d.ts +0 -7
- package/dist/eval/tools/index.js +0 -35
- package/dist/eval/tools/read.d.ts +0 -2
- package/dist/eval/tools/read.js +0 -122
- package/dist/eval/tools/types.d.ts +0 -49
- package/dist/eval/tools/types.js +0 -41
- package/dist/eval/tools/write.d.ts +0 -2
- package/dist/eval/tools/write.js +0 -92
- package/dist/eval/types.d.ts +0 -561
- package/dist/eval/types.js +0 -47
- package/dist/eval/verifiers/judge.d.ts +0 -40
- package/dist/eval/verifiers/judge.js +0 -256
- package/dist/eval/verifiers/rules.d.ts +0 -24
- package/dist/eval/verifiers/rules.js +0 -218
- package/dist/eval/verifiers/structural.d.ts +0 -14
- package/dist/eval/verifiers/structural.js +0 -171
- package/dist/eval/verifiers/traceability.d.ts +0 -23
- package/dist/eval/verifiers/traceability.js +0 -84
- package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
- package/dist/eval/verifiers/workflow-consistency.js +0 -225
- package/dist/eval/workflow-corpus.d.ts +0 -7
- package/dist/eval/workflow-corpus.js +0 -207
- package/dist/feature-system.d.ts +0 -42
- package/dist/feature-system.js +0 -432
- package/dist/internal/knowledge-digest.d.ts +0 -7
- package/dist/internal/knowledge-digest.js +0 -93
|
@@ -1,261 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Multi-turn with-tools agent (agent mode, reused by workflow mode).
|
|
3
|
-
*
|
|
4
|
-
* Multi-turn loop with OpenAI-style function-calling over a set of
|
|
5
|
-
* sandbox-confined tools. The AUT is given:
|
|
6
|
-
*
|
|
7
|
-
* - System prompt = stage SKILL.md (same contract as the single-shot path
|
|
8
|
-
* so the baseline is comparable).
|
|
9
|
-
* - User prompt = task description + a short "tools available" hint
|
|
10
|
-
* that names the sandbox root and the four built-in tools.
|
|
11
|
-
* - Tools = `read_file`, `write_file`, `glob`, `grep` (see
|
|
12
|
-
* `src/eval/tools/`).
|
|
13
|
-
*
|
|
14
|
-
* The loop runs up to `config.toolMaxTurns` turns (default 8). Each
|
|
15
|
-
* turn:
|
|
16
|
-
*
|
|
17
|
-
* 1. Send the current transcript to the model with tools enabled.
|
|
18
|
-
* 2. Commit token usage against the wrapped client (cost guard sees
|
|
19
|
-
* every call).
|
|
20
|
-
* 3. If the model returned tool_calls, execute each sandbox tool and
|
|
21
|
-
* append a `role: "tool"` message with the JSON-serialized result.
|
|
22
|
-
* 4. If the model produced assistant content with `finish_reason: stop`,
|
|
23
|
-
* treat that as the artifact and exit.
|
|
24
|
-
*
|
|
25
|
-
* When the turn budget is exhausted without a terminal stop, the agent
|
|
26
|
-
* throws `MaxTurnsExceededError`. The runner surfaces the error as a
|
|
27
|
-
* failed workflow verifier so the case counts as a regression.
|
|
28
|
-
*
|
|
29
|
-
* Artifact resolution: the final assistant content is the artifact. If
|
|
30
|
-
* the model used `write_file` to stage the artifact at
|
|
31
|
-
* `artifact.md` (or `artifact/<stage>.md`), we prefer that file — it
|
|
32
|
-
* mirrors workflow mode where writes are the deliverable. The
|
|
33
|
-
* fallback is the terminal assistant message so prompts that don't
|
|
34
|
-
* call write_file still produce something judgable.
|
|
35
|
-
*/
|
|
36
|
-
import fs from "node:fs/promises";
|
|
37
|
-
import path from "node:path";
|
|
38
|
-
import { computeUsageUsd } from "../cost-guard.js";
|
|
39
|
-
import { createSandbox } from "../sandbox.js";
|
|
40
|
-
import { BUILTIN_TOOLS, toolsByName, toolsForRequest, truncatePayload } from "../tools/index.js";
|
|
41
|
-
import { loadStageSkill } from "./single-shot.js";
|
|
42
|
-
export class MaxTurnsExceededError extends Error {
|
|
43
|
-
turns;
|
|
44
|
-
constructor(turns) {
|
|
45
|
-
super(`Agent loop exceeded the ${turns}-turn budget without a terminal stop.`);
|
|
46
|
-
this.name = "MaxTurnsExceededError";
|
|
47
|
-
this.turns = turns;
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
const DEFAULT_MAX_TURNS = 8;
|
|
51
|
-
const DEFAULT_MAX_ARG_BYTES = 64 * 1024;
|
|
52
|
-
const DEFAULT_MAX_RESULT_BYTES = 32 * 1024;
|
|
53
|
-
const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
|
|
54
|
-
export async function runWithTools(input) {
|
|
55
|
-
const { caseEntry, config, projectRoot, client } = input;
|
|
56
|
-
const maxTurns = clampPositive(config.toolMaxTurns, DEFAULT_MAX_TURNS);
|
|
57
|
-
const maxArgBytes = clampPositive(config.toolMaxArgumentsBytes, DEFAULT_MAX_ARG_BYTES);
|
|
58
|
-
const maxResultBytes = clampPositive(config.toolMaxResultBytes, DEFAULT_MAX_RESULT_BYTES);
|
|
59
|
-
const loader = input.loadSkill ?? ((stage) => loadStageSkill(projectRoot, stage));
|
|
60
|
-
const systemPrompt = await loader(caseEntry.stage);
|
|
61
|
-
const tools = input.tools ?? BUILTIN_TOOLS;
|
|
62
|
-
const toolMap = toolsByName(tools);
|
|
63
|
-
const toolsBody = toolsForRequest(tools);
|
|
64
|
-
const sandboxFactory = input.createSandboxFn ?? createSandbox;
|
|
65
|
-
const externalSandbox = input.externalSandbox;
|
|
66
|
-
const sandbox = externalSandbox ??
|
|
67
|
-
(await sandboxFactory({
|
|
68
|
-
projectRoot,
|
|
69
|
-
...(caseEntry.contextFiles ? { contextFiles: caseEntry.contextFiles } : {})
|
|
70
|
-
}));
|
|
71
|
-
const toolUse = {
|
|
72
|
-
turns: 0,
|
|
73
|
-
calls: 0,
|
|
74
|
-
errors: 0,
|
|
75
|
-
deniedPaths: [],
|
|
76
|
-
byTool: {}
|
|
77
|
-
};
|
|
78
|
-
const usage = { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
|
|
79
|
-
let lastModel = config.model;
|
|
80
|
-
let totalAttempts = 0;
|
|
81
|
-
const userPrompt = buildUserPrompt(caseEntry, sandbox, tools, input.promptPreamble);
|
|
82
|
-
const messages = [
|
|
83
|
-
{ role: "system", content: systemPrompt },
|
|
84
|
-
{ role: "user", content: userPrompt }
|
|
85
|
-
];
|
|
86
|
-
const started = Date.now();
|
|
87
|
-
try {
|
|
88
|
-
for (let turn = 0; turn < maxTurns; turn += 1) {
|
|
89
|
-
toolUse.turns = turn + 1;
|
|
90
|
-
const response = await client.chat({
|
|
91
|
-
model: config.model,
|
|
92
|
-
messages,
|
|
93
|
-
temperature: config.agentTemperature ?? 0.2,
|
|
94
|
-
timeoutMs: config.timeoutMs,
|
|
95
|
-
tools: toolsBody,
|
|
96
|
-
toolChoice: "auto"
|
|
97
|
-
});
|
|
98
|
-
usage.promptTokens += response.usage.promptTokens;
|
|
99
|
-
usage.completionTokens += response.usage.completionTokens;
|
|
100
|
-
usage.totalTokens += response.usage.totalTokens;
|
|
101
|
-
lastModel = response.model;
|
|
102
|
-
totalAttempts += response.attempts;
|
|
103
|
-
const hasToolCalls = response.toolCalls && response.toolCalls.length > 0;
|
|
104
|
-
messages.push(rememberAssistant(response.content, response.toolCalls));
|
|
105
|
-
if (!hasToolCalls) {
|
|
106
|
-
const artifact = await resolveArtifact(sandbox, response.content);
|
|
107
|
-
return finalize(artifact, usage, lastModel, totalAttempts, started, toolUse, systemPrompt, userPrompt, config);
|
|
108
|
-
}
|
|
109
|
-
for (const call of response.toolCalls) {
|
|
110
|
-
const tool = toolMap.get(call.name);
|
|
111
|
-
const argBytes = Buffer.byteLength(call.arguments ?? "", "utf8");
|
|
112
|
-
if (argBytes > maxArgBytes) {
|
|
113
|
-
toolUse.errors += 1;
|
|
114
|
-
bumpToolCount(toolUse, call.name);
|
|
115
|
-
messages.push(toolResponseMessage(call.id, {
|
|
116
|
-
ok: false,
|
|
117
|
-
name: call.name,
|
|
118
|
-
error: `arguments payload exceeds ${maxArgBytes} bytes`
|
|
119
|
-
}));
|
|
120
|
-
continue;
|
|
121
|
-
}
|
|
122
|
-
if (!tool) {
|
|
123
|
-
toolUse.errors += 1;
|
|
124
|
-
bumpToolCount(toolUse, call.name);
|
|
125
|
-
messages.push(toolResponseMessage(call.id, {
|
|
126
|
-
ok: false,
|
|
127
|
-
name: call.name,
|
|
128
|
-
error: `unknown tool "${call.name}"`
|
|
129
|
-
}));
|
|
130
|
-
continue;
|
|
131
|
-
}
|
|
132
|
-
bumpToolCount(toolUse, call.name);
|
|
133
|
-
const result = await tool.invoke(call.arguments ?? "", {
|
|
134
|
-
sandbox,
|
|
135
|
-
maxResultBytes
|
|
136
|
-
});
|
|
137
|
-
if (!result.ok) {
|
|
138
|
-
toolUse.errors += 1;
|
|
139
|
-
const denied = result.details && typeof result.details.deniedPath === "string"
|
|
140
|
-
? result.details.deniedPath
|
|
141
|
-
: undefined;
|
|
142
|
-
if (denied && !toolUse.deniedPaths.includes(denied)) {
|
|
143
|
-
toolUse.deniedPaths.push(denied);
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
else {
|
|
147
|
-
toolUse.calls += 1;
|
|
148
|
-
}
|
|
149
|
-
messages.push(toolResponseMessage(call.id, result));
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
throw new MaxTurnsExceededError(maxTurns);
|
|
153
|
-
}
|
|
154
|
-
finally {
|
|
155
|
-
if (!externalSandbox)
|
|
156
|
-
await sandbox.dispose();
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
function finalize(artifact, usage, model, attempts, started, toolUse, systemPrompt, userPrompt, config) {
|
|
160
|
-
const usageUsd = computeUsageUsd(model, usage, {
|
|
161
|
-
tokenPricing: config.tokenPricing
|
|
162
|
-
});
|
|
163
|
-
return {
|
|
164
|
-
artifact: artifact.trim(),
|
|
165
|
-
usage,
|
|
166
|
-
usageUsd,
|
|
167
|
-
model,
|
|
168
|
-
attempts,
|
|
169
|
-
durationMs: Date.now() - started,
|
|
170
|
-
toolUse,
|
|
171
|
-
systemPrompt,
|
|
172
|
-
userPrompt
|
|
173
|
-
};
|
|
174
|
-
}
|
|
175
|
-
function rememberAssistant(content, toolCalls) {
|
|
176
|
-
const base = { role: "assistant", content };
|
|
177
|
-
if (toolCalls && toolCalls.length > 0)
|
|
178
|
-
base.toolCalls = toolCalls;
|
|
179
|
-
return base;
|
|
180
|
-
}
|
|
181
|
-
function toolResponseMessage(callId, result) {
|
|
182
|
-
const payload = result.ok
|
|
183
|
-
? { ok: true, content: result.content, details: result.details ?? {} }
|
|
184
|
-
: { ok: false, error: result.error, details: result.details ?? {} };
|
|
185
|
-
return {
|
|
186
|
-
role: "tool",
|
|
187
|
-
content: truncatePayload(JSON.stringify(payload), 32 * 1024),
|
|
188
|
-
toolCallId: callId,
|
|
189
|
-
name: result.name
|
|
190
|
-
};
|
|
191
|
-
}
|
|
192
|
-
function bumpToolCount(summary, name) {
|
|
193
|
-
summary.byTool[name] = (summary.byTool[name] ?? 0) + 1;
|
|
194
|
-
}
|
|
195
|
-
function clampPositive(value, fallback) {
|
|
196
|
-
if (value === undefined)
|
|
197
|
-
return fallback;
|
|
198
|
-
if (!Number.isFinite(value) || value <= 0)
|
|
199
|
-
return fallback;
|
|
200
|
-
return Math.floor(value);
|
|
201
|
-
}
|
|
202
|
-
function buildUserPrompt(caseEntry, sandbox, tools, preamble) {
|
|
203
|
-
const toolList = tools.map((t) => `- ${t.descriptor.name}: ${t.descriptor.description}`);
|
|
204
|
-
const files = caseEntry.contextFiles ?? [];
|
|
205
|
-
const contextLines = files.length > 0
|
|
206
|
-
? files.map((f) => `- ${f}`).join("\n")
|
|
207
|
-
: "(no files seeded)";
|
|
208
|
-
const lines = [];
|
|
209
|
-
if (preamble && preamble.trim().length > 0) {
|
|
210
|
-
lines.push(preamble.trim(), ``);
|
|
211
|
-
}
|
|
212
|
-
lines.push(`Stage: ${caseEntry.stage}`, `Case id: ${caseEntry.id}`, ``);
|
|
213
|
-
const rest = [
|
|
214
|
-
`Sandbox root: ${sandbox.root}`,
|
|
215
|
-
`You may call the following tools to read or modify files inside the sandbox.`,
|
|
216
|
-
`All paths are relative to the sandbox root.`,
|
|
217
|
-
``,
|
|
218
|
-
`Tools:`,
|
|
219
|
-
...toolList,
|
|
220
|
-
``,
|
|
221
|
-
`Seeded context files (available under the sandbox root):`,
|
|
222
|
-
contextLines,
|
|
223
|
-
``,
|
|
224
|
-
`Task:`,
|
|
225
|
-
caseEntry.inputPrompt.trim(),
|
|
226
|
-
``,
|
|
227
|
-
`When you are done, reply with the artifact as the final assistant message.`,
|
|
228
|
-
`Output the artifact directly (markdown with optional YAML frontmatter).`,
|
|
229
|
-
`Do not wrap in code fences, do not add commentary before or after.`,
|
|
230
|
-
`You may optionally write the artifact to \`artifact.md\` in the sandbox; ` +
|
|
231
|
-
`if you do, the last written \`artifact.md\` is preferred over the chat reply.`
|
|
232
|
-
];
|
|
233
|
-
lines.push(...rest);
|
|
234
|
-
return lines.join("\n");
|
|
235
|
-
}
|
|
236
|
-
async function resolveArtifact(sandbox, fallback) {
|
|
237
|
-
for (const candidate of ARTIFACT_CANDIDATES) {
|
|
238
|
-
try {
|
|
239
|
-
const abs = await sandbox.resolve(candidate);
|
|
240
|
-
const stat = await fs.stat(abs);
|
|
241
|
-
if (stat.isFile()) {
|
|
242
|
-
return await fs.readFile(abs, "utf8");
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
catch {
|
|
246
|
-
continue;
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
try {
|
|
250
|
-
const dir = path.join(sandbox.root);
|
|
251
|
-
const entries = (await fs.readdir(dir, { withFileTypes: true }));
|
|
252
|
-
const match = entries.find((entry) => entry.isFile() && /^artifact\./i.test(entry.name));
|
|
253
|
-
if (match) {
|
|
254
|
-
return await fs.readFile(path.join(dir, match.name), "utf8");
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
catch {
|
|
258
|
-
// fall through to fallback
|
|
259
|
-
}
|
|
260
|
-
return fallback;
|
|
261
|
-
}
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
import type { EvalLlmClient } from "../llm-client.js";
|
|
2
|
-
import { createSandbox } from "../sandbox.js";
|
|
3
|
-
import type { SandboxTool } from "../tools/index.js";
|
|
4
|
-
import type { ResolvedEvalConfig, WorkflowCase, WorkflowStageName, WorkflowStageResult } from "../types.js";
|
|
5
|
-
export interface WorkflowInput {
|
|
6
|
-
workflow: WorkflowCase;
|
|
7
|
-
config: Pick<ResolvedEvalConfig, "model" | "agentTemperature" | "timeoutMs" | "tokenPricing" | "toolMaxTurns" | "toolMaxArgumentsBytes" | "toolMaxResultBytes" | "workflowMaxTotalTurns">;
|
|
8
|
-
projectRoot: string;
|
|
9
|
-
client: EvalLlmClient;
|
|
10
|
-
tools?: SandboxTool[];
|
|
11
|
-
/** Override for the SKILL.md loader (test hook). */
|
|
12
|
-
loadSkill?: (stage: WorkflowStageName) => Promise<string>;
|
|
13
|
-
/** Override for the sandbox factory (test hook). */
|
|
14
|
-
createSandboxFn?: typeof createSandbox;
|
|
15
|
-
/**
|
|
16
|
-
* Optional per-stage lifecycle hooks. The runner uses these to emit
|
|
17
|
-
* progress events to stderr so workflow-mode runs surface real-time
|
|
18
|
-
* status rather than going silent for minutes.
|
|
19
|
-
*/
|
|
20
|
-
onStageStart?: (stage: WorkflowStageName) => void;
|
|
21
|
-
onStageEnd?: (stage: WorkflowStageName, result: WorkflowStageResult) => void;
|
|
22
|
-
}
|
|
23
|
-
export interface WorkflowOutput {
|
|
24
|
-
caseId: string;
|
|
25
|
-
stages: WorkflowStageResult[];
|
|
26
|
-
/** Map from stage name to produced artifact (also persisted in sandbox). */
|
|
27
|
-
artifacts: Map<WorkflowStageName, string>;
|
|
28
|
-
totalUsageUsd: number;
|
|
29
|
-
totalDurationMs: number;
|
|
30
|
-
}
|
|
31
|
-
export declare function runWorkflow(input: WorkflowInput): Promise<WorkflowOutput>;
|
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Workflow-mode agent.
|
|
3
|
-
*
|
|
4
|
-
* Runs the with-tools loop once per stage in a workflow case,
|
|
5
|
-
* sharing a single sandbox across stages so every new stage can read
|
|
6
|
-
* the earlier artifacts the model produced. The shape of the run is:
|
|
7
|
-
*
|
|
8
|
-
* 1. Create one sandbox seeded with `contextFiles`.
|
|
9
|
-
* 2. For each stage in `workflow.stages`:
|
|
10
|
-
* a. Delete any leftover `artifact.md` so the resolver doesn't
|
|
11
|
-
* accidentally pick the previous stage's output.
|
|
12
|
-
* b. Invoke `runWithTools({ externalSandbox: sandbox, promptPreamble })`.
|
|
13
|
-
* The preamble tells the model which stage it is on and lists the
|
|
14
|
-
* `stages/*.md` files available for reading.
|
|
15
|
-
* c. Persist the returned artifact to `stages/<stage>.md` inside the
|
|
16
|
-
* sandbox (deterministic, regardless of whether the model wrote
|
|
17
|
-
* `artifact.md` itself).
|
|
18
|
-
* d. Record `WorkflowStageResult` with usage, duration, and tool use.
|
|
19
|
-
* 3. Dispose the sandbox in a `finally` so temp directories never leak.
|
|
20
|
-
*
|
|
21
|
-
* Errors bubble up from `runWithTools`:
|
|
22
|
-
* - `MaxTurnsExceededError` stops the workflow at the current stage.
|
|
23
|
-
* - `DailyCostCapExceededError` (surfaced by the cost-guard wrapper in
|
|
24
|
-
* the runner) aborts immediately.
|
|
25
|
-
* - Generic `EvalLlmError` subclasses propagate as-is so the runner can
|
|
26
|
-
* record a workflow-level verifier failure.
|
|
27
|
-
*/
|
|
28
|
-
import fs from "node:fs/promises";
|
|
29
|
-
import path from "node:path";
|
|
30
|
-
import { createSandbox } from "../sandbox.js";
|
|
31
|
-
import { loadStageSkill } from "./single-shot.js";
|
|
32
|
-
import { MaxTurnsExceededError, runWithTools } from "./with-tools.js";
|
|
33
|
-
const STAGES_SUBDIR = "stages";
|
|
34
|
-
const ARTIFACT_CANDIDATES = ["artifact.md", "artifact.txt", "ARTIFACT.md"];
|
|
35
|
-
const DEFAULT_WORKFLOW_MAX_TOTAL_TURNS = 40;
|
|
36
|
-
const DEFAULT_STAGE_TURN_CAP = 8;
|
|
37
|
-
function clampPositive(value, fallback) {
|
|
38
|
-
if (!Number.isInteger(value) || value < 1) {
|
|
39
|
-
return fallback;
|
|
40
|
-
}
|
|
41
|
-
return value;
|
|
42
|
-
}
|
|
43
|
-
export async function runWorkflow(input) {
|
|
44
|
-
const { workflow, config, projectRoot, client } = input;
|
|
45
|
-
const sandboxFactory = input.createSandboxFn ?? createSandbox;
|
|
46
|
-
const sandbox = await sandboxFactory({
|
|
47
|
-
projectRoot,
|
|
48
|
-
...(workflow.contextFiles ? { contextFiles: workflow.contextFiles } : {})
|
|
49
|
-
});
|
|
50
|
-
const stageResults = [];
|
|
51
|
-
const artifacts = new Map();
|
|
52
|
-
let totalUsageUsd = 0;
|
|
53
|
-
let totalDurationMs = 0;
|
|
54
|
-
let totalTurns = 0;
|
|
55
|
-
const workflowTurnBudget = clampPositive(config.workflowMaxTotalTurns, DEFAULT_WORKFLOW_MAX_TOTAL_TURNS);
|
|
56
|
-
try {
|
|
57
|
-
await fs.mkdir(await sandbox.resolve(STAGES_SUBDIR, { allowMissing: true }), { recursive: true });
|
|
58
|
-
for (const step of workflow.stages) {
|
|
59
|
-
const remainingWorkflowTurns = workflowTurnBudget - totalTurns;
|
|
60
|
-
if (remainingWorkflowTurns < 1) {
|
|
61
|
-
throw new MaxTurnsExceededError(workflowTurnBudget);
|
|
62
|
-
}
|
|
63
|
-
const perStageTurnCap = clampPositive(config.toolMaxTurns, DEFAULT_STAGE_TURN_CAP);
|
|
64
|
-
const stageTurnBudget = Math.min(perStageTurnCap, remainingWorkflowTurns);
|
|
65
|
-
input.onStageStart?.(step.name);
|
|
66
|
-
await clearArtifactFile(sandbox);
|
|
67
|
-
const priorStages = stageResults.map((r) => r.stage);
|
|
68
|
-
const preamble = buildStagePreamble(workflow, step.name, priorStages);
|
|
69
|
-
const caseEntry = {
|
|
70
|
-
id: `${workflow.id}/${step.name}`,
|
|
71
|
-
stage: step.name,
|
|
72
|
-
inputPrompt: step.inputPrompt,
|
|
73
|
-
...(workflow.contextFiles ? { contextFiles: workflow.contextFiles } : {})
|
|
74
|
-
};
|
|
75
|
-
const result = await runWithTools({
|
|
76
|
-
caseEntry,
|
|
77
|
-
config: {
|
|
78
|
-
...config,
|
|
79
|
-
toolMaxTurns: stageTurnBudget
|
|
80
|
-
},
|
|
81
|
-
projectRoot,
|
|
82
|
-
client,
|
|
83
|
-
...(input.tools ? { tools: input.tools } : {}),
|
|
84
|
-
...(input.loadSkill
|
|
85
|
-
? { loadSkill: input.loadSkill }
|
|
86
|
-
: {
|
|
87
|
-
loadSkill: (stage) => loadStageSkill(projectRoot, stage)
|
|
88
|
-
}),
|
|
89
|
-
externalSandbox: sandbox,
|
|
90
|
-
promptPreamble: preamble
|
|
91
|
-
});
|
|
92
|
-
await persistStageArtifact(sandbox, step.name, result.artifact);
|
|
93
|
-
artifacts.set(step.name, result.artifact);
|
|
94
|
-
const stageResult = {
|
|
95
|
-
stage: step.name,
|
|
96
|
-
artifact: result.artifact,
|
|
97
|
-
durationMs: result.durationMs,
|
|
98
|
-
usageUsd: result.usageUsd,
|
|
99
|
-
toolUse: result.toolUse,
|
|
100
|
-
attempts: result.attempts,
|
|
101
|
-
model: result.model,
|
|
102
|
-
promptTokens: result.usage.promptTokens,
|
|
103
|
-
completionTokens: result.usage.completionTokens
|
|
104
|
-
};
|
|
105
|
-
stageResults.push(stageResult);
|
|
106
|
-
input.onStageEnd?.(step.name, stageResult);
|
|
107
|
-
totalUsageUsd += result.usageUsd;
|
|
108
|
-
totalDurationMs += result.durationMs;
|
|
109
|
-
totalTurns += result.toolUse.turns;
|
|
110
|
-
}
|
|
111
|
-
return {
|
|
112
|
-
caseId: workflow.id,
|
|
113
|
-
stages: stageResults,
|
|
114
|
-
artifacts,
|
|
115
|
-
totalUsageUsd: Number(totalUsageUsd.toFixed(6)),
|
|
116
|
-
totalDurationMs
|
|
117
|
-
};
|
|
118
|
-
}
|
|
119
|
-
finally {
|
|
120
|
-
await sandbox.dispose();
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
async function clearArtifactFile(sandbox) {
|
|
124
|
-
for (const candidate of ARTIFACT_CANDIDATES) {
|
|
125
|
-
try {
|
|
126
|
-
const abs = await sandbox.resolve(candidate);
|
|
127
|
-
await fs.rm(abs, { force: true });
|
|
128
|
-
}
|
|
129
|
-
catch {
|
|
130
|
-
// candidate did not exist — resolve threw SandboxEscapeError for
|
|
131
|
-
// missing realpath; safe to ignore.
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
async function persistStageArtifact(sandbox, stage, artifact) {
|
|
136
|
-
const rel = `${STAGES_SUBDIR}/${stage}.md`;
|
|
137
|
-
const abs = await sandbox.resolve(rel, { allowMissing: true });
|
|
138
|
-
await fs.mkdir(path.dirname(abs), { recursive: true });
|
|
139
|
-
await fs.writeFile(abs, artifact.endsWith("\n") ? artifact : `${artifact}\n`, "utf8");
|
|
140
|
-
}
|
|
141
|
-
function buildStagePreamble(workflow, current, priorStages) {
|
|
142
|
-
const lines = [];
|
|
143
|
-
lines.push(`You are running stage "${current}" of the workflow "${workflow.id}".`);
|
|
144
|
-
if (workflow.description) {
|
|
145
|
-
lines.push(`Case description: ${workflow.description}`);
|
|
146
|
-
}
|
|
147
|
-
if (priorStages.length === 0) {
|
|
148
|
-
lines.push(`This is the first stage. Any context_files have been seeded into the sandbox root.`);
|
|
149
|
-
}
|
|
150
|
-
else {
|
|
151
|
-
lines.push(`Earlier stage artifacts are available via read_file:`, ...priorStages.map((name) => ` - ${STAGES_SUBDIR}/${name}.md`), `Read the prior artifacts before drafting your output so decisions and ` +
|
|
152
|
-
`ids carry through.`);
|
|
153
|
-
}
|
|
154
|
-
return lines.join("\n");
|
|
155
|
-
}
|
package/dist/eval/baseline.d.ts
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import type { FlowStage } from "../types.js";
|
|
2
|
-
import type { BaselineDelta, BaselineSnapshot, EvalReport } from "./types.js";
|
|
3
|
-
export declare const BASELINE_SCHEMA_VERSION = 1;
|
|
4
|
-
/**
|
|
5
|
-
* Thrown when a signed baseline's on-disk digest does not match the
|
|
6
|
-
* canonical encoding of its `{ schemaVersion, stage, cases }` block.
|
|
7
|
-
* Callers should treat this as a hard failure: the baseline was either
|
|
8
|
-
* hand-edited or corrupted and cannot be trusted for regression gating.
|
|
9
|
-
*/
|
|
10
|
-
export declare class BaselineSignatureError extends Error {
|
|
11
|
-
readonly file: string;
|
|
12
|
-
readonly expected: string;
|
|
13
|
-
readonly actual: string;
|
|
14
|
-
constructor(opts: {
|
|
15
|
-
file: string;
|
|
16
|
-
expected: string;
|
|
17
|
-
actual: string;
|
|
18
|
-
});
|
|
19
|
-
}
|
|
20
|
-
/**
|
|
21
|
-
* Produce a deterministic sha256 digest over the signable portion of a
|
|
22
|
-
* baseline. We intentionally exclude `generatedAt` and `cclawVersion`
|
|
23
|
-
* from the digest so that rebuilding the same baseline from identical
|
|
24
|
-
* case results on a new CLI version doesn't invalidate the signature —
|
|
25
|
-
* only changes to the observed pass/ok/score payloads do.
|
|
26
|
-
*/
|
|
27
|
-
export declare function computeBaselineDigest(snapshot: Pick<BaselineSnapshot, "schemaVersion" | "stage" | "cases">): string;
|
|
28
|
-
export declare function loadBaseline(projectRoot: string, stage: FlowStage): Promise<BaselineSnapshot | null>;
|
|
29
|
-
export declare function loadBaselinesByStage(projectRoot: string, stages: readonly FlowStage[]): Promise<Map<FlowStage, BaselineSnapshot>>;
|
|
30
|
-
export declare function buildBaselineForStage(stage: FlowStage, report: EvalReport): BaselineSnapshot;
|
|
31
|
-
export declare function writeBaselinesFromReport(projectRoot: string, report: EvalReport): Promise<string[]>;
|
|
32
|
-
/**
|
|
33
|
-
* Compare a freshly computed report against loaded baselines. If no baseline
|
|
34
|
-
* exists for a stage covered by the report, that stage contributes zero
|
|
35
|
-
* regressions (first run of that stage). Current is the source of truth.
|
|
36
|
-
*/
|
|
37
|
-
export declare function compareAgainstBaselines(report: EvalReport, baselines: Map<FlowStage, BaselineSnapshot>): BaselineDelta | undefined;
|
|
38
|
-
export declare function listBaselineStages(projectRoot: string): Promise<FlowStage[]>;
|