cclaw-cli 0.24.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +3 -1
- package/dist/content/eval-scaffold.d.ts +5 -1
- package/dist/content/eval-scaffold.js +284 -3
- package/dist/eval/agents/single-shot.d.ts +27 -0
- package/dist/eval/agents/single-shot.js +79 -0
- package/dist/eval/agents/with-tools.d.ts +31 -0
- package/dist/eval/agents/with-tools.js +255 -0
- package/dist/eval/config-loader.js +128 -3
- package/dist/eval/cost-guard.d.ts +80 -0
- package/dist/eval/cost-guard.js +153 -0
- package/dist/eval/llm-client.d.ts +123 -20
- package/dist/eval/llm-client.js +251 -10
- package/dist/eval/report.js +45 -0
- package/dist/eval/rubric-loader.d.ts +20 -0
- package/dist/eval/rubric-loader.js +143 -0
- package/dist/eval/runner.d.ts +7 -0
- package/dist/eval/runner.js +193 -12
- package/dist/eval/sandbox.d.ts +38 -0
- package/dist/eval/sandbox.js +137 -0
- package/dist/eval/tools/glob.d.ts +2 -0
- package/dist/eval/tools/glob.js +163 -0
- package/dist/eval/tools/grep.d.ts +2 -0
- package/dist/eval/tools/grep.js +152 -0
- package/dist/eval/tools/index.d.ts +7 -0
- package/dist/eval/tools/index.js +35 -0
- package/dist/eval/tools/read.d.ts +2 -0
- package/dist/eval/tools/read.js +122 -0
- package/dist/eval/tools/types.d.ts +49 -0
- package/dist/eval/tools/types.js +41 -0
- package/dist/eval/tools/write.d.ts +2 -0
- package/dist/eval/tools/write.js +92 -0
- package/dist/eval/types.d.ts +138 -1
- package/dist/eval/verifiers/judge.d.ts +40 -0
- package/dist/eval/verifiers/judge.js +256 -0
- package/dist/install.js +7 -1
- package/package.json +2 -1
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM judge verifier — Step 3.
|
|
3
|
+
*
|
|
4
|
+
* Given an artifact and the stage's rubric, runs N judge samples (default
|
|
5
|
+
* median-of-3) against the configured LLM, aggregates the per-check
|
|
6
|
+
* scores, and returns one VerifierResult per rubric check plus one
|
|
7
|
+
* aggregate result covering the whole stage.
|
|
8
|
+
*
|
|
9
|
+
* Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
|
|
10
|
+
* so unit tests inject a stub EvalLlmClient and assert on the aggregate
|
|
11
|
+
* math without touching the network.
|
|
12
|
+
*/
|
|
13
|
+
import { type EvalLlmClient } from "../llm-client.js";
|
|
14
|
+
import type { JudgeExpected, JudgeInvocation, JudgeSample, ResolvedEvalConfig, RubricDoc, VerifierResult } from "../types.js";
|
|
15
|
+
export interface RunJudgeOptions {
|
|
16
|
+
artifact: string;
|
|
17
|
+
rubric: RubricDoc;
|
|
18
|
+
config: Pick<ResolvedEvalConfig, "model" | "judgeModel" | "judgeSamples" | "judgeTemperature" | "timeoutMs" | "tokenPricing">;
|
|
19
|
+
client: EvalLlmClient;
|
|
20
|
+
/** Per-case hint that overlays the rubric (sample count, minimums). */
|
|
21
|
+
caseHint?: JudgeExpected;
|
|
22
|
+
/** Optional seed seed; incremented per sample for reproducibility. */
|
|
23
|
+
baseSeed?: number;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Parse one judge response into a JudgeSample. The parser is intentionally
|
|
27
|
+
* forgiving with rationales (missing -> empty string) but strict with
|
|
28
|
+
* scores: missing or non-numeric entries are dropped and the coverage
|
|
29
|
+
* flag on the aggregate flips to false.
|
|
30
|
+
*/
|
|
31
|
+
export declare function parseJudgeResponse(content: string, rubric: RubricDoc): JudgeSample;
|
|
32
|
+
/** Run the judge against an artifact and return per-sample + aggregate data. */
|
|
33
|
+
export declare function runJudge(options: RunJudgeOptions): Promise<JudgeInvocation>;
|
|
34
|
+
/**
|
|
35
|
+
* Convert a JudgeInvocation into VerifierResult[] for the runner. One
|
|
36
|
+
* result per rubric check (score 0..1 normalized from the 1..5 median) +
|
|
37
|
+
* one "coverage" result that flips to `ok:false` when any sample failed
|
|
38
|
+
* to emit a score for a check.
|
|
39
|
+
*/
|
|
40
|
+
export declare function judgeResultsToVerifiers(rubric: RubricDoc, invocation: JudgeInvocation, config: Pick<ResolvedEvalConfig, "regression">, caseHint?: JudgeExpected): VerifierResult[];
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM judge verifier — Step 3.
|
|
3
|
+
*
|
|
4
|
+
* Given an artifact and the stage's rubric, runs N judge samples (default
|
|
5
|
+
* median-of-3) against the configured LLM, aggregates the per-check
|
|
6
|
+
* scores, and returns one VerifierResult per rubric check plus one
|
|
7
|
+
* aggregate result covering the whole stage.
|
|
8
|
+
*
|
|
9
|
+
* Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
|
|
10
|
+
* so unit tests inject a stub EvalLlmClient and assert on the aggregate
|
|
11
|
+
* math without touching the network.
|
|
12
|
+
*/
|
|
13
|
+
import { EvalLlmError } from "../llm-client.js";
|
|
14
|
+
import { computeUsageUsd } from "../cost-guard.js";
|
|
15
|
+
const SCALE_MIN = 1;
|
|
16
|
+
const SCALE_MAX = 5;
|
|
17
|
+
const SYSTEM_PREAMBLE = `You are a strict reviewer for software engineering artifacts. ` +
|
|
18
|
+
`You will receive a rubric and an artifact. ` +
|
|
19
|
+
`Score each rubric check on an integer 1..5 scale, where:\n` +
|
|
20
|
+
` 1 = does not meet the bar at all\n` +
|
|
21
|
+
` 2 = barely meets the bar, major gaps\n` +
|
|
22
|
+
` 3 = partially meets the bar, noticeable gaps\n` +
|
|
23
|
+
` 4 = mostly meets the bar, small gaps\n` +
|
|
24
|
+
` 5 = fully meets the bar\n` +
|
|
25
|
+
`Respond with JSON only (no prose, no markdown fences). ` +
|
|
26
|
+
`Shape: {"scores": {"<check-id>": 1..5, ...}, "rationales": {"<check-id>": "one sentence", ...}}. ` +
|
|
27
|
+
`Include every check id in both maps. Use integer scores only.`;
|
|
28
|
+
function median(values) {
|
|
29
|
+
if (values.length === 0)
|
|
30
|
+
return 0;
|
|
31
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
32
|
+
const mid = Math.floor(sorted.length / 2);
|
|
33
|
+
if (sorted.length % 2 === 1)
|
|
34
|
+
return sorted[mid];
|
|
35
|
+
return ((sorted[mid - 1] + sorted[mid]) / 2);
|
|
36
|
+
}
|
|
37
|
+
function mean(values) {
|
|
38
|
+
if (values.length === 0)
|
|
39
|
+
return 0;
|
|
40
|
+
return values.reduce((acc, v) => acc + v, 0) / values.length;
|
|
41
|
+
}
|
|
42
|
+
function clampScore(raw) {
|
|
43
|
+
if (typeof raw !== "number" || !Number.isFinite(raw))
|
|
44
|
+
return undefined;
|
|
45
|
+
const clamped = Math.round(Math.min(Math.max(raw, SCALE_MIN), SCALE_MAX));
|
|
46
|
+
return clamped;
|
|
47
|
+
}
|
|
48
|
+
function stripFences(raw) {
|
|
49
|
+
const trimmed = raw.trim();
|
|
50
|
+
if (!trimmed.startsWith("```"))
|
|
51
|
+
return trimmed;
|
|
52
|
+
return trimmed.replace(/^```(?:json)?\s*/i, "").replace(/```\s*$/i, "").trim();
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Parse one judge response into a JudgeSample. The parser is intentionally
|
|
56
|
+
* forgiving with rationales (missing -> empty string) but strict with
|
|
57
|
+
* scores: missing or non-numeric entries are dropped and the coverage
|
|
58
|
+
* flag on the aggregate flips to false.
|
|
59
|
+
*/
|
|
60
|
+
export function parseJudgeResponse(content, rubric) {
|
|
61
|
+
let parsed;
|
|
62
|
+
try {
|
|
63
|
+
parsed = JSON.parse(stripFences(content));
|
|
64
|
+
}
|
|
65
|
+
catch (err) {
|
|
66
|
+
throw new Error(`Judge response was not valid JSON: ${err instanceof Error ? err.message : String(err)}`);
|
|
67
|
+
}
|
|
68
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
69
|
+
throw new Error("Judge response must be a JSON object with scores/rationales maps.");
|
|
70
|
+
}
|
|
71
|
+
const rawScores = parsed.scores;
|
|
72
|
+
const rawRationales = parsed.rationales;
|
|
73
|
+
if (!rawScores || typeof rawScores !== "object" || Array.isArray(rawScores)) {
|
|
74
|
+
throw new Error('Judge response missing "scores" object.');
|
|
75
|
+
}
|
|
76
|
+
const scores = {};
|
|
77
|
+
const rationales = {};
|
|
78
|
+
for (const check of rubric.checks) {
|
|
79
|
+
const rawScore = rawScores[check.id];
|
|
80
|
+
const clamped = clampScore(rawScore);
|
|
81
|
+
if (clamped !== undefined)
|
|
82
|
+
scores[check.id] = clamped;
|
|
83
|
+
let rationale = "";
|
|
84
|
+
if (rawRationales && typeof rawRationales === "object" && !Array.isArray(rawRationales)) {
|
|
85
|
+
const raw = rawRationales[check.id];
|
|
86
|
+
if (typeof raw === "string")
|
|
87
|
+
rationale = raw.trim();
|
|
88
|
+
}
|
|
89
|
+
rationales[check.id] = rationale;
|
|
90
|
+
}
|
|
91
|
+
return { scores, rationales };
|
|
92
|
+
}
|
|
93
|
+
function aggregateSamples(rubric, samples) {
|
|
94
|
+
return rubric.checks.map((check) => {
|
|
95
|
+
const values = [];
|
|
96
|
+
let covered = true;
|
|
97
|
+
for (const sample of samples) {
|
|
98
|
+
const value = sample.scores[check.id];
|
|
99
|
+
if (typeof value === "number")
|
|
100
|
+
values.push(value);
|
|
101
|
+
else
|
|
102
|
+
covered = false;
|
|
103
|
+
}
|
|
104
|
+
return {
|
|
105
|
+
checkId: check.id,
|
|
106
|
+
samples: values,
|
|
107
|
+
median: median(values),
|
|
108
|
+
mean: Number(mean(values).toFixed(4)),
|
|
109
|
+
coverage: covered && samples.length > 0
|
|
110
|
+
};
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
function buildMessages(artifact, rubric) {
|
|
114
|
+
const rubricLines = rubric.checks.map((check) => {
|
|
115
|
+
const scale = check.scale ? ` (${check.scale})` : "";
|
|
116
|
+
const critical = check.critical ? " [critical]" : "";
|
|
117
|
+
return `- ${check.id}${critical}: ${check.prompt}${scale}`;
|
|
118
|
+
});
|
|
119
|
+
const userContent = [
|
|
120
|
+
`Rubric (stage=${rubric.stage}, rubric=${rubric.id}):`,
|
|
121
|
+
...rubricLines,
|
|
122
|
+
``,
|
|
123
|
+
`Artifact:`,
|
|
124
|
+
`"""`,
|
|
125
|
+
artifact,
|
|
126
|
+
`"""`,
|
|
127
|
+
``,
|
|
128
|
+
`Return JSON only.`
|
|
129
|
+
].join("\n");
|
|
130
|
+
return [
|
|
131
|
+
{ role: "system", content: SYSTEM_PREAMBLE },
|
|
132
|
+
{ role: "user", content: userContent }
|
|
133
|
+
];
|
|
134
|
+
}
|
|
135
|
+
function sumUsage(usages) {
|
|
136
|
+
let promptTokens = 0;
|
|
137
|
+
let completionTokens = 0;
|
|
138
|
+
let totalTokens = 0;
|
|
139
|
+
for (const u of usages) {
|
|
140
|
+
promptTokens += u.promptTokens;
|
|
141
|
+
completionTokens += u.completionTokens;
|
|
142
|
+
totalTokens += u.totalTokens;
|
|
143
|
+
}
|
|
144
|
+
return { promptTokens, completionTokens, totalTokens };
|
|
145
|
+
}
|
|
146
|
+
/** Run the judge against an artifact and return per-sample + aggregate data. */
|
|
147
|
+
export async function runJudge(options) {
|
|
148
|
+
const { artifact, rubric, config, client, caseHint, baseSeed } = options;
|
|
149
|
+
const rawSamples = caseHint?.samples ?? config.judgeSamples ?? 3;
|
|
150
|
+
if (!Number.isInteger(rawSamples) || rawSamples < 1) {
|
|
151
|
+
throw new Error(`Invalid judge sample count: ${rawSamples}. Use a positive integer (1, 3, 5).`);
|
|
152
|
+
}
|
|
153
|
+
if (rawSamples % 2 === 0) {
|
|
154
|
+
throw new Error(`Judge sample count must be odd (so a true median exists), got: ${rawSamples}.`);
|
|
155
|
+
}
|
|
156
|
+
const started = Date.now();
|
|
157
|
+
const model = config.judgeModel ?? config.model;
|
|
158
|
+
const temperature = config.judgeTemperature ?? 0;
|
|
159
|
+
const messages = buildMessages(artifact, rubric);
|
|
160
|
+
const samples = [];
|
|
161
|
+
const usages = [];
|
|
162
|
+
for (let i = 0; i < rawSamples; i += 1) {
|
|
163
|
+
let response;
|
|
164
|
+
try {
|
|
165
|
+
response = await client.chat({
|
|
166
|
+
model,
|
|
167
|
+
messages,
|
|
168
|
+
temperature,
|
|
169
|
+
responseFormatJson: true,
|
|
170
|
+
...(baseSeed !== undefined ? { seed: baseSeed + i } : {}),
|
|
171
|
+
timeoutMs: config.timeoutMs
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
catch (err) {
|
|
175
|
+
if (err instanceof EvalLlmError)
|
|
176
|
+
throw err;
|
|
177
|
+
throw err;
|
|
178
|
+
}
|
|
179
|
+
usages.push(response.usage);
|
|
180
|
+
samples.push(parseJudgeResponse(response.content, rubric));
|
|
181
|
+
}
|
|
182
|
+
const aggregates = aggregateSamples(rubric, samples);
|
|
183
|
+
const usage = sumUsage(usages);
|
|
184
|
+
const usageUsd = computeUsageUsd(model, usage, { tokenPricing: config.tokenPricing });
|
|
185
|
+
return {
|
|
186
|
+
rubricId: rubric.id,
|
|
187
|
+
samples,
|
|
188
|
+
aggregates,
|
|
189
|
+
usageUsd,
|
|
190
|
+
durationMs: Date.now() - started
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
function verifierIdFor(check) {
|
|
194
|
+
return `judge:${check.id}`;
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Convert a JudgeInvocation into VerifierResult[] for the runner. One
|
|
198
|
+
* result per rubric check (score 0..1 normalized from the 1..5 median) +
|
|
199
|
+
* one "coverage" result that flips to `ok:false` when any sample failed
|
|
200
|
+
* to emit a score for a check.
|
|
201
|
+
*/
|
|
202
|
+
export function judgeResultsToVerifiers(rubric, invocation, config, caseHint) {
|
|
203
|
+
const out = [];
|
|
204
|
+
const failIfCriticalBelow = config.regression.failIfCriticalBelow;
|
|
205
|
+
for (const aggregate of invocation.aggregates) {
|
|
206
|
+
const check = rubric.checks.find((c) => c.id === aggregate.checkId);
|
|
207
|
+
if (!check)
|
|
208
|
+
continue;
|
|
209
|
+
const normalized = (aggregate.median - SCALE_MIN) / (SCALE_MAX - SCALE_MIN);
|
|
210
|
+
const caseMinimum = caseHint?.minimumScores?.[check.id];
|
|
211
|
+
const criticalFloor = check.critical ? failIfCriticalBelow : undefined;
|
|
212
|
+
const floors = [];
|
|
213
|
+
if (typeof caseMinimum === "number")
|
|
214
|
+
floors.push(caseMinimum);
|
|
215
|
+
if (typeof criticalFloor === "number")
|
|
216
|
+
floors.push(criticalFloor);
|
|
217
|
+
const floor = floors.length > 0 ? Math.max(...floors) : undefined;
|
|
218
|
+
const ok = !aggregate.coverage
|
|
219
|
+
? false
|
|
220
|
+
: floor === undefined || aggregate.median >= floor;
|
|
221
|
+
out.push({
|
|
222
|
+
kind: "judge",
|
|
223
|
+
id: verifierIdFor(check),
|
|
224
|
+
ok,
|
|
225
|
+
score: Number(Math.max(0, Math.min(1, normalized)).toFixed(4)),
|
|
226
|
+
message: ok
|
|
227
|
+
? `median=${aggregate.median.toFixed(2)} across ${aggregate.samples.length} sample(s)`
|
|
228
|
+
: aggregate.coverage
|
|
229
|
+
? `median=${aggregate.median.toFixed(2)} below floor=${floor?.toFixed(2) ?? "n/a"}`
|
|
230
|
+
: `judge did not score every sample (${aggregate.samples.length}/${invocation.samples.length}); treated as failing`,
|
|
231
|
+
details: {
|
|
232
|
+
median: aggregate.median,
|
|
233
|
+
mean: aggregate.mean,
|
|
234
|
+
samples: aggregate.samples,
|
|
235
|
+
coverage: aggregate.coverage,
|
|
236
|
+
critical: check.critical === true,
|
|
237
|
+
caseMinimum: caseMinimum ?? null,
|
|
238
|
+
criticalFloor: criticalFloor ?? null
|
|
239
|
+
}
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
const required = caseHint?.requiredChecks ?? [];
|
|
243
|
+
const covered = new Set(rubric.checks.map((c) => c.id));
|
|
244
|
+
const missingRequired = required.filter((id) => !covered.has(id));
|
|
245
|
+
if (missingRequired.length > 0) {
|
|
246
|
+
out.push({
|
|
247
|
+
kind: "judge",
|
|
248
|
+
id: "judge:required-checks",
|
|
249
|
+
ok: false,
|
|
250
|
+
score: 0,
|
|
251
|
+
message: `Rubric is missing required check id(s): ${missingRequired.join(", ")}`,
|
|
252
|
+
details: { missing: missingRequired, rubricId: rubric.id }
|
|
253
|
+
});
|
|
254
|
+
}
|
|
255
|
+
return out;
|
|
256
|
+
}
|
package/dist/install.js
CHANGED
|
@@ -28,7 +28,7 @@ import { contextMonitorScript, promptGuardScript, workflowGuardScript } from "./
|
|
|
28
28
|
import { META_SKILL_NAME, usingCclawSkillMarkdown } from "./content/meta-skill.js";
|
|
29
29
|
import { decisionProtocolMarkdown, completionProtocolMarkdown, ethosProtocolMarkdown } from "./content/protocols.js";
|
|
30
30
|
import { ARTIFACT_TEMPLATES, CURSOR_WORKFLOW_RULE_MDC, RULEBOOK_MARKDOWN, buildRulesJson } from "./content/templates.js";
|
|
31
|
-
import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
|
|
31
|
+
import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRIC_FILES, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
|
|
32
32
|
import { TDD_BATCH_WALKTHROUGH_MARKDOWN, stageSkillFolder, stageSkillMarkdown } from "./content/skills.js";
|
|
33
33
|
import { stageCommonGuidanceMarkdown } from "./content/stage-common-guidance.js";
|
|
34
34
|
import { STAGE_EXAMPLES_REFERENCE_DIR, stageExamplesReferenceMarkdown } from "./content/examples.js";
|
|
@@ -198,6 +198,12 @@ async function writeEvalScaffold(projectRoot) {
|
|
|
198
198
|
{ rel: "evals/baselines/README.md", content: EVAL_BASELINES_README },
|
|
199
199
|
{ rel: "evals/reports/README.md", content: EVAL_REPORTS_README }
|
|
200
200
|
];
|
|
201
|
+
for (const rubric of EVAL_RUBRIC_FILES) {
|
|
202
|
+
targets.push({
|
|
203
|
+
rel: `evals/rubrics/${rubric.stage}.yaml`,
|
|
204
|
+
content: rubric.contents
|
|
205
|
+
});
|
|
206
|
+
}
|
|
201
207
|
for (const target of targets) {
|
|
202
208
|
const absolute = runtimePath(projectRoot, ...target.rel.split("/"));
|
|
203
209
|
if (await exists(absolute))
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "cclaw-cli",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.26.0",
|
|
4
4
|
"description": "Installer-first flow toolkit for coding agents",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -40,6 +40,7 @@
|
|
|
40
40
|
"node": ">=20.0.0"
|
|
41
41
|
},
|
|
42
42
|
"dependencies": {
|
|
43
|
+
"openai": "^4.104.0",
|
|
43
44
|
"yaml": "^2.8.1"
|
|
44
45
|
},
|
|
45
46
|
"devDependencies": {
|