cclaw-cli 0.24.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ /**
2
+ * LLM judge verifier — Step 3.
3
+ *
4
+ * Given an artifact and the stage's rubric, runs N judge samples (default
5
+ * median-of-3) against the configured LLM, aggregates the per-check
6
+ * scores, and returns one VerifierResult per rubric check plus one
7
+ * aggregate result covering the whole stage.
8
+ *
9
+ * Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
10
+ * so unit tests inject a stub EvalLlmClient and assert on the aggregate
11
+ * math without touching the network.
12
+ */
13
+ import { type EvalLlmClient } from "../llm-client.js";
14
+ import type { JudgeExpected, JudgeInvocation, JudgeSample, ResolvedEvalConfig, RubricDoc, VerifierResult } from "../types.js";
15
+ export interface RunJudgeOptions {
16
+ artifact: string;
17
+ rubric: RubricDoc;
18
+ config: Pick<ResolvedEvalConfig, "model" | "judgeModel" | "judgeSamples" | "judgeTemperature" | "timeoutMs" | "tokenPricing">;
19
+ client: EvalLlmClient;
20
+ /** Per-case hint that overlays the rubric (sample count, minimums). */
21
+ caseHint?: JudgeExpected;
22
+ /** Optional seed seed; incremented per sample for reproducibility. */
23
+ baseSeed?: number;
24
+ }
25
+ /**
26
+ * Parse one judge response into a JudgeSample. The parser is intentionally
27
+ * forgiving with rationales (missing -> empty string) but strict with
28
+ * scores: missing or non-numeric entries are dropped and the coverage
29
+ * flag on the aggregate flips to false.
30
+ */
31
+ export declare function parseJudgeResponse(content: string, rubric: RubricDoc): JudgeSample;
32
+ /** Run the judge against an artifact and return per-sample + aggregate data. */
33
+ export declare function runJudge(options: RunJudgeOptions): Promise<JudgeInvocation>;
34
+ /**
35
+ * Convert a JudgeInvocation into VerifierResult[] for the runner. One
36
+ * result per rubric check (score 0..1 normalized from the 1..5 median) +
37
+ * one "coverage" result that flips to `ok:false` when any sample failed
38
+ * to emit a score for a check.
39
+ */
40
+ export declare function judgeResultsToVerifiers(rubric: RubricDoc, invocation: JudgeInvocation, config: Pick<ResolvedEvalConfig, "regression">, caseHint?: JudgeExpected): VerifierResult[];
@@ -0,0 +1,256 @@
1
+ /**
2
+ * LLM judge verifier — Step 3.
3
+ *
4
+ * Given an artifact and the stage's rubric, runs N judge samples (default
5
+ * median-of-3) against the configured LLM, aggregates the per-check
6
+ * scores, and returns one VerifierResult per rubric check plus one
7
+ * aggregate result covering the whole stage.
8
+ *
9
+ * Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
10
+ * so unit tests inject a stub EvalLlmClient and assert on the aggregate
11
+ * math without touching the network.
12
+ */
13
+ import { EvalLlmError } from "../llm-client.js";
14
+ import { computeUsageUsd } from "../cost-guard.js";
15
+ const SCALE_MIN = 1;
16
+ const SCALE_MAX = 5;
17
+ const SYSTEM_PREAMBLE = `You are a strict reviewer for software engineering artifacts. ` +
18
+ `You will receive a rubric and an artifact. ` +
19
+ `Score each rubric check on an integer 1..5 scale, where:\n` +
20
+ ` 1 = does not meet the bar at all\n` +
21
+ ` 2 = barely meets the bar, major gaps\n` +
22
+ ` 3 = partially meets the bar, noticeable gaps\n` +
23
+ ` 4 = mostly meets the bar, small gaps\n` +
24
+ ` 5 = fully meets the bar\n` +
25
+ `Respond with JSON only (no prose, no markdown fences). ` +
26
+ `Shape: {"scores": {"<check-id>": 1..5, ...}, "rationales": {"<check-id>": "one sentence", ...}}. ` +
27
+ `Include every check id in both maps. Use integer scores only.`;
28
+ function median(values) {
29
+ if (values.length === 0)
30
+ return 0;
31
+ const sorted = [...values].sort((a, b) => a - b);
32
+ const mid = Math.floor(sorted.length / 2);
33
+ if (sorted.length % 2 === 1)
34
+ return sorted[mid];
35
+ return ((sorted[mid - 1] + sorted[mid]) / 2);
36
+ }
37
+ function mean(values) {
38
+ if (values.length === 0)
39
+ return 0;
40
+ return values.reduce((acc, v) => acc + v, 0) / values.length;
41
+ }
42
+ function clampScore(raw) {
43
+ if (typeof raw !== "number" || !Number.isFinite(raw))
44
+ return undefined;
45
+ const clamped = Math.round(Math.min(Math.max(raw, SCALE_MIN), SCALE_MAX));
46
+ return clamped;
47
+ }
48
+ function stripFences(raw) {
49
+ const trimmed = raw.trim();
50
+ if (!trimmed.startsWith("```"))
51
+ return trimmed;
52
+ return trimmed.replace(/^```(?:json)?\s*/i, "").replace(/```\s*$/i, "").trim();
53
+ }
54
+ /**
55
+ * Parse one judge response into a JudgeSample. The parser is intentionally
56
+ * forgiving with rationales (missing -> empty string) but strict with
57
+ * scores: missing or non-numeric entries are dropped and the coverage
58
+ * flag on the aggregate flips to false.
59
+ */
60
+ export function parseJudgeResponse(content, rubric) {
61
+ let parsed;
62
+ try {
63
+ parsed = JSON.parse(stripFences(content));
64
+ }
65
+ catch (err) {
66
+ throw new Error(`Judge response was not valid JSON: ${err instanceof Error ? err.message : String(err)}`);
67
+ }
68
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
69
+ throw new Error("Judge response must be a JSON object with scores/rationales maps.");
70
+ }
71
+ const rawScores = parsed.scores;
72
+ const rawRationales = parsed.rationales;
73
+ if (!rawScores || typeof rawScores !== "object" || Array.isArray(rawScores)) {
74
+ throw new Error('Judge response missing "scores" object.');
75
+ }
76
+ const scores = {};
77
+ const rationales = {};
78
+ for (const check of rubric.checks) {
79
+ const rawScore = rawScores[check.id];
80
+ const clamped = clampScore(rawScore);
81
+ if (clamped !== undefined)
82
+ scores[check.id] = clamped;
83
+ let rationale = "";
84
+ if (rawRationales && typeof rawRationales === "object" && !Array.isArray(rawRationales)) {
85
+ const raw = rawRationales[check.id];
86
+ if (typeof raw === "string")
87
+ rationale = raw.trim();
88
+ }
89
+ rationales[check.id] = rationale;
90
+ }
91
+ return { scores, rationales };
92
+ }
93
+ function aggregateSamples(rubric, samples) {
94
+ return rubric.checks.map((check) => {
95
+ const values = [];
96
+ let covered = true;
97
+ for (const sample of samples) {
98
+ const value = sample.scores[check.id];
99
+ if (typeof value === "number")
100
+ values.push(value);
101
+ else
102
+ covered = false;
103
+ }
104
+ return {
105
+ checkId: check.id,
106
+ samples: values,
107
+ median: median(values),
108
+ mean: Number(mean(values).toFixed(4)),
109
+ coverage: covered && samples.length > 0
110
+ };
111
+ });
112
+ }
113
+ function buildMessages(artifact, rubric) {
114
+ const rubricLines = rubric.checks.map((check) => {
115
+ const scale = check.scale ? ` (${check.scale})` : "";
116
+ const critical = check.critical ? " [critical]" : "";
117
+ return `- ${check.id}${critical}: ${check.prompt}${scale}`;
118
+ });
119
+ const userContent = [
120
+ `Rubric (stage=${rubric.stage}, rubric=${rubric.id}):`,
121
+ ...rubricLines,
122
+ ``,
123
+ `Artifact:`,
124
+ `"""`,
125
+ artifact,
126
+ `"""`,
127
+ ``,
128
+ `Return JSON only.`
129
+ ].join("\n");
130
+ return [
131
+ { role: "system", content: SYSTEM_PREAMBLE },
132
+ { role: "user", content: userContent }
133
+ ];
134
+ }
135
+ function sumUsage(usages) {
136
+ let promptTokens = 0;
137
+ let completionTokens = 0;
138
+ let totalTokens = 0;
139
+ for (const u of usages) {
140
+ promptTokens += u.promptTokens;
141
+ completionTokens += u.completionTokens;
142
+ totalTokens += u.totalTokens;
143
+ }
144
+ return { promptTokens, completionTokens, totalTokens };
145
+ }
146
+ /** Run the judge against an artifact and return per-sample + aggregate data. */
147
+ export async function runJudge(options) {
148
+ const { artifact, rubric, config, client, caseHint, baseSeed } = options;
149
+ const rawSamples = caseHint?.samples ?? config.judgeSamples ?? 3;
150
+ if (!Number.isInteger(rawSamples) || rawSamples < 1) {
151
+ throw new Error(`Invalid judge sample count: ${rawSamples}. Use a positive integer (1, 3, 5).`);
152
+ }
153
+ if (rawSamples % 2 === 0) {
154
+ throw new Error(`Judge sample count must be odd (so a true median exists), got: ${rawSamples}.`);
155
+ }
156
+ const started = Date.now();
157
+ const model = config.judgeModel ?? config.model;
158
+ const temperature = config.judgeTemperature ?? 0;
159
+ const messages = buildMessages(artifact, rubric);
160
+ const samples = [];
161
+ const usages = [];
162
+ for (let i = 0; i < rawSamples; i += 1) {
163
+ let response;
164
+ try {
165
+ response = await client.chat({
166
+ model,
167
+ messages,
168
+ temperature,
169
+ responseFormatJson: true,
170
+ ...(baseSeed !== undefined ? { seed: baseSeed + i } : {}),
171
+ timeoutMs: config.timeoutMs
172
+ });
173
+ }
174
+ catch (err) {
175
+ if (err instanceof EvalLlmError)
176
+ throw err;
177
+ throw err;
178
+ }
179
+ usages.push(response.usage);
180
+ samples.push(parseJudgeResponse(response.content, rubric));
181
+ }
182
+ const aggregates = aggregateSamples(rubric, samples);
183
+ const usage = sumUsage(usages);
184
+ const usageUsd = computeUsageUsd(model, usage, { tokenPricing: config.tokenPricing });
185
+ return {
186
+ rubricId: rubric.id,
187
+ samples,
188
+ aggregates,
189
+ usageUsd,
190
+ durationMs: Date.now() - started
191
+ };
192
+ }
193
+ function verifierIdFor(check) {
194
+ return `judge:${check.id}`;
195
+ }
196
+ /**
197
+ * Convert a JudgeInvocation into VerifierResult[] for the runner. One
198
+ * result per rubric check (score 0..1 normalized from the 1..5 median) +
199
+ * one "coverage" result that flips to `ok:false` when any sample failed
200
+ * to emit a score for a check.
201
+ */
202
+ export function judgeResultsToVerifiers(rubric, invocation, config, caseHint) {
203
+ const out = [];
204
+ const failIfCriticalBelow = config.regression.failIfCriticalBelow;
205
+ for (const aggregate of invocation.aggregates) {
206
+ const check = rubric.checks.find((c) => c.id === aggregate.checkId);
207
+ if (!check)
208
+ continue;
209
+ const normalized = (aggregate.median - SCALE_MIN) / (SCALE_MAX - SCALE_MIN);
210
+ const caseMinimum = caseHint?.minimumScores?.[check.id];
211
+ const criticalFloor = check.critical ? failIfCriticalBelow : undefined;
212
+ const floors = [];
213
+ if (typeof caseMinimum === "number")
214
+ floors.push(caseMinimum);
215
+ if (typeof criticalFloor === "number")
216
+ floors.push(criticalFloor);
217
+ const floor = floors.length > 0 ? Math.max(...floors) : undefined;
218
+ const ok = !aggregate.coverage
219
+ ? false
220
+ : floor === undefined || aggregate.median >= floor;
221
+ out.push({
222
+ kind: "judge",
223
+ id: verifierIdFor(check),
224
+ ok,
225
+ score: Number(Math.max(0, Math.min(1, normalized)).toFixed(4)),
226
+ message: ok
227
+ ? `median=${aggregate.median.toFixed(2)} across ${aggregate.samples.length} sample(s)`
228
+ : aggregate.coverage
229
+ ? `median=${aggregate.median.toFixed(2)} below floor=${floor?.toFixed(2) ?? "n/a"}`
230
+ : `judge did not score every sample (${aggregate.samples.length}/${invocation.samples.length}); treated as failing`,
231
+ details: {
232
+ median: aggregate.median,
233
+ mean: aggregate.mean,
234
+ samples: aggregate.samples,
235
+ coverage: aggregate.coverage,
236
+ critical: check.critical === true,
237
+ caseMinimum: caseMinimum ?? null,
238
+ criticalFloor: criticalFloor ?? null
239
+ }
240
+ });
241
+ }
242
+ const required = caseHint?.requiredChecks ?? [];
243
+ const covered = new Set(rubric.checks.map((c) => c.id));
244
+ const missingRequired = required.filter((id) => !covered.has(id));
245
+ if (missingRequired.length > 0) {
246
+ out.push({
247
+ kind: "judge",
248
+ id: "judge:required-checks",
249
+ ok: false,
250
+ score: 0,
251
+ message: `Rubric is missing required check id(s): ${missingRequired.join(", ")}`,
252
+ details: { missing: missingRequired, rubricId: rubric.id }
253
+ });
254
+ }
255
+ return out;
256
+ }
package/dist/install.js CHANGED
@@ -28,7 +28,7 @@ import { contextMonitorScript, promptGuardScript, workflowGuardScript } from "./
28
28
  import { META_SKILL_NAME, usingCclawSkillMarkdown } from "./content/meta-skill.js";
29
29
  import { decisionProtocolMarkdown, completionProtocolMarkdown, ethosProtocolMarkdown } from "./content/protocols.js";
30
30
  import { ARTIFACT_TEMPLATES, CURSOR_WORKFLOW_RULE_MDC, RULEBOOK_MARKDOWN, buildRulesJson } from "./content/templates.js";
31
- import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
31
+ import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRIC_FILES, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
32
32
  import { TDD_BATCH_WALKTHROUGH_MARKDOWN, stageSkillFolder, stageSkillMarkdown } from "./content/skills.js";
33
33
  import { stageCommonGuidanceMarkdown } from "./content/stage-common-guidance.js";
34
34
  import { STAGE_EXAMPLES_REFERENCE_DIR, stageExamplesReferenceMarkdown } from "./content/examples.js";
@@ -198,6 +198,12 @@ async function writeEvalScaffold(projectRoot) {
198
198
  { rel: "evals/baselines/README.md", content: EVAL_BASELINES_README },
199
199
  { rel: "evals/reports/README.md", content: EVAL_REPORTS_README }
200
200
  ];
201
+ for (const rubric of EVAL_RUBRIC_FILES) {
202
+ targets.push({
203
+ rel: `evals/rubrics/${rubric.stage}.yaml`,
204
+ content: rubric.contents
205
+ });
206
+ }
201
207
  for (const target of targets) {
202
208
  const absolute = runtimePath(projectRoot, ...target.rel.split("/"));
203
209
  if (await exists(absolute))
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cclaw-cli",
3
- "version": "0.24.0",
3
+ "version": "0.26.0",
4
4
  "description": "Installer-first flow toolkit for coding agents",
5
5
  "type": "module",
6
6
  "bin": {
@@ -40,6 +40,7 @@
40
40
  "node": ">=20.0.0"
41
41
  },
42
42
  "dependencies": {
43
+ "openai": "^4.104.0",
43
44
  "yaml": "^2.8.1"
44
45
  },
45
46
  "devDependencies": {