cclaw-cli 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,19 +1,251 @@
1
- export class EvalLlmNotWiredError extends Error {
1
+ /**
2
+ * LLM client for the cclaw eval subsystem.
3
+ *
4
+ * Thin adapter over the `openai` SDK pointed at any OpenAI-compatible
5
+ * `baseURL` (z.ai, OpenAI, vLLM, Ollama+openai-proxy, ...). The surface is
6
+ * deliberately narrow:
7
+ *
8
+ * - `chat()` — one request/response round-trip with timeout, bounded
9
+ * retries on transient errors, and a structured error hierarchy so
10
+ * callers can react policy-style (cost-guard, judge, agent-under-test).
11
+ * - `ChatRequest` / `ChatResponse` — wire format decoupled from the
12
+ * OpenAI types so swapping vendors stays a one-file change.
13
+ *
14
+ * Factories stay side-effect-free: no network calls are made until `chat()`
15
+ * is invoked, so CLI help and dry-run paths never need an API key.
16
+ */
17
+ import OpenAI from "openai";
18
+ /** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
19
+ export class EvalLlmError extends Error {
20
+ retryable;
21
+ status;
22
+ constructor(message, opts) {
23
+ super(message);
24
+ this.name = "EvalLlmError";
25
+ this.retryable = opts.retryable;
26
+ if (opts.status !== undefined)
27
+ this.status = opts.status;
28
+ if (opts.cause !== undefined)
29
+ this.cause = opts.cause;
30
+ }
31
+ }
32
+ export class EvalLlmAuthError extends EvalLlmError {
33
+ constructor(cause) {
34
+ super("LLM request rejected (auth). Check CCLAW_EVAL_API_KEY and provider permissions.", {
35
+ retryable: false,
36
+ status: 401,
37
+ cause
38
+ });
39
+ this.name = "EvalLlmAuthError";
40
+ }
41
+ }
42
+ export class EvalLlmConfigError extends EvalLlmError {
43
+ constructor(message, cause) {
44
+ super(message, { retryable: false, cause });
45
+ this.name = "EvalLlmConfigError";
46
+ }
47
+ }
48
+ export class EvalLlmTimeoutError extends EvalLlmError {
49
+ constructor(timeoutMs) {
50
+ super(`LLM request timed out after ${timeoutMs}ms.`, { retryable: true });
51
+ this.name = "EvalLlmTimeoutError";
52
+ }
53
+ }
54
+ export class EvalLlmRateLimitedError extends EvalLlmError {
55
+ constructor(cause) {
56
+ super("LLM rate limit hit. Retrying with backoff.", {
57
+ retryable: true,
58
+ status: 429,
59
+ cause
60
+ });
61
+ this.name = "EvalLlmRateLimitedError";
62
+ }
63
+ }
64
+ export class EvalLlmTransportError extends EvalLlmError {
65
+ constructor(cause, status) {
66
+ super("LLM transport error.", { retryable: true, status, cause });
67
+ this.name = "EvalLlmTransportError";
68
+ }
69
+ }
70
+ export class EvalLlmInvalidResponseError extends EvalLlmError {
71
+ constructor(message, details) {
72
+ super(message, { retryable: false });
73
+ this.name = "EvalLlmInvalidResponseError";
74
+ if (details)
75
+ this.details = details;
76
+ }
77
+ }
78
+ export class EvalLlmNotConfiguredError extends EvalLlmError {
2
79
  constructor() {
3
- super(`LLM client is not wired yet.\n` +
4
- `Run \`cclaw eval --dry-run\` or \`cclaw eval --schema-only\` for offline evals.`);
5
- this.name = "EvalLlmNotWiredError";
80
+ super(`LLM client not configured. Set CCLAW_EVAL_API_KEY (and optionally ` +
81
+ `CCLAW_EVAL_BASE_URL / CCLAW_EVAL_MODEL) or run with --schema-only / --rules.`, { retryable: false });
82
+ this.name = "EvalLlmNotConfiguredError";
6
83
  }
7
84
  }
8
85
  /**
9
- * Factory stub. Throws with a clear message so accidental early usage is
10
- * easy to diagnose. The real implementation will replace this body with
11
- * `new OpenAI({ apiKey, baseURL }) ... adapter`.
86
+ * Deprecated shim preserved so older wiring keeps compiling. Prefer
87
+ * `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
88
+ * key" case.
12
89
  */
13
- export function createEvalClient(_config) {
90
+ export class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
91
+ }
92
+ export const DEFAULT_RETRY_POLICY = {
93
+ maxRetries: 2,
94
+ initialBackoffMs: 500,
95
+ maxBackoffMs: 8_000
96
+ };
97
+ function isAbortError(err) {
98
+ if (err === null || typeof err !== "object")
99
+ return false;
100
+ const name = err.name;
101
+ const code = err.code;
102
+ return (name === "AbortError" || code === "ABORT_ERR" || code === "ERR_CANCELED");
103
+ }
104
+ function statusFromError(err) {
105
+ if (err === null || typeof err !== "object")
106
+ return undefined;
107
+ const status = err.status;
108
+ return typeof status === "number" ? status : undefined;
109
+ }
110
+ function normalizeError(err, timeoutMs) {
111
+ if (err instanceof EvalLlmError)
112
+ return err;
113
+ if (isAbortError(err))
114
+ return new EvalLlmTimeoutError(timeoutMs);
115
+ const status = statusFromError(err);
116
+ if (status === 401 || status === 403)
117
+ return new EvalLlmAuthError(err);
118
+ if (status === 429)
119
+ return new EvalLlmRateLimitedError(err);
120
+ if (status !== undefined && status >= 400 && status < 500) {
121
+ return new EvalLlmError(`LLM request rejected (HTTP ${status}).`, {
122
+ retryable: false,
123
+ status,
124
+ cause: err
125
+ });
126
+ }
127
+ return new EvalLlmTransportError(err, status);
128
+ }
129
+ function normalizeFinishReason(raw) {
130
+ switch (raw) {
131
+ case "length":
132
+ return "length";
133
+ case "tool_calls":
134
+ case "function_call":
135
+ return "tool_calls";
136
+ case "content_filter":
137
+ return "content_filter";
138
+ case "stop":
139
+ case null:
140
+ case undefined:
141
+ default:
142
+ return "stop";
143
+ }
144
+ }
145
+ function buildBody(request) {
146
+ const body = {
147
+ model: request.model,
148
+ messages: request.messages.map((m) => ({
149
+ role: m.role,
150
+ content: m.content,
151
+ ...(m.name !== undefined ? { name: m.name } : {}),
152
+ ...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {})
153
+ }))
154
+ };
155
+ if (request.maxTokens !== undefined)
156
+ body.max_tokens = request.maxTokens;
157
+ if (request.temperature !== undefined)
158
+ body.temperature = request.temperature;
159
+ if (request.seed !== undefined)
160
+ body.seed = request.seed;
161
+ if (request.tools !== undefined)
162
+ body.tools = request.tools;
163
+ if (request.toolChoice !== undefined)
164
+ body.tool_choice = request.toolChoice;
165
+ if (request.responseFormatJson === true) {
166
+ body.response_format = { type: "json_object" };
167
+ }
168
+ return body;
169
+ }
170
+ function defaultSleep(ms) {
171
+ return new Promise((resolve) => setTimeout(resolve, ms));
172
+ }
173
+ function backoffDelay(attempt, policy) {
174
+ const raw = policy.initialBackoffMs * 2 ** attempt;
175
+ return Math.min(raw, policy.maxBackoffMs);
176
+ }
177
+ /**
178
+ * Build a real client pointed at the configured endpoint. Throws
179
+ * `EvalLlmNotConfiguredError` at call time (not construction time) when no
180
+ * API key is available, so CLI help and dry-run paths stay offline-safe.
181
+ */
182
+ export function createEvalClient(config, options = {}) {
183
+ const retryPolicy = options.retryPolicy ?? {
184
+ ...DEFAULT_RETRY_POLICY,
185
+ maxRetries: Math.max(0, config.maxRetries ?? DEFAULT_RETRY_POLICY.maxRetries)
186
+ };
187
+ const sleep = options.sleep ?? defaultSleep;
188
+ let cached;
189
+ const getClient = () => {
190
+ if (cached)
191
+ return cached;
192
+ if (!config.apiKey)
193
+ throw new EvalLlmNotConfiguredError();
194
+ const factory = options.openaiFactory ??
195
+ ((opts) => new OpenAI(opts));
196
+ cached = factory({ apiKey: config.apiKey, baseURL: config.baseUrl });
197
+ return cached;
198
+ };
14
199
  return {
15
- async chat() {
16
- throw new EvalLlmNotWiredError();
200
+ async chat(request) {
201
+ const timeoutMs = Math.max(1_000, request.timeoutMs ?? config.timeoutMs);
202
+ const body = buildBody(request);
203
+ const client = getClient();
204
+ let lastError;
205
+ const maxAttempts = retryPolicy.maxRetries + 1;
206
+ for (let attempt = 0; attempt < maxAttempts; attempt += 1) {
207
+ const controller = new AbortController();
208
+ const handle = setTimeout(() => controller.abort(), timeoutMs);
209
+ try {
210
+ const raw = await client.chat.completions.create(body, {
211
+ signal: controller.signal
212
+ });
213
+ clearTimeout(handle);
214
+ const choice = raw.choices?.[0];
215
+ if (!choice) {
216
+ throw new EvalLlmInvalidResponseError("LLM response contained no choices.", { model: raw.model });
217
+ }
218
+ const content = choice.message?.content ?? "";
219
+ const toolCalls = choice.message?.tool_calls?.map((call) => ({
220
+ id: call.id,
221
+ name: call.function.name,
222
+ arguments: call.function.arguments
223
+ }));
224
+ const usage = {
225
+ promptTokens: raw.usage?.prompt_tokens ?? 0,
226
+ completionTokens: raw.usage?.completion_tokens ?? 0,
227
+ totalTokens: raw.usage?.total_tokens ?? 0
228
+ };
229
+ return {
230
+ content,
231
+ ...(toolCalls && toolCalls.length > 0 ? { toolCalls } : {}),
232
+ usage,
233
+ finishReason: normalizeFinishReason(choice.finish_reason),
234
+ model: raw.model ?? request.model,
235
+ attempts: attempt + 1
236
+ };
237
+ }
238
+ catch (err) {
239
+ clearTimeout(handle);
240
+ const normalized = normalizeError(err, timeoutMs);
241
+ lastError = normalized;
242
+ const isLastAttempt = attempt === maxAttempts - 1;
243
+ if (!normalized.retryable || isLastAttempt)
244
+ throw normalized;
245
+ await sleep(backoffDelay(attempt, retryPolicy));
246
+ }
247
+ }
248
+ throw lastError ?? new EvalLlmTransportError(new Error("unknown"));
17
249
  }
18
250
  };
19
251
  }
@@ -75,6 +75,32 @@ export function formatMarkdownReport(report) {
75
75
  lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
76
76
  }
77
77
  lines.push(``);
78
+ const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
79
+ if (judgeCases.length > 0) {
80
+ lines.push(`## Judge scores`);
81
+ lines.push(``);
82
+ lines.push(`| stage | case id | check | median | mean | coverage | ok |`);
83
+ lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
84
+ for (const item of judgeCases) {
85
+ for (const verifier of item.verifierResults) {
86
+ if (verifier.kind !== "judge")
87
+ continue;
88
+ if (verifier.id === "judge:required-checks")
89
+ continue;
90
+ if (verifier.id === "judge:rubric:missing")
91
+ continue;
92
+ if (verifier.id === "judge:invocation:error")
93
+ continue;
94
+ const details = verifier.details ?? {};
95
+ const median = typeof details.median === "number" ? details.median.toFixed(2) : "-";
96
+ const mean = typeof details.mean === "number" ? details.mean.toFixed(2) : "-";
97
+ const coverage = details.coverage === true ? "yes" : "no";
98
+ const checkId = verifier.id.replace(/^judge:/, "");
99
+ lines.push(`| ${item.stage} | ${item.caseId} | ${checkId} | ${median} | ${mean} | ${coverage} | ${verifier.ok ? "yes" : "no"} |`);
100
+ }
101
+ }
102
+ lines.push(``);
103
+ }
78
104
  lines.push(`## Verifier details`);
79
105
  lines.push(``);
80
106
  for (const item of report.cases) {
@@ -0,0 +1,20 @@
1
+ import type { FlowStage } from "../types.js";
2
+ import type { RubricCheck, RubricDoc } from "./types.js";
3
+ export declare function rubricsDir(projectRoot: string): string;
4
+ export declare function rubricPath(projectRoot: string, stage: FlowStage): string;
5
+ declare function validateCheck(raw: unknown, index: number, file: string): RubricCheck;
6
+ declare function validateRubric(raw: unknown, file: string): RubricDoc;
7
+ /**
8
+ * Load the rubric for `stage`. Returns `undefined` when the file is
9
+ * missing so callers can emit a "no rubric" verifier result rather than
10
+ * crashing — authors are expected to grow rubrics incrementally.
11
+ */
12
+ export declare function loadRubric(projectRoot: string, stage: FlowStage): Promise<RubricDoc | undefined>;
13
+ /** Load every rubric present in the given rubrics directory. */
14
+ export declare function loadAllRubrics(projectRoot: string): Promise<Map<FlowStage, RubricDoc>>;
15
+ /** Exposed for tests. */
16
+ export declare const __internal: {
17
+ validateRubric: typeof validateRubric;
18
+ validateCheck: typeof validateCheck;
19
+ };
20
+ export {};
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Loader + validator for `.cclaw/evals/rubrics/<stage>.yaml`.
3
+ *
4
+ * Each file maps to exactly one `RubricDoc` that drives the LLM judge.
5
+ * Validation is strict: unknown top-level keys, missing required fields,
6
+ * duplicate check ids, and malformed weights all surface as actionable
7
+ * errors rather than turning into silent "judge had nothing to score"
8
+ * passes.
9
+ */
10
+ import fs from "node:fs/promises";
11
+ import path from "node:path";
12
+ import { parse } from "yaml";
13
+ import { EVALS_ROOT } from "../constants.js";
14
+ import { exists } from "../fs-utils.js";
15
+ import { FLOW_STAGES } from "../types.js";
16
+ export function rubricsDir(projectRoot) {
17
+ return path.join(projectRoot, EVALS_ROOT, "rubrics");
18
+ }
19
+ export function rubricPath(projectRoot, stage) {
20
+ return path.join(rubricsDir(projectRoot), `${stage}.yaml`);
21
+ }
22
+ function rubricError(file, reason) {
23
+ return new Error(`Invalid rubric at ${file}: ${reason}\n` +
24
+ `See docs/evals.md for the rubric schema. Fields: stage (required), id (optional, defaults to stage), checks[] with id + prompt.`);
25
+ }
26
+ function isRecord(value) {
27
+ return typeof value === "object" && value !== null && !Array.isArray(value);
28
+ }
29
+ function validateCheck(raw, index, file) {
30
+ if (!isRecord(raw)) {
31
+ throw rubricError(file, `checks[${index}] must be a mapping`);
32
+ }
33
+ const id = raw.id;
34
+ if (typeof id !== "string" || id.trim().length === 0) {
35
+ throw rubricError(file, `checks[${index}].id must be a non-empty string`);
36
+ }
37
+ if (!/^[a-z][a-z0-9-]*$/.test(id)) {
38
+ throw rubricError(file, `checks[${index}].id "${id}" must be kebab-case (lowercase letters, digits, hyphen; starts with a letter)`);
39
+ }
40
+ const prompt = raw.prompt;
41
+ if (typeof prompt !== "string" || prompt.trim().length === 0) {
42
+ throw rubricError(file, `checks[${index}].prompt must be a non-empty string`);
43
+ }
44
+ const check = {
45
+ id,
46
+ prompt: prompt.trim()
47
+ };
48
+ if (raw.scale !== undefined) {
49
+ if (typeof raw.scale !== "string" || raw.scale.trim().length === 0) {
50
+ throw rubricError(file, `checks[${index}].scale must be a non-empty string when provided`);
51
+ }
52
+ check.scale = raw.scale.trim();
53
+ }
54
+ if (raw.weight !== undefined) {
55
+ if (typeof raw.weight !== "number" || !Number.isFinite(raw.weight) || raw.weight < 0) {
56
+ throw rubricError(file, `checks[${index}].weight must be a non-negative number when provided`);
57
+ }
58
+ check.weight = raw.weight;
59
+ }
60
+ if (raw.critical !== undefined) {
61
+ if (typeof raw.critical !== "boolean") {
62
+ throw rubricError(file, `checks[${index}].critical must be a boolean when provided`);
63
+ }
64
+ check.critical = raw.critical;
65
+ }
66
+ const known = new Set(["id", "prompt", "scale", "weight", "critical"]);
67
+ const unknown = Object.keys(raw).filter((key) => !known.has(key));
68
+ if (unknown.length > 0) {
69
+ throw rubricError(file, `checks[${index}] has unknown key(s): ${unknown.join(", ")}`);
70
+ }
71
+ return check;
72
+ }
73
+ function validateRubric(raw, file) {
74
+ if (!isRecord(raw)) {
75
+ throw rubricError(file, "top-level value must be a mapping");
76
+ }
77
+ const stage = raw.stage;
78
+ if (typeof stage !== "string" || !FLOW_STAGES.includes(stage)) {
79
+ throw rubricError(file, `"stage" must be one of: ${FLOW_STAGES.join(", ")} (got: ${JSON.stringify(stage)})`);
80
+ }
81
+ const id = raw.id;
82
+ let rubricId = stage;
83
+ if (id !== undefined) {
84
+ if (typeof id !== "string" || id.trim().length === 0) {
85
+ throw rubricError(file, `"id" must be a non-empty string when provided`);
86
+ }
87
+ rubricId = id.trim();
88
+ }
89
+ const checks = raw.checks;
90
+ if (!Array.isArray(checks) || checks.length === 0) {
91
+ throw rubricError(file, `"checks" must be a non-empty array`);
92
+ }
93
+ const parsed = [];
94
+ const seen = new Set();
95
+ for (let i = 0; i < checks.length; i += 1) {
96
+ const check = validateCheck(checks[i], i, file);
97
+ if (seen.has(check.id)) {
98
+ throw rubricError(file, `duplicate check id: "${check.id}"`);
99
+ }
100
+ seen.add(check.id);
101
+ parsed.push(check);
102
+ }
103
+ const known = new Set(["stage", "id", "checks"]);
104
+ const unknown = Object.keys(raw).filter((key) => !known.has(key));
105
+ if (unknown.length > 0) {
106
+ throw rubricError(file, `unknown top-level key(s): ${unknown.join(", ")}`);
107
+ }
108
+ return {
109
+ stage: stage,
110
+ id: rubricId,
111
+ checks: parsed
112
+ };
113
+ }
114
+ /**
115
+ * Load the rubric for `stage`. Returns `undefined` when the file is
116
+ * missing so callers can emit a "no rubric" verifier result rather than
117
+ * crashing — authors are expected to grow rubrics incrementally.
118
+ */
119
+ export async function loadRubric(projectRoot, stage) {
120
+ const file = rubricPath(projectRoot, stage);
121
+ if (!(await exists(file)))
122
+ return undefined;
123
+ let parsed;
124
+ try {
125
+ parsed = parse(await fs.readFile(file, "utf8"));
126
+ }
127
+ catch (err) {
128
+ throw rubricError(file, err instanceof Error ? err.message : String(err));
129
+ }
130
+ return validateRubric(parsed, file);
131
+ }
132
+ /** Load every rubric present in the given rubrics directory. */
133
+ export async function loadAllRubrics(projectRoot) {
134
+ const out = new Map();
135
+ for (const stage of FLOW_STAGES) {
136
+ const doc = await loadRubric(projectRoot, stage);
137
+ if (doc)
138
+ out.set(stage, doc);
139
+ }
140
+ return out;
141
+ }
142
+ /** Exposed for tests. */
143
+ export const __internal = { validateRubric, validateCheck };
@@ -1,4 +1,5 @@
1
1
  import type { FlowStage } from "../types.js";
2
+ import { type EvalLlmClient } from "./llm-client.js";
2
3
  import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
3
4
  export interface RunEvalOptions {
4
5
  projectRoot: string;
@@ -14,6 +15,12 @@ export interface RunEvalOptions {
14
15
  dryRun?: boolean;
15
16
  /** Override process.env during tests. */
16
17
  env?: NodeJS.ProcessEnv;
18
+ /**
19
+ * Optional LLM client injection. Primary use case: unit and
20
+ * integration tests that want deterministic judge + agent behavior
21
+ * without hitting the network.
22
+ */
23
+ llmClient?: EvalLlmClient;
17
24
  }
18
25
  export interface DryRunSummary {
19
26
  kind: "dry-run";