cclaw-cli 0.24.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +3 -1
- package/dist/content/eval-scaffold.d.ts +5 -1
- package/dist/content/eval-scaffold.js +284 -3
- package/dist/eval/agents/single-shot.d.ts +27 -0
- package/dist/eval/agents/single-shot.js +79 -0
- package/dist/eval/agents/with-tools.d.ts +31 -0
- package/dist/eval/agents/with-tools.js +255 -0
- package/dist/eval/config-loader.js +128 -3
- package/dist/eval/cost-guard.d.ts +80 -0
- package/dist/eval/cost-guard.js +153 -0
- package/dist/eval/llm-client.d.ts +123 -20
- package/dist/eval/llm-client.js +251 -10
- package/dist/eval/report.js +45 -0
- package/dist/eval/rubric-loader.d.ts +20 -0
- package/dist/eval/rubric-loader.js +143 -0
- package/dist/eval/runner.d.ts +7 -0
- package/dist/eval/runner.js +193 -12
- package/dist/eval/sandbox.d.ts +38 -0
- package/dist/eval/sandbox.js +137 -0
- package/dist/eval/tools/glob.d.ts +2 -0
- package/dist/eval/tools/glob.js +163 -0
- package/dist/eval/tools/grep.d.ts +2 -0
- package/dist/eval/tools/grep.js +152 -0
- package/dist/eval/tools/index.d.ts +7 -0
- package/dist/eval/tools/index.js +35 -0
- package/dist/eval/tools/read.d.ts +2 -0
- package/dist/eval/tools/read.js +122 -0
- package/dist/eval/tools/types.d.ts +49 -0
- package/dist/eval/tools/types.js +41 -0
- package/dist/eval/tools/write.d.ts +2 -0
- package/dist/eval/tools/write.js +92 -0
- package/dist/eval/types.d.ts +138 -1
- package/dist/eval/verifiers/judge.d.ts +40 -0
- package/dist/eval/verifiers/judge.js +256 -0
- package/dist/install.js +7 -1
- package/package.json +2 -1
|
@@ -1,30 +1,38 @@
|
|
|
1
|
-
|
|
2
|
-
* LLM client skeleton for the cclaw eval subsystem.
|
|
3
|
-
*
|
|
4
|
-
* This module declares the shape of the client without pulling in the
|
|
5
|
-
* `openai` runtime dependency. The real implementation lands when
|
|
6
|
-
* single-shot (Tier A) evals and LLM judging come online. Keeping this stub
|
|
7
|
-
* separate means users who only run structural + rule-based verifiers never
|
|
8
|
-
* install an extra dependency or receive network egress warnings.
|
|
9
|
-
*/
|
|
1
|
+
import type { ClientOptions } from "openai";
|
|
10
2
|
import type { ResolvedEvalConfig } from "./types.js";
|
|
11
|
-
/**
|
|
12
|
-
* Minimal chat interface the rest of the eval code will depend on. It is
|
|
13
|
-
* intentionally a subset of OpenAI's Chat Completions surface so that the
|
|
14
|
-
* real implementation is a thin adapter around `OpenAI.chat.completions.create`.
|
|
15
|
-
*/
|
|
16
3
|
export interface ChatMessage {
|
|
17
4
|
role: "system" | "user" | "assistant" | "tool";
|
|
18
5
|
content: string;
|
|
19
6
|
name?: string;
|
|
20
7
|
toolCallId?: string;
|
|
8
|
+
/**
|
|
9
|
+
* OpenAI-style tool calls carried on a preceding assistant message.
|
|
10
|
+
* Populated by the Tier B loop so the wire transcript stays
|
|
11
|
+
* consistent (assistant message → tool responses).
|
|
12
|
+
*/
|
|
13
|
+
toolCalls?: Array<{
|
|
14
|
+
id: string;
|
|
15
|
+
name: string;
|
|
16
|
+
arguments: string;
|
|
17
|
+
}>;
|
|
21
18
|
}
|
|
22
19
|
export interface ChatRequest {
|
|
23
20
|
model: string;
|
|
24
21
|
messages: ChatMessage[];
|
|
25
22
|
maxTokens?: number;
|
|
26
23
|
temperature?: number;
|
|
24
|
+
/** Per-call timeout override. Falls back to `config.timeoutMs`. */
|
|
27
25
|
timeoutMs?: number;
|
|
26
|
+
/**
|
|
27
|
+
* Ask the provider for a JSON-object response. The judge pipeline sets
|
|
28
|
+
* this; the agent-under-test usually leaves it unset.
|
|
29
|
+
*/
|
|
30
|
+
responseFormatJson?: boolean;
|
|
31
|
+
/**
|
|
32
|
+
* Optional deterministic sampling seed. Providers that don't implement
|
|
33
|
+
* `seed` simply ignore it.
|
|
34
|
+
*/
|
|
35
|
+
seed?: number;
|
|
28
36
|
/**
|
|
29
37
|
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
30
38
|
* by Tier B. Ignored by the Tier A single-shot path.
|
|
@@ -46,17 +54,112 @@ export interface ChatResponse {
|
|
|
46
54
|
}>;
|
|
47
55
|
usage: ChatUsage;
|
|
48
56
|
finishReason: "stop" | "length" | "tool_calls" | "content_filter";
|
|
57
|
+
model: string;
|
|
58
|
+
attempts: number;
|
|
59
|
+
}
|
|
60
|
+
/** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
|
|
61
|
+
export declare class EvalLlmError extends Error {
|
|
62
|
+
readonly retryable: boolean;
|
|
63
|
+
readonly status?: number;
|
|
64
|
+
constructor(message: string, opts: {
|
|
65
|
+
retryable: boolean;
|
|
66
|
+
status?: number;
|
|
67
|
+
cause?: unknown;
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
export declare class EvalLlmAuthError extends EvalLlmError {
|
|
71
|
+
constructor(cause: unknown);
|
|
72
|
+
}
|
|
73
|
+
export declare class EvalLlmConfigError extends EvalLlmError {
|
|
74
|
+
constructor(message: string, cause?: unknown);
|
|
75
|
+
}
|
|
76
|
+
export declare class EvalLlmTimeoutError extends EvalLlmError {
|
|
77
|
+
constructor(timeoutMs: number);
|
|
78
|
+
}
|
|
79
|
+
export declare class EvalLlmRateLimitedError extends EvalLlmError {
|
|
80
|
+
constructor(cause: unknown);
|
|
81
|
+
}
|
|
82
|
+
export declare class EvalLlmTransportError extends EvalLlmError {
|
|
83
|
+
constructor(cause: unknown, status?: number);
|
|
84
|
+
}
|
|
85
|
+
export declare class EvalLlmInvalidResponseError extends EvalLlmError {
|
|
86
|
+
constructor(message: string, details?: Record<string, unknown>);
|
|
87
|
+
}
|
|
88
|
+
export declare class EvalLlmNotConfiguredError extends EvalLlmError {
|
|
89
|
+
constructor();
|
|
49
90
|
}
|
|
50
91
|
/** Lightweight client abstraction shared across eval runners. */
|
|
51
92
|
export interface EvalLlmClient {
|
|
52
93
|
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
53
94
|
}
|
|
54
|
-
|
|
55
|
-
|
|
95
|
+
/**
|
|
96
|
+
* Deprecated shim preserved so older wiring keeps compiling. Prefer
|
|
97
|
+
* `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
|
|
98
|
+
* key" case.
|
|
99
|
+
*/
|
|
100
|
+
export declare class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
|
|
101
|
+
}
|
|
102
|
+
/** `createEvalClient` options — mostly for tests to inject a fake transport. */
|
|
103
|
+
export interface CreateEvalClientOptions {
|
|
104
|
+
/** Inject an `openai` stand-in. Used by unit tests to avoid real HTTP. */
|
|
105
|
+
openaiFactory?: (opts: ClientOptions) => OpenAILike;
|
|
106
|
+
/**
|
|
107
|
+
* Override the default retry/backoff policy. Honored by the internal
|
|
108
|
+
* retry loop; transport errors still fall back to the defaults when
|
|
109
|
+
* unset.
|
|
110
|
+
*/
|
|
111
|
+
retryPolicy?: RetryPolicy;
|
|
112
|
+
/** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
|
|
113
|
+
sleep?: (ms: number) => Promise<void>;
|
|
114
|
+
}
|
|
115
|
+
export interface RetryPolicy {
|
|
116
|
+
/** Max retries *on top of* the initial attempt. 0 = single attempt. */
|
|
117
|
+
maxRetries: number;
|
|
118
|
+
/** Initial backoff in ms. Doubles each retry (capped at `maxBackoffMs`). */
|
|
119
|
+
initialBackoffMs: number;
|
|
120
|
+
/** Upper bound for a single sleep between attempts. */
|
|
121
|
+
maxBackoffMs: number;
|
|
122
|
+
}
|
|
123
|
+
export declare const DEFAULT_RETRY_POLICY: RetryPolicy;
|
|
124
|
+
/**
|
|
125
|
+
* Minimal OpenAI-SDK surface we depend on, declared here so tests can
|
|
126
|
+
* substitute a plain object without pulling the real SDK into the test
|
|
127
|
+
* runtime.
|
|
128
|
+
*/
|
|
129
|
+
export interface OpenAILike {
|
|
130
|
+
chat: {
|
|
131
|
+
completions: {
|
|
132
|
+
create(body: Record<string, unknown>, options: {
|
|
133
|
+
signal: AbortSignal;
|
|
134
|
+
}): Promise<OpenAILikeChatResponse>;
|
|
135
|
+
};
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
interface OpenAILikeChatResponse {
|
|
139
|
+
model?: string;
|
|
140
|
+
choices: Array<{
|
|
141
|
+
message?: {
|
|
142
|
+
content?: string | null;
|
|
143
|
+
tool_calls?: Array<{
|
|
144
|
+
id: string;
|
|
145
|
+
function: {
|
|
146
|
+
name: string;
|
|
147
|
+
arguments: string;
|
|
148
|
+
};
|
|
149
|
+
}>;
|
|
150
|
+
};
|
|
151
|
+
finish_reason?: string | null;
|
|
152
|
+
}>;
|
|
153
|
+
usage?: {
|
|
154
|
+
prompt_tokens?: number;
|
|
155
|
+
completion_tokens?: number;
|
|
156
|
+
total_tokens?: number;
|
|
157
|
+
};
|
|
56
158
|
}
|
|
57
159
|
/**
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
160
|
+
* Build a real client pointed at the configured endpoint. Throws
|
|
161
|
+
* `EvalLlmNotConfiguredError` at call time (not construction time) when no
|
|
162
|
+
* API key is available, so CLI help and dry-run paths stay offline-safe.
|
|
61
163
|
*/
|
|
62
|
-
export declare function createEvalClient(
|
|
164
|
+
export declare function createEvalClient(config: ResolvedEvalConfig, options?: CreateEvalClientOptions): EvalLlmClient;
|
|
165
|
+
export {};
|
package/dist/eval/llm-client.js
CHANGED
|
@@ -1,19 +1,260 @@
|
|
|
1
|
-
|
|
1
|
+
/**
|
|
2
|
+
* LLM client for the cclaw eval subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Thin adapter over the `openai` SDK pointed at any OpenAI-compatible
|
|
5
|
+
* `baseURL` (z.ai, OpenAI, vLLM, Ollama+openai-proxy, ...). The surface is
|
|
6
|
+
* deliberately narrow:
|
|
7
|
+
*
|
|
8
|
+
* - `chat()` — one request/response round-trip with timeout, bounded
|
|
9
|
+
* retries on transient errors, and a structured error hierarchy so
|
|
10
|
+
* callers can react policy-style (cost-guard, judge, agent-under-test).
|
|
11
|
+
* - `ChatRequest` / `ChatResponse` — wire format decoupled from the
|
|
12
|
+
* OpenAI types so swapping vendors stays a one-file change.
|
|
13
|
+
*
|
|
14
|
+
* Factories stay side-effect-free: no network calls are made until `chat()`
|
|
15
|
+
* is invoked, so CLI help and dry-run paths never need an API key.
|
|
16
|
+
*/
|
|
17
|
+
import OpenAI from "openai";
|
|
18
|
+
/** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
|
|
19
|
+
export class EvalLlmError extends Error {
|
|
20
|
+
retryable;
|
|
21
|
+
status;
|
|
22
|
+
constructor(message, opts) {
|
|
23
|
+
super(message);
|
|
24
|
+
this.name = "EvalLlmError";
|
|
25
|
+
this.retryable = opts.retryable;
|
|
26
|
+
if (opts.status !== undefined)
|
|
27
|
+
this.status = opts.status;
|
|
28
|
+
if (opts.cause !== undefined)
|
|
29
|
+
this.cause = opts.cause;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
export class EvalLlmAuthError extends EvalLlmError {
|
|
33
|
+
constructor(cause) {
|
|
34
|
+
super("LLM request rejected (auth). Check CCLAW_EVAL_API_KEY and provider permissions.", {
|
|
35
|
+
retryable: false,
|
|
36
|
+
status: 401,
|
|
37
|
+
cause
|
|
38
|
+
});
|
|
39
|
+
this.name = "EvalLlmAuthError";
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
export class EvalLlmConfigError extends EvalLlmError {
|
|
43
|
+
constructor(message, cause) {
|
|
44
|
+
super(message, { retryable: false, cause });
|
|
45
|
+
this.name = "EvalLlmConfigError";
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
export class EvalLlmTimeoutError extends EvalLlmError {
|
|
49
|
+
constructor(timeoutMs) {
|
|
50
|
+
super(`LLM request timed out after ${timeoutMs}ms.`, { retryable: true });
|
|
51
|
+
this.name = "EvalLlmTimeoutError";
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
export class EvalLlmRateLimitedError extends EvalLlmError {
|
|
55
|
+
constructor(cause) {
|
|
56
|
+
super("LLM rate limit hit. Retrying with backoff.", {
|
|
57
|
+
retryable: true,
|
|
58
|
+
status: 429,
|
|
59
|
+
cause
|
|
60
|
+
});
|
|
61
|
+
this.name = "EvalLlmRateLimitedError";
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
export class EvalLlmTransportError extends EvalLlmError {
|
|
65
|
+
constructor(cause, status) {
|
|
66
|
+
super("LLM transport error.", { retryable: true, status, cause });
|
|
67
|
+
this.name = "EvalLlmTransportError";
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
export class EvalLlmInvalidResponseError extends EvalLlmError {
|
|
71
|
+
constructor(message, details) {
|
|
72
|
+
super(message, { retryable: false });
|
|
73
|
+
this.name = "EvalLlmInvalidResponseError";
|
|
74
|
+
if (details)
|
|
75
|
+
this.details = details;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
export class EvalLlmNotConfiguredError extends EvalLlmError {
|
|
2
79
|
constructor() {
|
|
3
|
-
super(`LLM client
|
|
4
|
-
`
|
|
5
|
-
this.name = "
|
|
80
|
+
super(`LLM client not configured. Set CCLAW_EVAL_API_KEY (and optionally ` +
|
|
81
|
+
`CCLAW_EVAL_BASE_URL / CCLAW_EVAL_MODEL) or run with --schema-only / --rules.`, { retryable: false });
|
|
82
|
+
this.name = "EvalLlmNotConfiguredError";
|
|
6
83
|
}
|
|
7
84
|
}
|
|
8
85
|
/**
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
86
|
+
* Deprecated shim preserved so older wiring keeps compiling. Prefer
|
|
87
|
+
* `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
|
|
88
|
+
* key" case.
|
|
12
89
|
*/
|
|
13
|
-
export
|
|
90
|
+
export class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
|
|
91
|
+
}
|
|
92
|
+
export const DEFAULT_RETRY_POLICY = {
|
|
93
|
+
maxRetries: 2,
|
|
94
|
+
initialBackoffMs: 500,
|
|
95
|
+
maxBackoffMs: 8_000
|
|
96
|
+
};
|
|
97
|
+
function isAbortError(err) {
|
|
98
|
+
if (err === null || typeof err !== "object")
|
|
99
|
+
return false;
|
|
100
|
+
const name = err.name;
|
|
101
|
+
const code = err.code;
|
|
102
|
+
return (name === "AbortError" || code === "ABORT_ERR" || code === "ERR_CANCELED");
|
|
103
|
+
}
|
|
104
|
+
function statusFromError(err) {
|
|
105
|
+
if (err === null || typeof err !== "object")
|
|
106
|
+
return undefined;
|
|
107
|
+
const status = err.status;
|
|
108
|
+
return typeof status === "number" ? status : undefined;
|
|
109
|
+
}
|
|
110
|
+
function normalizeError(err, timeoutMs) {
|
|
111
|
+
if (err instanceof EvalLlmError)
|
|
112
|
+
return err;
|
|
113
|
+
if (isAbortError(err))
|
|
114
|
+
return new EvalLlmTimeoutError(timeoutMs);
|
|
115
|
+
const status = statusFromError(err);
|
|
116
|
+
if (status === 401 || status === 403)
|
|
117
|
+
return new EvalLlmAuthError(err);
|
|
118
|
+
if (status === 429)
|
|
119
|
+
return new EvalLlmRateLimitedError(err);
|
|
120
|
+
if (status !== undefined && status >= 400 && status < 500) {
|
|
121
|
+
return new EvalLlmError(`LLM request rejected (HTTP ${status}).`, {
|
|
122
|
+
retryable: false,
|
|
123
|
+
status,
|
|
124
|
+
cause: err
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
return new EvalLlmTransportError(err, status);
|
|
128
|
+
}
|
|
129
|
+
function normalizeFinishReason(raw) {
|
|
130
|
+
switch (raw) {
|
|
131
|
+
case "length":
|
|
132
|
+
return "length";
|
|
133
|
+
case "tool_calls":
|
|
134
|
+
case "function_call":
|
|
135
|
+
return "tool_calls";
|
|
136
|
+
case "content_filter":
|
|
137
|
+
return "content_filter";
|
|
138
|
+
case "stop":
|
|
139
|
+
case null:
|
|
140
|
+
case undefined:
|
|
141
|
+
default:
|
|
142
|
+
return "stop";
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
function buildBody(request) {
|
|
146
|
+
const body = {
|
|
147
|
+
model: request.model,
|
|
148
|
+
messages: request.messages.map((m) => ({
|
|
149
|
+
role: m.role,
|
|
150
|
+
content: m.content,
|
|
151
|
+
...(m.name !== undefined ? { name: m.name } : {}),
|
|
152
|
+
...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {}),
|
|
153
|
+
...(m.toolCalls && m.toolCalls.length > 0
|
|
154
|
+
? {
|
|
155
|
+
tool_calls: m.toolCalls.map((call) => ({
|
|
156
|
+
id: call.id,
|
|
157
|
+
type: "function",
|
|
158
|
+
function: { name: call.name, arguments: call.arguments }
|
|
159
|
+
}))
|
|
160
|
+
}
|
|
161
|
+
: {})
|
|
162
|
+
}))
|
|
163
|
+
};
|
|
164
|
+
if (request.maxTokens !== undefined)
|
|
165
|
+
body.max_tokens = request.maxTokens;
|
|
166
|
+
if (request.temperature !== undefined)
|
|
167
|
+
body.temperature = request.temperature;
|
|
168
|
+
if (request.seed !== undefined)
|
|
169
|
+
body.seed = request.seed;
|
|
170
|
+
if (request.tools !== undefined)
|
|
171
|
+
body.tools = request.tools;
|
|
172
|
+
if (request.toolChoice !== undefined)
|
|
173
|
+
body.tool_choice = request.toolChoice;
|
|
174
|
+
if (request.responseFormatJson === true) {
|
|
175
|
+
body.response_format = { type: "json_object" };
|
|
176
|
+
}
|
|
177
|
+
return body;
|
|
178
|
+
}
|
|
179
|
+
function defaultSleep(ms) {
|
|
180
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
181
|
+
}
|
|
182
|
+
function backoffDelay(attempt, policy) {
|
|
183
|
+
const raw = policy.initialBackoffMs * 2 ** attempt;
|
|
184
|
+
return Math.min(raw, policy.maxBackoffMs);
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Build a real client pointed at the configured endpoint. Throws
|
|
188
|
+
* `EvalLlmNotConfiguredError` at call time (not construction time) when no
|
|
189
|
+
* API key is available, so CLI help and dry-run paths stay offline-safe.
|
|
190
|
+
*/
|
|
191
|
+
export function createEvalClient(config, options = {}) {
|
|
192
|
+
const retryPolicy = options.retryPolicy ?? {
|
|
193
|
+
...DEFAULT_RETRY_POLICY,
|
|
194
|
+
maxRetries: Math.max(0, config.maxRetries ?? DEFAULT_RETRY_POLICY.maxRetries)
|
|
195
|
+
};
|
|
196
|
+
const sleep = options.sleep ?? defaultSleep;
|
|
197
|
+
let cached;
|
|
198
|
+
const getClient = () => {
|
|
199
|
+
if (cached)
|
|
200
|
+
return cached;
|
|
201
|
+
if (!config.apiKey)
|
|
202
|
+
throw new EvalLlmNotConfiguredError();
|
|
203
|
+
const factory = options.openaiFactory ??
|
|
204
|
+
((opts) => new OpenAI(opts));
|
|
205
|
+
cached = factory({ apiKey: config.apiKey, baseURL: config.baseUrl });
|
|
206
|
+
return cached;
|
|
207
|
+
};
|
|
14
208
|
return {
|
|
15
|
-
async chat() {
|
|
16
|
-
|
|
209
|
+
async chat(request) {
|
|
210
|
+
const timeoutMs = Math.max(1_000, request.timeoutMs ?? config.timeoutMs);
|
|
211
|
+
const body = buildBody(request);
|
|
212
|
+
const client = getClient();
|
|
213
|
+
let lastError;
|
|
214
|
+
const maxAttempts = retryPolicy.maxRetries + 1;
|
|
215
|
+
for (let attempt = 0; attempt < maxAttempts; attempt += 1) {
|
|
216
|
+
const controller = new AbortController();
|
|
217
|
+
const handle = setTimeout(() => controller.abort(), timeoutMs);
|
|
218
|
+
try {
|
|
219
|
+
const raw = await client.chat.completions.create(body, {
|
|
220
|
+
signal: controller.signal
|
|
221
|
+
});
|
|
222
|
+
clearTimeout(handle);
|
|
223
|
+
const choice = raw.choices?.[0];
|
|
224
|
+
if (!choice) {
|
|
225
|
+
throw new EvalLlmInvalidResponseError("LLM response contained no choices.", { model: raw.model });
|
|
226
|
+
}
|
|
227
|
+
const content = choice.message?.content ?? "";
|
|
228
|
+
const toolCalls = choice.message?.tool_calls?.map((call) => ({
|
|
229
|
+
id: call.id,
|
|
230
|
+
name: call.function.name,
|
|
231
|
+
arguments: call.function.arguments
|
|
232
|
+
}));
|
|
233
|
+
const usage = {
|
|
234
|
+
promptTokens: raw.usage?.prompt_tokens ?? 0,
|
|
235
|
+
completionTokens: raw.usage?.completion_tokens ?? 0,
|
|
236
|
+
totalTokens: raw.usage?.total_tokens ?? 0
|
|
237
|
+
};
|
|
238
|
+
return {
|
|
239
|
+
content,
|
|
240
|
+
...(toolCalls && toolCalls.length > 0 ? { toolCalls } : {}),
|
|
241
|
+
usage,
|
|
242
|
+
finishReason: normalizeFinishReason(choice.finish_reason),
|
|
243
|
+
model: raw.model ?? request.model,
|
|
244
|
+
attempts: attempt + 1
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
catch (err) {
|
|
248
|
+
clearTimeout(handle);
|
|
249
|
+
const normalized = normalizeError(err, timeoutMs);
|
|
250
|
+
lastError = normalized;
|
|
251
|
+
const isLastAttempt = attempt === maxAttempts - 1;
|
|
252
|
+
if (!normalized.retryable || isLastAttempt)
|
|
253
|
+
throw normalized;
|
|
254
|
+
await sleep(backoffDelay(attempt, retryPolicy));
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
throw lastError ?? new EvalLlmTransportError(new Error("unknown"));
|
|
17
258
|
}
|
|
18
259
|
};
|
|
19
260
|
}
|
package/dist/eval/report.js
CHANGED
|
@@ -75,6 +75,51 @@ export function formatMarkdownReport(report) {
|
|
|
75
75
|
lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
|
|
76
76
|
}
|
|
77
77
|
lines.push(``);
|
|
78
|
+
const toolCases = report.cases.filter((item) => item.verifierResults.some((r) => r.id === "agent:with-tools" && typeof r.details?.toolUse === "object"));
|
|
79
|
+
if (toolCases.length > 0) {
|
|
80
|
+
lines.push(`## Tool use`);
|
|
81
|
+
lines.push(``);
|
|
82
|
+
lines.push(`| stage | case id | turns | calls | errors | denied | by tool |`);
|
|
83
|
+
lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
|
|
84
|
+
for (const item of toolCases) {
|
|
85
|
+
const verifier = item.verifierResults.find((r) => r.id === "agent:with-tools");
|
|
86
|
+
const toolUse = verifier?.details?.toolUse;
|
|
87
|
+
if (!toolUse)
|
|
88
|
+
continue;
|
|
89
|
+
const byTool = Object.entries(toolUse.byTool)
|
|
90
|
+
.map(([name, count]) => `${name}=${count}`)
|
|
91
|
+
.join(", ");
|
|
92
|
+
const denied = toolUse.deniedPaths.length > 0 ? toolUse.deniedPaths.length : "0";
|
|
93
|
+
lines.push(`| ${item.stage} | ${item.caseId} | ${toolUse.turns} | ${toolUse.calls} | ${toolUse.errors} | ${denied} | ${byTool || "-"} |`);
|
|
94
|
+
}
|
|
95
|
+
lines.push(``);
|
|
96
|
+
}
|
|
97
|
+
const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
|
|
98
|
+
if (judgeCases.length > 0) {
|
|
99
|
+
lines.push(`## Judge scores`);
|
|
100
|
+
lines.push(``);
|
|
101
|
+
lines.push(`| stage | case id | check | median | mean | coverage | ok |`);
|
|
102
|
+
lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
|
|
103
|
+
for (const item of judgeCases) {
|
|
104
|
+
for (const verifier of item.verifierResults) {
|
|
105
|
+
if (verifier.kind !== "judge")
|
|
106
|
+
continue;
|
|
107
|
+
if (verifier.id === "judge:required-checks")
|
|
108
|
+
continue;
|
|
109
|
+
if (verifier.id === "judge:rubric:missing")
|
|
110
|
+
continue;
|
|
111
|
+
if (verifier.id === "judge:invocation:error")
|
|
112
|
+
continue;
|
|
113
|
+
const details = verifier.details ?? {};
|
|
114
|
+
const median = typeof details.median === "number" ? details.median.toFixed(2) : "-";
|
|
115
|
+
const mean = typeof details.mean === "number" ? details.mean.toFixed(2) : "-";
|
|
116
|
+
const coverage = details.coverage === true ? "yes" : "no";
|
|
117
|
+
const checkId = verifier.id.replace(/^judge:/, "");
|
|
118
|
+
lines.push(`| ${item.stage} | ${item.caseId} | ${checkId} | ${median} | ${mean} | ${coverage} | ${verifier.ok ? "yes" : "no"} |`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
lines.push(``);
|
|
122
|
+
}
|
|
78
123
|
lines.push(`## Verifier details`);
|
|
79
124
|
lines.push(``);
|
|
80
125
|
for (const item of report.cases) {
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { RubricCheck, RubricDoc } from "./types.js";
|
|
3
|
+
export declare function rubricsDir(projectRoot: string): string;
|
|
4
|
+
export declare function rubricPath(projectRoot: string, stage: FlowStage): string;
|
|
5
|
+
declare function validateCheck(raw: unknown, index: number, file: string): RubricCheck;
|
|
6
|
+
declare function validateRubric(raw: unknown, file: string): RubricDoc;
|
|
7
|
+
/**
|
|
8
|
+
* Load the rubric for `stage`. Returns `undefined` when the file is
|
|
9
|
+
* missing so callers can emit a "no rubric" verifier result rather than
|
|
10
|
+
* crashing — authors are expected to grow rubrics incrementally.
|
|
11
|
+
*/
|
|
12
|
+
export declare function loadRubric(projectRoot: string, stage: FlowStage): Promise<RubricDoc | undefined>;
|
|
13
|
+
/** Load every rubric present in the given rubrics directory. */
|
|
14
|
+
export declare function loadAllRubrics(projectRoot: string): Promise<Map<FlowStage, RubricDoc>>;
|
|
15
|
+
/** Exposed for tests. */
|
|
16
|
+
export declare const __internal: {
|
|
17
|
+
validateRubric: typeof validateRubric;
|
|
18
|
+
validateCheck: typeof validateCheck;
|
|
19
|
+
};
|
|
20
|
+
export {};
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Loader + validator for `.cclaw/evals/rubrics/<stage>.yaml`.
|
|
3
|
+
*
|
|
4
|
+
* Each file maps to exactly one `RubricDoc` that drives the LLM judge.
|
|
5
|
+
* Validation is strict: unknown top-level keys, missing required fields,
|
|
6
|
+
* duplicate check ids, and malformed weights all surface as actionable
|
|
7
|
+
* errors rather than turning into silent "judge had nothing to score"
|
|
8
|
+
* passes.
|
|
9
|
+
*/
|
|
10
|
+
import fs from "node:fs/promises";
|
|
11
|
+
import path from "node:path";
|
|
12
|
+
import { parse } from "yaml";
|
|
13
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
14
|
+
import { exists } from "../fs-utils.js";
|
|
15
|
+
import { FLOW_STAGES } from "../types.js";
|
|
16
|
+
export function rubricsDir(projectRoot) {
|
|
17
|
+
return path.join(projectRoot, EVALS_ROOT, "rubrics");
|
|
18
|
+
}
|
|
19
|
+
export function rubricPath(projectRoot, stage) {
|
|
20
|
+
return path.join(rubricsDir(projectRoot), `${stage}.yaml`);
|
|
21
|
+
}
|
|
22
|
+
function rubricError(file, reason) {
|
|
23
|
+
return new Error(`Invalid rubric at ${file}: ${reason}\n` +
|
|
24
|
+
`See docs/evals.md for the rubric schema. Fields: stage (required), id (optional, defaults to stage), checks[] with id + prompt.`);
|
|
25
|
+
}
|
|
26
|
+
function isRecord(value) {
|
|
27
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
28
|
+
}
|
|
29
|
+
function validateCheck(raw, index, file) {
|
|
30
|
+
if (!isRecord(raw)) {
|
|
31
|
+
throw rubricError(file, `checks[${index}] must be a mapping`);
|
|
32
|
+
}
|
|
33
|
+
const id = raw.id;
|
|
34
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
35
|
+
throw rubricError(file, `checks[${index}].id must be a non-empty string`);
|
|
36
|
+
}
|
|
37
|
+
if (!/^[a-z][a-z0-9-]*$/.test(id)) {
|
|
38
|
+
throw rubricError(file, `checks[${index}].id "${id}" must be kebab-case (lowercase letters, digits, hyphen; starts with a letter)`);
|
|
39
|
+
}
|
|
40
|
+
const prompt = raw.prompt;
|
|
41
|
+
if (typeof prompt !== "string" || prompt.trim().length === 0) {
|
|
42
|
+
throw rubricError(file, `checks[${index}].prompt must be a non-empty string`);
|
|
43
|
+
}
|
|
44
|
+
const check = {
|
|
45
|
+
id,
|
|
46
|
+
prompt: prompt.trim()
|
|
47
|
+
};
|
|
48
|
+
if (raw.scale !== undefined) {
|
|
49
|
+
if (typeof raw.scale !== "string" || raw.scale.trim().length === 0) {
|
|
50
|
+
throw rubricError(file, `checks[${index}].scale must be a non-empty string when provided`);
|
|
51
|
+
}
|
|
52
|
+
check.scale = raw.scale.trim();
|
|
53
|
+
}
|
|
54
|
+
if (raw.weight !== undefined) {
|
|
55
|
+
if (typeof raw.weight !== "number" || !Number.isFinite(raw.weight) || raw.weight < 0) {
|
|
56
|
+
throw rubricError(file, `checks[${index}].weight must be a non-negative number when provided`);
|
|
57
|
+
}
|
|
58
|
+
check.weight = raw.weight;
|
|
59
|
+
}
|
|
60
|
+
if (raw.critical !== undefined) {
|
|
61
|
+
if (typeof raw.critical !== "boolean") {
|
|
62
|
+
throw rubricError(file, `checks[${index}].critical must be a boolean when provided`);
|
|
63
|
+
}
|
|
64
|
+
check.critical = raw.critical;
|
|
65
|
+
}
|
|
66
|
+
const known = new Set(["id", "prompt", "scale", "weight", "critical"]);
|
|
67
|
+
const unknown = Object.keys(raw).filter((key) => !known.has(key));
|
|
68
|
+
if (unknown.length > 0) {
|
|
69
|
+
throw rubricError(file, `checks[${index}] has unknown key(s): ${unknown.join(", ")}`);
|
|
70
|
+
}
|
|
71
|
+
return check;
|
|
72
|
+
}
|
|
73
|
+
function validateRubric(raw, file) {
|
|
74
|
+
if (!isRecord(raw)) {
|
|
75
|
+
throw rubricError(file, "top-level value must be a mapping");
|
|
76
|
+
}
|
|
77
|
+
const stage = raw.stage;
|
|
78
|
+
if (typeof stage !== "string" || !FLOW_STAGES.includes(stage)) {
|
|
79
|
+
throw rubricError(file, `"stage" must be one of: ${FLOW_STAGES.join(", ")} (got: ${JSON.stringify(stage)})`);
|
|
80
|
+
}
|
|
81
|
+
const id = raw.id;
|
|
82
|
+
let rubricId = stage;
|
|
83
|
+
if (id !== undefined) {
|
|
84
|
+
if (typeof id !== "string" || id.trim().length === 0) {
|
|
85
|
+
throw rubricError(file, `"id" must be a non-empty string when provided`);
|
|
86
|
+
}
|
|
87
|
+
rubricId = id.trim();
|
|
88
|
+
}
|
|
89
|
+
const checks = raw.checks;
|
|
90
|
+
if (!Array.isArray(checks) || checks.length === 0) {
|
|
91
|
+
throw rubricError(file, `"checks" must be a non-empty array`);
|
|
92
|
+
}
|
|
93
|
+
const parsed = [];
|
|
94
|
+
const seen = new Set();
|
|
95
|
+
for (let i = 0; i < checks.length; i += 1) {
|
|
96
|
+
const check = validateCheck(checks[i], i, file);
|
|
97
|
+
if (seen.has(check.id)) {
|
|
98
|
+
throw rubricError(file, `duplicate check id: "${check.id}"`);
|
|
99
|
+
}
|
|
100
|
+
seen.add(check.id);
|
|
101
|
+
parsed.push(check);
|
|
102
|
+
}
|
|
103
|
+
const known = new Set(["stage", "id", "checks"]);
|
|
104
|
+
const unknown = Object.keys(raw).filter((key) => !known.has(key));
|
|
105
|
+
if (unknown.length > 0) {
|
|
106
|
+
throw rubricError(file, `unknown top-level key(s): ${unknown.join(", ")}`);
|
|
107
|
+
}
|
|
108
|
+
return {
|
|
109
|
+
stage: stage,
|
|
110
|
+
id: rubricId,
|
|
111
|
+
checks: parsed
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Load the rubric for `stage`. Returns `undefined` when the file is
|
|
116
|
+
* missing so callers can emit a "no rubric" verifier result rather than
|
|
117
|
+
* crashing — authors are expected to grow rubrics incrementally.
|
|
118
|
+
*/
|
|
119
|
+
export async function loadRubric(projectRoot, stage) {
|
|
120
|
+
const file = rubricPath(projectRoot, stage);
|
|
121
|
+
if (!(await exists(file)))
|
|
122
|
+
return undefined;
|
|
123
|
+
let parsed;
|
|
124
|
+
try {
|
|
125
|
+
parsed = parse(await fs.readFile(file, "utf8"));
|
|
126
|
+
}
|
|
127
|
+
catch (err) {
|
|
128
|
+
throw rubricError(file, err instanceof Error ? err.message : String(err));
|
|
129
|
+
}
|
|
130
|
+
return validateRubric(parsed, file);
|
|
131
|
+
}
|
|
132
|
+
/** Load every rubric present in the given rubrics directory. */
|
|
133
|
+
export async function loadAllRubrics(projectRoot) {
|
|
134
|
+
const out = new Map();
|
|
135
|
+
for (const stage of FLOW_STAGES) {
|
|
136
|
+
const doc = await loadRubric(projectRoot, stage);
|
|
137
|
+
if (doc)
|
|
138
|
+
out.set(stage, doc);
|
|
139
|
+
}
|
|
140
|
+
return out;
|
|
141
|
+
}
|
|
142
|
+
/** Exposed for tests. */
|
|
143
|
+
export const __internal = { validateRubric, validateCheck };
|
package/dist/eval/runner.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { FlowStage } from "../types.js";
|
|
2
|
+
import { type EvalLlmClient } from "./llm-client.js";
|
|
2
3
|
import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
|
|
3
4
|
export interface RunEvalOptions {
|
|
4
5
|
projectRoot: string;
|
|
@@ -14,6 +15,12 @@ export interface RunEvalOptions {
|
|
|
14
15
|
dryRun?: boolean;
|
|
15
16
|
/** Override process.env during tests. */
|
|
16
17
|
env?: NodeJS.ProcessEnv;
|
|
18
|
+
/**
|
|
19
|
+
* Optional LLM client injection. Primary use case: unit and
|
|
20
|
+
* integration tests that want deterministic judge + agent behavior
|
|
21
|
+
* without hitting the network.
|
|
22
|
+
*/
|
|
23
|
+
llmClient?: EvalLlmClient;
|
|
17
24
|
}
|
|
18
25
|
export interface DryRunSummary {
|
|
19
26
|
kind: "dry-run";
|