cclaw-cli 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +2 -1
- package/dist/content/eval-scaffold.d.ts +5 -1
- package/dist/content/eval-scaffold.js +284 -3
- package/dist/eval/agents/single-shot.d.ts +27 -0
- package/dist/eval/agents/single-shot.js +79 -0
- package/dist/eval/config-loader.js +96 -3
- package/dist/eval/cost-guard.d.ts +80 -0
- package/dist/eval/cost-guard.js +153 -0
- package/dist/eval/llm-client.d.ts +113 -20
- package/dist/eval/llm-client.js +242 -10
- package/dist/eval/report.js +26 -0
- package/dist/eval/rubric-loader.d.ts +20 -0
- package/dist/eval/rubric-loader.js +143 -0
- package/dist/eval/runner.d.ts +7 -0
- package/dist/eval/runner.js +145 -12
- package/dist/eval/types.d.ts +103 -1
- package/dist/eval/verifiers/judge.d.ts +40 -0
- package/dist/eval/verifiers/judge.js +256 -0
- package/dist/install.js +7 -1
- package/package.json +2 -1
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cost guard for the cclaw eval subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Two responsibilities:
|
|
5
|
+
*
|
|
6
|
+
* 1. Convert `ChatUsage` (prompt/completion token counts) into USD using
|
|
7
|
+
* a per-model `TokenPricing` schedule. Pricing comes from
|
|
8
|
+
* `config.tokenPricing[model]` first, then from the builtin fallback
|
|
9
|
+
* schedule for well-known models (z.ai GLM 5.1 at publish time).
|
|
10
|
+
* 2. Maintain a per-day running total persisted to
|
|
11
|
+
* `.cclaw/evals/.spend-YYYY-MM-DD.json` so that a long eval session
|
|
12
|
+
* (or a cron-run nightly) can't blow through the configured
|
|
13
|
+
* `dailyUsdCap`. The counter is opt-in: no cap, no writes.
|
|
14
|
+
*
|
|
15
|
+
* The guard is deliberately pessimistic — it rounds USD up to 6 decimals
|
|
16
|
+
* and never subtracts, so a CI run that errors mid-flight still shows the
|
|
17
|
+
* partial spend in the next report.
|
|
18
|
+
*/
|
|
19
|
+
import fs from "node:fs/promises";
|
|
20
|
+
import path from "node:path";
|
|
21
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
22
|
+
import { exists } from "../fs-utils.js";
|
|
23
|
+
/**
|
|
24
|
+
* Builtin pricing fallback. Intentionally conservative: when the user
|
|
25
|
+
* hasn't configured pricing and we don't know the model, we default to a
|
|
26
|
+
* "small model" USD schedule so the cap can still do something useful.
|
|
27
|
+
*
|
|
28
|
+
* Values are USD per 1K tokens. Sources are public pricing pages as of
|
|
29
|
+
* 2026-04; update by editing this constant, not the guard logic.
|
|
30
|
+
*/
|
|
31
|
+
export const DEFAULT_TOKEN_PRICING = {
|
|
32
|
+
"glm-5.1": { input: 0.0005, output: 0.0015 },
|
|
33
|
+
"glm-4.6": { input: 0.0005, output: 0.0015 },
|
|
34
|
+
"gpt-4o-mini": { input: 0.00015, output: 0.0006 },
|
|
35
|
+
"gpt-4o": { input: 0.005, output: 0.015 }
|
|
36
|
+
};
|
|
37
|
+
/** Hard default when neither config nor builtins know the model. */
|
|
38
|
+
export const UNKNOWN_MODEL_PRICING = { input: 0.001, output: 0.003 };
|
|
39
|
+
export class DailyCostCapExceededError extends Error {
|
|
40
|
+
capUsd;
|
|
41
|
+
projectedUsd;
|
|
42
|
+
currentUsd;
|
|
43
|
+
constructor(opts) {
|
|
44
|
+
super(`Daily cost cap would be exceeded: ` +
|
|
45
|
+
`current=$${opts.currentUsd.toFixed(4)}, ` +
|
|
46
|
+
`projected=$${opts.projectedUsd.toFixed(4)}, ` +
|
|
47
|
+
`cap=$${opts.capUsd.toFixed(4)}. ` +
|
|
48
|
+
`Unset CCLAW_EVAL_DAILY_USD_CAP or increase the cap to continue.`);
|
|
49
|
+
this.name = "DailyCostCapExceededError";
|
|
50
|
+
this.capUsd = opts.capUsd;
|
|
51
|
+
this.projectedUsd = opts.projectedUsd;
|
|
52
|
+
this.currentUsd = opts.currentUsd;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
function utcDate(now = new Date()) {
|
|
56
|
+
return now.toISOString().slice(0, 10);
|
|
57
|
+
}
|
|
58
|
+
function pricingFor(model, config) {
|
|
59
|
+
const custom = config.tokenPricing?.[model];
|
|
60
|
+
if (custom)
|
|
61
|
+
return custom;
|
|
62
|
+
const builtin = DEFAULT_TOKEN_PRICING[model];
|
|
63
|
+
if (builtin)
|
|
64
|
+
return builtin;
|
|
65
|
+
return UNKNOWN_MODEL_PRICING;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Compute USD cost of a single `ChatUsage` using the given `model` pricing
|
|
69
|
+
* schedule. Returns 0 when `usage.totalTokens` is 0 (e.g. transport error
|
|
70
|
+
* before first token).
|
|
71
|
+
*/
|
|
72
|
+
export function computeUsageUsd(model, usage, config) {
|
|
73
|
+
if (!usage || usage.totalTokens <= 0)
|
|
74
|
+
return 0;
|
|
75
|
+
const schedule = pricingFor(model, config);
|
|
76
|
+
const cost = (usage.promptTokens * schedule.input) / 1_000 +
|
|
77
|
+
(usage.completionTokens * schedule.output) / 1_000;
|
|
78
|
+
return Math.max(0, Number(cost.toFixed(6)));
|
|
79
|
+
}
|
|
80
|
+
function emptyLedger(date) {
|
|
81
|
+
return { date, totalUsd: 0, calls: 0, byModel: {} };
|
|
82
|
+
}
|
|
83
|
+
function ledgerPath(projectRoot, date) {
|
|
84
|
+
return path.join(projectRoot, EVALS_ROOT, `.spend-${date}.json`);
|
|
85
|
+
}
|
|
86
|
+
async function readLedger(file, date) {
|
|
87
|
+
if (!(await exists(file)))
|
|
88
|
+
return emptyLedger(date);
|
|
89
|
+
try {
|
|
90
|
+
const raw = JSON.parse(await fs.readFile(file, "utf8"));
|
|
91
|
+
if (raw?.date !== date)
|
|
92
|
+
return emptyLedger(date);
|
|
93
|
+
return {
|
|
94
|
+
date,
|
|
95
|
+
totalUsd: typeof raw.totalUsd === "number" ? raw.totalUsd : 0,
|
|
96
|
+
calls: typeof raw.calls === "number" ? raw.calls : 0,
|
|
97
|
+
byModel: raw.byModel && typeof raw.byModel === "object" ? raw.byModel : {}
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
catch {
|
|
101
|
+
return emptyLedger(date);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
async function writeLedger(file, ledger) {
|
|
105
|
+
await fs.mkdir(path.dirname(file), { recursive: true });
|
|
106
|
+
await fs.writeFile(file, `${JSON.stringify(ledger, null, 2)}\n`, "utf8");
|
|
107
|
+
}
|
|
108
|
+
export function createCostGuard(projectRoot, config, options = {}) {
|
|
109
|
+
const now = options.now ?? (() => new Date());
|
|
110
|
+
const currentDate = () => utcDate(now());
|
|
111
|
+
const file = () => options.ledgerPath ?? ledgerPath(projectRoot, currentDate());
|
|
112
|
+
return {
|
|
113
|
+
async commit(model, usage) {
|
|
114
|
+
const usd = computeUsageUsd(model, usage, config);
|
|
115
|
+
if (config.dailyUsdCap === undefined)
|
|
116
|
+
return usd;
|
|
117
|
+
const date = currentDate();
|
|
118
|
+
const target = file();
|
|
119
|
+
const ledger = await readLedger(target, date);
|
|
120
|
+
const projected = Number((ledger.totalUsd + usd).toFixed(6));
|
|
121
|
+
if (projected > config.dailyUsdCap) {
|
|
122
|
+
throw new DailyCostCapExceededError({
|
|
123
|
+
capUsd: config.dailyUsdCap,
|
|
124
|
+
projectedUsd: projected,
|
|
125
|
+
currentUsd: ledger.totalUsd
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
ledger.totalUsd = projected;
|
|
129
|
+
ledger.calls += 1;
|
|
130
|
+
const byModel = ledger.byModel[model] ?? { tokensIn: 0, tokensOut: 0, usd: 0 };
|
|
131
|
+
byModel.tokensIn += usage.promptTokens;
|
|
132
|
+
byModel.tokensOut += usage.completionTokens;
|
|
133
|
+
byModel.usd = Number((byModel.usd + usd).toFixed(6));
|
|
134
|
+
ledger.byModel[model] = byModel;
|
|
135
|
+
await writeLedger(target, ledger);
|
|
136
|
+
return usd;
|
|
137
|
+
},
|
|
138
|
+
async snapshot() {
|
|
139
|
+
if (config.dailyUsdCap === undefined)
|
|
140
|
+
return undefined;
|
|
141
|
+
const date = currentDate();
|
|
142
|
+
return readLedger(file(), date);
|
|
143
|
+
}
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
/** Exposed for tests. */
|
|
147
|
+
export const __internal = {
|
|
148
|
+
utcDate,
|
|
149
|
+
pricingFor,
|
|
150
|
+
ledgerPath,
|
|
151
|
+
readLedger,
|
|
152
|
+
writeLedger
|
|
153
|
+
};
|
|
@@ -1,18 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
* LLM client skeleton for the cclaw eval subsystem.
|
|
3
|
-
*
|
|
4
|
-
* This module declares the shape of the client without pulling in the
|
|
5
|
-
* `openai` runtime dependency. The real implementation lands when
|
|
6
|
-
* single-shot (Tier A) evals and LLM judging come online. Keeping this stub
|
|
7
|
-
* separate means users who only run structural + rule-based verifiers never
|
|
8
|
-
* install an extra dependency or receive network egress warnings.
|
|
9
|
-
*/
|
|
1
|
+
import type { ClientOptions } from "openai";
|
|
10
2
|
import type { ResolvedEvalConfig } from "./types.js";
|
|
11
|
-
/**
|
|
12
|
-
* Minimal chat interface the rest of the eval code will depend on. It is
|
|
13
|
-
* intentionally a subset of OpenAI's Chat Completions surface so that the
|
|
14
|
-
* real implementation is a thin adapter around `OpenAI.chat.completions.create`.
|
|
15
|
-
*/
|
|
16
3
|
export interface ChatMessage {
|
|
17
4
|
role: "system" | "user" | "assistant" | "tool";
|
|
18
5
|
content: string;
|
|
@@ -24,7 +11,18 @@ export interface ChatRequest {
|
|
|
24
11
|
messages: ChatMessage[];
|
|
25
12
|
maxTokens?: number;
|
|
26
13
|
temperature?: number;
|
|
14
|
+
/** Per-call timeout override. Falls back to `config.timeoutMs`. */
|
|
27
15
|
timeoutMs?: number;
|
|
16
|
+
/**
|
|
17
|
+
* Ask the provider for a JSON-object response. The judge pipeline sets
|
|
18
|
+
* this; the agent-under-test usually leaves it unset.
|
|
19
|
+
*/
|
|
20
|
+
responseFormatJson?: boolean;
|
|
21
|
+
/**
|
|
22
|
+
* Optional deterministic sampling seed. Providers that don't implement
|
|
23
|
+
* `seed` simply ignore it.
|
|
24
|
+
*/
|
|
25
|
+
seed?: number;
|
|
28
26
|
/**
|
|
29
27
|
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
30
28
|
* by Tier B. Ignored by the Tier A single-shot path.
|
|
@@ -46,17 +44,112 @@ export interface ChatResponse {
|
|
|
46
44
|
}>;
|
|
47
45
|
usage: ChatUsage;
|
|
48
46
|
finishReason: "stop" | "length" | "tool_calls" | "content_filter";
|
|
47
|
+
model: string;
|
|
48
|
+
attempts: number;
|
|
49
|
+
}
|
|
50
|
+
/** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
|
|
51
|
+
export declare class EvalLlmError extends Error {
|
|
52
|
+
readonly retryable: boolean;
|
|
53
|
+
readonly status?: number;
|
|
54
|
+
constructor(message: string, opts: {
|
|
55
|
+
retryable: boolean;
|
|
56
|
+
status?: number;
|
|
57
|
+
cause?: unknown;
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
export declare class EvalLlmAuthError extends EvalLlmError {
|
|
61
|
+
constructor(cause: unknown);
|
|
62
|
+
}
|
|
63
|
+
export declare class EvalLlmConfigError extends EvalLlmError {
|
|
64
|
+
constructor(message: string, cause?: unknown);
|
|
65
|
+
}
|
|
66
|
+
export declare class EvalLlmTimeoutError extends EvalLlmError {
|
|
67
|
+
constructor(timeoutMs: number);
|
|
68
|
+
}
|
|
69
|
+
export declare class EvalLlmRateLimitedError extends EvalLlmError {
|
|
70
|
+
constructor(cause: unknown);
|
|
71
|
+
}
|
|
72
|
+
export declare class EvalLlmTransportError extends EvalLlmError {
|
|
73
|
+
constructor(cause: unknown, status?: number);
|
|
74
|
+
}
|
|
75
|
+
export declare class EvalLlmInvalidResponseError extends EvalLlmError {
|
|
76
|
+
constructor(message: string, details?: Record<string, unknown>);
|
|
77
|
+
}
|
|
78
|
+
export declare class EvalLlmNotConfiguredError extends EvalLlmError {
|
|
79
|
+
constructor();
|
|
49
80
|
}
|
|
50
81
|
/** Lightweight client abstraction shared across eval runners. */
|
|
51
82
|
export interface EvalLlmClient {
|
|
52
83
|
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
53
84
|
}
|
|
54
|
-
|
|
55
|
-
|
|
85
|
+
/**
|
|
86
|
+
* Deprecated shim preserved so older wiring keeps compiling. Prefer
|
|
87
|
+
* `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
|
|
88
|
+
* key" case.
|
|
89
|
+
*/
|
|
90
|
+
export declare class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
|
|
91
|
+
}
|
|
92
|
+
/** `createEvalClient` options — mostly for tests to inject a fake transport. */
|
|
93
|
+
export interface CreateEvalClientOptions {
|
|
94
|
+
/** Inject an `openai` stand-in. Used by unit tests to avoid real HTTP. */
|
|
95
|
+
openaiFactory?: (opts: ClientOptions) => OpenAILike;
|
|
96
|
+
/**
|
|
97
|
+
* Override the default retry/backoff policy. Honored by the internal
|
|
98
|
+
* retry loop; transport errors still fall back to the defaults when
|
|
99
|
+
* unset.
|
|
100
|
+
*/
|
|
101
|
+
retryPolicy?: RetryPolicy;
|
|
102
|
+
/** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
|
|
103
|
+
sleep?: (ms: number) => Promise<void>;
|
|
104
|
+
}
|
|
105
|
+
export interface RetryPolicy {
|
|
106
|
+
/** Max retries *on top of* the initial attempt. 0 = single attempt. */
|
|
107
|
+
maxRetries: number;
|
|
108
|
+
/** Initial backoff in ms. Doubles each retry (capped at `maxBackoffMs`). */
|
|
109
|
+
initialBackoffMs: number;
|
|
110
|
+
/** Upper bound for a single sleep between attempts. */
|
|
111
|
+
maxBackoffMs: number;
|
|
112
|
+
}
|
|
113
|
+
export declare const DEFAULT_RETRY_POLICY: RetryPolicy;
|
|
114
|
+
/**
|
|
115
|
+
* Minimal OpenAI-SDK surface we depend on, declared here so tests can
|
|
116
|
+
* substitute a plain object without pulling the real SDK into the test
|
|
117
|
+
* runtime.
|
|
118
|
+
*/
|
|
119
|
+
export interface OpenAILike {
|
|
120
|
+
chat: {
|
|
121
|
+
completions: {
|
|
122
|
+
create(body: Record<string, unknown>, options: {
|
|
123
|
+
signal: AbortSignal;
|
|
124
|
+
}): Promise<OpenAILikeChatResponse>;
|
|
125
|
+
};
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
interface OpenAILikeChatResponse {
|
|
129
|
+
model?: string;
|
|
130
|
+
choices: Array<{
|
|
131
|
+
message?: {
|
|
132
|
+
content?: string | null;
|
|
133
|
+
tool_calls?: Array<{
|
|
134
|
+
id: string;
|
|
135
|
+
function: {
|
|
136
|
+
name: string;
|
|
137
|
+
arguments: string;
|
|
138
|
+
};
|
|
139
|
+
}>;
|
|
140
|
+
};
|
|
141
|
+
finish_reason?: string | null;
|
|
142
|
+
}>;
|
|
143
|
+
usage?: {
|
|
144
|
+
prompt_tokens?: number;
|
|
145
|
+
completion_tokens?: number;
|
|
146
|
+
total_tokens?: number;
|
|
147
|
+
};
|
|
56
148
|
}
|
|
57
149
|
/**
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
150
|
+
* Build a real client pointed at the configured endpoint. Throws
|
|
151
|
+
* `EvalLlmNotConfiguredError` at call time (not construction time) when no
|
|
152
|
+
* API key is available, so CLI help and dry-run paths stay offline-safe.
|
|
61
153
|
*/
|
|
62
|
-
export declare function createEvalClient(
|
|
154
|
+
export declare function createEvalClient(config: ResolvedEvalConfig, options?: CreateEvalClientOptions): EvalLlmClient;
|
|
155
|
+
export {};
|
package/dist/eval/llm-client.js
CHANGED
|
@@ -1,19 +1,251 @@
|
|
|
1
|
-
|
|
1
|
+
/**
|
|
2
|
+
* LLM client for the cclaw eval subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Thin adapter over the `openai` SDK pointed at any OpenAI-compatible
|
|
5
|
+
* `baseURL` (z.ai, OpenAI, vLLM, Ollama+openai-proxy, ...). The surface is
|
|
6
|
+
* deliberately narrow:
|
|
7
|
+
*
|
|
8
|
+
* - `chat()` — one request/response round-trip with timeout, bounded
|
|
9
|
+
* retries on transient errors, and a structured error hierarchy so
|
|
10
|
+
* callers can react policy-style (cost-guard, judge, agent-under-test).
|
|
11
|
+
* - `ChatRequest` / `ChatResponse` — wire format decoupled from the
|
|
12
|
+
* OpenAI types so swapping vendors stays a one-file change.
|
|
13
|
+
*
|
|
14
|
+
* Factories stay side-effect-free: no network calls are made until `chat()`
|
|
15
|
+
* is invoked, so CLI help and dry-run paths never need an API key.
|
|
16
|
+
*/
|
|
17
|
+
import OpenAI from "openai";
|
|
18
|
+
/** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
|
|
19
|
+
export class EvalLlmError extends Error {
|
|
20
|
+
retryable;
|
|
21
|
+
status;
|
|
22
|
+
constructor(message, opts) {
|
|
23
|
+
super(message);
|
|
24
|
+
this.name = "EvalLlmError";
|
|
25
|
+
this.retryable = opts.retryable;
|
|
26
|
+
if (opts.status !== undefined)
|
|
27
|
+
this.status = opts.status;
|
|
28
|
+
if (opts.cause !== undefined)
|
|
29
|
+
this.cause = opts.cause;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
export class EvalLlmAuthError extends EvalLlmError {
|
|
33
|
+
constructor(cause) {
|
|
34
|
+
super("LLM request rejected (auth). Check CCLAW_EVAL_API_KEY and provider permissions.", {
|
|
35
|
+
retryable: false,
|
|
36
|
+
status: 401,
|
|
37
|
+
cause
|
|
38
|
+
});
|
|
39
|
+
this.name = "EvalLlmAuthError";
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
export class EvalLlmConfigError extends EvalLlmError {
|
|
43
|
+
constructor(message, cause) {
|
|
44
|
+
super(message, { retryable: false, cause });
|
|
45
|
+
this.name = "EvalLlmConfigError";
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
export class EvalLlmTimeoutError extends EvalLlmError {
|
|
49
|
+
constructor(timeoutMs) {
|
|
50
|
+
super(`LLM request timed out after ${timeoutMs}ms.`, { retryable: true });
|
|
51
|
+
this.name = "EvalLlmTimeoutError";
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
export class EvalLlmRateLimitedError extends EvalLlmError {
|
|
55
|
+
constructor(cause) {
|
|
56
|
+
super("LLM rate limit hit. Retrying with backoff.", {
|
|
57
|
+
retryable: true,
|
|
58
|
+
status: 429,
|
|
59
|
+
cause
|
|
60
|
+
});
|
|
61
|
+
this.name = "EvalLlmRateLimitedError";
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
export class EvalLlmTransportError extends EvalLlmError {
|
|
65
|
+
constructor(cause, status) {
|
|
66
|
+
super("LLM transport error.", { retryable: true, status, cause });
|
|
67
|
+
this.name = "EvalLlmTransportError";
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
export class EvalLlmInvalidResponseError extends EvalLlmError {
|
|
71
|
+
constructor(message, details) {
|
|
72
|
+
super(message, { retryable: false });
|
|
73
|
+
this.name = "EvalLlmInvalidResponseError";
|
|
74
|
+
if (details)
|
|
75
|
+
this.details = details;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
export class EvalLlmNotConfiguredError extends EvalLlmError {
|
|
2
79
|
constructor() {
|
|
3
|
-
super(`LLM client
|
|
4
|
-
`
|
|
5
|
-
this.name = "
|
|
80
|
+
super(`LLM client not configured. Set CCLAW_EVAL_API_KEY (and optionally ` +
|
|
81
|
+
`CCLAW_EVAL_BASE_URL / CCLAW_EVAL_MODEL) or run with --schema-only / --rules.`, { retryable: false });
|
|
82
|
+
this.name = "EvalLlmNotConfiguredError";
|
|
6
83
|
}
|
|
7
84
|
}
|
|
8
85
|
/**
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
86
|
+
* Deprecated shim preserved so older wiring keeps compiling. Prefer
|
|
87
|
+
* `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
|
|
88
|
+
* key" case.
|
|
12
89
|
*/
|
|
13
|
-
export
|
|
90
|
+
export class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
|
|
91
|
+
}
|
|
92
|
+
export const DEFAULT_RETRY_POLICY = {
|
|
93
|
+
maxRetries: 2,
|
|
94
|
+
initialBackoffMs: 500,
|
|
95
|
+
maxBackoffMs: 8_000
|
|
96
|
+
};
|
|
97
|
+
function isAbortError(err) {
|
|
98
|
+
if (err === null || typeof err !== "object")
|
|
99
|
+
return false;
|
|
100
|
+
const name = err.name;
|
|
101
|
+
const code = err.code;
|
|
102
|
+
return (name === "AbortError" || code === "ABORT_ERR" || code === "ERR_CANCELED");
|
|
103
|
+
}
|
|
104
|
+
function statusFromError(err) {
|
|
105
|
+
if (err === null || typeof err !== "object")
|
|
106
|
+
return undefined;
|
|
107
|
+
const status = err.status;
|
|
108
|
+
return typeof status === "number" ? status : undefined;
|
|
109
|
+
}
|
|
110
|
+
function normalizeError(err, timeoutMs) {
|
|
111
|
+
if (err instanceof EvalLlmError)
|
|
112
|
+
return err;
|
|
113
|
+
if (isAbortError(err))
|
|
114
|
+
return new EvalLlmTimeoutError(timeoutMs);
|
|
115
|
+
const status = statusFromError(err);
|
|
116
|
+
if (status === 401 || status === 403)
|
|
117
|
+
return new EvalLlmAuthError(err);
|
|
118
|
+
if (status === 429)
|
|
119
|
+
return new EvalLlmRateLimitedError(err);
|
|
120
|
+
if (status !== undefined && status >= 400 && status < 500) {
|
|
121
|
+
return new EvalLlmError(`LLM request rejected (HTTP ${status}).`, {
|
|
122
|
+
retryable: false,
|
|
123
|
+
status,
|
|
124
|
+
cause: err
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
return new EvalLlmTransportError(err, status);
|
|
128
|
+
}
|
|
129
|
+
function normalizeFinishReason(raw) {
|
|
130
|
+
switch (raw) {
|
|
131
|
+
case "length":
|
|
132
|
+
return "length";
|
|
133
|
+
case "tool_calls":
|
|
134
|
+
case "function_call":
|
|
135
|
+
return "tool_calls";
|
|
136
|
+
case "content_filter":
|
|
137
|
+
return "content_filter";
|
|
138
|
+
case "stop":
|
|
139
|
+
case null:
|
|
140
|
+
case undefined:
|
|
141
|
+
default:
|
|
142
|
+
return "stop";
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
function buildBody(request) {
|
|
146
|
+
const body = {
|
|
147
|
+
model: request.model,
|
|
148
|
+
messages: request.messages.map((m) => ({
|
|
149
|
+
role: m.role,
|
|
150
|
+
content: m.content,
|
|
151
|
+
...(m.name !== undefined ? { name: m.name } : {}),
|
|
152
|
+
...(m.toolCallId !== undefined ? { tool_call_id: m.toolCallId } : {})
|
|
153
|
+
}))
|
|
154
|
+
};
|
|
155
|
+
if (request.maxTokens !== undefined)
|
|
156
|
+
body.max_tokens = request.maxTokens;
|
|
157
|
+
if (request.temperature !== undefined)
|
|
158
|
+
body.temperature = request.temperature;
|
|
159
|
+
if (request.seed !== undefined)
|
|
160
|
+
body.seed = request.seed;
|
|
161
|
+
if (request.tools !== undefined)
|
|
162
|
+
body.tools = request.tools;
|
|
163
|
+
if (request.toolChoice !== undefined)
|
|
164
|
+
body.tool_choice = request.toolChoice;
|
|
165
|
+
if (request.responseFormatJson === true) {
|
|
166
|
+
body.response_format = { type: "json_object" };
|
|
167
|
+
}
|
|
168
|
+
return body;
|
|
169
|
+
}
|
|
170
|
+
function defaultSleep(ms) {
|
|
171
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
172
|
+
}
|
|
173
|
+
function backoffDelay(attempt, policy) {
|
|
174
|
+
const raw = policy.initialBackoffMs * 2 ** attempt;
|
|
175
|
+
return Math.min(raw, policy.maxBackoffMs);
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Build a real client pointed at the configured endpoint. Throws
|
|
179
|
+
* `EvalLlmNotConfiguredError` at call time (not construction time) when no
|
|
180
|
+
* API key is available, so CLI help and dry-run paths stay offline-safe.
|
|
181
|
+
*/
|
|
182
|
+
export function createEvalClient(config, options = {}) {
|
|
183
|
+
const retryPolicy = options.retryPolicy ?? {
|
|
184
|
+
...DEFAULT_RETRY_POLICY,
|
|
185
|
+
maxRetries: Math.max(0, config.maxRetries ?? DEFAULT_RETRY_POLICY.maxRetries)
|
|
186
|
+
};
|
|
187
|
+
const sleep = options.sleep ?? defaultSleep;
|
|
188
|
+
let cached;
|
|
189
|
+
const getClient = () => {
|
|
190
|
+
if (cached)
|
|
191
|
+
return cached;
|
|
192
|
+
if (!config.apiKey)
|
|
193
|
+
throw new EvalLlmNotConfiguredError();
|
|
194
|
+
const factory = options.openaiFactory ??
|
|
195
|
+
((opts) => new OpenAI(opts));
|
|
196
|
+
cached = factory({ apiKey: config.apiKey, baseURL: config.baseUrl });
|
|
197
|
+
return cached;
|
|
198
|
+
};
|
|
14
199
|
return {
|
|
15
|
-
async chat() {
|
|
16
|
-
|
|
200
|
+
async chat(request) {
|
|
201
|
+
const timeoutMs = Math.max(1_000, request.timeoutMs ?? config.timeoutMs);
|
|
202
|
+
const body = buildBody(request);
|
|
203
|
+
const client = getClient();
|
|
204
|
+
let lastError;
|
|
205
|
+
const maxAttempts = retryPolicy.maxRetries + 1;
|
|
206
|
+
for (let attempt = 0; attempt < maxAttempts; attempt += 1) {
|
|
207
|
+
const controller = new AbortController();
|
|
208
|
+
const handle = setTimeout(() => controller.abort(), timeoutMs);
|
|
209
|
+
try {
|
|
210
|
+
const raw = await client.chat.completions.create(body, {
|
|
211
|
+
signal: controller.signal
|
|
212
|
+
});
|
|
213
|
+
clearTimeout(handle);
|
|
214
|
+
const choice = raw.choices?.[0];
|
|
215
|
+
if (!choice) {
|
|
216
|
+
throw new EvalLlmInvalidResponseError("LLM response contained no choices.", { model: raw.model });
|
|
217
|
+
}
|
|
218
|
+
const content = choice.message?.content ?? "";
|
|
219
|
+
const toolCalls = choice.message?.tool_calls?.map((call) => ({
|
|
220
|
+
id: call.id,
|
|
221
|
+
name: call.function.name,
|
|
222
|
+
arguments: call.function.arguments
|
|
223
|
+
}));
|
|
224
|
+
const usage = {
|
|
225
|
+
promptTokens: raw.usage?.prompt_tokens ?? 0,
|
|
226
|
+
completionTokens: raw.usage?.completion_tokens ?? 0,
|
|
227
|
+
totalTokens: raw.usage?.total_tokens ?? 0
|
|
228
|
+
};
|
|
229
|
+
return {
|
|
230
|
+
content,
|
|
231
|
+
...(toolCalls && toolCalls.length > 0 ? { toolCalls } : {}),
|
|
232
|
+
usage,
|
|
233
|
+
finishReason: normalizeFinishReason(choice.finish_reason),
|
|
234
|
+
model: raw.model ?? request.model,
|
|
235
|
+
attempts: attempt + 1
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
catch (err) {
|
|
239
|
+
clearTimeout(handle);
|
|
240
|
+
const normalized = normalizeError(err, timeoutMs);
|
|
241
|
+
lastError = normalized;
|
|
242
|
+
const isLastAttempt = attempt === maxAttempts - 1;
|
|
243
|
+
if (!normalized.retryable || isLastAttempt)
|
|
244
|
+
throw normalized;
|
|
245
|
+
await sleep(backoffDelay(attempt, retryPolicy));
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
throw lastError ?? new EvalLlmTransportError(new Error("unknown"));
|
|
17
249
|
}
|
|
18
250
|
};
|
|
19
251
|
}
|
package/dist/eval/report.js
CHANGED
|
@@ -75,6 +75,32 @@ export function formatMarkdownReport(report) {
|
|
|
75
75
|
lines.push(`| ${item.stage} | ${item.caseId} | ${item.passed ? "yes" : "no"} | ${item.durationMs} | ${cost} |`);
|
|
76
76
|
}
|
|
77
77
|
lines.push(``);
|
|
78
|
+
const judgeCases = report.cases.filter((item) => item.verifierResults.some((r) => r.kind === "judge"));
|
|
79
|
+
if (judgeCases.length > 0) {
|
|
80
|
+
lines.push(`## Judge scores`);
|
|
81
|
+
lines.push(``);
|
|
82
|
+
lines.push(`| stage | case id | check | median | mean | coverage | ok |`);
|
|
83
|
+
lines.push(`| --- | --- | --- | --- | --- | --- | --- |`);
|
|
84
|
+
for (const item of judgeCases) {
|
|
85
|
+
for (const verifier of item.verifierResults) {
|
|
86
|
+
if (verifier.kind !== "judge")
|
|
87
|
+
continue;
|
|
88
|
+
if (verifier.id === "judge:required-checks")
|
|
89
|
+
continue;
|
|
90
|
+
if (verifier.id === "judge:rubric:missing")
|
|
91
|
+
continue;
|
|
92
|
+
if (verifier.id === "judge:invocation:error")
|
|
93
|
+
continue;
|
|
94
|
+
const details = verifier.details ?? {};
|
|
95
|
+
const median = typeof details.median === "number" ? details.median.toFixed(2) : "-";
|
|
96
|
+
const mean = typeof details.mean === "number" ? details.mean.toFixed(2) : "-";
|
|
97
|
+
const coverage = details.coverage === true ? "yes" : "no";
|
|
98
|
+
const checkId = verifier.id.replace(/^judge:/, "");
|
|
99
|
+
lines.push(`| ${item.stage} | ${item.caseId} | ${checkId} | ${median} | ${mean} | ${coverage} | ${verifier.ok ? "yes" : "no"} |`);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
lines.push(``);
|
|
103
|
+
}
|
|
78
104
|
lines.push(`## Verifier details`);
|
|
79
105
|
lines.push(``);
|
|
80
106
|
for (const item of report.cases) {
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { FlowStage } from "../types.js";
|
|
2
|
+
import type { RubricCheck, RubricDoc } from "./types.js";
|
|
3
|
+
export declare function rubricsDir(projectRoot: string): string;
|
|
4
|
+
export declare function rubricPath(projectRoot: string, stage: FlowStage): string;
|
|
5
|
+
declare function validateCheck(raw: unknown, index: number, file: string): RubricCheck;
|
|
6
|
+
declare function validateRubric(raw: unknown, file: string): RubricDoc;
|
|
7
|
+
/**
|
|
8
|
+
* Load the rubric for `stage`. Returns `undefined` when the file is
|
|
9
|
+
* missing so callers can emit a "no rubric" verifier result rather than
|
|
10
|
+
* crashing — authors are expected to grow rubrics incrementally.
|
|
11
|
+
*/
|
|
12
|
+
export declare function loadRubric(projectRoot: string, stage: FlowStage): Promise<RubricDoc | undefined>;
|
|
13
|
+
/** Load every rubric present in the given rubrics directory. */
|
|
14
|
+
export declare function loadAllRubrics(projectRoot: string): Promise<Map<FlowStage, RubricDoc>>;
|
|
15
|
+
/** Exposed for tests. */
|
|
16
|
+
export declare const __internal: {
|
|
17
|
+
validateRubric: typeof validateRubric;
|
|
18
|
+
validateCheck: typeof validateCheck;
|
|
19
|
+
};
|
|
20
|
+
export {};
|