cclaw-cli 0.23.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +3 -2
- package/dist/content/eval-scaffold.d.ts +5 -1
- package/dist/content/eval-scaffold.js +284 -3
- package/dist/eval/agents/single-shot.d.ts +27 -0
- package/dist/eval/agents/single-shot.js +79 -0
- package/dist/eval/config-loader.js +96 -3
- package/dist/eval/corpus.d.ts +11 -0
- package/dist/eval/corpus.js +162 -7
- package/dist/eval/cost-guard.d.ts +80 -0
- package/dist/eval/cost-guard.js +153 -0
- package/dist/eval/llm-client.d.ts +113 -20
- package/dist/eval/llm-client.js +242 -10
- package/dist/eval/report.js +26 -0
- package/dist/eval/rubric-loader.d.ts +20 -0
- package/dist/eval/rubric-loader.js +143 -0
- package/dist/eval/runner.d.ts +7 -0
- package/dist/eval/runner.js +213 -34
- package/dist/eval/types.d.ts +171 -4
- package/dist/eval/verifiers/judge.d.ts +40 -0
- package/dist/eval/verifiers/judge.js +256 -0
- package/dist/eval/verifiers/rules.d.ts +24 -0
- package/dist/eval/verifiers/rules.js +218 -0
- package/dist/eval/verifiers/traceability.d.ts +23 -0
- package/dist/eval/verifiers/traceability.js +84 -0
- package/dist/install.js +7 -1
- package/package.json +2 -1
package/dist/eval/corpus.js
CHANGED
|
@@ -58,6 +58,128 @@ function parseStructural(filePath, raw) {
|
|
|
58
58
|
structural.maxChars = maxChars;
|
|
59
59
|
return structural;
|
|
60
60
|
}
|
|
61
|
+
function parseRegexRule(filePath, context, value) {
|
|
62
|
+
if (typeof value === "string") {
|
|
63
|
+
return { pattern: value };
|
|
64
|
+
}
|
|
65
|
+
if (!isRecord(value)) {
|
|
66
|
+
throw corpusError(filePath, `"${context}" entries must be either a string or a mapping with "pattern"`);
|
|
67
|
+
}
|
|
68
|
+
const pattern = value.pattern;
|
|
69
|
+
if (typeof pattern !== "string" || pattern.length === 0) {
|
|
70
|
+
throw corpusError(filePath, `"${context}" mapping entry must include a non-empty "pattern" string`);
|
|
71
|
+
}
|
|
72
|
+
const flags = value.flags;
|
|
73
|
+
if (flags !== undefined && typeof flags !== "string") {
|
|
74
|
+
throw corpusError(filePath, `"${context}" flags must be a string`);
|
|
75
|
+
}
|
|
76
|
+
const description = value.description;
|
|
77
|
+
if (description !== undefined && typeof description !== "string") {
|
|
78
|
+
throw corpusError(filePath, `"${context}" description must be a string`);
|
|
79
|
+
}
|
|
80
|
+
const rule = { pattern };
|
|
81
|
+
if (flags !== undefined)
|
|
82
|
+
rule.flags = flags;
|
|
83
|
+
if (description !== undefined)
|
|
84
|
+
rule.description = description;
|
|
85
|
+
return rule;
|
|
86
|
+
}
|
|
87
|
+
function parseRegexRules(filePath, context, value) {
|
|
88
|
+
if (value === undefined)
|
|
89
|
+
return undefined;
|
|
90
|
+
if (!Array.isArray(value)) {
|
|
91
|
+
throw corpusError(filePath, `"${context}" must be an array`);
|
|
92
|
+
}
|
|
93
|
+
return value.map((entry, index) => parseRegexRule(filePath, `${context}[${index}]`, entry));
|
|
94
|
+
}
|
|
95
|
+
function parseOccurrenceBounds(filePath, context, value) {
|
|
96
|
+
if (value === undefined)
|
|
97
|
+
return undefined;
|
|
98
|
+
if (!isRecord(value)) {
|
|
99
|
+
throw corpusError(filePath, `"${context}" must be a mapping of phrase → integer`);
|
|
100
|
+
}
|
|
101
|
+
const out = {};
|
|
102
|
+
for (const [phrase, count] of Object.entries(value)) {
|
|
103
|
+
if (typeof count !== "number" || !Number.isFinite(count) || !Number.isInteger(count) || count < 0) {
|
|
104
|
+
throw corpusError(filePath, `"${context}.${phrase}" must be a non-negative integer`);
|
|
105
|
+
}
|
|
106
|
+
out[phrase] = count;
|
|
107
|
+
}
|
|
108
|
+
return out;
|
|
109
|
+
}
|
|
110
|
+
function parseRules(filePath, raw) {
|
|
111
|
+
if (raw === undefined)
|
|
112
|
+
return undefined;
|
|
113
|
+
if (!isRecord(raw)) {
|
|
114
|
+
throw corpusError(filePath, `"expected.rules" must be a mapping`);
|
|
115
|
+
}
|
|
116
|
+
const mustContain = readStringArray(filePath, "expected.rules.must_contain", raw.must_contain ?? raw.mustContain);
|
|
117
|
+
const mustNotContain = readStringArray(filePath, "expected.rules.must_not_contain", raw.must_not_contain ?? raw.mustNotContain);
|
|
118
|
+
const regexRequired = parseRegexRules(filePath, "expected.rules.regex_required", raw.regex_required ?? raw.regexRequired);
|
|
119
|
+
const regexForbidden = parseRegexRules(filePath, "expected.rules.regex_forbidden", raw.regex_forbidden ?? raw.regexForbidden);
|
|
120
|
+
const minOccurrences = parseOccurrenceBounds(filePath, "expected.rules.min_occurrences", raw.min_occurrences ?? raw.minOccurrences);
|
|
121
|
+
const maxOccurrences = parseOccurrenceBounds(filePath, "expected.rules.max_occurrences", raw.max_occurrences ?? raw.maxOccurrences);
|
|
122
|
+
const uniqueBulletsInSection = readStringArray(filePath, "expected.rules.unique_bullets_in_section", raw.unique_bullets_in_section ?? raw.uniqueBulletsInSection);
|
|
123
|
+
const rules = {};
|
|
124
|
+
if (mustContain)
|
|
125
|
+
rules.mustContain = mustContain;
|
|
126
|
+
if (mustNotContain)
|
|
127
|
+
rules.mustNotContain = mustNotContain;
|
|
128
|
+
if (regexRequired)
|
|
129
|
+
rules.regexRequired = regexRequired;
|
|
130
|
+
if (regexForbidden)
|
|
131
|
+
rules.regexForbidden = regexForbidden;
|
|
132
|
+
if (minOccurrences)
|
|
133
|
+
rules.minOccurrences = minOccurrences;
|
|
134
|
+
if (maxOccurrences)
|
|
135
|
+
rules.maxOccurrences = maxOccurrences;
|
|
136
|
+
if (uniqueBulletsInSection)
|
|
137
|
+
rules.uniqueBulletsInSection = uniqueBulletsInSection;
|
|
138
|
+
return Object.keys(rules).length === 0 ? undefined : rules;
|
|
139
|
+
}
|
|
140
|
+
function parseTraceability(filePath, raw) {
|
|
141
|
+
if (raw === undefined)
|
|
142
|
+
return undefined;
|
|
143
|
+
if (!isRecord(raw)) {
|
|
144
|
+
throw corpusError(filePath, `"expected.traceability" must be a mapping`);
|
|
145
|
+
}
|
|
146
|
+
const idPattern = raw.id_pattern ?? raw.idPattern;
|
|
147
|
+
if (typeof idPattern !== "string" || idPattern.length === 0) {
|
|
148
|
+
throw corpusError(filePath, `"expected.traceability.id_pattern" must be a non-empty regex source`);
|
|
149
|
+
}
|
|
150
|
+
const idFlags = raw.id_flags ?? raw.idFlags;
|
|
151
|
+
if (idFlags !== undefined && typeof idFlags !== "string") {
|
|
152
|
+
throw corpusError(filePath, `"expected.traceability.id_flags" must be a string`);
|
|
153
|
+
}
|
|
154
|
+
const source = raw.source;
|
|
155
|
+
if (typeof source !== "string" || source.length === 0) {
|
|
156
|
+
throw corpusError(filePath, `"expected.traceability.source" must be "self" or an extra_fixtures label`);
|
|
157
|
+
}
|
|
158
|
+
const requireInRaw = raw.require_in ?? raw.requireIn;
|
|
159
|
+
const requireIn = readStringArray(filePath, "expected.traceability.require_in", requireInRaw);
|
|
160
|
+
if (!requireIn || requireIn.length === 0) {
|
|
161
|
+
throw corpusError(filePath, `"expected.traceability.require_in" must be a non-empty array`);
|
|
162
|
+
}
|
|
163
|
+
const out = { idPattern, source, requireIn };
|
|
164
|
+
if (idFlags !== undefined)
|
|
165
|
+
out.idFlags = idFlags;
|
|
166
|
+
return out;
|
|
167
|
+
}
|
|
168
|
+
function parseExtraFixtures(filePath, raw) {
|
|
169
|
+
if (raw === undefined)
|
|
170
|
+
return undefined;
|
|
171
|
+
if (!isRecord(raw)) {
|
|
172
|
+
throw corpusError(filePath, `"extra_fixtures" must be a mapping of label → path`);
|
|
173
|
+
}
|
|
174
|
+
const out = {};
|
|
175
|
+
for (const [label, value] of Object.entries(raw)) {
|
|
176
|
+
if (typeof value !== "string" || value.length === 0) {
|
|
177
|
+
throw corpusError(filePath, `"extra_fixtures.${label}" must be a non-empty path string`);
|
|
178
|
+
}
|
|
179
|
+
out[label] = value;
|
|
180
|
+
}
|
|
181
|
+
return Object.keys(out).length === 0 ? undefined : out;
|
|
182
|
+
}
|
|
61
183
|
function parseExpected(filePath, raw) {
|
|
62
184
|
if (raw === undefined)
|
|
63
185
|
return undefined;
|
|
@@ -68,12 +190,12 @@ function parseExpected(filePath, raw) {
|
|
|
68
190
|
const structural = parseStructural(filePath, raw.structural);
|
|
69
191
|
if (structural)
|
|
70
192
|
shape.structural = structural;
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
193
|
+
const rules = parseRules(filePath, raw.rules);
|
|
194
|
+
if (rules)
|
|
195
|
+
shape.rules = rules;
|
|
196
|
+
const traceability = parseTraceability(filePath, raw.traceability);
|
|
197
|
+
if (traceability)
|
|
198
|
+
shape.traceability = traceability;
|
|
77
199
|
if (raw.judge !== undefined) {
|
|
78
200
|
if (!isRecord(raw.judge)) {
|
|
79
201
|
throw corpusError(filePath, `"expected.judge" must be a mapping`);
|
|
@@ -101,13 +223,15 @@ function validateCase(filePath, raw) {
|
|
|
101
223
|
const contextFiles = readStringArray(filePath, "context_files", raw.context_files ?? raw.contextFiles);
|
|
102
224
|
const expected = parseExpected(filePath, raw.expected);
|
|
103
225
|
const fixture = typeof raw.fixture === "string" ? raw.fixture : undefined;
|
|
226
|
+
const extraFixtures = parseExtraFixtures(filePath, raw.extra_fixtures ?? raw.extraFixtures);
|
|
104
227
|
return {
|
|
105
228
|
id: id.trim(),
|
|
106
229
|
stage: stageRaw,
|
|
107
230
|
inputPrompt: inputPrompt.trim(),
|
|
108
231
|
contextFiles,
|
|
109
232
|
expected,
|
|
110
|
-
fixture
|
|
233
|
+
fixture,
|
|
234
|
+
extraFixtures
|
|
111
235
|
};
|
|
112
236
|
}
|
|
113
237
|
/**
|
|
@@ -173,3 +297,34 @@ export async function readFixtureArtifact(projectRoot, caseEntry) {
|
|
|
173
297
|
}
|
|
174
298
|
return fs.readFile(fixturePath, "utf8");
|
|
175
299
|
}
|
|
300
|
+
/**
|
|
301
|
+
* Resolve an entry from `extraFixtures` to an absolute filesystem path,
|
|
302
|
+
* relative to the case's stage directory (same convention as `fixture`).
|
|
303
|
+
*/
|
|
304
|
+
export function extraFixturePath(projectRoot, caseEntry, label) {
|
|
305
|
+
const value = caseEntry.extraFixtures?.[label];
|
|
306
|
+
if (!value)
|
|
307
|
+
return undefined;
|
|
308
|
+
return path.resolve(projectRoot, EVALS_ROOT, "corpus", caseEntry.stage, value);
|
|
309
|
+
}
|
|
310
|
+
/**
|
|
311
|
+
* Read every declared extra fixture for a case into a `{ label → text }`
|
|
312
|
+
* map. Missing files throw so authoring mistakes surface immediately rather
|
|
313
|
+
* than being silently skipped by cross-artifact verifiers.
|
|
314
|
+
*/
|
|
315
|
+
export async function readExtraFixtures(projectRoot, caseEntry) {
|
|
316
|
+
const out = {};
|
|
317
|
+
if (!caseEntry.extraFixtures)
|
|
318
|
+
return out;
|
|
319
|
+
for (const label of Object.keys(caseEntry.extraFixtures)) {
|
|
320
|
+
const filePath = extraFixturePath(projectRoot, caseEntry, label);
|
|
321
|
+
if (!filePath)
|
|
322
|
+
continue;
|
|
323
|
+
if (!(await exists(filePath))) {
|
|
324
|
+
throw new Error(`Extra fixture missing for ${caseEntry.stage}/${caseEntry.id} ` +
|
|
325
|
+
`(label="${label}"): ${filePath}`);
|
|
326
|
+
}
|
|
327
|
+
out[label] = await fs.readFile(filePath, "utf8");
|
|
328
|
+
}
|
|
329
|
+
return out;
|
|
330
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import type { ChatUsage } from "./llm-client.js";
|
|
2
|
+
import type { ResolvedEvalConfig, TokenPricing } from "./types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Builtin pricing fallback. Intentionally conservative: when the user
|
|
5
|
+
* hasn't configured pricing and we don't know the model, we default to a
|
|
6
|
+
* "small model" USD schedule so the cap can still do something useful.
|
|
7
|
+
*
|
|
8
|
+
* Values are USD per 1K tokens. Sources are public pricing pages as of
|
|
9
|
+
* 2026-04; update by editing this constant, not the guard logic.
|
|
10
|
+
*/
|
|
11
|
+
export declare const DEFAULT_TOKEN_PRICING: Readonly<Record<string, TokenPricing>>;
|
|
12
|
+
/** Hard default when neither config nor builtins know the model. */
|
|
13
|
+
export declare const UNKNOWN_MODEL_PRICING: TokenPricing;
|
|
14
|
+
export interface SpendLedger {
|
|
15
|
+
/** ISO date (`YYYY-MM-DD` in UTC) — also embedded in the file name. */
|
|
16
|
+
date: string;
|
|
17
|
+
/** USD spent so far today across every call that hit the guard. */
|
|
18
|
+
totalUsd: number;
|
|
19
|
+
/** Number of `chat()` calls accounted for. */
|
|
20
|
+
calls: number;
|
|
21
|
+
/** Per-model breakdown for the report. */
|
|
22
|
+
byModel: Record<string, {
|
|
23
|
+
tokensIn: number;
|
|
24
|
+
tokensOut: number;
|
|
25
|
+
usd: number;
|
|
26
|
+
}>;
|
|
27
|
+
}
|
|
28
|
+
export declare class DailyCostCapExceededError extends Error {
|
|
29
|
+
readonly capUsd: number;
|
|
30
|
+
readonly projectedUsd: number;
|
|
31
|
+
readonly currentUsd: number;
|
|
32
|
+
constructor(opts: {
|
|
33
|
+
capUsd: number;
|
|
34
|
+
projectedUsd: number;
|
|
35
|
+
currentUsd: number;
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
declare function utcDate(now?: Date): string;
|
|
39
|
+
declare function pricingFor(model: string, config: Pick<ResolvedEvalConfig, "tokenPricing">): TokenPricing;
|
|
40
|
+
/**
|
|
41
|
+
* Compute USD cost of a single `ChatUsage` using the given `model` pricing
|
|
42
|
+
* schedule. Returns 0 when `usage.totalTokens` is 0 (e.g. transport error
|
|
43
|
+
* before first token).
|
|
44
|
+
*/
|
|
45
|
+
export declare function computeUsageUsd(model: string, usage: ChatUsage, config: Pick<ResolvedEvalConfig, "tokenPricing">): number;
|
|
46
|
+
declare function ledgerPath(projectRoot: string, date: string): string;
|
|
47
|
+
declare function readLedger(file: string, date: string): Promise<SpendLedger>;
|
|
48
|
+
declare function writeLedger(file: string, ledger: SpendLedger): Promise<void>;
|
|
49
|
+
/**
|
|
50
|
+
* Guard a single LLM call against the daily USD cap. Returns the updated
|
|
51
|
+
* ledger on success; throws `DailyCostCapExceededError` when the projected
|
|
52
|
+
* total would cross the cap. When `config.dailyUsdCap` is unset, the guard
|
|
53
|
+
* is a no-op — no file writes, no ledger — so non-judge runs never touch
|
|
54
|
+
* the filesystem.
|
|
55
|
+
*/
|
|
56
|
+
export interface CostGuard {
|
|
57
|
+
/**
|
|
58
|
+
* Commit the USD cost of a finished call to the ledger. When `dailyUsdCap`
|
|
59
|
+
* is set, refuses the commit if the projected total would exceed the cap.
|
|
60
|
+
*/
|
|
61
|
+
commit(model: string, usage: ChatUsage): Promise<number>;
|
|
62
|
+
/** Snapshot the current ledger (or undefined when no cap is set). */
|
|
63
|
+
snapshot(): Promise<SpendLedger | undefined>;
|
|
64
|
+
}
|
|
65
|
+
export interface CreateCostGuardOptions {
|
|
66
|
+
/** Clock injection for tests. */
|
|
67
|
+
now?: () => Date;
|
|
68
|
+
/** Override the default filesystem root for the ledger. */
|
|
69
|
+
ledgerPath?: string;
|
|
70
|
+
}
|
|
71
|
+
export declare function createCostGuard(projectRoot: string, config: Pick<ResolvedEvalConfig, "dailyUsdCap" | "tokenPricing">, options?: CreateCostGuardOptions): CostGuard;
|
|
72
|
+
/** Exposed for tests. */
|
|
73
|
+
export declare const __internal: {
|
|
74
|
+
utcDate: typeof utcDate;
|
|
75
|
+
pricingFor: typeof pricingFor;
|
|
76
|
+
ledgerPath: typeof ledgerPath;
|
|
77
|
+
readLedger: typeof readLedger;
|
|
78
|
+
writeLedger: typeof writeLedger;
|
|
79
|
+
};
|
|
80
|
+
export {};
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cost guard for the cclaw eval subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Two responsibilities:
|
|
5
|
+
*
|
|
6
|
+
* 1. Convert `ChatUsage` (prompt/completion token counts) into USD using
|
|
7
|
+
* a per-model `TokenPricing` schedule. Pricing comes from
|
|
8
|
+
* `config.tokenPricing[model]` first, then from the builtin fallback
|
|
9
|
+
* schedule for well-known models (z.ai GLM 5.1 at publish time).
|
|
10
|
+
* 2. Maintain a per-day running total persisted to
|
|
11
|
+
* `.cclaw/evals/.spend-YYYY-MM-DD.json` so that a long eval session
|
|
12
|
+
* (or a cron-run nightly) can't blow through the configured
|
|
13
|
+
* `dailyUsdCap`. The counter is opt-in: no cap, no writes.
|
|
14
|
+
*
|
|
15
|
+
* The guard is deliberately pessimistic — it rounds USD up to 6 decimals
|
|
16
|
+
* and never subtracts, so a CI run that errors mid-flight still shows the
|
|
17
|
+
* partial spend in the next report.
|
|
18
|
+
*/
|
|
19
|
+
import fs from "node:fs/promises";
|
|
20
|
+
import path from "node:path";
|
|
21
|
+
import { EVALS_ROOT } from "../constants.js";
|
|
22
|
+
import { exists } from "../fs-utils.js";
|
|
23
|
+
/**
|
|
24
|
+
* Builtin pricing fallback. Intentionally conservative: when the user
|
|
25
|
+
* hasn't configured pricing and we don't know the model, we default to a
|
|
26
|
+
* "small model" USD schedule so the cap can still do something useful.
|
|
27
|
+
*
|
|
28
|
+
* Values are USD per 1K tokens. Sources are public pricing pages as of
|
|
29
|
+
* 2026-04; update by editing this constant, not the guard logic.
|
|
30
|
+
*/
|
|
31
|
+
export const DEFAULT_TOKEN_PRICING = {
|
|
32
|
+
"glm-5.1": { input: 0.0005, output: 0.0015 },
|
|
33
|
+
"glm-4.6": { input: 0.0005, output: 0.0015 },
|
|
34
|
+
"gpt-4o-mini": { input: 0.00015, output: 0.0006 },
|
|
35
|
+
"gpt-4o": { input: 0.005, output: 0.015 }
|
|
36
|
+
};
|
|
37
|
+
/** Hard default when neither config nor builtins know the model. */
|
|
38
|
+
export const UNKNOWN_MODEL_PRICING = { input: 0.001, output: 0.003 };
|
|
39
|
+
export class DailyCostCapExceededError extends Error {
|
|
40
|
+
capUsd;
|
|
41
|
+
projectedUsd;
|
|
42
|
+
currentUsd;
|
|
43
|
+
constructor(opts) {
|
|
44
|
+
super(`Daily cost cap would be exceeded: ` +
|
|
45
|
+
`current=$${opts.currentUsd.toFixed(4)}, ` +
|
|
46
|
+
`projected=$${opts.projectedUsd.toFixed(4)}, ` +
|
|
47
|
+
`cap=$${opts.capUsd.toFixed(4)}. ` +
|
|
48
|
+
`Unset CCLAW_EVAL_DAILY_USD_CAP or increase the cap to continue.`);
|
|
49
|
+
this.name = "DailyCostCapExceededError";
|
|
50
|
+
this.capUsd = opts.capUsd;
|
|
51
|
+
this.projectedUsd = opts.projectedUsd;
|
|
52
|
+
this.currentUsd = opts.currentUsd;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
function utcDate(now = new Date()) {
|
|
56
|
+
return now.toISOString().slice(0, 10);
|
|
57
|
+
}
|
|
58
|
+
function pricingFor(model, config) {
|
|
59
|
+
const custom = config.tokenPricing?.[model];
|
|
60
|
+
if (custom)
|
|
61
|
+
return custom;
|
|
62
|
+
const builtin = DEFAULT_TOKEN_PRICING[model];
|
|
63
|
+
if (builtin)
|
|
64
|
+
return builtin;
|
|
65
|
+
return UNKNOWN_MODEL_PRICING;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Compute USD cost of a single `ChatUsage` using the given `model` pricing
|
|
69
|
+
* schedule. Returns 0 when `usage.totalTokens` is 0 (e.g. transport error
|
|
70
|
+
* before first token).
|
|
71
|
+
*/
|
|
72
|
+
export function computeUsageUsd(model, usage, config) {
|
|
73
|
+
if (!usage || usage.totalTokens <= 0)
|
|
74
|
+
return 0;
|
|
75
|
+
const schedule = pricingFor(model, config);
|
|
76
|
+
const cost = (usage.promptTokens * schedule.input) / 1_000 +
|
|
77
|
+
(usage.completionTokens * schedule.output) / 1_000;
|
|
78
|
+
return Math.max(0, Number(cost.toFixed(6)));
|
|
79
|
+
}
|
|
80
|
+
function emptyLedger(date) {
|
|
81
|
+
return { date, totalUsd: 0, calls: 0, byModel: {} };
|
|
82
|
+
}
|
|
83
|
+
function ledgerPath(projectRoot, date) {
|
|
84
|
+
return path.join(projectRoot, EVALS_ROOT, `.spend-${date}.json`);
|
|
85
|
+
}
|
|
86
|
+
async function readLedger(file, date) {
|
|
87
|
+
if (!(await exists(file)))
|
|
88
|
+
return emptyLedger(date);
|
|
89
|
+
try {
|
|
90
|
+
const raw = JSON.parse(await fs.readFile(file, "utf8"));
|
|
91
|
+
if (raw?.date !== date)
|
|
92
|
+
return emptyLedger(date);
|
|
93
|
+
return {
|
|
94
|
+
date,
|
|
95
|
+
totalUsd: typeof raw.totalUsd === "number" ? raw.totalUsd : 0,
|
|
96
|
+
calls: typeof raw.calls === "number" ? raw.calls : 0,
|
|
97
|
+
byModel: raw.byModel && typeof raw.byModel === "object" ? raw.byModel : {}
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
catch {
|
|
101
|
+
return emptyLedger(date);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
async function writeLedger(file, ledger) {
|
|
105
|
+
await fs.mkdir(path.dirname(file), { recursive: true });
|
|
106
|
+
await fs.writeFile(file, `${JSON.stringify(ledger, null, 2)}\n`, "utf8");
|
|
107
|
+
}
|
|
108
|
+
export function createCostGuard(projectRoot, config, options = {}) {
|
|
109
|
+
const now = options.now ?? (() => new Date());
|
|
110
|
+
const currentDate = () => utcDate(now());
|
|
111
|
+
const file = () => options.ledgerPath ?? ledgerPath(projectRoot, currentDate());
|
|
112
|
+
return {
|
|
113
|
+
async commit(model, usage) {
|
|
114
|
+
const usd = computeUsageUsd(model, usage, config);
|
|
115
|
+
if (config.dailyUsdCap === undefined)
|
|
116
|
+
return usd;
|
|
117
|
+
const date = currentDate();
|
|
118
|
+
const target = file();
|
|
119
|
+
const ledger = await readLedger(target, date);
|
|
120
|
+
const projected = Number((ledger.totalUsd + usd).toFixed(6));
|
|
121
|
+
if (projected > config.dailyUsdCap) {
|
|
122
|
+
throw new DailyCostCapExceededError({
|
|
123
|
+
capUsd: config.dailyUsdCap,
|
|
124
|
+
projectedUsd: projected,
|
|
125
|
+
currentUsd: ledger.totalUsd
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
ledger.totalUsd = projected;
|
|
129
|
+
ledger.calls += 1;
|
|
130
|
+
const byModel = ledger.byModel[model] ?? { tokensIn: 0, tokensOut: 0, usd: 0 };
|
|
131
|
+
byModel.tokensIn += usage.promptTokens;
|
|
132
|
+
byModel.tokensOut += usage.completionTokens;
|
|
133
|
+
byModel.usd = Number((byModel.usd + usd).toFixed(6));
|
|
134
|
+
ledger.byModel[model] = byModel;
|
|
135
|
+
await writeLedger(target, ledger);
|
|
136
|
+
return usd;
|
|
137
|
+
},
|
|
138
|
+
async snapshot() {
|
|
139
|
+
if (config.dailyUsdCap === undefined)
|
|
140
|
+
return undefined;
|
|
141
|
+
const date = currentDate();
|
|
142
|
+
return readLedger(file(), date);
|
|
143
|
+
}
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
/** Exposed for tests. */
|
|
147
|
+
export const __internal = {
|
|
148
|
+
utcDate,
|
|
149
|
+
pricingFor,
|
|
150
|
+
ledgerPath,
|
|
151
|
+
readLedger,
|
|
152
|
+
writeLedger
|
|
153
|
+
};
|
|
@@ -1,18 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
* LLM client skeleton for the cclaw eval subsystem.
|
|
3
|
-
*
|
|
4
|
-
* This module declares the shape of the client without pulling in the
|
|
5
|
-
* `openai` runtime dependency. The real implementation lands when
|
|
6
|
-
* single-shot (Tier A) evals and LLM judging come online. Keeping this stub
|
|
7
|
-
* separate means users who only run structural + rule-based verifiers never
|
|
8
|
-
* install an extra dependency or receive network egress warnings.
|
|
9
|
-
*/
|
|
1
|
+
import type { ClientOptions } from "openai";
|
|
10
2
|
import type { ResolvedEvalConfig } from "./types.js";
|
|
11
|
-
/**
|
|
12
|
-
* Minimal chat interface the rest of the eval code will depend on. It is
|
|
13
|
-
* intentionally a subset of OpenAI's Chat Completions surface so that the
|
|
14
|
-
* real implementation is a thin adapter around `OpenAI.chat.completions.create`.
|
|
15
|
-
*/
|
|
16
3
|
export interface ChatMessage {
|
|
17
4
|
role: "system" | "user" | "assistant" | "tool";
|
|
18
5
|
content: string;
|
|
@@ -24,7 +11,18 @@ export interface ChatRequest {
|
|
|
24
11
|
messages: ChatMessage[];
|
|
25
12
|
maxTokens?: number;
|
|
26
13
|
temperature?: number;
|
|
14
|
+
/** Per-call timeout override. Falls back to `config.timeoutMs`. */
|
|
27
15
|
timeoutMs?: number;
|
|
16
|
+
/**
|
|
17
|
+
* Ask the provider for a JSON-object response. The judge pipeline sets
|
|
18
|
+
* this; the agent-under-test usually leaves it unset.
|
|
19
|
+
*/
|
|
20
|
+
responseFormatJson?: boolean;
|
|
21
|
+
/**
|
|
22
|
+
* Optional deterministic sampling seed. Providers that don't implement
|
|
23
|
+
* `seed` simply ignore it.
|
|
24
|
+
*/
|
|
25
|
+
seed?: number;
|
|
28
26
|
/**
|
|
29
27
|
* Tool/function-calling definitions in OpenAI wire format. Populated only
|
|
30
28
|
* by Tier B. Ignored by the Tier A single-shot path.
|
|
@@ -46,17 +44,112 @@ export interface ChatResponse {
|
|
|
46
44
|
}>;
|
|
47
45
|
usage: ChatUsage;
|
|
48
46
|
finishReason: "stop" | "length" | "tool_calls" | "content_filter";
|
|
47
|
+
model: string;
|
|
48
|
+
attempts: number;
|
|
49
|
+
}
|
|
50
|
+
/** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
|
|
51
|
+
export declare class EvalLlmError extends Error {
|
|
52
|
+
readonly retryable: boolean;
|
|
53
|
+
readonly status?: number;
|
|
54
|
+
constructor(message: string, opts: {
|
|
55
|
+
retryable: boolean;
|
|
56
|
+
status?: number;
|
|
57
|
+
cause?: unknown;
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
export declare class EvalLlmAuthError extends EvalLlmError {
|
|
61
|
+
constructor(cause: unknown);
|
|
62
|
+
}
|
|
63
|
+
export declare class EvalLlmConfigError extends EvalLlmError {
|
|
64
|
+
constructor(message: string, cause?: unknown);
|
|
65
|
+
}
|
|
66
|
+
export declare class EvalLlmTimeoutError extends EvalLlmError {
|
|
67
|
+
constructor(timeoutMs: number);
|
|
68
|
+
}
|
|
69
|
+
export declare class EvalLlmRateLimitedError extends EvalLlmError {
|
|
70
|
+
constructor(cause: unknown);
|
|
71
|
+
}
|
|
72
|
+
export declare class EvalLlmTransportError extends EvalLlmError {
|
|
73
|
+
constructor(cause: unknown, status?: number);
|
|
74
|
+
}
|
|
75
|
+
export declare class EvalLlmInvalidResponseError extends EvalLlmError {
|
|
76
|
+
constructor(message: string, details?: Record<string, unknown>);
|
|
77
|
+
}
|
|
78
|
+
export declare class EvalLlmNotConfiguredError extends EvalLlmError {
|
|
79
|
+
constructor();
|
|
49
80
|
}
|
|
50
81
|
/** Lightweight client abstraction shared across eval runners. */
|
|
51
82
|
export interface EvalLlmClient {
|
|
52
83
|
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
53
84
|
}
|
|
54
|
-
|
|
55
|
-
|
|
85
|
+
/**
|
|
86
|
+
* Deprecated shim preserved so older wiring keeps compiling. Prefer
|
|
87
|
+
* `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
|
|
88
|
+
* key" case.
|
|
89
|
+
*/
|
|
90
|
+
export declare class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
|
|
91
|
+
}
|
|
92
|
+
/** `createEvalClient` options — mostly for tests to inject a fake transport. */
|
|
93
|
+
export interface CreateEvalClientOptions {
|
|
94
|
+
/** Inject an `openai` stand-in. Used by unit tests to avoid real HTTP. */
|
|
95
|
+
openaiFactory?: (opts: ClientOptions) => OpenAILike;
|
|
96
|
+
/**
|
|
97
|
+
* Override the default retry/backoff policy. Honored by the internal
|
|
98
|
+
* retry loop; transport errors still fall back to the defaults when
|
|
99
|
+
* unset.
|
|
100
|
+
*/
|
|
101
|
+
retryPolicy?: RetryPolicy;
|
|
102
|
+
/** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
|
|
103
|
+
sleep?: (ms: number) => Promise<void>;
|
|
104
|
+
}
|
|
105
|
+
export interface RetryPolicy {
|
|
106
|
+
/** Max retries *on top of* the initial attempt. 0 = single attempt. */
|
|
107
|
+
maxRetries: number;
|
|
108
|
+
/** Initial backoff in ms. Doubles each retry (capped at `maxBackoffMs`). */
|
|
109
|
+
initialBackoffMs: number;
|
|
110
|
+
/** Upper bound for a single sleep between attempts. */
|
|
111
|
+
maxBackoffMs: number;
|
|
112
|
+
}
|
|
113
|
+
export declare const DEFAULT_RETRY_POLICY: RetryPolicy;
|
|
114
|
+
/**
|
|
115
|
+
* Minimal OpenAI-SDK surface we depend on, declared here so tests can
|
|
116
|
+
* substitute a plain object without pulling the real SDK into the test
|
|
117
|
+
* runtime.
|
|
118
|
+
*/
|
|
119
|
+
export interface OpenAILike {
|
|
120
|
+
chat: {
|
|
121
|
+
completions: {
|
|
122
|
+
create(body: Record<string, unknown>, options: {
|
|
123
|
+
signal: AbortSignal;
|
|
124
|
+
}): Promise<OpenAILikeChatResponse>;
|
|
125
|
+
};
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
interface OpenAILikeChatResponse {
|
|
129
|
+
model?: string;
|
|
130
|
+
choices: Array<{
|
|
131
|
+
message?: {
|
|
132
|
+
content?: string | null;
|
|
133
|
+
tool_calls?: Array<{
|
|
134
|
+
id: string;
|
|
135
|
+
function: {
|
|
136
|
+
name: string;
|
|
137
|
+
arguments: string;
|
|
138
|
+
};
|
|
139
|
+
}>;
|
|
140
|
+
};
|
|
141
|
+
finish_reason?: string | null;
|
|
142
|
+
}>;
|
|
143
|
+
usage?: {
|
|
144
|
+
prompt_tokens?: number;
|
|
145
|
+
completion_tokens?: number;
|
|
146
|
+
total_tokens?: number;
|
|
147
|
+
};
|
|
56
148
|
}
|
|
57
149
|
/**
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
150
|
+
* Build a real client pointed at the configured endpoint. Throws
|
|
151
|
+
* `EvalLlmNotConfiguredError` at call time (not construction time) when no
|
|
152
|
+
* API key is available, so CLI help and dry-run paths stay offline-safe.
|
|
61
153
|
*/
|
|
62
|
-
export declare function createEvalClient(
|
|
154
|
+
export declare function createEvalClient(config: ResolvedEvalConfig, options?: CreateEvalClientOptions): EvalLlmClient;
|
|
155
|
+
export {};
|