cclaw-cli 0.24.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,152 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { SandboxEscapeError } from "../sandbox.js";
4
+ import { parseArgs, requireString, optionalNumber, truncatePayload } from "./types.js";
5
+ const DESCRIPTION = "Search the sandbox for a regular expression. Returns matching lines in " +
6
+ "`path:line:text` form. Accepts optional `caseInsensitive` and a per-call " +
7
+ "`maxMatches` cap (default 100, hard max 500).";
8
+ const HARD_MAX = 500;
9
+ export const grepTool = {
10
+ descriptor: {
11
+ name: "grep",
12
+ description: DESCRIPTION,
13
+ parameters: {
14
+ type: "object",
15
+ additionalProperties: false,
16
+ required: ["pattern"],
17
+ properties: {
18
+ pattern: {
19
+ type: "string",
20
+ description: "Regular expression compiled with JavaScript semantics."
21
+ },
22
+ caseInsensitive: {
23
+ type: "boolean",
24
+ description: "Match case-insensitively (default false)."
25
+ },
26
+ maxMatches: {
27
+ type: "integer",
28
+ minimum: 1,
29
+ description: "Stop after N matches (default 100, hard max 500)."
30
+ }
31
+ }
32
+ }
33
+ },
34
+ async invoke(rawArgs, ctx) {
35
+ let args;
36
+ try {
37
+ args = parseArgs(rawArgs);
38
+ }
39
+ catch (err) {
40
+ return { ok: false, name: this.descriptor.name, error: err.message };
41
+ }
42
+ let pattern;
43
+ try {
44
+ pattern = requireString(args, "pattern");
45
+ }
46
+ catch (err) {
47
+ return { ok: false, name: this.descriptor.name, error: err.message };
48
+ }
49
+ const caseInsensitive = args.caseInsensitive === true;
50
+ let maxMatches;
51
+ try {
52
+ const raw = optionalNumber(args, "maxMatches");
53
+ maxMatches = raw === undefined ? 100 : Math.min(HARD_MAX, Math.max(1, Math.floor(raw)));
54
+ }
55
+ catch (err) {
56
+ return {
57
+ ok: false,
58
+ name: this.descriptor.name,
59
+ error: err.message
60
+ };
61
+ }
62
+ let regex;
63
+ try {
64
+ regex = new RegExp(pattern, caseInsensitive ? "i" : "");
65
+ }
66
+ catch (err) {
67
+ return {
68
+ ok: false,
69
+ name: this.descriptor.name,
70
+ error: `invalid regex: ${err.message}`
71
+ };
72
+ }
73
+ let filesScanned = 0;
74
+ const hits = [];
75
+ try {
76
+ await walk(ctx.sandbox.root, "", async (relPath, abs) => {
77
+ if (hits.length >= maxMatches)
78
+ return false;
79
+ let content;
80
+ try {
81
+ content = await fs.readFile(abs, "utf8");
82
+ }
83
+ catch {
84
+ return true;
85
+ }
86
+ filesScanned += 1;
87
+ const lines = content.split(/\r?\n/);
88
+ for (let i = 0; i < lines.length; i += 1) {
89
+ const line = lines[i];
90
+ if (regex.test(line)) {
91
+ hits.push(`${relPath}:${i + 1}:${line}`);
92
+ if (hits.length >= maxMatches)
93
+ return false;
94
+ }
95
+ }
96
+ return true;
97
+ });
98
+ }
99
+ catch (err) {
100
+ if (err instanceof SandboxEscapeError) {
101
+ return {
102
+ ok: false,
103
+ name: this.descriptor.name,
104
+ error: err.message,
105
+ details: { deniedPath: pattern }
106
+ };
107
+ }
108
+ return {
109
+ ok: false,
110
+ name: this.descriptor.name,
111
+ error: `walk failed: ${err.message}`
112
+ };
113
+ }
114
+ const body = hits.length > 0 ? hits.join("\n") : "(no matches)";
115
+ return {
116
+ ok: true,
117
+ name: this.descriptor.name,
118
+ content: truncatePayload(body, ctx.maxResultBytes),
119
+ details: {
120
+ pattern,
121
+ caseInsensitive,
122
+ matches: hits.length,
123
+ filesScanned,
124
+ truncated: hits.length >= maxMatches
125
+ }
126
+ };
127
+ }
128
+ };
129
+ async function walk(root, rel, visit) {
130
+ const dir = path.join(root, rel);
131
+ let entries;
132
+ try {
133
+ entries = (await fs.readdir(dir, { withFileTypes: true }));
134
+ }
135
+ catch {
136
+ return;
137
+ }
138
+ for (const entry of entries) {
139
+ const childRel = rel ? path.join(rel, entry.name) : entry.name;
140
+ if (entry.isSymbolicLink())
141
+ continue;
142
+ if (entry.isDirectory()) {
143
+ await walk(root, childRel, visit);
144
+ continue;
145
+ }
146
+ if (entry.isFile()) {
147
+ const keepGoing = await visit(childRel.replace(/\\/g, "/"), path.join(root, childRel));
148
+ if (keepGoing === false)
149
+ return;
150
+ }
151
+ }
152
+ }
@@ -0,0 +1,7 @@
1
+ import type { SandboxTool } from "./types.js";
2
+ export { SandboxTool, ToolResult, ToolContext, truncatePayload } from "./types.js";
3
+ export declare const BUILTIN_TOOLS: SandboxTool[];
4
+ /** Build a lookup for the agent loop. */
5
+ export declare function toolsByName(tools?: SandboxTool[]): Map<string, SandboxTool>;
6
+ /** Shape a tool list for OpenAI-style `tools[]` in the chat request. */
7
+ export declare function toolsForRequest(tools?: SandboxTool[]): unknown[];
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Registry of sandbox-confined tools used by the Tier B with-tools agent.
3
+ *
4
+ * The registry order defines the advertised schema order in the
5
+ * function-calling payload. Keeping it stable means judges reading
6
+ * generated traces can rely on predictable tool descriptions.
7
+ */
8
+ import { globTool } from "./glob.js";
9
+ import { grepTool } from "./grep.js";
10
+ import { readTool } from "./read.js";
11
+ import { writeTool } from "./write.js";
12
+ export { truncatePayload } from "./types.js";
13
+ export const BUILTIN_TOOLS = [readTool, writeTool, globTool, grepTool];
14
+ /** Build a lookup for the agent loop. */
15
+ export function toolsByName(tools = BUILTIN_TOOLS) {
16
+ const map = new Map();
17
+ for (const tool of tools) {
18
+ if (map.has(tool.descriptor.name)) {
19
+ throw new Error(`duplicate tool name: ${tool.descriptor.name}`);
20
+ }
21
+ map.set(tool.descriptor.name, tool);
22
+ }
23
+ return map;
24
+ }
25
+ /** Shape a tool list for OpenAI-style `tools[]` in the chat request. */
26
+ export function toolsForRequest(tools = BUILTIN_TOOLS) {
27
+ return tools.map((tool) => ({
28
+ type: "function",
29
+ function: {
30
+ name: tool.descriptor.name,
31
+ description: tool.descriptor.description,
32
+ parameters: tool.descriptor.parameters
33
+ }
34
+ }));
35
+ }
@@ -0,0 +1,2 @@
1
+ import { type SandboxTool } from "./types.js";
2
+ export declare const readTool: SandboxTool;
@@ -0,0 +1,122 @@
1
+ import fs from "node:fs/promises";
2
+ import { SandboxEscapeError } from "../sandbox.js";
3
+ import { parseArgs, requireString, optionalNumber, truncatePayload } from "./types.js";
4
+ const DESCRIPTION = "Read a UTF-8 text file from the sandbox. Returns the file contents. " +
5
+ "Supports optional 1-indexed `offset` and `limit` to read a slice.";
6
+ export const readTool = {
7
+ descriptor: {
8
+ name: "read_file",
9
+ description: DESCRIPTION,
10
+ parameters: {
11
+ type: "object",
12
+ additionalProperties: false,
13
+ required: ["path"],
14
+ properties: {
15
+ path: {
16
+ type: "string",
17
+ description: "Path relative to the sandbox root."
18
+ },
19
+ offset: {
20
+ type: "integer",
21
+ minimum: 1,
22
+ description: "1-indexed start line (inclusive)."
23
+ },
24
+ limit: {
25
+ type: "integer",
26
+ minimum: 1,
27
+ description: "Maximum number of lines to return."
28
+ }
29
+ }
30
+ }
31
+ },
32
+ async invoke(rawArgs, ctx) {
33
+ let args;
34
+ try {
35
+ args = parseArgs(rawArgs);
36
+ }
37
+ catch (err) {
38
+ return { ok: false, name: this.descriptor.name, error: err.message };
39
+ }
40
+ let relPath;
41
+ try {
42
+ relPath = requireString(args, "path");
43
+ }
44
+ catch (err) {
45
+ return { ok: false, name: this.descriptor.name, error: err.message };
46
+ }
47
+ let offset;
48
+ let limit;
49
+ try {
50
+ offset = optionalNumber(args, "offset");
51
+ limit = optionalNumber(args, "limit");
52
+ }
53
+ catch (err) {
54
+ return {
55
+ ok: false,
56
+ name: this.descriptor.name,
57
+ error: err.message
58
+ };
59
+ }
60
+ if (offset !== undefined && (!Number.isInteger(offset) || offset < 1)) {
61
+ return {
62
+ ok: false,
63
+ name: this.descriptor.name,
64
+ error: '"offset" must be a positive integer'
65
+ };
66
+ }
67
+ if (limit !== undefined && (!Number.isInteger(limit) || limit < 1)) {
68
+ return {
69
+ ok: false,
70
+ name: this.descriptor.name,
71
+ error: '"limit" must be a positive integer'
72
+ };
73
+ }
74
+ let abs;
75
+ try {
76
+ abs = await ctx.sandbox.resolve(relPath);
77
+ }
78
+ catch (err) {
79
+ const denied = err instanceof SandboxEscapeError ? relPath : undefined;
80
+ return {
81
+ ok: false,
82
+ name: this.descriptor.name,
83
+ error: err.message,
84
+ details: denied ? { deniedPath: denied } : undefined
85
+ };
86
+ }
87
+ let raw;
88
+ try {
89
+ raw = await fs.readFile(abs, "utf8");
90
+ }
91
+ catch (err) {
92
+ return {
93
+ ok: false,
94
+ name: this.descriptor.name,
95
+ error: `read failed: ${err.message}`,
96
+ details: { path: relPath }
97
+ };
98
+ }
99
+ let content = raw;
100
+ let effectiveLines;
101
+ if (offset !== undefined || limit !== undefined) {
102
+ const lines = raw.split(/\r?\n/);
103
+ const start = Math.max(0, (offset ?? 1) - 1);
104
+ const end = limit !== undefined ? Math.min(lines.length, start + limit) : lines.length;
105
+ const slice = lines.slice(start, end);
106
+ content = slice.join("\n");
107
+ effectiveLines = slice.length;
108
+ }
109
+ const truncated = truncatePayload(content, ctx.maxResultBytes);
110
+ return {
111
+ ok: true,
112
+ name: this.descriptor.name,
113
+ content: truncated,
114
+ details: {
115
+ path: relPath,
116
+ bytes: Buffer.byteLength(truncated, "utf8"),
117
+ truncated: truncated !== content,
118
+ ...(effectiveLines !== undefined ? { lines: effectiveLines } : {})
119
+ }
120
+ };
121
+ }
122
+ };
@@ -0,0 +1,49 @@
1
+ /**
2
+ * Shared types for Tier B sandbox-confined tools.
3
+ *
4
+ * Tools are plain async functions: they take validated arguments and a
5
+ * sandbox handle and return a structured result. The runner serializes
6
+ * results for the model as JSON; the `SandboxTool.invoke` wrapper keeps
7
+ * both the raw structured output (for tests/metrics) and the stringified
8
+ * model-facing payload.
9
+ */
10
+ import type { Sandbox } from "../sandbox.js";
11
+ export interface ToolDescriptor {
12
+ /** Name the model calls (must match the function-calling schema). */
13
+ name: string;
14
+ /** Human-readable prompt shown to the model. */
15
+ description: string;
16
+ /** JSON schema shipped with the OpenAI-style `tools[]` array. */
17
+ parameters: Record<string, unknown>;
18
+ }
19
+ export interface ToolContext {
20
+ sandbox: Sandbox;
21
+ /**
22
+ * Maximum bytes the tool may return in `content`. Results longer than
23
+ * this are truncated with a trailing marker so the model sees the
24
+ * cutoff.
25
+ */
26
+ maxResultBytes: number;
27
+ }
28
+ export interface ToolSuccess {
29
+ ok: true;
30
+ name: string;
31
+ content: string;
32
+ details?: Record<string, unknown>;
33
+ }
34
+ export interface ToolFailure {
35
+ ok: false;
36
+ name: string;
37
+ error: string;
38
+ details?: Record<string, unknown>;
39
+ }
40
+ export type ToolResult = ToolSuccess | ToolFailure;
41
+ export interface SandboxTool {
42
+ descriptor: ToolDescriptor;
43
+ invoke(rawArgs: string, ctx: ToolContext): Promise<ToolResult>;
44
+ }
45
+ /** Truncate a result payload to `maxBytes` with a visible cutoff marker. */
46
+ export declare function truncatePayload(payload: string, maxBytes: number): string;
47
+ export declare function parseArgs(raw: string): Record<string, unknown>;
48
+ export declare function requireString(args: Record<string, unknown>, key: string): string;
49
+ export declare function optionalNumber(args: Record<string, unknown>, key: string): number | undefined;
@@ -0,0 +1,41 @@
1
+ /** Truncate a result payload to `maxBytes` with a visible cutoff marker. */
2
+ export function truncatePayload(payload, maxBytes) {
3
+ if (Buffer.byteLength(payload, "utf8") <= maxBytes)
4
+ return payload;
5
+ const marker = "\n…[truncated by cclaw sandbox]";
6
+ const budget = Math.max(0, maxBytes - Buffer.byteLength(marker, "utf8"));
7
+ const buf = Buffer.from(payload, "utf8").subarray(0, budget);
8
+ return `${buf.toString("utf8")}${marker}`;
9
+ }
10
+ export function parseArgs(raw) {
11
+ if (typeof raw !== "string" || raw.trim() === "") {
12
+ throw new Error("tool arguments missing");
13
+ }
14
+ let parsed;
15
+ try {
16
+ parsed = JSON.parse(raw);
17
+ }
18
+ catch (err) {
19
+ throw new Error(`tool arguments are not valid JSON: ${err.message}`);
20
+ }
21
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
22
+ throw new Error("tool arguments must be a JSON object");
23
+ }
24
+ return parsed;
25
+ }
26
+ export function requireString(args, key) {
27
+ const value = args[key];
28
+ if (typeof value !== "string" || value.length === 0) {
29
+ throw new Error(`"${key}" must be a non-empty string`);
30
+ }
31
+ return value;
32
+ }
33
+ export function optionalNumber(args, key) {
34
+ const value = args[key];
35
+ if (value === undefined || value === null)
36
+ return undefined;
37
+ if (typeof value !== "number" || !Number.isFinite(value)) {
38
+ throw new Error(`"${key}" must be a finite number`);
39
+ }
40
+ return value;
41
+ }
@@ -0,0 +1,2 @@
1
+ import { type SandboxTool } from "./types.js";
2
+ export declare const writeTool: SandboxTool;
@@ -0,0 +1,92 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { SandboxEscapeError } from "../sandbox.js";
4
+ import { parseArgs, requireString, truncatePayload } from "./types.js";
5
+ const DESCRIPTION = "Write a UTF-8 text file inside the sandbox. Creates parent directories " +
6
+ "as needed. Overwrites existing files. Only paths inside the sandbox " +
7
+ "are accepted.";
8
+ export const writeTool = {
9
+ descriptor: {
10
+ name: "write_file",
11
+ description: DESCRIPTION,
12
+ parameters: {
13
+ type: "object",
14
+ additionalProperties: false,
15
+ required: ["path", "content"],
16
+ properties: {
17
+ path: {
18
+ type: "string",
19
+ description: "Path relative to the sandbox root."
20
+ },
21
+ content: {
22
+ type: "string",
23
+ description: "UTF-8 contents to write."
24
+ }
25
+ }
26
+ }
27
+ },
28
+ async invoke(rawArgs, ctx) {
29
+ let args;
30
+ try {
31
+ args = parseArgs(rawArgs);
32
+ }
33
+ catch (err) {
34
+ return { ok: false, name: this.descriptor.name, error: err.message };
35
+ }
36
+ let relPath;
37
+ try {
38
+ relPath = requireString(args, "path");
39
+ }
40
+ catch (err) {
41
+ return { ok: false, name: this.descriptor.name, error: err.message };
42
+ }
43
+ const rawContent = args.content;
44
+ if (typeof rawContent !== "string") {
45
+ return {
46
+ ok: false,
47
+ name: this.descriptor.name,
48
+ error: '"content" must be a string'
49
+ };
50
+ }
51
+ const payloadBytes = Buffer.byteLength(rawContent, "utf8");
52
+ if (payloadBytes > ctx.maxResultBytes * 4) {
53
+ return {
54
+ ok: false,
55
+ name: this.descriptor.name,
56
+ error: `"content" exceeds per-invocation ceiling (${payloadBytes} bytes).`
57
+ };
58
+ }
59
+ let abs;
60
+ try {
61
+ abs = await ctx.sandbox.resolve(relPath, { allowMissing: true });
62
+ }
63
+ catch (err) {
64
+ const denied = err instanceof SandboxEscapeError ? relPath : undefined;
65
+ return {
66
+ ok: false,
67
+ name: this.descriptor.name,
68
+ error: err.message,
69
+ details: denied ? { deniedPath: denied } : undefined
70
+ };
71
+ }
72
+ try {
73
+ await fs.mkdir(path.dirname(abs), { recursive: true });
74
+ await fs.writeFile(abs, rawContent, "utf8");
75
+ }
76
+ catch (err) {
77
+ return {
78
+ ok: false,
79
+ name: this.descriptor.name,
80
+ error: `write failed: ${err.message}`,
81
+ details: { path: relPath }
82
+ };
83
+ }
84
+ const summary = `wrote ${payloadBytes} byte(s) to ${relPath}`;
85
+ return {
86
+ ok: true,
87
+ name: this.descriptor.name,
88
+ content: truncatePayload(summary, ctx.maxResultBytes),
89
+ details: { path: relPath, bytes: payloadBytes }
90
+ };
91
+ }
92
+ };
@@ -114,6 +114,31 @@ export interface TraceabilityExpected {
114
114
  */
115
115
  requireIn: string[];
116
116
  }
117
+ /**
118
+ * LLM-judge expectations — Step 3.
119
+ *
120
+ * When present, the judge runs against the resolved artifact (live-agent
121
+ * output in Tier A/B/C, or the pre-generated fixture when `--judge` is
122
+ * combined with `--schema-only` for smoke tests). Every field below is
123
+ * optional; the case-level hint overlays the stage-level rubric loaded
124
+ * from `.cclaw/evals/rubrics/<stage>.yaml`.
125
+ */
126
+ export interface JudgeExpected {
127
+ /**
128
+ * Per-case check ids that MUST be present in the stage rubric. Used when
129
+ * a case wants to assert the rubric covers scenario-specific properties.
130
+ */
131
+ requiredChecks?: string[];
132
+ /**
133
+ * Stage rubric identifier when a stage ships multiple rubrics (e.g.
134
+ * "strict" vs. "lenient"). Defaults to the stage name.
135
+ */
136
+ rubric?: string;
137
+ /** Optional override of `config.judgeSamples` for the case. */
138
+ samples?: number;
139
+ /** Per-check minimum score (1..5 scale). Fail when any score drops below. */
140
+ minimumScores?: Record<string, number>;
141
+ }
117
142
  /** Superset of per-verifier expectation shapes. */
118
143
  export interface ExpectedShape {
119
144
  structural?: StructuralExpected;
@@ -122,7 +147,7 @@ export interface ExpectedShape {
122
147
  /** Cross-stage ID propagation checks — Step 2. */
123
148
  traceability?: TraceabilityExpected;
124
149
  /** LLM-judge rubrics — Step 3. */
125
- judge?: Record<string, unknown>;
150
+ judge?: JudgeExpected;
126
151
  }
127
152
  /**
128
153
  * A single eval case describes one input scenario for one stage. Cases live in
@@ -228,6 +253,44 @@ export interface EvalConfig {
228
253
  timeoutMs: number;
229
254
  /** Max retries per API call on transient failures. */
230
255
  maxRetries: number;
256
+ /**
257
+ * Number of judge samples per case (median-of-N). Defaults to 3 when unset.
258
+ * Must be odd so a true median exists.
259
+ */
260
+ judgeSamples?: number;
261
+ /** Sampling temperature for judge calls. Defaults to 0.0. */
262
+ judgeTemperature?: number;
263
+ /** Sampling temperature for the agent-under-test. Defaults to 0.2. */
264
+ agentTemperature?: number;
265
+ /**
266
+ * Optional per-model USD pricing used by the cost guard. Keys match
267
+ * `model` / `judgeModel`. Values in USD per 1K tokens, so
268
+ * `{ input: 0.0005, output: 0.0015 }` = $0.50 per 1M input tokens.
269
+ */
270
+ tokenPricing?: Record<string, TokenPricing>;
271
+ /**
272
+ * Maximum assistant turns (tool_calls → tool result cycles) allowed by
273
+ * the Tier B with-tools agent. Defaults to 8 when unset. Runs that
274
+ * exceed the cap fail with a `MaxTurnsExceededError` and surface as a
275
+ * workflow verifier result.
276
+ */
277
+ toolMaxTurns?: number;
278
+ /**
279
+ * Per-invocation ceiling on tool call arguments bytes. Defends against
280
+ * runaway writes. Defaults to 64 KiB.
281
+ */
282
+ toolMaxArgumentsBytes?: number;
283
+ /**
284
+ * Per-invocation ceiling on tool call result bytes returned to the
285
+ * model. Defaults to 32 KiB; longer results are truncated with a
286
+ * marker so the model sees the cutoff.
287
+ */
288
+ toolMaxResultBytes?: number;
289
+ }
290
+ /** Per-model pricing schedule, expressed as USD per 1K tokens. */
291
+ export interface TokenPricing {
292
+ input: number;
293
+ output: number;
231
294
  }
232
295
  /** Resolved config with env overrides applied. */
233
296
  export interface ResolvedEvalConfig extends EvalConfig {
@@ -279,3 +342,77 @@ export interface BaselineRegression {
279
342
  previousScore?: number;
280
343
  currentScore?: number;
281
344
  }
345
+ /**
346
+ * One rubric check evaluated by the LLM judge. Scored on a 1..5 scale;
347
+ * 5 means "the artifact fully meets the bar described by `prompt`".
348
+ */
349
+ export interface RubricCheck {
350
+ /** Kebab-case slug, unique per rubric. Stable across runs. */
351
+ id: string;
352
+ /** Natural-language question posed to the judge. */
353
+ prompt: string;
354
+ /** Human-readable scale description rendered in judge prompts. */
355
+ scale?: string;
356
+ /** Relative weight for the stage's aggregate score. Defaults to 1.0. */
357
+ weight?: number;
358
+ /**
359
+ * When true, any sample below `config.regression.failIfCriticalBelow`
360
+ * flips the verifier to `ok:false` (not just a score drop).
361
+ */
362
+ critical?: boolean;
363
+ }
364
+ /** Parsed `.cclaw/evals/rubrics/<stage>.yaml`. */
365
+ export interface RubricDoc {
366
+ stage: FlowStage;
367
+ /** Optional rubric variant label; defaults to the stage name. */
368
+ id: string;
369
+ checks: RubricCheck[];
370
+ }
371
+ /**
372
+ * Judge response for a single sample (one API call). The judge is asked to
373
+ * return structured JSON; `scores[id]` maps rubric check id → integer 1..5.
374
+ * `rationales[id]` is a short plain-text explanation, useful in reports but
375
+ * never used for gating.
376
+ */
377
+ export interface JudgeSample {
378
+ scores: Record<string, number>;
379
+ rationales: Record<string, string>;
380
+ }
381
+ /** Aggregated judge output across N samples, per rubric check. */
382
+ export interface JudgeAggregate {
383
+ checkId: string;
384
+ samples: number[];
385
+ median: number;
386
+ mean: number;
387
+ /** True iff every sample returned a score for this check. */
388
+ coverage: boolean;
389
+ }
390
+ /**
391
+ * Judge invocation result. Produced by `runJudge` and consumed by the
392
+ * runner: the runner converts each aggregate into a `VerifierResult` and
393
+ * records `usageUsd` toward the per-case cost.
394
+ */
395
+ export interface JudgeInvocation {
396
+ rubricId: string;
397
+ samples: JudgeSample[];
398
+ aggregates: JudgeAggregate[];
399
+ usageUsd: number;
400
+ durationMs: number;
401
+ }
402
+ /**
403
+ * Tool-use summary produced by the Tier B with-tools agent. Captured so
404
+ * the runner can surface per-case tool metrics in the markdown report
405
+ * (number of calls, depth, error rate, denied paths).
406
+ */
407
+ export interface ToolUseSummary {
408
+ /** Turns consumed before the agent produced a terminal assistant message. */
409
+ turns: number;
410
+ /** Total successful tool invocations across all turns. */
411
+ calls: number;
412
+ /** Tool invocations that returned an error (bad args, denied path, etc.). */
413
+ errors: number;
414
+ /** Paths the sandbox refused to resolve (escape attempts, missing files). */
415
+ deniedPaths: string[];
416
+ /** Per-tool call counts, keyed by tool name. */
417
+ byTool: Record<string, number>;
418
+ }