@alis-build/harness-eval 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -10
- package/dist/adapters/claude-code/index.d.ts +2 -2
- package/dist/adapters/claude-code/index.js +2 -1
- package/dist/adapters/codex/index.d.ts +68 -0
- package/dist/adapters/codex/index.js +3 -0
- package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
- package/dist/claude-code-C_7hxC8z.js.map +1 -0
- package/dist/cli/bin.js +204 -127
- package/dist/cli/bin.js.map +1 -1
- package/dist/codex-0cHO2te9.js +496 -0
- package/dist/codex-0cHO2te9.js.map +1 -0
- package/dist/config/loader.d.ts +2 -2
- package/dist/config/loader.js +2 -2
- package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
- package/dist/index.d.ts +397 -153
- package/dist/index.js +125 -5
- package/dist/index.js.map +1 -0
- package/dist/loader-B1WmGGzf.d.ts +107 -0
- package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
- package/dist/loader-DnQ6Jt0i.js.map +1 -0
- package/dist/reporter-Biy-5-9M.js +2216 -0
- package/dist/reporter-Biy-5-9M.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
- package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
- package/dist/suite-BcP64nlb.js.map +1 -0
- package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
- package/dist/types-Bac8_Ixb.js +246 -0
- package/dist/types-Bac8_Ixb.js.map +1 -0
- package/dist/types-Bu8uOZZN.d.ts +77 -0
- package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
- package/package.json +7 -2
- package/schemas/eval-interchange-instances.schema.json +196 -0
- package/schemas/eval-interchange.schema.json +65 -52
- package/schemas/eval-run-envelope.schema.json +182 -425
- package/dist/build-DsVJ_UeU.js +0 -1396
- package/dist/build-DsVJ_UeU.js.map +0 -1
- package/dist/claude-code-ycT0JQZF.js.map +0 -1
- package/dist/loader-BCnFJ8rm.js.map +0 -1
- package/dist/loader-DTvoVfN0.d.ts +0 -33
- package/dist/suite-chj0j22j.js.map +0 -1
- package/schemas/eval-interchange-agent-trace.schema.json +0 -322
- package/schemas/eval-interchange-proto-instance.schema.json +0 -106
package/dist/index.d.ts
CHANGED
|
@@ -1,120 +1,39 @@
|
|
|
1
|
-
import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-
|
|
2
|
-
import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-
|
|
3
|
-
import {
|
|
4
|
-
import { i as
|
|
5
|
-
import { t as
|
|
1
|
+
import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-C0gBkl0-.js";
|
|
2
|
+
import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-DnvP1UBl.js";
|
|
3
|
+
import { i as CodexOptions } from "./types-Bu8uOZZN.js";
|
|
4
|
+
import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-0QkNVyp9.js";
|
|
5
|
+
import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, n as parseSuite, o as SuiteDocument, s as PipelineConfig, t as loadSuite } from "./loader-B1WmGGzf.js";
|
|
6
|
+
import { t as runSuite } from "./suite-BEShV0by.js";
|
|
6
7
|
import { Readable } from "node:stream";
|
|
7
8
|
|
|
8
|
-
//#region src/types/eval-interchange.d.ts
|
|
9
|
-
/**
|
|
10
|
-
* TypeScript types for eval interchange output.
|
|
11
|
-
*/
|
|
12
|
-
interface InterchangeToolCall {
|
|
13
|
-
tool_name: string;
|
|
14
|
-
tool_input: string;
|
|
15
|
-
}
|
|
16
|
-
interface InterchangeTrajectory {
|
|
17
|
-
tool_calls: InterchangeToolCall[];
|
|
18
|
-
}
|
|
19
|
-
interface TabularToolCall {
|
|
20
|
-
tool_name: string;
|
|
21
|
-
tool_input: unknown;
|
|
22
|
-
}
|
|
23
|
-
interface ContentPart {
|
|
24
|
-
text?: string;
|
|
25
|
-
function_call?: {
|
|
26
|
-
name: string;
|
|
27
|
-
args: unknown;
|
|
28
|
-
};
|
|
29
|
-
function_response?: {
|
|
30
|
-
name: string;
|
|
31
|
-
response: unknown;
|
|
32
|
-
};
|
|
33
|
-
}
|
|
34
|
-
interface AgentEvent {
|
|
35
|
-
author: string;
|
|
36
|
-
content: {
|
|
37
|
-
parts: ContentPart[];
|
|
38
|
-
};
|
|
39
|
-
event_time?: string;
|
|
40
|
-
state_delta?: Record<string, unknown>;
|
|
41
|
-
active_tools?: Array<{
|
|
42
|
-
name: string;
|
|
43
|
-
}>;
|
|
44
|
-
}
|
|
45
|
-
interface ConversationTurn {
|
|
46
|
-
turn_index: number;
|
|
47
|
-
turn_id?: string;
|
|
48
|
-
events: AgentEvent[];
|
|
49
|
-
}
|
|
50
|
-
interface AgentConfig {
|
|
51
|
-
agent_id: string;
|
|
52
|
-
agent_type?: string;
|
|
53
|
-
description?: string;
|
|
54
|
-
instruction?: string;
|
|
55
|
-
tools?: Array<{
|
|
56
|
-
name: string;
|
|
57
|
-
}>;
|
|
58
|
-
sub_agents?: string[];
|
|
59
|
-
}
|
|
60
|
-
interface AgentTrace {
|
|
61
|
-
agents: Record<string, AgentConfig>;
|
|
62
|
-
turns: ConversationTurn[];
|
|
63
|
-
}
|
|
64
|
-
interface EvalDatasetRow {
|
|
65
|
-
prompt?: string;
|
|
66
|
-
response?: string;
|
|
67
|
-
reference?: string;
|
|
68
|
-
predicted_trajectory: TabularToolCall[];
|
|
69
|
-
reference_trajectory?: TabularToolCall[];
|
|
70
|
-
latency_in_seconds: number;
|
|
71
|
-
failure: 0 | 1;
|
|
72
|
-
human_ratings?: Record<string, number>;
|
|
73
|
-
}
|
|
74
|
-
interface ProtoTrajectoryInstance {
|
|
75
|
-
predicted_trajectory: InterchangeTrajectory;
|
|
76
|
-
reference_trajectory?: InterchangeTrajectory;
|
|
77
|
-
prompt?: string;
|
|
78
|
-
response?: string;
|
|
79
|
-
reference?: string;
|
|
80
|
-
}
|
|
81
|
-
interface TrajectoryMetrics {
|
|
82
|
-
trajectory_exact_match: number;
|
|
83
|
-
trajectory_in_order_match: number;
|
|
84
|
-
trajectory_any_order_match: number;
|
|
85
|
-
trajectory_precision: number;
|
|
86
|
-
trajectory_recall: number;
|
|
87
|
-
trajectory_single_tool_use: number;
|
|
88
|
-
}
|
|
89
|
-
interface ToolCallMetrics {
|
|
90
|
-
tool_call_valid: number;
|
|
91
|
-
tool_name_match: number;
|
|
92
|
-
tool_parameter_key_match: number;
|
|
93
|
-
tool_parameter_kv_match: number;
|
|
94
|
-
}
|
|
95
|
-
//#endregion
|
|
96
9
|
//#region src/grader/types.d.ts
|
|
97
10
|
/**
|
|
98
11
|
* Outcome grading types (LLM-as-judge layer).
|
|
99
12
|
*
|
|
100
13
|
* Behavioral assertions live in harness-eval assertions; expectations here
|
|
101
14
|
* are natural-language outcome checks graded from trajectory transcripts.
|
|
15
|
+
* Grading runs as a second pass over a {@link SuiteReport} JSON artifact.
|
|
102
16
|
*/
|
|
103
17
|
interface GradedExpectation {
|
|
18
|
+
/** Original expectation text from the suite or sidecar file. */
|
|
104
19
|
text: string;
|
|
105
20
|
passed: boolean;
|
|
21
|
+
/** Quote or description supporting the pass/fail decision. */
|
|
106
22
|
evidence: string;
|
|
107
23
|
}
|
|
24
|
+
/** Aggregate pass/fail counts for one grading unit (rep or full report). */
|
|
108
25
|
interface GradingSummary {
|
|
109
26
|
passed: number;
|
|
110
27
|
failed: number;
|
|
111
28
|
total: number;
|
|
112
29
|
passRate: number;
|
|
113
30
|
}
|
|
31
|
+
/** Suggestion for improving an expectation or assertion wording. */
|
|
114
32
|
interface EvalFeedbackSuggestion {
|
|
115
33
|
assertion?: string;
|
|
116
34
|
reason: string;
|
|
117
35
|
}
|
|
36
|
+
/** Optional meta-feedback from the judge about expectation quality. */
|
|
118
37
|
interface EvalFeedback {
|
|
119
38
|
suggestions: EvalFeedbackSuggestion[];
|
|
120
39
|
overall: string;
|
|
@@ -132,14 +51,22 @@ interface RepGradingResult {
|
|
|
132
51
|
graderError?: string;
|
|
133
52
|
durationMs: number;
|
|
134
53
|
}
|
|
54
|
+
/** Full grading report for a suite run. */
|
|
135
55
|
interface SuiteGradingReport {
|
|
136
56
|
gradedAt: string;
|
|
137
57
|
sourceReport: string;
|
|
138
58
|
/** Grading YAML path when `--config` was used. */
|
|
139
59
|
gradingConfigPath?: string;
|
|
60
|
+
/** Judge that produced outcome grades. */
|
|
61
|
+
judge?: {
|
|
62
|
+
id: string;
|
|
63
|
+
model?: string;
|
|
64
|
+
adapter?: string;
|
|
65
|
+
};
|
|
140
66
|
results: RepGradingResult[];
|
|
141
67
|
summary: GradingSummary;
|
|
142
68
|
}
|
|
69
|
+
/** Options controlling {@link gradeReport} and the CLI `grade` command. */
|
|
143
70
|
interface GradeReportOptions {
|
|
144
71
|
/** Path to the report being graded (stored in output). */
|
|
145
72
|
sourceReport?: string;
|
|
@@ -159,14 +86,19 @@ interface GradeReportOptions {
|
|
|
159
86
|
env?: Record<string, string>;
|
|
160
87
|
/** Working directory for the judge subprocess. */
|
|
161
88
|
cwd?: string;
|
|
89
|
+
/** Grading adapter id. Default: `claude-code`. */
|
|
90
|
+
judgeAdapter?: "claude-code" | "codex";
|
|
162
91
|
/** Claude Code options for the judge (nested in grading YAML under `claudeCode`). */
|
|
163
92
|
claudeCode?: Record<string, unknown>;
|
|
93
|
+
/** Codex CLI options for the judge (nested in grading YAML under `codex`). */
|
|
94
|
+
codex?: Record<string, unknown>;
|
|
164
95
|
/** Path to grading YAML when `--config` was used. */
|
|
165
96
|
gradingConfigPath?: string;
|
|
166
97
|
/** Inject a custom grader (for tests). */
|
|
167
98
|
gradeFn?: GraderFn;
|
|
168
99
|
onProgress?: (event: GradeProgressEvent) => void;
|
|
169
100
|
}
|
|
101
|
+
/** Progress events emitted during outcome grading. */
|
|
170
102
|
type GradeProgressEvent = {
|
|
171
103
|
kind: "grade-start";
|
|
172
104
|
total: number;
|
|
@@ -185,13 +117,16 @@ type GradeProgressEvent = {
|
|
|
185
117
|
totalExpectations: number;
|
|
186
118
|
passedExpectations: number;
|
|
187
119
|
};
|
|
120
|
+
/** Pluggable grader implementation (defaults to Claude subprocess). */
|
|
188
121
|
type GraderFn = (input: GraderInput) => Promise<GraderOutput>;
|
|
122
|
+
/** Input passed to a grader for one repetition. */
|
|
189
123
|
interface GraderInput {
|
|
190
124
|
prompt: string;
|
|
191
125
|
transcript: string;
|
|
192
126
|
expectations: string[];
|
|
193
127
|
systemInstruction?: string;
|
|
194
128
|
}
|
|
129
|
+
/** Parsed grader response before alignment with input expectation order. */
|
|
195
130
|
interface GraderOutput {
|
|
196
131
|
expectations: GradedExpectation[];
|
|
197
132
|
summary: GradingSummary;
|
|
@@ -206,20 +141,14 @@ declare const EVAL_RUN_SCHEMA_VERSION = "1.0";
|
|
|
206
141
|
declare const TRAJECTORY_SCHEMA_VERSION = "1.0";
|
|
207
142
|
/** Link to the suite spec that produced a run. */
|
|
208
143
|
interface SuiteReference {
|
|
209
|
-
/** Absolute or repo-relative path to the suite YAML. */
|
|
210
144
|
uri?: string;
|
|
211
|
-
/** Stable suite identifier when known (e.g. case bundle name). */
|
|
212
145
|
id?: string;
|
|
213
|
-
/** SHA-256 or similar hash of suite file contents. */
|
|
214
146
|
contentHash?: string;
|
|
215
147
|
}
|
|
216
148
|
/** Harness that executed the run. */
|
|
217
149
|
interface HarnessInfo {
|
|
218
|
-
/** Adapter id from suite YAML, e.g. `claude-code`. */
|
|
219
150
|
adapter: string;
|
|
220
|
-
/** harness-eval package version when envelope was built. */
|
|
221
151
|
frameworkVersion?: string;
|
|
222
|
-
/** Optional harness binary version (e.g. `claude -v`). */
|
|
223
152
|
harnessVersion?: string;
|
|
224
153
|
}
|
|
225
154
|
/** CI, git, or runtime provenance for correlation in the DB. */
|
|
@@ -244,9 +173,7 @@ interface EvalProvenance {
|
|
|
244
173
|
interface EvalRunSummary {
|
|
245
174
|
cellsTotal: number;
|
|
246
175
|
cellsPassed: number;
|
|
247
|
-
/** All cells passed behavioral assertion thresholds. */
|
|
248
176
|
behavioralPass: boolean;
|
|
249
|
-
/** All graded expectations passed (when outcome layer present). */
|
|
250
177
|
outcomePass?: boolean;
|
|
251
178
|
}
|
|
252
179
|
/** Identity of the judge that produced outcome grades. */
|
|
@@ -254,6 +181,8 @@ interface JudgeInfo {
|
|
|
254
181
|
id: string;
|
|
255
182
|
model?: string;
|
|
256
183
|
version?: string;
|
|
184
|
+
/** Grading adapter id when known (e.g. `codex`, `claude-code`). */
|
|
185
|
+
adapter?: string;
|
|
257
186
|
}
|
|
258
187
|
/** Outcome grades for one repetition (built-in or external judge). */
|
|
259
188
|
interface OutcomeGrades {
|
|
@@ -278,20 +207,14 @@ interface ExternalScore {
|
|
|
278
207
|
}
|
|
279
208
|
/** Optional large or vendor-specific blobs (store by reference in DB when possible). */
|
|
280
209
|
interface EvalArtifacts {
|
|
281
|
-
/** Claude Code `stream-json` lines — debug only, not cross-harness. */
|
|
282
210
|
rawStreamEvents?: unknown[];
|
|
283
|
-
/** URI to OTLP JSON (S3, GCS, etc.). */
|
|
284
211
|
otlpTraceUri?: string;
|
|
285
|
-
/** Text transcript for judges (`trajectoryToTranscript`). */
|
|
286
212
|
transcript?: string;
|
|
287
213
|
}
|
|
288
|
-
/**
|
|
289
|
-
* One harness invocation — the unit external judges and trajectory queries use.
|
|
290
|
-
*/
|
|
214
|
+
/** One harness invocation — the unit external judges and trajectory queries use. */
|
|
291
215
|
interface EvalRepetition {
|
|
292
216
|
repetitionIndex: number;
|
|
293
217
|
durationMs: number;
|
|
294
|
-
/** Normalized harness session. Required when the harness completed with a view. */
|
|
295
218
|
trajectory?: TrajectoryView & {
|
|
296
219
|
schemaVersion: string;
|
|
297
220
|
};
|
|
@@ -300,18 +223,14 @@ interface EvalRepetition {
|
|
|
300
223
|
outcomeGrades?: OutcomeGrades;
|
|
301
224
|
externalScores?: ExternalScore[];
|
|
302
225
|
artifacts?: EvalArtifacts;
|
|
303
|
-
/**
|
|
304
|
-
|
|
305
|
-
/**
|
|
306
|
-
|
|
307
|
-
/**
|
|
308
|
-
|
|
309
|
-
|
|
226
|
+
/** Vertex EvaluationInstance protojson wire object. */
|
|
227
|
+
evaluationInstance?: EvaluationInstanceJson;
|
|
228
|
+
/** Vertex Trajectory*Instance protojson wire objects keyed by metric. */
|
|
229
|
+
trajectoryInstances?: TrajectoryInstancesJson;
|
|
230
|
+
/** Harness-precomputed trajectory metric scores (camelCase). */
|
|
231
|
+
harnessMetrics?: HarnessMetrics;
|
|
232
|
+
latencySeconds?: number;
|
|
310
233
|
failure?: 0 | 1;
|
|
311
|
-
/** Trajectory-level metrics when reference_trajectory is provided. */
|
|
312
|
-
trajectoryMetrics?: TrajectoryMetrics;
|
|
313
|
-
/** Tool-call-level metrics when reference_trajectory is provided. */
|
|
314
|
-
toolCallMetrics?: ToolCallMetrics;
|
|
315
234
|
error?: {
|
|
316
235
|
message: string;
|
|
317
236
|
diagnostics?: Partial<AdapterDiagnostics>;
|
|
@@ -335,24 +254,16 @@ interface EvalCellResult {
|
|
|
335
254
|
expectations?: string[];
|
|
336
255
|
cellLabel: string;
|
|
337
256
|
axes?: Record<string, string>;
|
|
338
|
-
/** Reference
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
human_ratings?: Record<string, number>;
|
|
257
|
+
/** Reference trajectory in Vertex protojson wire format. */
|
|
258
|
+
referenceTrajectory?: ProtojsonTrajectory;
|
|
259
|
+
humanRatings?: Record<string, number>;
|
|
342
260
|
assertionStats: EvalAssertionStat[];
|
|
343
261
|
adapterErrors: number;
|
|
344
|
-
/** Passed all behavioral assertion thresholds for this cell. */
|
|
345
262
|
behavioralPass: boolean;
|
|
346
|
-
/** Passed all outcome expectations when graded; omitted if not graded. */
|
|
347
263
|
outcomePass?: boolean;
|
|
348
264
|
repetitions: EvalRepetition[];
|
|
349
265
|
}
|
|
350
|
-
/**
|
|
351
|
-
* Top-level document for CI/CD pipelines, APIs, and databases.
|
|
352
|
-
*
|
|
353
|
-
* This is the interchange format your storage layer should target — not
|
|
354
|
-
* {@link import("./stream").StreamEvent} or OTLP traces.
|
|
355
|
-
*/
|
|
266
|
+
/** Top-level document for CI/CD pipelines, APIs, and databases. */
|
|
356
267
|
interface EvalRunEnvelope {
|
|
357
268
|
schemaVersion: typeof EVAL_RUN_SCHEMA_VERSION;
|
|
358
269
|
runId: string;
|
|
@@ -365,12 +276,15 @@ interface EvalRunEnvelope {
|
|
|
365
276
|
cells: EvalCellResult[];
|
|
366
277
|
}
|
|
367
278
|
interface BuildEvalRunEnvelopeOptions {
|
|
368
|
-
/**
|
|
279
|
+
/** Override envelope runId; defaults to a random UUID. */
|
|
369
280
|
runId?: string;
|
|
281
|
+
/** Link to the suite YAML that produced the run. */
|
|
370
282
|
suite?: SuiteReference;
|
|
283
|
+
/** Harness adapter metadata; adapter defaults to `"claude-code"`. */
|
|
371
284
|
harness?: Partial<HarnessInfo>;
|
|
285
|
+
/** CI, git, and runtime provenance for correlation. */
|
|
372
286
|
provenance?: EvalProvenance;
|
|
373
|
-
/**
|
|
287
|
+
/** Outcome grades to merge from a grader run. */
|
|
374
288
|
grading?: {
|
|
375
289
|
gradedAt?: string;
|
|
376
290
|
sourceReport?: string;
|
|
@@ -386,9 +300,9 @@ interface BuildEvalRunEnvelopeOptions {
|
|
|
386
300
|
}>;
|
|
387
301
|
judge?: JudgeInfo;
|
|
388
302
|
};
|
|
389
|
-
/** Include transcript
|
|
303
|
+
/** Include text transcript artifact (default true). */
|
|
390
304
|
includeTranscript?: boolean;
|
|
391
|
-
/** Include raw stream events
|
|
305
|
+
/** Include raw stream-json events (default false; debug only). */
|
|
392
306
|
includeRawStreamEvents?: boolean;
|
|
393
307
|
}
|
|
394
308
|
//#endregion
|
|
@@ -506,6 +420,7 @@ declare function getDefaultAdapter(): HarnessAdapter;
|
|
|
506
420
|
declare const DEFAULT_REPETITIONS = 5;
|
|
507
421
|
/** Default assertion pass-rate threshold when `threshold` is omitted. */
|
|
508
422
|
declare const DEFAULT_THRESHOLD = 1;
|
|
423
|
+
/** Injectable adapter run function (used by tests to stub harness I/O). */
|
|
509
424
|
type AdapterRunFn = (config: BaseAdapterConfig & Record<string, unknown>) => Promise<AdapterResult>;
|
|
510
425
|
/**
|
|
511
426
|
* Build the effective adapter config for one (suite, case, cell).
|
|
@@ -513,8 +428,21 @@ type AdapterRunFn = (config: BaseAdapterConfig & Record<string, unknown>) => Pro
|
|
|
513
428
|
* Merge order (later wins): defaultConfig < case.config < cell.config.
|
|
514
429
|
*/
|
|
515
430
|
declare function mergeConfig(suite: TestSuite, testCase: TestCase, cell: MatrixCell): BaseAdapterConfig & Record<string, unknown>;
|
|
431
|
+
/** Effective repetition count for a case (`case.repetitions` or default). */
|
|
516
432
|
declare function getRepetitions(testCase: TestCase): number;
|
|
433
|
+
/**
|
|
434
|
+
* Run one repetition: invoke the adapter, evaluate assertions, capture errors.
|
|
435
|
+
*
|
|
436
|
+
* Adapter failures are returned as {@link RepetitionResult.error} rather than
|
|
437
|
+
* thrown so the suite runner can continue other reps and report adapter error counts.
|
|
438
|
+
*/
|
|
517
439
|
declare function runRepetition(testCase: TestCase, _cell: MatrixCell, config: BaseAdapterConfig & Record<string, unknown>, repetitionIndex: number, run: AdapterRunFn, signal?: AbortSignal): Promise<RepetitionResult>;
|
|
440
|
+
/**
|
|
441
|
+
* Roll up repetition results into a {@link CellReport}.
|
|
442
|
+
*
|
|
443
|
+
* Adapter errors reduce `evaluatedCount` but do not fail the cell by
|
|
444
|
+
* themselves — only assertion threshold misses mark a cell as failed.
|
|
445
|
+
*/
|
|
518
446
|
declare function aggregateCell(testCase: TestCase, cell: MatrixCell, repetitions: RepetitionResult[]): CellReport;
|
|
519
447
|
//#endregion
|
|
520
448
|
//#region src/runner/limit.d.ts
|
|
@@ -536,6 +464,106 @@ declare function aggregateCell(testCase: TestCase, cell: MatrixCell, repetitions
|
|
|
536
464
|
type LimitedRunner = <T>(fn: () => Promise<T>) => Promise<T>;
|
|
537
465
|
declare function createLimit(max: number): LimitedRunner;
|
|
538
466
|
//#endregion
|
|
467
|
+
//#region src/cli/commands/envelope.d.ts
|
|
468
|
+
/** Supported `--projection` values for envelope output. */
|
|
469
|
+
type EnvelopeProjection = "envelope" | "trajectory" | "instances";
|
|
470
|
+
//#endregion
|
|
471
|
+
//#region src/pipeline/resolve-inputs.d.ts
|
|
472
|
+
type PipelineStepName = "run" | "grade" | "envelope";
|
|
473
|
+
/** CLI overrides for pipeline artifact paths (take precedence over YAML). */
|
|
474
|
+
interface PipelineCliOverrides {
|
|
475
|
+
run?: {
|
|
476
|
+
output?: string;
|
|
477
|
+
maxConcurrent?: number;
|
|
478
|
+
};
|
|
479
|
+
grade?: {
|
|
480
|
+
input?: string;
|
|
481
|
+
output?: string;
|
|
482
|
+
maxConcurrent?: number;
|
|
483
|
+
};
|
|
484
|
+
envelope?: {
|
|
485
|
+
report?: string;
|
|
486
|
+
grading?: string;
|
|
487
|
+
output?: string;
|
|
488
|
+
projection?: EnvelopeProjection;
|
|
489
|
+
};
|
|
490
|
+
}
|
|
491
|
+
/** Resolved paths for the harness run step. */
|
|
492
|
+
interface ResolvedPipelineRun {
|
|
493
|
+
output: string;
|
|
494
|
+
maxConcurrent?: number;
|
|
495
|
+
}
|
|
496
|
+
/** Resolved input (suite report) and output (grading JSON) for the grade step. */
|
|
497
|
+
interface ResolvedPipelineGrade {
|
|
498
|
+
input: string;
|
|
499
|
+
output: string;
|
|
500
|
+
maxConcurrent?: number;
|
|
501
|
+
}
|
|
502
|
+
/** Resolved artifact paths for the envelope export step. */
|
|
503
|
+
interface ResolvedPipelineEnvelope {
|
|
504
|
+
report: string;
|
|
505
|
+
grading?: string;
|
|
506
|
+
output: string;
|
|
507
|
+
projection: EnvelopeProjection;
|
|
508
|
+
includeRawStreamEvents: boolean;
|
|
509
|
+
noTranscript: boolean;
|
|
510
|
+
}
|
|
511
|
+
/** Fully resolved pipeline inputs for one or more enabled steps. */
|
|
512
|
+
interface ResolvedPipeline {
|
|
513
|
+
suitePath: string;
|
|
514
|
+
run?: ResolvedPipelineRun;
|
|
515
|
+
grade?: ResolvedPipelineGrade;
|
|
516
|
+
envelope?: ResolvedPipelineEnvelope;
|
|
517
|
+
}
|
|
518
|
+
/** Inputs for {@link resolvePipelineInputs}. */
|
|
519
|
+
interface ResolvePipelineInputsOptions {
|
|
520
|
+
suitePath: string;
|
|
521
|
+
suiteDir: string;
|
|
522
|
+
pipeline: PipelineConfig;
|
|
523
|
+
steps: PipelineStepName[];
|
|
524
|
+
executed?: {
|
|
525
|
+
run?: {
|
|
526
|
+
output: string;
|
|
527
|
+
};
|
|
528
|
+
grade?: {
|
|
529
|
+
input: string;
|
|
530
|
+
output: string;
|
|
531
|
+
};
|
|
532
|
+
};
|
|
533
|
+
overrides?: PipelineCliOverrides;
|
|
534
|
+
}
|
|
535
|
+
/** Resolve absolute paths for enabled pipeline steps. */
|
|
536
|
+
declare function resolvePipelineInputs(options: ResolvePipelineInputsOptions): Promise<ResolvedPipeline>;
|
|
537
|
+
/**
|
|
538
|
+
* Resolve a grading artifact path from a unified suite's `pipeline:` block.
|
|
539
|
+
*
|
|
540
|
+
* Used by `harness-eval envelope --suite` when `--grading` is omitted (spec C-7).
|
|
541
|
+
* Checks `pipeline.envelope.grading` then default `pipeline.grade.output` on disk.
|
|
542
|
+
*/
|
|
543
|
+
declare function resolveGradingArtifactFromSuite(suitePath: string): Promise<string | undefined>;
|
|
544
|
+
//#endregion
|
|
545
|
+
//#region src/pipeline/run-pipeline.d.ts
|
|
546
|
+
/** Options for {@link runPipeline} (CLI flags and progress callbacks). */
|
|
547
|
+
interface RunPipelineOptions {
|
|
548
|
+
/** Comma-separated subset of configured steps (e.g. `run,grade`). */
|
|
549
|
+
steps?: string;
|
|
550
|
+
maxConcurrent?: number;
|
|
551
|
+
overrides?: PipelineCliOverrides;
|
|
552
|
+
onRunProgress?: ProgressCallback;
|
|
553
|
+
onGradeProgress?: GradeReportOptions["onProgress"];
|
|
554
|
+
/** Framework version stamped on envelope export. */
|
|
555
|
+
frameworkVersion?: string;
|
|
556
|
+
}
|
|
557
|
+
/** Outcome of a pipeline run including per-step exit semantics. */
|
|
558
|
+
interface RunPipelineResult {
|
|
559
|
+
/** 0 pass, 1 eval/grade/envelope failure, 2 load error (thrown before return). */
|
|
560
|
+
exitCode: number;
|
|
561
|
+
stepsRun: PipelineStepName[];
|
|
562
|
+
runReport?: SuiteReport;
|
|
563
|
+
}
|
|
564
|
+
/** Execute configured pipeline steps in order; stop on first failure. */
|
|
565
|
+
declare function runPipeline(doc: SuiteDocument, options?: RunPipelineOptions): Promise<RunPipelineResult>;
|
|
566
|
+
//#endregion
|
|
539
567
|
//#region src/otel/types.d.ts
|
|
540
568
|
/**
|
|
541
569
|
* Minimal OTLP JSON types for trace export.
|
|
@@ -543,24 +571,30 @@ declare function createLimit(max: number): LimitedRunner;
|
|
|
543
571
|
* Shapes follow OTLP/HTTP JSON Protobuf encoding (lowerCamelCase field names).
|
|
544
572
|
* @see https://opentelemetry.io/docs/specs/otlp/
|
|
545
573
|
*/
|
|
574
|
+
/** OTLP ExportTraceServiceRequest root — batch of resource spans. */
|
|
546
575
|
interface ExportTraceServiceRequest {
|
|
547
576
|
resourceSpans: ResourceSpans[];
|
|
548
577
|
}
|
|
578
|
+
/** Resource-attributed span group in an export batch. */
|
|
549
579
|
interface ResourceSpans {
|
|
550
580
|
resource: Resource;
|
|
551
581
|
scopeSpans: ScopeSpans[];
|
|
552
582
|
}
|
|
583
|
+
/** OTLP resource descriptor (service.name, agent metadata). */
|
|
553
584
|
interface Resource {
|
|
554
585
|
attributes: KeyValue[];
|
|
555
586
|
}
|
|
587
|
+
/** Spans emitted by one instrumentation scope within a resource. */
|
|
556
588
|
interface ScopeSpans {
|
|
557
589
|
scope: InstrumentationScope;
|
|
558
590
|
spans: Span[];
|
|
559
591
|
}
|
|
592
|
+
/** Instrumentation library identity (name + optional version). */
|
|
560
593
|
interface InstrumentationScope {
|
|
561
594
|
name: string;
|
|
562
595
|
version?: string;
|
|
563
596
|
}
|
|
597
|
+
/** One span in OTLP JSON encoding (nanosecond timestamps as strings). */
|
|
564
598
|
interface Span {
|
|
565
599
|
traceId: string;
|
|
566
600
|
spanId: string;
|
|
@@ -572,14 +606,17 @@ interface Span {
|
|
|
572
606
|
attributes: KeyValue[];
|
|
573
607
|
status?: SpanStatus;
|
|
574
608
|
}
|
|
609
|
+
/** OTLP span status (OK, ERROR, or UNSET). */
|
|
575
610
|
interface SpanStatus {
|
|
576
611
|
code: number;
|
|
577
612
|
message?: string;
|
|
578
613
|
}
|
|
614
|
+
/** Key-value attribute pair on a span or resource. */
|
|
579
615
|
interface KeyValue {
|
|
580
616
|
key: string;
|
|
581
617
|
value: AnyValue;
|
|
582
618
|
}
|
|
619
|
+
/** Discriminated OTLP attribute value (one of the typed fields set). */
|
|
583
620
|
interface AnyValue {
|
|
584
621
|
stringValue?: string;
|
|
585
622
|
boolValue?: boolean;
|
|
@@ -595,6 +632,7 @@ interface ArrayValue {
|
|
|
595
632
|
interface KeyValueList {
|
|
596
633
|
values: KeyValue[];
|
|
597
634
|
}
|
|
635
|
+
/** Options passed to {@link trajectoryToOtlp} / {@link emitOtel}. */
|
|
598
636
|
interface EmitOtelOptions {
|
|
599
637
|
/** User prompt for the first `gen_ai.input.messages` entry. */
|
|
600
638
|
prompt?: string;
|
|
@@ -627,13 +665,20 @@ interface EmitOtelOptions {
|
|
|
627
665
|
* ```
|
|
628
666
|
*/
|
|
629
667
|
declare function trajectoryToOtlp(view: TrajectoryView, options?: EmitOtelOptions): ExportTraceServiceRequest;
|
|
630
|
-
/** Alias
|
|
668
|
+
/** Alias for {@link trajectoryToOtlp} — matches implementation plan naming. */
|
|
631
669
|
declare const emitOtel: typeof trajectoryToOtlp;
|
|
632
670
|
//#endregion
|
|
633
671
|
//#region src/grader/grade-report.d.ts
|
|
672
|
+
/**
|
|
673
|
+
* Grade every repetition in a {@link SuiteReport} that has expectations.
|
|
674
|
+
*
|
|
675
|
+
* Expectations come from inline case fields or an optional sidecar YAML/JSON
|
|
676
|
+
* map. Runs are concurrent under {@link GradeReportOptions.maxConcurrent}.
|
|
677
|
+
*/
|
|
634
678
|
declare function gradeReport(report: SuiteReport, options?: GradeReportOptions): Promise<SuiteGradingReport>;
|
|
635
679
|
//#endregion
|
|
636
680
|
//#region src/grader/resolve-grade-options.d.ts
|
|
681
|
+
/** CLI flag overrides for grading (take precedence over grading YAML). */
|
|
637
682
|
interface GradeCliOverrides {
|
|
638
683
|
model?: string;
|
|
639
684
|
binary?: string;
|
|
@@ -648,9 +693,16 @@ interface GradeCliOverrides {
|
|
|
648
693
|
declare function resolveGradeOptions(fileConfig?: GradingConfig, cli?: GradeCliOverrides, configPath?: string): GradeReportOptions;
|
|
649
694
|
//#endregion
|
|
650
695
|
//#region src/grader/transcript.d.ts
|
|
696
|
+
/**
|
|
697
|
+
* Render a {@link TrajectoryView} as markdown for LLM graders.
|
|
698
|
+
*
|
|
699
|
+
* Tool results are truncated at {@link MAX_RESULT_CHARS} to keep judge
|
|
700
|
+
* prompts within reasonable token limits.
|
|
701
|
+
*/
|
|
651
702
|
declare function trajectoryToTranscript(view: TrajectoryView, prompt?: string): string;
|
|
652
703
|
//#endregion
|
|
653
704
|
//#region src/grader/claude-grader.d.ts
|
|
705
|
+
/** Options for {@link createClaudeGrader} / {@link runClaudeGrader}. */
|
|
654
706
|
interface ClaudeGraderOptions {
|
|
655
707
|
binary?: string;
|
|
656
708
|
model?: string;
|
|
@@ -659,14 +711,36 @@ interface ClaudeGraderOptions {
|
|
|
659
711
|
cwd?: string;
|
|
660
712
|
claudeCode?: ClaudeCodeOptions;
|
|
661
713
|
}
|
|
714
|
+
/** Factory returning a {@link GraderFn} bound to subprocess options. */
|
|
662
715
|
declare function createClaudeGrader(options?: ClaudeGraderOptions): GraderFn;
|
|
663
716
|
//#endregion
|
|
717
|
+
//#region src/grader/codex-grader.d.ts
|
|
718
|
+
/** Options for {@link createCodexGrader} / {@link runCodexGrader}. */
|
|
719
|
+
interface CodexGraderOptions {
|
|
720
|
+
binary?: string;
|
|
721
|
+
model?: string;
|
|
722
|
+
timeoutMs?: number;
|
|
723
|
+
env?: Record<string, string>;
|
|
724
|
+
cwd?: string;
|
|
725
|
+
codex?: CodexOptions;
|
|
726
|
+
}
|
|
727
|
+
/** Factory returning a {@link GraderFn} bound to subprocess options. */
|
|
728
|
+
declare function createCodexGrader(options?: CodexGraderOptions): GraderFn;
|
|
729
|
+
//#endregion
|
|
664
730
|
//#region src/grader/format-console.d.ts
|
|
731
|
+
/**
|
|
732
|
+
* Format a {@link SuiteGradingReport} for terminal output.
|
|
733
|
+
*
|
|
734
|
+
* @param color When true, emit ANSI status colors (default for TTY console).
|
|
735
|
+
*/
|
|
665
736
|
declare function formatGradingConsole(report: SuiteGradingReport, color?: boolean): string;
|
|
737
|
+
/** True when every graded rep passed all expectations without grader errors. */
|
|
666
738
|
declare function gradingReportPassed(report: SuiteGradingReport): boolean;
|
|
667
739
|
//#endregion
|
|
668
740
|
//#region src/reporter/types.d.ts
|
|
741
|
+
/** Output format selector for {@link formatReport}. */
|
|
669
742
|
type ReportFormat = "console" | "markdown" | "json";
|
|
743
|
+
/** Options for suite report formatting. */
|
|
670
744
|
interface ReporterOptions {
|
|
671
745
|
format: ReportFormat;
|
|
672
746
|
baseline?: SuiteReport;
|
|
@@ -674,52 +748,222 @@ interface ReporterOptions {
|
|
|
674
748
|
}
|
|
675
749
|
//#endregion
|
|
676
750
|
//#region src/reporter/index.d.ts
|
|
751
|
+
/**
|
|
752
|
+
* Format a {@link SuiteReport} for console, markdown, or JSON output.
|
|
753
|
+
*
|
|
754
|
+
* JSON format bypasses the renderable intermediate model and serializes the
|
|
755
|
+
* report directly. Console and markdown apply optional baseline deltas.
|
|
756
|
+
*/
|
|
677
757
|
declare function formatReport(report: SuiteReport, options: ReporterOptions): string;
|
|
678
758
|
//#endregion
|
|
679
759
|
//#region src/eval-record/build.d.ts
|
|
680
760
|
/**
|
|
681
761
|
* Convert a {@link SuiteReport} (and optional grading) into a versioned
|
|
682
762
|
* {@link EvalRunEnvelope} for storage or API handoff.
|
|
763
|
+
*
|
|
764
|
+
* @param report - Runner output for one suite execution.
|
|
765
|
+
* @param options - Provenance, grading merge, and artifact inclusion flags.
|
|
766
|
+
* @returns A fully populated envelope with protojson interchange fields on each repetition.
|
|
683
767
|
*/
|
|
684
768
|
declare function buildEvalRunEnvelope(report: SuiteReport, options?: BuildEvalRunEnvelopeOptions): EvalRunEnvelope;
|
|
685
|
-
/**
|
|
769
|
+
/**
|
|
770
|
+
* Build an envelope from on-disk runner and grader JSON artifacts.
|
|
771
|
+
*
|
|
772
|
+
* Reads `reportPath` as a {@link SuiteReport}. When `gradingPath` is set, merges
|
|
773
|
+
* outcome grades from a {@link SuiteGradingReport}. When `suitePath` is set,
|
|
774
|
+
* attaches suite URI and SHA-256 content hash for reproducibility.
|
|
775
|
+
*
|
|
776
|
+
* @param reportPath - Path to the suite run report JSON from `harness-eval run`.
|
|
777
|
+
* @param options - Same build options as {@link buildEvalRunEnvelope}, plus file paths.
|
|
778
|
+
*/
|
|
686
779
|
declare function buildEvalRunEnvelopeFromFiles(reportPath: string, options?: BuildEvalRunEnvelopeOptions & {
|
|
687
780
|
gradingPath?: string;
|
|
688
781
|
suitePath?: string;
|
|
689
782
|
}): Promise<EvalRunEnvelope>;
|
|
690
783
|
//#endregion
|
|
691
|
-
//#region src/
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
784
|
+
//#region src/eval-interchange/enrich.d.ts
|
|
785
|
+
/**
|
|
786
|
+
* Attach Vertex protojson interchange fields to one {@link EvalRepetition}.
|
|
787
|
+
*
|
|
788
|
+
* When no trajectory exists (adapter error), sets `failure: 1` and skips
|
|
789
|
+
* protojson payloads. Trajectory instances and harness metrics are only
|
|
790
|
+
* computed when the suite defines a non-empty reference trajectory.
|
|
791
|
+
*
|
|
792
|
+
* @param repetition - Base repetition from the runner (trajectory, assertions, grades).
|
|
793
|
+
* @param options.prompt - Case prompt for EvaluationInstance.
|
|
794
|
+
* @param options.reference - Suite reference trajectory config, if any.
|
|
795
|
+
*/
|
|
796
|
+
declare function enrichRepetitionWithProtojson(repetition: EvalRepetition, options?: {
|
|
797
|
+
prompt?: string;
|
|
798
|
+
reference?: ReferenceTrajectoryConfig;
|
|
799
|
+
}): EvalRepetition;
|
|
800
|
+
//#endregion
|
|
801
|
+
//#region src/eval-interchange/protojson/evaluation-instance.d.ts
|
|
802
|
+
/**
|
|
803
|
+
* Build an EvaluationInstance protojson object from harness strings.
|
|
804
|
+
*
|
|
805
|
+
* Omitted fields are excluded from the output object rather than set to
|
|
806
|
+
* empty wrappers — protojson omits unset optional fields.
|
|
807
|
+
*
|
|
808
|
+
* @param options.prompt - Case prompt sent to the agent.
|
|
809
|
+
* @param options.response - Final agent response from the trajectory.
|
|
810
|
+
* @param options.reference - Optional reference answer text (rare in harness eval).
|
|
811
|
+
*/
|
|
812
|
+
declare function toEvaluationInstance(options: {
|
|
813
|
+
prompt?: string;
|
|
814
|
+
response?: string;
|
|
815
|
+
reference?: string;
|
|
816
|
+
}): EvaluationInstanceJson;
|
|
817
|
+
//#endregion
|
|
818
|
+
//#region src/eval-interchange/protojson/harness-metrics.d.ts
|
|
819
|
+
/** Suite YAML reference step shape accepted by metric computation. */
|
|
820
|
+
type ReferenceStep$1 = {
|
|
696
821
|
tool_name: string;
|
|
697
822
|
tool_input: unknown;
|
|
698
823
|
};
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
824
|
+
/**
|
|
825
|
+
* Compute trajectory metrics and map snake_case keys to Vertex camelCase.
|
|
826
|
+
*
|
|
827
|
+
* When `referenceToolNameMode` is `"bare"`, both predicted and reference tool
|
|
828
|
+
* names are stripped to the suffix after the last `__` so suite reference steps
|
|
829
|
+
* authored with bare names (e.g. `ListLandingZones`) match harness MCP names
|
|
830
|
+
* (e.g. `mcp__plugin__ListLandingZones`).
|
|
831
|
+
*
|
|
832
|
+
* @param predicted - Tool calls from the harness trajectory view.
|
|
833
|
+
* @param reference - Reference steps from suite YAML.
|
|
834
|
+
* @param options.referenceToolNameMode - Name normalization mode from suite YAML.
|
|
835
|
+
*/
|
|
836
|
+
declare function toHarnessMetrics(predicted: ToolCall[], reference: ReferenceStep$1[], options?: {
|
|
837
|
+
referenceToolNameMode?: ReferenceToolNameMode;
|
|
838
|
+
}): HarnessMetrics;
|
|
839
|
+
//#endregion
|
|
840
|
+
//#region src/eval-interchange/protojson/trajectory-instances.d.ts
|
|
841
|
+
type ReferenceStep = {
|
|
842
|
+
tool_name: string;
|
|
843
|
+
tool_input: unknown;
|
|
844
|
+
};
|
|
845
|
+
/**
|
|
846
|
+
* Build all Trajectory*Instance payloads for one predicted/reference pair.
|
|
847
|
+
*
|
|
848
|
+
* Pair metrics (exact, in-order, any-order, precision, recall) share the
|
|
849
|
+
* same trajectory pair; single-tool-use omits the reference trajectory
|
|
850
|
+
* per Vertex API shape.
|
|
851
|
+
*/
|
|
852
|
+
declare function toTrajectoryInstances(options: {
|
|
853
|
+
predicted: ToolCall[];
|
|
854
|
+
reference: ReferenceStep[];
|
|
855
|
+
referenceToolNameMode?: ReferenceToolNameMode;
|
|
856
|
+
}): TrajectoryInstancesJson;
|
|
704
857
|
//#endregion
|
|
705
858
|
//#region src/eval-interchange/projections.d.ts
|
|
859
|
+
/**
|
|
860
|
+
* Trajectory projection — all repetitions in the envelope as dataset rows.
|
|
861
|
+
*/
|
|
706
862
|
declare function toTrajectory(envelope: EvalRunEnvelope): EvalDatasetRow[];
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
863
|
+
/**
|
|
864
|
+
* Instances projection — all trajectory metric instances as JSONL rows.
|
|
865
|
+
*/
|
|
866
|
+
declare function toInstancesJsonl(envelope: EvalRunEnvelope): InstancesJsonlRow[];
|
|
710
867
|
//#endregion
|
|
711
868
|
//#region src/metrics/trajectory.d.ts
|
|
712
|
-
|
|
869
|
+
/**
|
|
870
|
+
* Trajectory-level metrics for comparing predicted and reference tool-call sequences.
|
|
871
|
+
*
|
|
872
|
+
* Aligns with Vertex AI EvaluationService trajectory metrics (exact match,
|
|
873
|
+
* in-order, any-order, precision, recall, single tool use). Tool calls are
|
|
874
|
+
* compared by `(tool_name, serialized tool_input)` identity after normalization.
|
|
875
|
+
*
|
|
876
|
+
* Binary metrics return 0 or 1; precision and recall return fractions in [0, 1].
|
|
877
|
+
*/
|
|
878
|
+
/** Canonical wire tool call used internally for comparison. */
|
|
879
|
+
interface WireToolCall {
|
|
713
880
|
tool_name: string;
|
|
714
|
-
tool_input:
|
|
881
|
+
tool_input: string;
|
|
882
|
+
}
|
|
883
|
+
/** All trajectory metric scores for one predicted/reference pair. */
|
|
884
|
+
interface TrajectoryMetrics {
|
|
885
|
+
trajectory_exact_match: number;
|
|
886
|
+
trajectory_in_order_match: number;
|
|
887
|
+
trajectory_any_order_match: number;
|
|
888
|
+
trajectory_precision: number;
|
|
889
|
+
trajectory_recall: number;
|
|
890
|
+
trajectory_single_tool_use: number;
|
|
891
|
+
}
|
|
892
|
+
/** Input accepted by trajectory metrics — wire or harness/YAML shapes. */
|
|
893
|
+
type TrajectoryInput = WireToolCall[] | Array<{
|
|
894
|
+
tool_name: string;
|
|
895
|
+
tool_input: unknown | string;
|
|
715
896
|
}>;
|
|
897
|
+
/** Exact sequence equality after normalization. */
|
|
716
898
|
declare function trajectoryExactMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
899
|
+
/** Reference is a subsequence of predicted (order preserved, extras allowed). */
|
|
717
900
|
declare function trajectoryInOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
901
|
+
/** Same multiset of tool calls; length must match. */
|
|
718
902
|
declare function trajectoryAnyOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
903
|
+
/**
|
|
904
|
+
* Fraction of predicted tool calls that appear in reference (multiset).
|
|
905
|
+
*
|
|
906
|
+
* Returns 1 when both trajectories are empty.
|
|
907
|
+
*/
|
|
719
908
|
declare function trajectoryPrecision(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
909
|
+
/**
|
|
910
|
+
* Fraction of reference tool calls matched in predicted (multiset recall).
|
|
911
|
+
*
|
|
912
|
+
* Returns 1 when reference is empty and predicted is empty.
|
|
913
|
+
*/
|
|
720
914
|
declare function trajectoryRecall(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
915
|
+
/** Both trajectories have exactly one call and they match. */
|
|
721
916
|
declare function trajectorySingleToolUse(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
917
|
+
/** Compute all trajectory metrics in one pass. */
|
|
722
918
|
declare function computeTrajectoryMetrics(predicted: TrajectoryInput, reference: TrajectoryInput): TrajectoryMetrics;
|
|
723
919
|
//#endregion
|
|
724
|
-
|
|
920
|
+
//#region src/metrics/tool-calls.d.ts
|
|
921
|
+
/** Options for parameter value comparison. */
|
|
922
|
+
interface ToolCallMetricOptions {
|
|
923
|
+
/** When true, compare serialized JSON strictly (reserved for future semantics). */
|
|
924
|
+
useStrictStringMatch?: boolean;
|
|
925
|
+
}
|
|
926
|
+
/** Aggregated tool-call metric scores (each 0..1). */
|
|
927
|
+
interface ToolCallMetrics {
|
|
928
|
+
tool_call_valid: number;
|
|
929
|
+
tool_name_match: number;
|
|
930
|
+
tool_parameter_key_match: number;
|
|
931
|
+
tool_parameter_kv_match: number;
|
|
932
|
+
}
|
|
933
|
+
type ToolCallInput = TrajectoryInput[number];
|
|
934
|
+
/**
|
|
935
|
+
* Whether a predicted tool call is well-formed (non-empty name, parseable JSON input).
|
|
936
|
+
*
|
|
937
|
+
* @returns 1 when valid, 0 otherwise.
|
|
938
|
+
*/
|
|
939
|
+
declare function toolCallValid(toolCall: ToolCallInput): number;
|
|
940
|
+
/**
|
|
941
|
+
* Whether predicted and reference tool names match exactly.
|
|
942
|
+
*
|
|
943
|
+
* @returns 1 on match, 0 otherwise.
|
|
944
|
+
*/
|
|
945
|
+
declare function toolNameMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
|
|
946
|
+
/**
|
|
947
|
+
* Whether parameter key sets match (same keys, same order after sort).
|
|
948
|
+
*
|
|
949
|
+
* Requires matching tool names first. Returns 0 when args are not objects.
|
|
950
|
+
*/
|
|
951
|
+
declare function toolParameterKeyMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
|
|
952
|
+
/**
|
|
953
|
+
* Whether all reference parameter key-value pairs match in the predicted call.
|
|
954
|
+
*
|
|
955
|
+
* Requires {@link toolParameterKeyMatch} first. Only keys present in reference
|
|
956
|
+
* are checked (predicted may have extra keys).
|
|
957
|
+
*/
|
|
958
|
+
declare function toolParameterKvMatch(predicted: ToolCallInput, reference: ToolCallInput, options?: ToolCallMetricOptions): number;
|
|
959
|
+
/**
|
|
960
|
+
* Average tool-call metrics across index-aligned predicted/reference pairs.
|
|
961
|
+
*
|
|
962
|
+
* Denominator is `max(predicted.length, reference.length, 1)`. Missing
|
|
963
|
+
* predicted calls at an index are skipped for pair metrics; validity still
|
|
964
|
+
* counts when a predicted call exists.
|
|
965
|
+
*/
|
|
966
|
+
declare function computeToolCallMetrics(predicted: ToolCallInput[], reference: ToolCallInput[], options?: ToolCallMetricOptions): ToolCallMetrics;
|
|
967
|
+
//#endregion
|
|
968
|
+
export { type AdapterDiagnostics, AdapterError, type AdapterResult, type AdapterRunFn, Assertion, AssertionResult, AssertionStat, AssistantMessage, AssistantMessageEvent, AssistantTurn, type BaseAdapterConfig, BuildEvalRunEnvelopeOptions, Cardinality, CellReport, CompoundPredicate, ConfigError, ContentBlock, DEFAULT_ADAPTER_ID, DEFAULT_REPETITIONS, DEFAULT_THRESHOLD, EVAL_RUN_SCHEMA_VERSION, type EmitOtelOptions, EvalArtifacts, EvalAssertionStat, EvalCellResult, EvalDatasetRow, EvalProvenance, EvalRepetition, EvalRunEnvelope, EvalRunSummary, EvaluationInstanceJson, type ExportTraceServiceRequest, ExternalScore, type GradeReportOptions, type HarnessAdapter, HarnessInfo, HarnessMetrics, InstanceData, InstancesJsonlRow, JudgeInfo, LeafPredicate, type LimitedRunner, MatrixCell, McpServerStatus, ObjectPredicate, OutcomeGrades, type ParseErrorRecord, type ParseResult, type PipelineConfig, Predicate, ProgressCallback, ProgressEvent, ProtojsonToolCall, ProtojsonTrajectory, ReferenceToolNameMode, ReferenceTrajectoryConfig, type RepGradingResult, RepetitionError, RepetitionResult, type ReporterOptions, ResultEvent, RetryRecord, RunSuiteOptions, SessionMeta, StopReason, StreamEvent, type SuiteConfig, type SuiteDocument, type SuiteGradingReport, SuiteReference, SuiteReport, SystemCompactBoundaryEvent, SystemInitEvent, SystemPluginInstallEvent, SystemRetryEvent, SystemUnknownEvent, TRAJECTORY_SCHEMA_VERSION, TestCase, TestSuite, TextBlock, ThresholdedAssertion, ToolCall, type ToolCallMetricOptions, ToolPattern, ToolResultBlock, ToolUseBlock, TrajectoryBuilder, type TrajectoryInput, TrajectoryInstanceMetricKey, TrajectoryInstancesJson, TrajectoryPairInstanceJson, TrajectorySingleToolUseInstanceJson, TrajectoryView, Usage, UsageSummary, UserMessage, UserMessageEvent, aggregateCell, buildEvalRunEnvelope, buildEvalRunEnvelopeFromFiles, buildTrajectory, index_d_exports as claudeCode, computeToolCallMetrics, computeTrajectoryMetrics, createClaudeGrader, createCodexGrader, createLimit, emitOtel, enrichRepetitionWithProtojson, evaluate, evaluateAll, formatGradingConsole, formatReport, getAdapter, getDefaultAdapter, getRepetitions, gradeReport, gradingReportPassed, isAssistantMessage, isResult, isSystemInit, isSystemRetry, isTextBlock, isToolResultBlock, isToolUseBlock, isUserMessage, listAdapters, loadSuite, loadSuiteDocument, mergeConfig, namespaceOf, parseStreamJson, parseSuite, registerAdapter, resolveGradeOptions, resolveGradingArtifactFromSuite, resolvePipelineInputs, runPipeline, runRepetition, runSuite, toEvaluationInstance, toHarnessMetrics, toInstancesJsonl, toTrajectory, toTrajectoryInstances, toolCallValid, toolNameMatch, toolParameterKeyMatch, toolParameterKvMatch, trajectoryAnyOrderMatch, trajectoryExactMatch, trajectoryInOrderMatch, trajectoryPrecision, trajectoryRecall, trajectorySingleToolUse, trajectoryToOtlp, trajectoryToTranscript };
|
|
725
969
|
//# sourceMappingURL=index.d.ts.map
|