@alis-build/harness-eval 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +104 -10
  2. package/dist/adapters/claude-code/index.d.ts +2 -2
  3. package/dist/adapters/claude-code/index.js +2 -1
  4. package/dist/adapters/codex/index.d.ts +68 -0
  5. package/dist/adapters/codex/index.js +3 -0
  6. package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
  7. package/dist/claude-code-C_7hxC8z.js.map +1 -0
  8. package/dist/cli/bin.js +204 -127
  9. package/dist/cli/bin.js.map +1 -1
  10. package/dist/codex-0cHO2te9.js +496 -0
  11. package/dist/codex-0cHO2te9.js.map +1 -0
  12. package/dist/config/loader.d.ts +2 -2
  13. package/dist/config/loader.js +2 -2
  14. package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
  15. package/dist/index.d.ts +397 -153
  16. package/dist/index.js +125 -5
  17. package/dist/index.js.map +1 -0
  18. package/dist/loader-B1WmGGzf.d.ts +107 -0
  19. package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
  20. package/dist/loader-DnQ6Jt0i.js.map +1 -0
  21. package/dist/reporter-Biy-5-9M.js +2216 -0
  22. package/dist/reporter-Biy-5-9M.js.map +1 -0
  23. package/dist/runner/suite.d.ts +1 -1
  24. package/dist/runner/suite.js +1 -1
  25. package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
  26. package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
  27. package/dist/suite-BcP64nlb.js.map +1 -0
  28. package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
  29. package/dist/types-Bac8_Ixb.js +246 -0
  30. package/dist/types-Bac8_Ixb.js.map +1 -0
  31. package/dist/types-Bu8uOZZN.d.ts +77 -0
  32. package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
  33. package/package.json +7 -2
  34. package/schemas/eval-interchange-instances.schema.json +196 -0
  35. package/schemas/eval-interchange.schema.json +65 -52
  36. package/schemas/eval-run-envelope.schema.json +182 -425
  37. package/dist/build-DsVJ_UeU.js +0 -1396
  38. package/dist/build-DsVJ_UeU.js.map +0 -1
  39. package/dist/claude-code-ycT0JQZF.js.map +0 -1
  40. package/dist/loader-BCnFJ8rm.js.map +0 -1
  41. package/dist/loader-DTvoVfN0.d.ts +0 -33
  42. package/dist/suite-chj0j22j.js.map +0 -1
  43. package/schemas/eval-interchange-agent-trace.schema.json +0 -322
  44. package/schemas/eval-interchange-proto-instance.schema.json +0 -106
package/dist/index.d.ts CHANGED
@@ -1,120 +1,39 @@
1
- import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-B9H4IZtA.js";
2
- import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-6Z17eKZx.js";
3
- import { _ as ObjectPredicate, a as ProgressEvent, b as ToolPattern, c as RunSuiteOptions, d as TestSuite, f as Assertion, g as LeafPredicate, h as CompoundPredicate, i as ProgressCallback, l as SuiteReport, m as Cardinality, n as CellReport, o as RepetitionError, p as AssertionResult, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as Predicate, y as ThresholdedAssertion } from "./types-BQol062t.js";
4
- import { i as GradingConfig, r as parseSuite, s as ConfigError, t as loadSuite } from "./loader-DTvoVfN0.js";
5
- import { t as runSuite } from "./suite-BoOvK_lq.js";
1
+ import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-C0gBkl0-.js";
2
+ import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-DnvP1UBl.js";
3
+ import { i as CodexOptions } from "./types-Bu8uOZZN.js";
4
+ import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-0QkNVyp9.js";
5
+ import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, n as parseSuite, o as SuiteDocument, s as PipelineConfig, t as loadSuite } from "./loader-B1WmGGzf.js";
6
+ import { t as runSuite } from "./suite-BEShV0by.js";
6
7
  import { Readable } from "node:stream";
7
8
 
8
- //#region src/types/eval-interchange.d.ts
9
- /**
10
- * TypeScript types for eval interchange output.
11
- */
12
- interface InterchangeToolCall {
13
- tool_name: string;
14
- tool_input: string;
15
- }
16
- interface InterchangeTrajectory {
17
- tool_calls: InterchangeToolCall[];
18
- }
19
- interface TabularToolCall {
20
- tool_name: string;
21
- tool_input: unknown;
22
- }
23
- interface ContentPart {
24
- text?: string;
25
- function_call?: {
26
- name: string;
27
- args: unknown;
28
- };
29
- function_response?: {
30
- name: string;
31
- response: unknown;
32
- };
33
- }
34
- interface AgentEvent {
35
- author: string;
36
- content: {
37
- parts: ContentPart[];
38
- };
39
- event_time?: string;
40
- state_delta?: Record<string, unknown>;
41
- active_tools?: Array<{
42
- name: string;
43
- }>;
44
- }
45
- interface ConversationTurn {
46
- turn_index: number;
47
- turn_id?: string;
48
- events: AgentEvent[];
49
- }
50
- interface AgentConfig {
51
- agent_id: string;
52
- agent_type?: string;
53
- description?: string;
54
- instruction?: string;
55
- tools?: Array<{
56
- name: string;
57
- }>;
58
- sub_agents?: string[];
59
- }
60
- interface AgentTrace {
61
- agents: Record<string, AgentConfig>;
62
- turns: ConversationTurn[];
63
- }
64
- interface EvalDatasetRow {
65
- prompt?: string;
66
- response?: string;
67
- reference?: string;
68
- predicted_trajectory: TabularToolCall[];
69
- reference_trajectory?: TabularToolCall[];
70
- latency_in_seconds: number;
71
- failure: 0 | 1;
72
- human_ratings?: Record<string, number>;
73
- }
74
- interface ProtoTrajectoryInstance {
75
- predicted_trajectory: InterchangeTrajectory;
76
- reference_trajectory?: InterchangeTrajectory;
77
- prompt?: string;
78
- response?: string;
79
- reference?: string;
80
- }
81
- interface TrajectoryMetrics {
82
- trajectory_exact_match: number;
83
- trajectory_in_order_match: number;
84
- trajectory_any_order_match: number;
85
- trajectory_precision: number;
86
- trajectory_recall: number;
87
- trajectory_single_tool_use: number;
88
- }
89
- interface ToolCallMetrics {
90
- tool_call_valid: number;
91
- tool_name_match: number;
92
- tool_parameter_key_match: number;
93
- tool_parameter_kv_match: number;
94
- }
95
- //#endregion
96
9
  //#region src/grader/types.d.ts
97
10
  /**
98
11
  * Outcome grading types (LLM-as-judge layer).
99
12
  *
100
13
  * Behavioral assertions live in harness-eval assertions; expectations here
101
14
  * are natural-language outcome checks graded from trajectory transcripts.
15
+ * Grading runs as a second pass over a {@link SuiteReport} JSON artifact.
102
16
  */
103
17
  interface GradedExpectation {
18
+ /** Original expectation text from the suite or sidecar file. */
104
19
  text: string;
105
20
  passed: boolean;
21
+ /** Quote or description supporting the pass/fail decision. */
106
22
  evidence: string;
107
23
  }
24
+ /** Aggregate pass/fail counts for one grading unit (rep or full report). */
108
25
  interface GradingSummary {
109
26
  passed: number;
110
27
  failed: number;
111
28
  total: number;
112
29
  passRate: number;
113
30
  }
31
+ /** Suggestion for improving an expectation or assertion wording. */
114
32
  interface EvalFeedbackSuggestion {
115
33
  assertion?: string;
116
34
  reason: string;
117
35
  }
36
+ /** Optional meta-feedback from the judge about expectation quality. */
118
37
  interface EvalFeedback {
119
38
  suggestions: EvalFeedbackSuggestion[];
120
39
  overall: string;
@@ -132,14 +51,22 @@ interface RepGradingResult {
132
51
  graderError?: string;
133
52
  durationMs: number;
134
53
  }
54
+ /** Full grading report for a suite run. */
135
55
  interface SuiteGradingReport {
136
56
  gradedAt: string;
137
57
  sourceReport: string;
138
58
  /** Grading YAML path when `--config` was used. */
139
59
  gradingConfigPath?: string;
60
+ /** Judge that produced outcome grades. */
61
+ judge?: {
62
+ id: string;
63
+ model?: string;
64
+ adapter?: string;
65
+ };
140
66
  results: RepGradingResult[];
141
67
  summary: GradingSummary;
142
68
  }
69
+ /** Options controlling {@link gradeReport} and the CLI `grade` command. */
143
70
  interface GradeReportOptions {
144
71
  /** Path to the report being graded (stored in output). */
145
72
  sourceReport?: string;
@@ -159,14 +86,19 @@ interface GradeReportOptions {
159
86
  env?: Record<string, string>;
160
87
  /** Working directory for the judge subprocess. */
161
88
  cwd?: string;
89
+ /** Grading adapter id. Default: `claude-code`. */
90
+ judgeAdapter?: "claude-code" | "codex";
162
91
  /** Claude Code options for the judge (nested in grading YAML under `claudeCode`). */
163
92
  claudeCode?: Record<string, unknown>;
93
+ /** Codex CLI options for the judge (nested in grading YAML under `codex`). */
94
+ codex?: Record<string, unknown>;
164
95
  /** Path to grading YAML when `--config` was used. */
165
96
  gradingConfigPath?: string;
166
97
  /** Inject a custom grader (for tests). */
167
98
  gradeFn?: GraderFn;
168
99
  onProgress?: (event: GradeProgressEvent) => void;
169
100
  }
101
+ /** Progress events emitted during outcome grading. */
170
102
  type GradeProgressEvent = {
171
103
  kind: "grade-start";
172
104
  total: number;
@@ -185,13 +117,16 @@ type GradeProgressEvent = {
185
117
  totalExpectations: number;
186
118
  passedExpectations: number;
187
119
  };
120
+ /** Pluggable grader implementation (defaults to Claude subprocess). */
188
121
  type GraderFn = (input: GraderInput) => Promise<GraderOutput>;
122
+ /** Input passed to a grader for one repetition. */
189
123
  interface GraderInput {
190
124
  prompt: string;
191
125
  transcript: string;
192
126
  expectations: string[];
193
127
  systemInstruction?: string;
194
128
  }
129
+ /** Parsed grader response before alignment with input expectation order. */
195
130
  interface GraderOutput {
196
131
  expectations: GradedExpectation[];
197
132
  summary: GradingSummary;
@@ -206,20 +141,14 @@ declare const EVAL_RUN_SCHEMA_VERSION = "1.0";
206
141
  declare const TRAJECTORY_SCHEMA_VERSION = "1.0";
207
142
  /** Link to the suite spec that produced a run. */
208
143
  interface SuiteReference {
209
- /** Absolute or repo-relative path to the suite YAML. */
210
144
  uri?: string;
211
- /** Stable suite identifier when known (e.g. case bundle name). */
212
145
  id?: string;
213
- /** SHA-256 or similar hash of suite file contents. */
214
146
  contentHash?: string;
215
147
  }
216
148
  /** Harness that executed the run. */
217
149
  interface HarnessInfo {
218
- /** Adapter id from suite YAML, e.g. `claude-code`. */
219
150
  adapter: string;
220
- /** harness-eval package version when envelope was built. */
221
151
  frameworkVersion?: string;
222
- /** Optional harness binary version (e.g. `claude -v`). */
223
152
  harnessVersion?: string;
224
153
  }
225
154
  /** CI, git, or runtime provenance for correlation in the DB. */
@@ -244,9 +173,7 @@ interface EvalProvenance {
244
173
  interface EvalRunSummary {
245
174
  cellsTotal: number;
246
175
  cellsPassed: number;
247
- /** All cells passed behavioral assertion thresholds. */
248
176
  behavioralPass: boolean;
249
- /** All graded expectations passed (when outcome layer present). */
250
177
  outcomePass?: boolean;
251
178
  }
252
179
  /** Identity of the judge that produced outcome grades. */
@@ -254,6 +181,8 @@ interface JudgeInfo {
254
181
  id: string;
255
182
  model?: string;
256
183
  version?: string;
184
+ /** Grading adapter id when known (e.g. `codex`, `claude-code`). */
185
+ adapter?: string;
257
186
  }
258
187
  /** Outcome grades for one repetition (built-in or external judge). */
259
188
  interface OutcomeGrades {
@@ -278,20 +207,14 @@ interface ExternalScore {
278
207
  }
279
208
  /** Optional large or vendor-specific blobs (store by reference in DB when possible). */
280
209
  interface EvalArtifacts {
281
- /** Claude Code `stream-json` lines — debug only, not cross-harness. */
282
210
  rawStreamEvents?: unknown[];
283
- /** URI to OTLP JSON (S3, GCS, etc.). */
284
211
  otlpTraceUri?: string;
285
- /** Text transcript for judges (`trajectoryToTranscript`). */
286
212
  transcript?: string;
287
213
  }
288
- /**
289
- * One harness invocation — the unit external judges and trajectory queries use.
290
- */
214
+ /** One harness invocation — the unit external judges and trajectory queries use. */
291
215
  interface EvalRepetition {
292
216
  repetitionIndex: number;
293
217
  durationMs: number;
294
- /** Normalized harness session. Required when the harness completed with a view. */
295
218
  trajectory?: TrajectoryView & {
296
219
  schemaVersion: string;
297
220
  };
@@ -300,18 +223,14 @@ interface EvalRepetition {
300
223
  outcomeGrades?: OutcomeGrades;
301
224
  externalScores?: ExternalScore[];
302
225
  artifacts?: EvalArtifacts;
303
- /** Interchange-format predicted tool-call trajectory. */
304
- predicted_trajectory?: InterchangeToolCall[];
305
- /** Full multi-turn agent trace in interchange format. */
306
- agent_trace?: AgentTrace;
307
- /** Session latency in seconds (interchange field). */
308
- latency_in_seconds?: number;
309
- /** 1 when the harness run failed, 0 on success (interchange field). */
226
+ /** Vertex EvaluationInstance protojson wire object. */
227
+ evaluationInstance?: EvaluationInstanceJson;
228
+ /** Vertex Trajectory*Instance protojson wire objects keyed by metric. */
229
+ trajectoryInstances?: TrajectoryInstancesJson;
230
+ /** Harness-precomputed trajectory metric scores (camelCase). */
231
+ harnessMetrics?: HarnessMetrics;
232
+ latencySeconds?: number;
310
233
  failure?: 0 | 1;
311
- /** Trajectory-level metrics when reference_trajectory is provided. */
312
- trajectoryMetrics?: TrajectoryMetrics;
313
- /** Tool-call-level metrics when reference_trajectory is provided. */
314
- toolCallMetrics?: ToolCallMetrics;
315
234
  error?: {
316
235
  message: string;
317
236
  diagnostics?: Partial<AdapterDiagnostics>;
@@ -335,24 +254,16 @@ interface EvalCellResult {
335
254
  expectations?: string[];
336
255
  cellLabel: string;
337
256
  axes?: Record<string, string>;
338
- /** Reference tool-call trajectory for metric computation. */
339
- reference_trajectory?: TabularToolCall[];
340
- /** Human ratings keyed by metric name for judge calibration. */
341
- human_ratings?: Record<string, number>;
257
+ /** Reference trajectory in Vertex protojson wire format. */
258
+ referenceTrajectory?: ProtojsonTrajectory;
259
+ humanRatings?: Record<string, number>;
342
260
  assertionStats: EvalAssertionStat[];
343
261
  adapterErrors: number;
344
- /** Passed all behavioral assertion thresholds for this cell. */
345
262
  behavioralPass: boolean;
346
- /** Passed all outcome expectations when graded; omitted if not graded. */
347
263
  outcomePass?: boolean;
348
264
  repetitions: EvalRepetition[];
349
265
  }
350
- /**
351
- * Top-level document for CI/CD pipelines, APIs, and databases.
352
- *
353
- * This is the interchange format your storage layer should target — not
354
- * {@link import("./stream").StreamEvent} or OTLP traces.
355
- */
266
+ /** Top-level document for CI/CD pipelines, APIs, and databases. */
356
267
  interface EvalRunEnvelope {
357
268
  schemaVersion: typeof EVAL_RUN_SCHEMA_VERSION;
358
269
  runId: string;
@@ -365,12 +276,15 @@ interface EvalRunEnvelope {
365
276
  cells: EvalCellResult[];
366
277
  }
367
278
  interface BuildEvalRunEnvelopeOptions {
368
- /** UUID for this run; generated if omitted. */
279
+ /** Override envelope runId; defaults to a random UUID. */
369
280
  runId?: string;
281
+ /** Link to the suite YAML that produced the run. */
370
282
  suite?: SuiteReference;
283
+ /** Harness adapter metadata; adapter defaults to `"claude-code"`. */
371
284
  harness?: Partial<HarnessInfo>;
285
+ /** CI, git, and runtime provenance for correlation. */
372
286
  provenance?: EvalProvenance;
373
- /** Merge outcome grades from `gradeReport()` or compatible structure. */
287
+ /** Outcome grades to merge from a grader run. */
374
288
  grading?: {
375
289
  gradedAt?: string;
376
290
  sourceReport?: string;
@@ -386,9 +300,9 @@ interface BuildEvalRunEnvelopeOptions {
386
300
  }>;
387
301
  judge?: JudgeInfo;
388
302
  };
389
- /** Include transcript in each repetition's artifacts. Default true. */
303
+ /** Include text transcript artifact (default true). */
390
304
  includeTranscript?: boolean;
391
- /** Include raw stream events when adapter provides them. Default false. */
305
+ /** Include raw stream-json events (default false; debug only). */
392
306
  includeRawStreamEvents?: boolean;
393
307
  }
394
308
  //#endregion
@@ -506,6 +420,7 @@ declare function getDefaultAdapter(): HarnessAdapter;
506
420
  declare const DEFAULT_REPETITIONS = 5;
507
421
  /** Default assertion pass-rate threshold when `threshold` is omitted. */
508
422
  declare const DEFAULT_THRESHOLD = 1;
423
+ /** Injectable adapter run function (used by tests to stub harness I/O). */
509
424
  type AdapterRunFn = (config: BaseAdapterConfig & Record<string, unknown>) => Promise<AdapterResult>;
510
425
  /**
511
426
  * Build the effective adapter config for one (suite, case, cell).
@@ -513,8 +428,21 @@ type AdapterRunFn = (config: BaseAdapterConfig & Record<string, unknown>) => Pro
513
428
  * Merge order (later wins): defaultConfig < case.config < cell.config.
514
429
  */
515
430
  declare function mergeConfig(suite: TestSuite, testCase: TestCase, cell: MatrixCell): BaseAdapterConfig & Record<string, unknown>;
431
+ /** Effective repetition count for a case (`case.repetitions` or default). */
516
432
  declare function getRepetitions(testCase: TestCase): number;
433
+ /**
434
+ * Run one repetition: invoke the adapter, evaluate assertions, capture errors.
435
+ *
436
+ * Adapter failures are returned as {@link RepetitionResult.error} rather than
437
+ * thrown so the suite runner can continue other reps and report adapter error counts.
438
+ */
517
439
  declare function runRepetition(testCase: TestCase, _cell: MatrixCell, config: BaseAdapterConfig & Record<string, unknown>, repetitionIndex: number, run: AdapterRunFn, signal?: AbortSignal): Promise<RepetitionResult>;
440
+ /**
441
+ * Roll up repetition results into a {@link CellReport}.
442
+ *
443
+ * Adapter errors reduce `evaluatedCount` but do not fail the cell by
444
+ * themselves — only assertion threshold misses mark a cell as failed.
445
+ */
518
446
  declare function aggregateCell(testCase: TestCase, cell: MatrixCell, repetitions: RepetitionResult[]): CellReport;
519
447
  //#endregion
520
448
  //#region src/runner/limit.d.ts
@@ -536,6 +464,106 @@ declare function aggregateCell(testCase: TestCase, cell: MatrixCell, repetitions
536
464
  type LimitedRunner = <T>(fn: () => Promise<T>) => Promise<T>;
537
465
  declare function createLimit(max: number): LimitedRunner;
538
466
  //#endregion
467
+ //#region src/cli/commands/envelope.d.ts
468
+ /** Supported `--projection` values for envelope output. */
469
+ type EnvelopeProjection = "envelope" | "trajectory" | "instances";
470
+ //#endregion
471
+ //#region src/pipeline/resolve-inputs.d.ts
472
+ type PipelineStepName = "run" | "grade" | "envelope";
473
+ /** CLI overrides for pipeline artifact paths (take precedence over YAML). */
474
+ interface PipelineCliOverrides {
475
+ run?: {
476
+ output?: string;
477
+ maxConcurrent?: number;
478
+ };
479
+ grade?: {
480
+ input?: string;
481
+ output?: string;
482
+ maxConcurrent?: number;
483
+ };
484
+ envelope?: {
485
+ report?: string;
486
+ grading?: string;
487
+ output?: string;
488
+ projection?: EnvelopeProjection;
489
+ };
490
+ }
491
+ /** Resolved paths for the harness run step. */
492
+ interface ResolvedPipelineRun {
493
+ output: string;
494
+ maxConcurrent?: number;
495
+ }
496
+ /** Resolved input (suite report) and output (grading JSON) for the grade step. */
497
+ interface ResolvedPipelineGrade {
498
+ input: string;
499
+ output: string;
500
+ maxConcurrent?: number;
501
+ }
502
+ /** Resolved artifact paths for the envelope export step. */
503
+ interface ResolvedPipelineEnvelope {
504
+ report: string;
505
+ grading?: string;
506
+ output: string;
507
+ projection: EnvelopeProjection;
508
+ includeRawStreamEvents: boolean;
509
+ noTranscript: boolean;
510
+ }
511
+ /** Fully resolved pipeline inputs for one or more enabled steps. */
512
+ interface ResolvedPipeline {
513
+ suitePath: string;
514
+ run?: ResolvedPipelineRun;
515
+ grade?: ResolvedPipelineGrade;
516
+ envelope?: ResolvedPipelineEnvelope;
517
+ }
518
+ /** Inputs for {@link resolvePipelineInputs}. */
519
+ interface ResolvePipelineInputsOptions {
520
+ suitePath: string;
521
+ suiteDir: string;
522
+ pipeline: PipelineConfig;
523
+ steps: PipelineStepName[];
524
+ executed?: {
525
+ run?: {
526
+ output: string;
527
+ };
528
+ grade?: {
529
+ input: string;
530
+ output: string;
531
+ };
532
+ };
533
+ overrides?: PipelineCliOverrides;
534
+ }
535
+ /** Resolve absolute paths for enabled pipeline steps. */
536
+ declare function resolvePipelineInputs(options: ResolvePipelineInputsOptions): Promise<ResolvedPipeline>;
537
+ /**
538
+ * Resolve a grading artifact path from a unified suite's `pipeline:` block.
539
+ *
540
+ * Used by `harness-eval envelope --suite` when `--grading` is omitted (spec C-7).
541
+ * Checks `pipeline.envelope.grading` then default `pipeline.grade.output` on disk.
542
+ */
543
+ declare function resolveGradingArtifactFromSuite(suitePath: string): Promise<string | undefined>;
544
+ //#endregion
545
+ //#region src/pipeline/run-pipeline.d.ts
546
+ /** Options for {@link runPipeline} (CLI flags and progress callbacks). */
547
+ interface RunPipelineOptions {
548
+ /** Comma-separated subset of configured steps (e.g. `run,grade`). */
549
+ steps?: string;
550
+ maxConcurrent?: number;
551
+ overrides?: PipelineCliOverrides;
552
+ onRunProgress?: ProgressCallback;
553
+ onGradeProgress?: GradeReportOptions["onProgress"];
554
+ /** Framework version stamped on envelope export. */
555
+ frameworkVersion?: string;
556
+ }
557
+ /** Outcome of a pipeline run including per-step exit semantics. */
558
+ interface RunPipelineResult {
559
+ /** 0 pass, 1 eval/grade/envelope failure, 2 load error (thrown before return). */
560
+ exitCode: number;
561
+ stepsRun: PipelineStepName[];
562
+ runReport?: SuiteReport;
563
+ }
564
+ /** Execute configured pipeline steps in order; stop on first failure. */
565
+ declare function runPipeline(doc: SuiteDocument, options?: RunPipelineOptions): Promise<RunPipelineResult>;
566
+ //#endregion
539
567
  //#region src/otel/types.d.ts
540
568
  /**
541
569
  * Minimal OTLP JSON types for trace export.
@@ -543,24 +571,30 @@ declare function createLimit(max: number): LimitedRunner;
543
571
  * Shapes follow OTLP/HTTP JSON Protobuf encoding (lowerCamelCase field names).
544
572
  * @see https://opentelemetry.io/docs/specs/otlp/
545
573
  */
574
+ /** OTLP ExportTraceServiceRequest root — batch of resource spans. */
546
575
  interface ExportTraceServiceRequest {
547
576
  resourceSpans: ResourceSpans[];
548
577
  }
578
+ /** Resource-attributed span group in an export batch. */
549
579
  interface ResourceSpans {
550
580
  resource: Resource;
551
581
  scopeSpans: ScopeSpans[];
552
582
  }
583
+ /** OTLP resource descriptor (service.name, agent metadata). */
553
584
  interface Resource {
554
585
  attributes: KeyValue[];
555
586
  }
587
+ /** Spans emitted by one instrumentation scope within a resource. */
556
588
  interface ScopeSpans {
557
589
  scope: InstrumentationScope;
558
590
  spans: Span[];
559
591
  }
592
+ /** Instrumentation library identity (name + optional version). */
560
593
  interface InstrumentationScope {
561
594
  name: string;
562
595
  version?: string;
563
596
  }
597
+ /** One span in OTLP JSON encoding (nanosecond timestamps as strings). */
564
598
  interface Span {
565
599
  traceId: string;
566
600
  spanId: string;
@@ -572,14 +606,17 @@ interface Span {
572
606
  attributes: KeyValue[];
573
607
  status?: SpanStatus;
574
608
  }
609
+ /** OTLP span status (OK, ERROR, or UNSET). */
575
610
  interface SpanStatus {
576
611
  code: number;
577
612
  message?: string;
578
613
  }
614
+ /** Key-value attribute pair on a span or resource. */
579
615
  interface KeyValue {
580
616
  key: string;
581
617
  value: AnyValue;
582
618
  }
619
+ /** Discriminated OTLP attribute value (one of the typed fields set). */
583
620
  interface AnyValue {
584
621
  stringValue?: string;
585
622
  boolValue?: boolean;
@@ -595,6 +632,7 @@ interface ArrayValue {
595
632
  interface KeyValueList {
596
633
  values: KeyValue[];
597
634
  }
635
+ /** Options passed to {@link trajectoryToOtlp} / {@link emitOtel}. */
598
636
  interface EmitOtelOptions {
599
637
  /** User prompt for the first `gen_ai.input.messages` entry. */
600
638
  prompt?: string;
@@ -627,13 +665,20 @@ interface EmitOtelOptions {
627
665
  * ```
628
666
  */
629
667
  declare function trajectoryToOtlp(view: TrajectoryView, options?: EmitOtelOptions): ExportTraceServiceRequest;
630
- /** Alias matching the implementation plan naming. */
668
+ /** Alias for {@link trajectoryToOtlp} — matches implementation plan naming. */
631
669
  declare const emitOtel: typeof trajectoryToOtlp;
632
670
  //#endregion
633
671
  //#region src/grader/grade-report.d.ts
672
+ /**
673
+ * Grade every repetition in a {@link SuiteReport} that has expectations.
674
+ *
675
+ * Expectations come from inline case fields or an optional sidecar YAML/JSON
676
+ * map. Runs are concurrent under {@link GradeReportOptions.maxConcurrent}.
677
+ */
634
678
  declare function gradeReport(report: SuiteReport, options?: GradeReportOptions): Promise<SuiteGradingReport>;
635
679
  //#endregion
636
680
  //#region src/grader/resolve-grade-options.d.ts
681
+ /** CLI flag overrides for grading (take precedence over grading YAML). */
637
682
  interface GradeCliOverrides {
638
683
  model?: string;
639
684
  binary?: string;
@@ -648,9 +693,16 @@ interface GradeCliOverrides {
648
693
  declare function resolveGradeOptions(fileConfig?: GradingConfig, cli?: GradeCliOverrides, configPath?: string): GradeReportOptions;
649
694
  //#endregion
650
695
  //#region src/grader/transcript.d.ts
696
+ /**
697
+ * Render a {@link TrajectoryView} as markdown for LLM graders.
698
+ *
699
+ * Tool results are truncated at {@link MAX_RESULT_CHARS} to keep judge
700
+ * prompts within reasonable token limits.
701
+ */
651
702
  declare function trajectoryToTranscript(view: TrajectoryView, prompt?: string): string;
652
703
  //#endregion
653
704
  //#region src/grader/claude-grader.d.ts
705
+ /** Options for {@link createClaudeGrader} / {@link runClaudeGrader}. */
654
706
  interface ClaudeGraderOptions {
655
707
  binary?: string;
656
708
  model?: string;
@@ -659,14 +711,36 @@ interface ClaudeGraderOptions {
659
711
  cwd?: string;
660
712
  claudeCode?: ClaudeCodeOptions;
661
713
  }
714
+ /** Factory returning a {@link GraderFn} bound to subprocess options. */
662
715
  declare function createClaudeGrader(options?: ClaudeGraderOptions): GraderFn;
663
716
  //#endregion
717
+ //#region src/grader/codex-grader.d.ts
718
+ /** Options for {@link createCodexGrader} / {@link runCodexGrader}. */
719
+ interface CodexGraderOptions {
720
+ binary?: string;
721
+ model?: string;
722
+ timeoutMs?: number;
723
+ env?: Record<string, string>;
724
+ cwd?: string;
725
+ codex?: CodexOptions;
726
+ }
727
+ /** Factory returning a {@link GraderFn} bound to subprocess options. */
728
+ declare function createCodexGrader(options?: CodexGraderOptions): GraderFn;
729
+ //#endregion
664
730
  //#region src/grader/format-console.d.ts
731
+ /**
732
+ * Format a {@link SuiteGradingReport} for terminal output.
733
+ *
734
+ * @param color When true, emit ANSI status colors (default for TTY console).
735
+ */
665
736
  declare function formatGradingConsole(report: SuiteGradingReport, color?: boolean): string;
737
+ /** True when every graded rep passed all expectations without grader errors. */
666
738
  declare function gradingReportPassed(report: SuiteGradingReport): boolean;
667
739
  //#endregion
668
740
  //#region src/reporter/types.d.ts
741
+ /** Output format selector for {@link formatReport}. */
669
742
  type ReportFormat = "console" | "markdown" | "json";
743
+ /** Options for suite report formatting. */
670
744
  interface ReporterOptions {
671
745
  format: ReportFormat;
672
746
  baseline?: SuiteReport;
@@ -674,52 +748,222 @@ interface ReporterOptions {
674
748
  }
675
749
  //#endregion
676
750
  //#region src/reporter/index.d.ts
751
+ /**
752
+ * Format a {@link SuiteReport} for console, markdown, or JSON output.
753
+ *
754
+ * JSON format bypasses the renderable intermediate model and serializes the
755
+ * report directly. Console and markdown apply optional baseline deltas.
756
+ */
677
757
  declare function formatReport(report: SuiteReport, options: ReporterOptions): string;
678
758
  //#endregion
679
759
  //#region src/eval-record/build.d.ts
680
760
  /**
681
761
  * Convert a {@link SuiteReport} (and optional grading) into a versioned
682
762
  * {@link EvalRunEnvelope} for storage or API handoff.
763
+ *
764
+ * @param report - Runner output for one suite execution.
765
+ * @param options - Provenance, grading merge, and artifact inclusion flags.
766
+ * @returns A fully populated envelope with protojson interchange fields on each repetition.
683
767
  */
684
768
  declare function buildEvalRunEnvelope(report: SuiteReport, options?: BuildEvalRunEnvelopeOptions): EvalRunEnvelope;
685
- /** Build envelope from on-disk report + optional grading JSON paths. */
769
+ /**
770
+ * Build an envelope from on-disk runner and grader JSON artifacts.
771
+ *
772
+ * Reads `reportPath` as a {@link SuiteReport}. When `gradingPath` is set, merges
773
+ * outcome grades from a {@link SuiteGradingReport}. When `suitePath` is set,
774
+ * attaches suite URI and SHA-256 content hash for reproducibility.
775
+ *
776
+ * @param reportPath - Path to the suite run report JSON from `harness-eval run`.
777
+ * @param options - Same build options as {@link buildEvalRunEnvelope}, plus file paths.
778
+ */
686
779
  declare function buildEvalRunEnvelopeFromFiles(reportPath: string, options?: BuildEvalRunEnvelopeOptions & {
687
780
  gradingPath?: string;
688
781
  suitePath?: string;
689
782
  }): Promise<EvalRunEnvelope>;
690
783
  //#endregion
691
- //#region src/metrics/tool-calls.d.ts
692
- interface ToolCallMetricOptions {
693
- useStrictStringMatch?: boolean;
694
- }
695
- type ToolCallInput = InterchangeToolCall | TabularToolCall | {
784
+ //#region src/eval-interchange/enrich.d.ts
785
+ /**
786
+ * Attach Vertex protojson interchange fields to one {@link EvalRepetition}.
787
+ *
788
+ * When no trajectory exists (adapter error), sets `failure: 1` and skips
789
+ * protojson payloads. Trajectory instances and harness metrics are only
790
+ * computed when the suite defines a non-empty reference trajectory.
791
+ *
792
+ * @param repetition - Base repetition from the runner (trajectory, assertions, grades).
793
+ * @param options.prompt - Case prompt for EvaluationInstance.
794
+ * @param options.reference - Suite reference trajectory config, if any.
795
+ */
796
+ declare function enrichRepetitionWithProtojson(repetition: EvalRepetition, options?: {
797
+ prompt?: string;
798
+ reference?: ReferenceTrajectoryConfig;
799
+ }): EvalRepetition;
800
+ //#endregion
801
+ //#region src/eval-interchange/protojson/evaluation-instance.d.ts
802
+ /**
803
+ * Build an EvaluationInstance protojson object from harness strings.
804
+ *
805
+ * Omitted fields are excluded from the output object rather than set to
806
+ * empty wrappers — protojson omits unset optional fields.
807
+ *
808
+ * @param options.prompt - Case prompt sent to the agent.
809
+ * @param options.response - Final agent response from the trajectory.
810
+ * @param options.reference - Optional reference answer text (rare in harness eval).
811
+ */
812
+ declare function toEvaluationInstance(options: {
813
+ prompt?: string;
814
+ response?: string;
815
+ reference?: string;
816
+ }): EvaluationInstanceJson;
817
+ //#endregion
818
+ //#region src/eval-interchange/protojson/harness-metrics.d.ts
819
+ /** Suite YAML reference step shape accepted by metric computation. */
820
+ type ReferenceStep$1 = {
696
821
  tool_name: string;
697
822
  tool_input: unknown;
698
823
  };
699
- declare function toolCallValid(toolCall: ToolCallInput): number;
700
- declare function toolNameMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
701
- declare function toolParameterKeyMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
702
- declare function toolParameterKvMatch(predicted: ToolCallInput, reference: ToolCallInput, options?: ToolCallMetricOptions): number;
703
- declare function computeToolCallMetrics(predicted: ToolCallInput[], reference: ToolCallInput[], options?: ToolCallMetricOptions): ToolCallMetrics;
824
+ /**
825
+ * Compute trajectory metrics and map snake_case keys to Vertex camelCase.
826
+ *
827
+ * When `referenceToolNameMode` is `"bare"`, both predicted and reference tool
828
+ * names are stripped to the suffix after the last `__` so suite reference steps
829
+ * authored with bare names (e.g. `ListLandingZones`) match harness MCP names
830
+ * (e.g. `mcp__plugin__ListLandingZones`).
831
+ *
832
+ * @param predicted - Tool calls from the harness trajectory view.
833
+ * @param reference - Reference steps from suite YAML.
834
+ * @param options.referenceToolNameMode - Name normalization mode from suite YAML.
835
+ */
836
+ declare function toHarnessMetrics(predicted: ToolCall[], reference: ReferenceStep$1[], options?: {
837
+ referenceToolNameMode?: ReferenceToolNameMode;
838
+ }): HarnessMetrics;
839
+ //#endregion
840
+ //#region src/eval-interchange/protojson/trajectory-instances.d.ts
841
+ type ReferenceStep = {
842
+ tool_name: string;
843
+ tool_input: unknown;
844
+ };
845
+ /**
846
+ * Build all Trajectory*Instance payloads for one predicted/reference pair.
847
+ *
848
+ * Pair metrics (exact, in-order, any-order, precision, recall) share the
849
+ * same trajectory pair; single-tool-use omits the reference trajectory
850
+ * per Vertex API shape.
851
+ */
852
+ declare function toTrajectoryInstances(options: {
853
+ predicted: ToolCall[];
854
+ reference: ReferenceStep[];
855
+ referenceToolNameMode?: ReferenceToolNameMode;
856
+ }): TrajectoryInstancesJson;
704
857
  //#endregion
705
858
  //#region src/eval-interchange/projections.d.ts
859
+ /**
860
+ * Trajectory projection — all repetitions in the envelope as dataset rows.
861
+ */
706
862
  declare function toTrajectory(envelope: EvalRunEnvelope): EvalDatasetRow[];
707
- declare function toProtoInstances(envelope: EvalRunEnvelope): ProtoTrajectoryInstance[];
708
- declare function toAgentTrace(envelope: EvalRunEnvelope): AgentTrace[];
709
- declare function enrichRepetitionWithInterchange(repetition: EvalRepetition, referenceTrajectory?: TabularToolCall[]): EvalRepetition;
863
+ /**
864
+ * Instances projection — all trajectory metric instances as JSONL rows.
865
+ */
866
+ declare function toInstancesJsonl(envelope: EvalRunEnvelope): InstancesJsonlRow[];
710
867
  //#endregion
711
868
  //#region src/metrics/trajectory.d.ts
712
- type TrajectoryInput = InterchangeToolCall[] | TabularToolCall[] | Array<{
869
+ /**
870
+ * Trajectory-level metrics for comparing predicted and reference tool-call sequences.
871
+ *
872
+ * Aligns with Vertex AI EvaluationService trajectory metrics (exact match,
873
+ * in-order, any-order, precision, recall, single tool use). Tool calls are
874
+ * compared by `(tool_name, serialized tool_input)` identity after normalization.
875
+ *
876
+ * Binary metrics return 0 or 1; precision and recall return fractions in [0, 1].
877
+ */
878
+ /** Canonical wire tool call used internally for comparison. */
879
+ interface WireToolCall {
713
880
  tool_name: string;
714
- tool_input: unknown;
881
+ tool_input: string;
882
+ }
883
+ /** All trajectory metric scores for one predicted/reference pair. */
884
+ interface TrajectoryMetrics {
885
+ trajectory_exact_match: number;
886
+ trajectory_in_order_match: number;
887
+ trajectory_any_order_match: number;
888
+ trajectory_precision: number;
889
+ trajectory_recall: number;
890
+ trajectory_single_tool_use: number;
891
+ }
892
+ /** Input accepted by trajectory metrics — wire or harness/YAML shapes. */
893
+ type TrajectoryInput = WireToolCall[] | Array<{
894
+ tool_name: string;
895
+ tool_input: unknown | string;
715
896
  }>;
897
+ /** Exact sequence equality after normalization. */
716
898
  declare function trajectoryExactMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
899
+ /** Reference is a subsequence of predicted (order preserved, extras allowed). */
717
900
  declare function trajectoryInOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
901
+ /** Same multiset of tool calls; length must match. */
718
902
  declare function trajectoryAnyOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
903
+ /**
904
+ * Fraction of predicted tool calls that appear in reference (multiset).
905
+ *
906
+ * Returns 1 when both trajectories are empty.
907
+ */
719
908
  declare function trajectoryPrecision(predicted: TrajectoryInput, reference: TrajectoryInput): number;
909
+ /**
910
+ * Fraction of reference tool calls matched in predicted (multiset recall).
911
+ *
912
+ * Returns 1 when reference is empty and predicted is empty.
913
+ */
720
914
  declare function trajectoryRecall(predicted: TrajectoryInput, reference: TrajectoryInput): number;
915
+ /** Both trajectories have exactly one call and they match. */
721
916
  declare function trajectorySingleToolUse(predicted: TrajectoryInput, reference: TrajectoryInput): number;
917
+ /** Compute all trajectory metrics in one pass. */
722
918
  declare function computeTrajectoryMetrics(predicted: TrajectoryInput, reference: TrajectoryInput): TrajectoryMetrics;
723
919
  //#endregion
724
- export { type AdapterDiagnostics, AdapterError, type AdapterResult, type AdapterRunFn, AgentConfig, AgentEvent, AgentTrace, Assertion, AssertionResult, AssertionStat, AssistantMessage, AssistantMessageEvent, AssistantTurn, type BaseAdapterConfig, BuildEvalRunEnvelopeOptions, Cardinality, CellReport, CompoundPredicate, ConfigError, ContentBlock, ContentPart, ConversationTurn, DEFAULT_ADAPTER_ID, DEFAULT_REPETITIONS, DEFAULT_THRESHOLD, EVAL_RUN_SCHEMA_VERSION, type EmitOtelOptions, EvalArtifacts, EvalAssertionStat, EvalCellResult, EvalDatasetRow, EvalProvenance, EvalRepetition, EvalRunEnvelope, EvalRunSummary, type ExportTraceServiceRequest, ExternalScore, type GradeReportOptions, type HarnessAdapter, HarnessInfo, InterchangeToolCall, InterchangeTrajectory, JudgeInfo, LeafPredicate, type LimitedRunner, MatrixCell, McpServerStatus, ObjectPredicate, OutcomeGrades, type ParseErrorRecord, type ParseResult, Predicate, ProgressCallback, ProgressEvent, ProtoTrajectoryInstance, type RepGradingResult, RepetitionError, RepetitionResult, type ReporterOptions, ResultEvent, RetryRecord, RunSuiteOptions, SessionMeta, StopReason, StreamEvent, type SuiteConfig, type SuiteGradingReport, SuiteReference, SuiteReport, SystemCompactBoundaryEvent, SystemInitEvent, SystemPluginInstallEvent, SystemRetryEvent, SystemUnknownEvent, TRAJECTORY_SCHEMA_VERSION, TabularToolCall, TestCase, TestSuite, TextBlock, ThresholdedAssertion, ToolCall, type ToolCallMetricOptions, ToolCallMetrics, ToolPattern, ToolResultBlock, ToolUseBlock, TrajectoryBuilder, type TrajectoryInput, TrajectoryMetrics, TrajectoryView, Usage, UsageSummary, UserMessage, UserMessageEvent, aggregateCell, buildEvalRunEnvelope, buildEvalRunEnvelopeFromFiles, buildTrajectory, index_d_exports as claudeCode, computeToolCallMetrics, computeTrajectoryMetrics, createClaudeGrader, createLimit, emitOtel, enrichRepetitionWithInterchange, evaluate, evaluateAll, formatGradingConsole, formatReport, getAdapter, getDefaultAdapter, getRepetitions, gradeReport, gradingReportPassed, isAssistantMessage, isResult, isSystemInit, isSystemRetry, isTextBlock, isToolResultBlock, isToolUseBlock, isUserMessage, listAdapters, loadSuite, mergeConfig, namespaceOf, parseStreamJson, parseSuite, registerAdapter, resolveGradeOptions, runRepetition, runSuite, toAgentTrace, toProtoInstances, toTrajectory, toolCallValid, toolNameMatch, toolParameterKeyMatch, toolParameterKvMatch, trajectoryAnyOrderMatch, trajectoryExactMatch, trajectoryInOrderMatch, trajectoryPrecision, trajectoryRecall, trajectorySingleToolUse, trajectoryToOtlp, trajectoryToTranscript };
920
+ //#region src/metrics/tool-calls.d.ts
921
+ /** Options for parameter value comparison. */
922
+ interface ToolCallMetricOptions {
923
+ /** When true, compare serialized JSON strictly (reserved for future semantics). */
924
+ useStrictStringMatch?: boolean;
925
+ }
926
+ /** Aggregated tool-call metric scores (each 0..1). */
927
+ interface ToolCallMetrics {
928
+ tool_call_valid: number;
929
+ tool_name_match: number;
930
+ tool_parameter_key_match: number;
931
+ tool_parameter_kv_match: number;
932
+ }
933
+ type ToolCallInput = TrajectoryInput[number];
934
+ /**
935
+ * Whether a predicted tool call is well-formed (non-empty name, parseable JSON input).
936
+ *
937
+ * @returns 1 when valid, 0 otherwise.
938
+ */
939
+ declare function toolCallValid(toolCall: ToolCallInput): number;
940
+ /**
941
+ * Whether predicted and reference tool names match exactly.
942
+ *
943
+ * @returns 1 on match, 0 otherwise.
944
+ */
945
+ declare function toolNameMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
946
+ /**
947
+ * Whether parameter key sets match (same keys, same order after sort).
948
+ *
949
+ * Requires matching tool names first. Returns 0 when args are not objects.
950
+ */
951
+ declare function toolParameterKeyMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
952
+ /**
953
+ * Whether all reference parameter key-value pairs match in the predicted call.
954
+ *
955
+ * Requires {@link toolParameterKeyMatch} first. Only keys present in reference
956
+ * are checked (predicted may have extra keys).
957
+ */
958
+ declare function toolParameterKvMatch(predicted: ToolCallInput, reference: ToolCallInput, options?: ToolCallMetricOptions): number;
959
+ /**
960
+ * Average tool-call metrics across index-aligned predicted/reference pairs.
961
+ *
962
+ * Denominator is `max(predicted.length, reference.length, 1)`. Missing
963
+ * predicted calls at an index are skipped for pair metrics; validity still
964
+ * counts when a predicted call exists.
965
+ */
966
+ declare function computeToolCallMetrics(predicted: ToolCallInput[], reference: ToolCallInput[], options?: ToolCallMetricOptions): ToolCallMetrics;
967
+ //#endregion
968
+ export { type AdapterDiagnostics, AdapterError, type AdapterResult, type AdapterRunFn, Assertion, AssertionResult, AssertionStat, AssistantMessage, AssistantMessageEvent, AssistantTurn, type BaseAdapterConfig, BuildEvalRunEnvelopeOptions, Cardinality, CellReport, CompoundPredicate, ConfigError, ContentBlock, DEFAULT_ADAPTER_ID, DEFAULT_REPETITIONS, DEFAULT_THRESHOLD, EVAL_RUN_SCHEMA_VERSION, type EmitOtelOptions, EvalArtifacts, EvalAssertionStat, EvalCellResult, EvalDatasetRow, EvalProvenance, EvalRepetition, EvalRunEnvelope, EvalRunSummary, EvaluationInstanceJson, type ExportTraceServiceRequest, ExternalScore, type GradeReportOptions, type HarnessAdapter, HarnessInfo, HarnessMetrics, InstanceData, InstancesJsonlRow, JudgeInfo, LeafPredicate, type LimitedRunner, MatrixCell, McpServerStatus, ObjectPredicate, OutcomeGrades, type ParseErrorRecord, type ParseResult, type PipelineConfig, Predicate, ProgressCallback, ProgressEvent, ProtojsonToolCall, ProtojsonTrajectory, ReferenceToolNameMode, ReferenceTrajectoryConfig, type RepGradingResult, RepetitionError, RepetitionResult, type ReporterOptions, ResultEvent, RetryRecord, RunSuiteOptions, SessionMeta, StopReason, StreamEvent, type SuiteConfig, type SuiteDocument, type SuiteGradingReport, SuiteReference, SuiteReport, SystemCompactBoundaryEvent, SystemInitEvent, SystemPluginInstallEvent, SystemRetryEvent, SystemUnknownEvent, TRAJECTORY_SCHEMA_VERSION, TestCase, TestSuite, TextBlock, ThresholdedAssertion, ToolCall, type ToolCallMetricOptions, ToolPattern, ToolResultBlock, ToolUseBlock, TrajectoryBuilder, type TrajectoryInput, TrajectoryInstanceMetricKey, TrajectoryInstancesJson, TrajectoryPairInstanceJson, TrajectorySingleToolUseInstanceJson, TrajectoryView, Usage, UsageSummary, UserMessage, UserMessageEvent, aggregateCell, buildEvalRunEnvelope, buildEvalRunEnvelopeFromFiles, buildTrajectory, index_d_exports as claudeCode, computeToolCallMetrics, computeTrajectoryMetrics, createClaudeGrader, createCodexGrader, createLimit, emitOtel, enrichRepetitionWithProtojson, evaluate, evaluateAll, formatGradingConsole, formatReport, getAdapter, getDefaultAdapter, getRepetitions, gradeReport, gradingReportPassed, isAssistantMessage, isResult, isSystemInit, isSystemRetry, isTextBlock, isToolResultBlock, isToolUseBlock, isUserMessage, listAdapters, loadSuite, loadSuiteDocument, mergeConfig, namespaceOf, parseStreamJson, parseSuite, registerAdapter, resolveGradeOptions, resolveGradingArtifactFromSuite, resolvePipelineInputs, runPipeline, runRepetition, runSuite, toEvaluationInstance, toHarnessMetrics, toInstancesJsonl, toTrajectory, toTrajectoryInstances, toolCallValid, toolNameMatch, toolParameterKeyMatch, toolParameterKvMatch, trajectoryAnyOrderMatch, trajectoryExactMatch, trajectoryInOrderMatch, trajectoryPrecision, trajectoryRecall, trajectorySingleToolUse, trajectoryToOtlp, trajectoryToTranscript };
725
969
  //# sourceMappingURL=index.d.ts.map