@alis-build/harness-eval 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +17 -4
  2. package/dist/adapters/claude-code/index.d.ts +1 -1
  3. package/dist/adapters/claude-code/index.js +1 -1
  4. package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
  5. package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
  6. package/dist/cli/bin.js +109 -12
  7. package/dist/cli/bin.js.map +1 -1
  8. package/dist/config/loader.d.ts +1 -1
  9. package/dist/config/loader.js +1 -1
  10. package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
  11. package/dist/index.d.ts +270 -152
  12. package/dist/index.js +124 -5
  13. package/dist/index.js.map +1 -0
  14. package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
  15. package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
  16. package/dist/loader-DcI0KfRX.js.map +1 -0
  17. package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
  18. package/dist/projections-BcX7w-f6.js.map +1 -0
  19. package/dist/runner/suite.d.ts +1 -1
  20. package/dist/runner/suite.js +1 -1
  21. package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
  22. package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
  23. package/dist/suite-Dlzl-HI0.js.map +1 -0
  24. package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
  25. package/package.json +4 -2
  26. package/schemas/eval-interchange-instances.schema.json +196 -0
  27. package/schemas/eval-interchange.schema.json +65 -52
  28. package/schemas/eval-run-envelope.schema.json +182 -425
  29. package/dist/build-DsVJ_UeU.js.map +0 -1
  30. package/dist/loader-BCnFJ8rm.js.map +0 -1
  31. package/dist/suite-chj0j22j.js.map +0 -1
  32. package/schemas/eval-interchange-agent-trace.schema.json +0 -322
  33. package/schemas/eval-interchange-proto-instance.schema.json +0 -106
package/dist/index.d.ts CHANGED
@@ -1,120 +1,38 @@
1
1
  import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-B9H4IZtA.js";
2
- import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-6Z17eKZx.js";
3
- import { _ as ObjectPredicate, a as ProgressEvent, b as ToolPattern, c as RunSuiteOptions, d as TestSuite, f as Assertion, g as LeafPredicate, h as CompoundPredicate, i as ProgressCallback, l as SuiteReport, m as Cardinality, n as CellReport, o as RepetitionError, p as AssertionResult, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as Predicate, y as ThresholdedAssertion } from "./types-BQol062t.js";
4
- import { i as GradingConfig, r as parseSuite, s as ConfigError, t as loadSuite } from "./loader-DTvoVfN0.js";
5
- import { t as runSuite } from "./suite-BoOvK_lq.js";
2
+ import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-V22PrR0p.js";
3
+ import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-CD3TwOtZ.js";
4
+ import { i as GradingConfig, r as parseSuite, s as ConfigError, t as loadSuite } from "./loader-C9yQHUPC.js";
5
+ import { t as runSuite } from "./suite-DPJMIEbu.js";
6
6
  import { Readable } from "node:stream";
7
7
 
8
- //#region src/types/eval-interchange.d.ts
9
- /**
10
- * TypeScript types for eval interchange output.
11
- */
12
- interface InterchangeToolCall {
13
- tool_name: string;
14
- tool_input: string;
15
- }
16
- interface InterchangeTrajectory {
17
- tool_calls: InterchangeToolCall[];
18
- }
19
- interface TabularToolCall {
20
- tool_name: string;
21
- tool_input: unknown;
22
- }
23
- interface ContentPart {
24
- text?: string;
25
- function_call?: {
26
- name: string;
27
- args: unknown;
28
- };
29
- function_response?: {
30
- name: string;
31
- response: unknown;
32
- };
33
- }
34
- interface AgentEvent {
35
- author: string;
36
- content: {
37
- parts: ContentPart[];
38
- };
39
- event_time?: string;
40
- state_delta?: Record<string, unknown>;
41
- active_tools?: Array<{
42
- name: string;
43
- }>;
44
- }
45
- interface ConversationTurn {
46
- turn_index: number;
47
- turn_id?: string;
48
- events: AgentEvent[];
49
- }
50
- interface AgentConfig {
51
- agent_id: string;
52
- agent_type?: string;
53
- description?: string;
54
- instruction?: string;
55
- tools?: Array<{
56
- name: string;
57
- }>;
58
- sub_agents?: string[];
59
- }
60
- interface AgentTrace {
61
- agents: Record<string, AgentConfig>;
62
- turns: ConversationTurn[];
63
- }
64
- interface EvalDatasetRow {
65
- prompt?: string;
66
- response?: string;
67
- reference?: string;
68
- predicted_trajectory: TabularToolCall[];
69
- reference_trajectory?: TabularToolCall[];
70
- latency_in_seconds: number;
71
- failure: 0 | 1;
72
- human_ratings?: Record<string, number>;
73
- }
74
- interface ProtoTrajectoryInstance {
75
- predicted_trajectory: InterchangeTrajectory;
76
- reference_trajectory?: InterchangeTrajectory;
77
- prompt?: string;
78
- response?: string;
79
- reference?: string;
80
- }
81
- interface TrajectoryMetrics {
82
- trajectory_exact_match: number;
83
- trajectory_in_order_match: number;
84
- trajectory_any_order_match: number;
85
- trajectory_precision: number;
86
- trajectory_recall: number;
87
- trajectory_single_tool_use: number;
88
- }
89
- interface ToolCallMetrics {
90
- tool_call_valid: number;
91
- tool_name_match: number;
92
- tool_parameter_key_match: number;
93
- tool_parameter_kv_match: number;
94
- }
95
- //#endregion
96
8
  //#region src/grader/types.d.ts
97
9
  /**
98
10
  * Outcome grading types (LLM-as-judge layer).
99
11
  *
100
12
  * Behavioral assertions live in harness-eval assertions; expectations here
101
13
  * are natural-language outcome checks graded from trajectory transcripts.
14
+ * Grading runs as a second pass over a {@link SuiteReport} JSON artifact.
102
15
  */
103
16
  interface GradedExpectation {
17
+ /** Original expectation text from the suite or sidecar file. */
104
18
  text: string;
105
19
  passed: boolean;
20
+ /** Quote or description supporting the pass/fail decision. */
106
21
  evidence: string;
107
22
  }
23
+ /** Aggregate pass/fail counts for one grading unit (rep or full report). */
108
24
  interface GradingSummary {
109
25
  passed: number;
110
26
  failed: number;
111
27
  total: number;
112
28
  passRate: number;
113
29
  }
30
+ /** Suggestion for improving an expectation or assertion wording. */
114
31
  interface EvalFeedbackSuggestion {
115
32
  assertion?: string;
116
33
  reason: string;
117
34
  }
35
+ /** Optional meta-feedback from the judge about expectation quality. */
118
36
  interface EvalFeedback {
119
37
  suggestions: EvalFeedbackSuggestion[];
120
38
  overall: string;
@@ -132,6 +50,7 @@ interface RepGradingResult {
132
50
  graderError?: string;
133
51
  durationMs: number;
134
52
  }
53
+ /** Full grading report for a suite run. */
135
54
  interface SuiteGradingReport {
136
55
  gradedAt: string;
137
56
  sourceReport: string;
@@ -140,6 +59,7 @@ interface SuiteGradingReport {
140
59
  results: RepGradingResult[];
141
60
  summary: GradingSummary;
142
61
  }
62
+ /** Options controlling {@link gradeReport} and the CLI `grade` command. */
143
63
  interface GradeReportOptions {
144
64
  /** Path to the report being graded (stored in output). */
145
65
  sourceReport?: string;
@@ -167,6 +87,7 @@ interface GradeReportOptions {
167
87
  gradeFn?: GraderFn;
168
88
  onProgress?: (event: GradeProgressEvent) => void;
169
89
  }
90
+ /** Progress events emitted during outcome grading. */
170
91
  type GradeProgressEvent = {
171
92
  kind: "grade-start";
172
93
  total: number;
@@ -185,13 +106,16 @@ type GradeProgressEvent = {
185
106
  totalExpectations: number;
186
107
  passedExpectations: number;
187
108
  };
109
+ /** Pluggable grader implementation (defaults to Claude subprocess). */
188
110
  type GraderFn = (input: GraderInput) => Promise<GraderOutput>;
111
+ /** Input passed to a grader for one repetition. */
189
112
  interface GraderInput {
190
113
  prompt: string;
191
114
  transcript: string;
192
115
  expectations: string[];
193
116
  systemInstruction?: string;
194
117
  }
118
+ /** Parsed grader response before alignment with input expectation order. */
195
119
  interface GraderOutput {
196
120
  expectations: GradedExpectation[];
197
121
  summary: GradingSummary;
@@ -206,20 +130,14 @@ declare const EVAL_RUN_SCHEMA_VERSION = "1.0";
206
130
  declare const TRAJECTORY_SCHEMA_VERSION = "1.0";
207
131
  /** Link to the suite spec that produced a run. */
208
132
  interface SuiteReference {
209
- /** Absolute or repo-relative path to the suite YAML. */
210
133
  uri?: string;
211
- /** Stable suite identifier when known (e.g. case bundle name). */
212
134
  id?: string;
213
- /** SHA-256 or similar hash of suite file contents. */
214
135
  contentHash?: string;
215
136
  }
216
137
  /** Harness that executed the run. */
217
138
  interface HarnessInfo {
218
- /** Adapter id from suite YAML, e.g. `claude-code`. */
219
139
  adapter: string;
220
- /** harness-eval package version when envelope was built. */
221
140
  frameworkVersion?: string;
222
- /** Optional harness binary version (e.g. `claude -v`). */
223
141
  harnessVersion?: string;
224
142
  }
225
143
  /** CI, git, or runtime provenance for correlation in the DB. */
@@ -244,9 +162,7 @@ interface EvalProvenance {
244
162
  interface EvalRunSummary {
245
163
  cellsTotal: number;
246
164
  cellsPassed: number;
247
- /** All cells passed behavioral assertion thresholds. */
248
165
  behavioralPass: boolean;
249
- /** All graded expectations passed (when outcome layer present). */
250
166
  outcomePass?: boolean;
251
167
  }
252
168
  /** Identity of the judge that produced outcome grades. */
@@ -278,20 +194,14 @@ interface ExternalScore {
278
194
  }
279
195
  /** Optional large or vendor-specific blobs (store by reference in DB when possible). */
280
196
  interface EvalArtifacts {
281
- /** Claude Code `stream-json` lines — debug only, not cross-harness. */
282
197
  rawStreamEvents?: unknown[];
283
- /** URI to OTLP JSON (S3, GCS, etc.). */
284
198
  otlpTraceUri?: string;
285
- /** Text transcript for judges (`trajectoryToTranscript`). */
286
199
  transcript?: string;
287
200
  }
288
- /**
289
- * One harness invocation — the unit external judges and trajectory queries use.
290
- */
201
+ /** One harness invocation — the unit external judges and trajectory queries use. */
291
202
  interface EvalRepetition {
292
203
  repetitionIndex: number;
293
204
  durationMs: number;
294
- /** Normalized harness session. Required when the harness completed with a view. */
295
205
  trajectory?: TrajectoryView & {
296
206
  schemaVersion: string;
297
207
  };
@@ -300,18 +210,14 @@ interface EvalRepetition {
300
210
  outcomeGrades?: OutcomeGrades;
301
211
  externalScores?: ExternalScore[];
302
212
  artifacts?: EvalArtifacts;
303
- /** Interchange-format predicted tool-call trajectory. */
304
- predicted_trajectory?: InterchangeToolCall[];
305
- /** Full multi-turn agent trace in interchange format. */
306
- agent_trace?: AgentTrace;
307
- /** Session latency in seconds (interchange field). */
308
- latency_in_seconds?: number;
309
- /** 1 when the harness run failed, 0 on success (interchange field). */
213
+ /** Vertex EvaluationInstance protojson wire object. */
214
+ evaluationInstance?: EvaluationInstanceJson;
215
+ /** Vertex Trajectory*Instance protojson wire objects keyed by metric. */
216
+ trajectoryInstances?: TrajectoryInstancesJson;
217
+ /** Harness-precomputed trajectory metric scores (camelCase). */
218
+ harnessMetrics?: HarnessMetrics;
219
+ latencySeconds?: number;
310
220
  failure?: 0 | 1;
311
- /** Trajectory-level metrics when reference_trajectory is provided. */
312
- trajectoryMetrics?: TrajectoryMetrics;
313
- /** Tool-call-level metrics when reference_trajectory is provided. */
314
- toolCallMetrics?: ToolCallMetrics;
315
221
  error?: {
316
222
  message: string;
317
223
  diagnostics?: Partial<AdapterDiagnostics>;
@@ -335,24 +241,16 @@ interface EvalCellResult {
335
241
  expectations?: string[];
336
242
  cellLabel: string;
337
243
  axes?: Record<string, string>;
338
- /** Reference tool-call trajectory for metric computation. */
339
- reference_trajectory?: TabularToolCall[];
340
- /** Human ratings keyed by metric name for judge calibration. */
341
- human_ratings?: Record<string, number>;
244
+ /** Reference trajectory in Vertex protojson wire format. */
245
+ referenceTrajectory?: ProtojsonTrajectory;
246
+ humanRatings?: Record<string, number>;
342
247
  assertionStats: EvalAssertionStat[];
343
248
  adapterErrors: number;
344
- /** Passed all behavioral assertion thresholds for this cell. */
345
249
  behavioralPass: boolean;
346
- /** Passed all outcome expectations when graded; omitted if not graded. */
347
250
  outcomePass?: boolean;
348
251
  repetitions: EvalRepetition[];
349
252
  }
350
- /**
351
- * Top-level document for CI/CD pipelines, APIs, and databases.
352
- *
353
- * This is the interchange format your storage layer should target — not
354
- * {@link import("./stream").StreamEvent} or OTLP traces.
355
- */
253
+ /** Top-level document for CI/CD pipelines, APIs, and databases. */
356
254
  interface EvalRunEnvelope {
357
255
  schemaVersion: typeof EVAL_RUN_SCHEMA_VERSION;
358
256
  runId: string;
@@ -365,12 +263,15 @@ interface EvalRunEnvelope {
365
263
  cells: EvalCellResult[];
366
264
  }
367
265
  interface BuildEvalRunEnvelopeOptions {
368
- /** UUID for this run; generated if omitted. */
266
+ /** Override envelope runId; defaults to a random UUID. */
369
267
  runId?: string;
268
+ /** Link to the suite YAML that produced the run. */
370
269
  suite?: SuiteReference;
270
+ /** Harness adapter metadata; adapter defaults to `"claude-code"`. */
371
271
  harness?: Partial<HarnessInfo>;
272
+ /** CI, git, and runtime provenance for correlation. */
372
273
  provenance?: EvalProvenance;
373
- /** Merge outcome grades from `gradeReport()` or compatible structure. */
274
+ /** Outcome grades to merge from a grader run. */
374
275
  grading?: {
375
276
  gradedAt?: string;
376
277
  sourceReport?: string;
@@ -386,9 +287,9 @@ interface BuildEvalRunEnvelopeOptions {
386
287
  }>;
387
288
  judge?: JudgeInfo;
388
289
  };
389
- /** Include transcript in each repetition's artifacts. Default true. */
290
+ /** Include text transcript artifact (default true). */
390
291
  includeTranscript?: boolean;
391
- /** Include raw stream events when adapter provides them. Default false. */
292
+ /** Include raw stream-json events (default false; debug only). */
392
293
  includeRawStreamEvents?: boolean;
393
294
  }
394
295
  //#endregion
@@ -506,6 +407,7 @@ declare function getDefaultAdapter(): HarnessAdapter;
506
407
  declare const DEFAULT_REPETITIONS = 5;
507
408
  /** Default assertion pass-rate threshold when `threshold` is omitted. */
508
409
  declare const DEFAULT_THRESHOLD = 1;
410
+ /** Injectable adapter run function (used by tests to stub harness I/O). */
509
411
  type AdapterRunFn = (config: BaseAdapterConfig & Record<string, unknown>) => Promise<AdapterResult>;
510
412
  /**
511
413
  * Build the effective adapter config for one (suite, case, cell).
@@ -513,8 +415,21 @@ type AdapterRunFn = (config: BaseAdapterConfig & Record<string, unknown>) => Pro
513
415
  * Merge order (later wins): defaultConfig < case.config < cell.config.
514
416
  */
515
417
  declare function mergeConfig(suite: TestSuite, testCase: TestCase, cell: MatrixCell): BaseAdapterConfig & Record<string, unknown>;
418
+ /** Effective repetition count for a case (`case.repetitions` or default). */
516
419
  declare function getRepetitions(testCase: TestCase): number;
420
+ /**
421
+ * Run one repetition: invoke the adapter, evaluate assertions, capture errors.
422
+ *
423
+ * Adapter failures are returned as {@link RepetitionResult.error} rather than
424
+ * thrown so the suite runner can continue other reps and report adapter error counts.
425
+ */
517
426
  declare function runRepetition(testCase: TestCase, _cell: MatrixCell, config: BaseAdapterConfig & Record<string, unknown>, repetitionIndex: number, run: AdapterRunFn, signal?: AbortSignal): Promise<RepetitionResult>;
427
+ /**
428
+ * Roll up repetition results into a {@link CellReport}.
429
+ *
430
+ * Adapter errors reduce `evaluatedCount` but do not fail the cell by
431
+ * themselves — only assertion threshold misses mark a cell as failed.
432
+ */
518
433
  declare function aggregateCell(testCase: TestCase, cell: MatrixCell, repetitions: RepetitionResult[]): CellReport;
519
434
  //#endregion
520
435
  //#region src/runner/limit.d.ts
@@ -543,24 +458,30 @@ declare function createLimit(max: number): LimitedRunner;
543
458
  * Shapes follow OTLP/HTTP JSON Protobuf encoding (lowerCamelCase field names).
544
459
  * @see https://opentelemetry.io/docs/specs/otlp/
545
460
  */
461
+ /** OTLP ExportTraceServiceRequest root — batch of resource spans. */
546
462
  interface ExportTraceServiceRequest {
547
463
  resourceSpans: ResourceSpans[];
548
464
  }
465
+ /** Resource-attributed span group in an export batch. */
549
466
  interface ResourceSpans {
550
467
  resource: Resource;
551
468
  scopeSpans: ScopeSpans[];
552
469
  }
470
+ /** OTLP resource descriptor (service.name, agent metadata). */
553
471
  interface Resource {
554
472
  attributes: KeyValue[];
555
473
  }
474
+ /** Spans emitted by one instrumentation scope within a resource. */
556
475
  interface ScopeSpans {
557
476
  scope: InstrumentationScope;
558
477
  spans: Span[];
559
478
  }
479
+ /** Instrumentation library identity (name + optional version). */
560
480
  interface InstrumentationScope {
561
481
  name: string;
562
482
  version?: string;
563
483
  }
484
+ /** One span in OTLP JSON encoding (nanosecond timestamps as strings). */
564
485
  interface Span {
565
486
  traceId: string;
566
487
  spanId: string;
@@ -572,14 +493,17 @@ interface Span {
572
493
  attributes: KeyValue[];
573
494
  status?: SpanStatus;
574
495
  }
496
+ /** OTLP span status (OK, ERROR, or UNSET). */
575
497
  interface SpanStatus {
576
498
  code: number;
577
499
  message?: string;
578
500
  }
501
+ /** Key-value attribute pair on a span or resource. */
579
502
  interface KeyValue {
580
503
  key: string;
581
504
  value: AnyValue;
582
505
  }
506
+ /** Discriminated OTLP attribute value (one of the typed fields set). */
583
507
  interface AnyValue {
584
508
  stringValue?: string;
585
509
  boolValue?: boolean;
@@ -595,6 +519,7 @@ interface ArrayValue {
595
519
  interface KeyValueList {
596
520
  values: KeyValue[];
597
521
  }
522
+ /** Options passed to {@link trajectoryToOtlp} / {@link emitOtel}. */
598
523
  interface EmitOtelOptions {
599
524
  /** User prompt for the first `gen_ai.input.messages` entry. */
600
525
  prompt?: string;
@@ -627,13 +552,20 @@ interface EmitOtelOptions {
627
552
  * ```
628
553
  */
629
554
  declare function trajectoryToOtlp(view: TrajectoryView, options?: EmitOtelOptions): ExportTraceServiceRequest;
630
- /** Alias matching the implementation plan naming. */
555
+ /** Alias for {@link trajectoryToOtlp} — matches implementation plan naming. */
631
556
  declare const emitOtel: typeof trajectoryToOtlp;
632
557
  //#endregion
633
558
  //#region src/grader/grade-report.d.ts
559
+ /**
560
+ * Grade every repetition in a {@link SuiteReport} that has expectations.
561
+ *
562
+ * Expectations come from inline case fields or an optional sidecar YAML/JSON
563
+ * map. Runs are concurrent under {@link GradeReportOptions.maxConcurrent}.
564
+ */
634
565
  declare function gradeReport(report: SuiteReport, options?: GradeReportOptions): Promise<SuiteGradingReport>;
635
566
  //#endregion
636
567
  //#region src/grader/resolve-grade-options.d.ts
568
+ /** CLI flag overrides for grading (take precedence over grading YAML). */
637
569
  interface GradeCliOverrides {
638
570
  model?: string;
639
571
  binary?: string;
@@ -648,9 +580,16 @@ interface GradeCliOverrides {
648
580
  declare function resolveGradeOptions(fileConfig?: GradingConfig, cli?: GradeCliOverrides, configPath?: string): GradeReportOptions;
649
581
  //#endregion
650
582
  //#region src/grader/transcript.d.ts
583
+ /**
584
+ * Render a {@link TrajectoryView} as markdown for LLM graders.
585
+ *
586
+ * Tool results are truncated at {@link MAX_RESULT_CHARS} to keep judge
587
+ * prompts within reasonable token limits.
588
+ */
651
589
  declare function trajectoryToTranscript(view: TrajectoryView, prompt?: string): string;
652
590
  //#endregion
653
591
  //#region src/grader/claude-grader.d.ts
592
+ /** Options for {@link createClaudeGrader} / {@link runClaudeGrader}. */
654
593
  interface ClaudeGraderOptions {
655
594
  binary?: string;
656
595
  model?: string;
@@ -659,14 +598,23 @@ interface ClaudeGraderOptions {
659
598
  cwd?: string;
660
599
  claudeCode?: ClaudeCodeOptions;
661
600
  }
601
+ /** Factory returning a {@link GraderFn} bound to subprocess options. */
662
602
  declare function createClaudeGrader(options?: ClaudeGraderOptions): GraderFn;
663
603
  //#endregion
664
604
  //#region src/grader/format-console.d.ts
605
+ /**
606
+ * Format a {@link SuiteGradingReport} for terminal output.
607
+ *
608
+ * @param color When true, emit ANSI status colors (default for TTY console).
609
+ */
665
610
  declare function formatGradingConsole(report: SuiteGradingReport, color?: boolean): string;
611
+ /** True when every graded rep passed all expectations without grader errors. */
666
612
  declare function gradingReportPassed(report: SuiteGradingReport): boolean;
667
613
  //#endregion
668
614
  //#region src/reporter/types.d.ts
615
+ /** Output format selector for {@link formatReport}. */
669
616
  type ReportFormat = "console" | "markdown" | "json";
617
+ /** Options for suite report formatting. */
670
618
  interface ReporterOptions {
671
619
  format: ReportFormat;
672
620
  baseline?: SuiteReport;
@@ -674,52 +622,222 @@ interface ReporterOptions {
674
622
  }
675
623
  //#endregion
676
624
  //#region src/reporter/index.d.ts
625
+ /**
626
+ * Format a {@link SuiteReport} for console, markdown, or JSON output.
627
+ *
628
+ * JSON format bypasses the renderable intermediate model and serializes the
629
+ * report directly. Console and markdown apply optional baseline deltas.
630
+ */
677
631
  declare function formatReport(report: SuiteReport, options: ReporterOptions): string;
678
632
  //#endregion
679
633
  //#region src/eval-record/build.d.ts
680
634
  /**
681
635
  * Convert a {@link SuiteReport} (and optional grading) into a versioned
682
636
  * {@link EvalRunEnvelope} for storage or API handoff.
637
+ *
638
+ * @param report - Runner output for one suite execution.
639
+ * @param options - Provenance, grading merge, and artifact inclusion flags.
640
+ * @returns A fully populated envelope with protojson interchange fields on each repetition.
683
641
  */
684
642
  declare function buildEvalRunEnvelope(report: SuiteReport, options?: BuildEvalRunEnvelopeOptions): EvalRunEnvelope;
685
- /** Build envelope from on-disk report + optional grading JSON paths. */
643
+ /**
644
+ * Build an envelope from on-disk runner and grader JSON artifacts.
645
+ *
646
+ * Reads `reportPath` as a {@link SuiteReport}. When `gradingPath` is set, merges
647
+ * outcome grades from a {@link SuiteGradingReport}. When `suitePath` is set,
648
+ * attaches suite URI and SHA-256 content hash for reproducibility.
649
+ *
650
+ * @param reportPath - Path to the suite run report JSON from `harness-eval run`.
651
+ * @param options - Same build options as {@link buildEvalRunEnvelope}, plus file paths.
652
+ */
686
653
  declare function buildEvalRunEnvelopeFromFiles(reportPath: string, options?: BuildEvalRunEnvelopeOptions & {
687
654
  gradingPath?: string;
688
655
  suitePath?: string;
689
656
  }): Promise<EvalRunEnvelope>;
690
657
  //#endregion
691
- //#region src/metrics/tool-calls.d.ts
692
- interface ToolCallMetricOptions {
693
- useStrictStringMatch?: boolean;
694
- }
695
- type ToolCallInput = InterchangeToolCall | TabularToolCall | {
658
+ //#region src/eval-interchange/enrich.d.ts
659
+ /**
660
+ * Attach Vertex protojson interchange fields to one {@link EvalRepetition}.
661
+ *
662
+ * When no trajectory exists (adapter error), sets `failure: 1` and skips
663
+ * protojson payloads. Trajectory instances and harness metrics are only
664
+ * computed when the suite defines a non-empty reference trajectory.
665
+ *
666
+ * @param repetition - Base repetition from the runner (trajectory, assertions, grades).
667
+ * @param options.prompt - Case prompt for EvaluationInstance.
668
+ * @param options.reference - Suite reference trajectory config, if any.
669
+ */
670
+ declare function enrichRepetitionWithProtojson(repetition: EvalRepetition, options?: {
671
+ prompt?: string;
672
+ reference?: ReferenceTrajectoryConfig;
673
+ }): EvalRepetition;
674
+ //#endregion
675
+ //#region src/eval-interchange/protojson/evaluation-instance.d.ts
676
+ /**
677
+ * Build an EvaluationInstance protojson object from harness strings.
678
+ *
679
+ * Omitted fields are excluded from the output object rather than set to
680
+ * empty wrappers — protojson omits unset optional fields.
681
+ *
682
+ * @param options.prompt - Case prompt sent to the agent.
683
+ * @param options.response - Final agent response from the trajectory.
684
+ * @param options.reference - Optional reference answer text (rare in harness eval).
685
+ */
686
+ declare function toEvaluationInstance(options: {
687
+ prompt?: string;
688
+ response?: string;
689
+ reference?: string;
690
+ }): EvaluationInstanceJson;
691
+ //#endregion
692
+ //#region src/eval-interchange/protojson/harness-metrics.d.ts
693
+ /** Suite YAML reference step shape accepted by metric computation. */
694
+ type ReferenceStep$1 = {
696
695
  tool_name: string;
697
696
  tool_input: unknown;
698
697
  };
699
- declare function toolCallValid(toolCall: ToolCallInput): number;
700
- declare function toolNameMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
701
- declare function toolParameterKeyMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
702
- declare function toolParameterKvMatch(predicted: ToolCallInput, reference: ToolCallInput, options?: ToolCallMetricOptions): number;
703
- declare function computeToolCallMetrics(predicted: ToolCallInput[], reference: ToolCallInput[], options?: ToolCallMetricOptions): ToolCallMetrics;
698
+ /**
699
+ * Compute trajectory metrics and map snake_case keys to Vertex camelCase.
700
+ *
701
+ * When `referenceToolNameMode` is `"bare"`, both predicted and reference tool
702
+ * names are stripped to the suffix after the last `__` so suite reference steps
703
+ * authored with bare names (e.g. `ListLandingZones`) match harness MCP names
704
+ * (e.g. `mcp__plugin__ListLandingZones`).
705
+ *
706
+ * @param predicted - Tool calls from the harness trajectory view.
707
+ * @param reference - Reference steps from suite YAML.
708
+ * @param options.referenceToolNameMode - Name normalization mode from suite YAML.
709
+ */
710
+ declare function toHarnessMetrics(predicted: ToolCall[], reference: ReferenceStep$1[], options?: {
711
+ referenceToolNameMode?: ReferenceToolNameMode;
712
+ }): HarnessMetrics;
713
+ //#endregion
714
+ //#region src/eval-interchange/protojson/trajectory-instances.d.ts
715
+ type ReferenceStep = {
716
+ tool_name: string;
717
+ tool_input: unknown;
718
+ };
719
+ /**
720
+ * Build all Trajectory*Instance payloads for one predicted/reference pair.
721
+ *
722
+ * Pair metrics (exact, in-order, any-order, precision, recall) share the
723
+ * same trajectory pair; single-tool-use omits the reference trajectory
724
+ * per Vertex API shape.
725
+ */
726
+ declare function toTrajectoryInstances(options: {
727
+ predicted: ToolCall[];
728
+ reference: ReferenceStep[];
729
+ referenceToolNameMode?: ReferenceToolNameMode;
730
+ }): TrajectoryInstancesJson;
704
731
  //#endregion
705
732
  //#region src/eval-interchange/projections.d.ts
733
+ /**
734
+ * Trajectory projection — all repetitions in the envelope as dataset rows.
735
+ */
706
736
  declare function toTrajectory(envelope: EvalRunEnvelope): EvalDatasetRow[];
707
- declare function toProtoInstances(envelope: EvalRunEnvelope): ProtoTrajectoryInstance[];
708
- declare function toAgentTrace(envelope: EvalRunEnvelope): AgentTrace[];
709
- declare function enrichRepetitionWithInterchange(repetition: EvalRepetition, referenceTrajectory?: TabularToolCall[]): EvalRepetition;
737
+ /**
738
+ * Instances projection — all trajectory metric instances as JSONL rows.
739
+ */
740
+ declare function toInstancesJsonl(envelope: EvalRunEnvelope): InstancesJsonlRow[];
710
741
  //#endregion
711
742
  //#region src/metrics/trajectory.d.ts
712
- type TrajectoryInput = InterchangeToolCall[] | TabularToolCall[] | Array<{
743
+ /**
744
+ * Trajectory-level metrics for comparing predicted and reference tool-call sequences.
745
+ *
746
+ * Aligns with Vertex AI EvaluationService trajectory metrics (exact match,
747
+ * in-order, any-order, precision, recall, single tool use). Tool calls are
748
+ * compared by `(tool_name, serialized tool_input)` identity after normalization.
749
+ *
750
+ * Binary metrics return 0 or 1; precision and recall return fractions in [0, 1].
751
+ */
752
+ /** Canonical wire tool call used internally for comparison. */
753
+ interface WireToolCall {
713
754
  tool_name: string;
714
- tool_input: unknown;
755
+ tool_input: string;
756
+ }
757
+ /** All trajectory metric scores for one predicted/reference pair. */
758
+ interface TrajectoryMetrics {
759
+ trajectory_exact_match: number;
760
+ trajectory_in_order_match: number;
761
+ trajectory_any_order_match: number;
762
+ trajectory_precision: number;
763
+ trajectory_recall: number;
764
+ trajectory_single_tool_use: number;
765
+ }
766
+ /** Input accepted by trajectory metrics — wire or harness/YAML shapes. */
767
+ type TrajectoryInput = WireToolCall[] | Array<{
768
+ tool_name: string;
769
+ tool_input: unknown | string;
715
770
  }>;
771
+ /** Exact sequence equality after normalization. */
716
772
  declare function trajectoryExactMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
773
+ /** Reference is a subsequence of predicted (order preserved, extras allowed). */
717
774
  declare function trajectoryInOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
775
+ /** Same multiset of tool calls; length must match. */
718
776
  declare function trajectoryAnyOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
777
+ /**
778
+ * Fraction of predicted tool calls that appear in reference (multiset).
779
+ *
780
+ * Returns 1 when both trajectories are empty.
781
+ */
719
782
  declare function trajectoryPrecision(predicted: TrajectoryInput, reference: TrajectoryInput): number;
783
+ /**
784
+ * Fraction of reference tool calls matched in predicted (multiset recall).
785
+ *
786
+ * Returns 1 when reference is empty and predicted is empty.
787
+ */
720
788
  declare function trajectoryRecall(predicted: TrajectoryInput, reference: TrajectoryInput): number;
789
+ /** Both trajectories have exactly one call and they match. */
721
790
  declare function trajectorySingleToolUse(predicted: TrajectoryInput, reference: TrajectoryInput): number;
791
+ /** Compute all trajectory metrics in one pass. */
722
792
  declare function computeTrajectoryMetrics(predicted: TrajectoryInput, reference: TrajectoryInput): TrajectoryMetrics;
723
793
  //#endregion
724
- export { type AdapterDiagnostics, AdapterError, type AdapterResult, type AdapterRunFn, AgentConfig, AgentEvent, AgentTrace, Assertion, AssertionResult, AssertionStat, AssistantMessage, AssistantMessageEvent, AssistantTurn, type BaseAdapterConfig, BuildEvalRunEnvelopeOptions, Cardinality, CellReport, CompoundPredicate, ConfigError, ContentBlock, ContentPart, ConversationTurn, DEFAULT_ADAPTER_ID, DEFAULT_REPETITIONS, DEFAULT_THRESHOLD, EVAL_RUN_SCHEMA_VERSION, type EmitOtelOptions, EvalArtifacts, EvalAssertionStat, EvalCellResult, EvalDatasetRow, EvalProvenance, EvalRepetition, EvalRunEnvelope, EvalRunSummary, type ExportTraceServiceRequest, ExternalScore, type GradeReportOptions, type HarnessAdapter, HarnessInfo, InterchangeToolCall, InterchangeTrajectory, JudgeInfo, LeafPredicate, type LimitedRunner, MatrixCell, McpServerStatus, ObjectPredicate, OutcomeGrades, type ParseErrorRecord, type ParseResult, Predicate, ProgressCallback, ProgressEvent, ProtoTrajectoryInstance, type RepGradingResult, RepetitionError, RepetitionResult, type ReporterOptions, ResultEvent, RetryRecord, RunSuiteOptions, SessionMeta, StopReason, StreamEvent, type SuiteConfig, type SuiteGradingReport, SuiteReference, SuiteReport, SystemCompactBoundaryEvent, SystemInitEvent, SystemPluginInstallEvent, SystemRetryEvent, SystemUnknownEvent, TRAJECTORY_SCHEMA_VERSION, TabularToolCall, TestCase, TestSuite, TextBlock, ThresholdedAssertion, ToolCall, type ToolCallMetricOptions, ToolCallMetrics, ToolPattern, ToolResultBlock, ToolUseBlock, TrajectoryBuilder, type TrajectoryInput, TrajectoryMetrics, TrajectoryView, Usage, UsageSummary, UserMessage, UserMessageEvent, aggregateCell, buildEvalRunEnvelope, buildEvalRunEnvelopeFromFiles, buildTrajectory, index_d_exports as claudeCode, computeToolCallMetrics, computeTrajectoryMetrics, createClaudeGrader, createLimit, emitOtel, enrichRepetitionWithInterchange, evaluate, evaluateAll, formatGradingConsole, formatReport, getAdapter, getDefaultAdapter, getRepetitions, gradeReport, gradingReportPassed, isAssistantMessage, isResult, isSystemInit, isSystemRetry, isTextBlock, isToolResultBlock, isToolUseBlock, isUserMessage, listAdapters, loadSuite, mergeConfig, namespaceOf, parseStreamJson, parseSuite, registerAdapter, resolveGradeOptions, runRepetition, runSuite, toAgentTrace, toProtoInstances, toTrajectory, toolCallValid, toolNameMatch, toolParameterKeyMatch, toolParameterKvMatch, trajectoryAnyOrderMatch, trajectoryExactMatch, trajectoryInOrderMatch, trajectoryPrecision, trajectoryRecall, trajectorySingleToolUse, trajectoryToOtlp, trajectoryToTranscript };
794
+ //#region src/metrics/tool-calls.d.ts
795
+ /** Options for parameter value comparison. */
796
+ interface ToolCallMetricOptions {
797
+ /** When true, compare serialized JSON strictly (reserved for future semantics). */
798
+ useStrictStringMatch?: boolean;
799
+ }
800
+ /** Aggregated tool-call metric scores (each 0..1). */
801
+ interface ToolCallMetrics {
802
+ tool_call_valid: number;
803
+ tool_name_match: number;
804
+ tool_parameter_key_match: number;
805
+ tool_parameter_kv_match: number;
806
+ }
807
+ type ToolCallInput = TrajectoryInput[number];
808
+ /**
809
+ * Whether a predicted tool call is well-formed (non-empty name, parseable JSON input).
810
+ *
811
+ * @returns 1 when valid, 0 otherwise.
812
+ */
813
+ declare function toolCallValid(toolCall: ToolCallInput): number;
814
+ /**
815
+ * Whether predicted and reference tool names match exactly.
816
+ *
817
+ * @returns 1 on match, 0 otherwise.
818
+ */
819
+ declare function toolNameMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
820
+ /**
821
+ * Whether parameter key sets match (same keys, same order after sort).
822
+ *
823
+ * Requires matching tool names first. Returns 0 when args are not objects.
824
+ */
825
+ declare function toolParameterKeyMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
826
+ /**
827
+ * Whether all reference parameter key-value pairs match in the predicted call.
828
+ *
829
+ * Requires {@link toolParameterKeyMatch} first. Only keys present in reference
830
+ * are checked (predicted may have extra keys).
831
+ */
832
+ declare function toolParameterKvMatch(predicted: ToolCallInput, reference: ToolCallInput, options?: ToolCallMetricOptions): number;
833
+ /**
834
+ * Average tool-call metrics across index-aligned predicted/reference pairs.
835
+ *
836
+ * Denominator is `max(predicted.length, reference.length, 1)`. Missing
837
+ * predicted calls at an index are skipped for pair metrics; validity still
838
+ * counts when a predicted call exists.
839
+ */
840
+ declare function computeToolCallMetrics(predicted: ToolCallInput[], reference: ToolCallInput[], options?: ToolCallMetricOptions): ToolCallMetrics;
841
+ //#endregion
842
+ export { type AdapterDiagnostics, AdapterError, type AdapterResult, type AdapterRunFn, Assertion, AssertionResult, AssertionStat, AssistantMessage, AssistantMessageEvent, AssistantTurn, type BaseAdapterConfig, BuildEvalRunEnvelopeOptions, Cardinality, CellReport, CompoundPredicate, ConfigError, ContentBlock, DEFAULT_ADAPTER_ID, DEFAULT_REPETITIONS, DEFAULT_THRESHOLD, EVAL_RUN_SCHEMA_VERSION, type EmitOtelOptions, EvalArtifacts, EvalAssertionStat, EvalCellResult, EvalDatasetRow, EvalProvenance, EvalRepetition, EvalRunEnvelope, EvalRunSummary, EvaluationInstanceJson, type ExportTraceServiceRequest, ExternalScore, type GradeReportOptions, type HarnessAdapter, HarnessInfo, HarnessMetrics, InstanceData, InstancesJsonlRow, JudgeInfo, LeafPredicate, type LimitedRunner, MatrixCell, McpServerStatus, ObjectPredicate, OutcomeGrades, type ParseErrorRecord, type ParseResult, Predicate, ProgressCallback, ProgressEvent, ProtojsonToolCall, ProtojsonTrajectory, ReferenceToolNameMode, ReferenceTrajectoryConfig, type RepGradingResult, RepetitionError, RepetitionResult, type ReporterOptions, ResultEvent, RetryRecord, RunSuiteOptions, SessionMeta, StopReason, StreamEvent, type SuiteConfig, type SuiteGradingReport, SuiteReference, SuiteReport, SystemCompactBoundaryEvent, SystemInitEvent, SystemPluginInstallEvent, SystemRetryEvent, SystemUnknownEvent, TRAJECTORY_SCHEMA_VERSION, TestCase, TestSuite, TextBlock, ThresholdedAssertion, ToolCall, type ToolCallMetricOptions, ToolPattern, ToolResultBlock, ToolUseBlock, TrajectoryBuilder, type TrajectoryInput, TrajectoryInstanceMetricKey, TrajectoryInstancesJson, TrajectoryPairInstanceJson, TrajectorySingleToolUseInstanceJson, TrajectoryView, Usage, UsageSummary, UserMessage, UserMessageEvent, aggregateCell, buildEvalRunEnvelope, buildEvalRunEnvelopeFromFiles, buildTrajectory, index_d_exports as claudeCode, computeToolCallMetrics, computeTrajectoryMetrics, createClaudeGrader, createLimit, emitOtel, enrichRepetitionWithProtojson, evaluate, evaluateAll, formatGradingConsole, formatReport, getAdapter, getDefaultAdapter, getRepetitions, gradeReport, gradingReportPassed, isAssistantMessage, isResult, isSystemInit, isSystemRetry, isTextBlock, isToolResultBlock, isToolUseBlock, isUserMessage, listAdapters, loadSuite, mergeConfig, namespaceOf, parseStreamJson, parseSuite, registerAdapter, resolveGradeOptions, runRepetition, runSuite, toEvaluationInstance, toHarnessMetrics, toInstancesJsonl, toTrajectory, toTrajectoryInstances, toolCallValid, toolNameMatch, toolParameterKeyMatch, toolParameterKvMatch, trajectoryAnyOrderMatch, trajectoryExactMatch, trajectoryInOrderMatch, trajectoryPrecision, trajectoryRecall, trajectorySingleToolUse, trajectoryToOtlp, trajectoryToTranscript };
725
843
  //# sourceMappingURL=index.d.ts.map