npm - @alis-build/harness-eval - Versions diffs - 0.1.1 → 0.1.3 - Mend

@alis-build/harness-eval 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md +104 -10
package/dist/adapters/claude-code/index.d.ts +2 -2
package/dist/adapters/claude-code/index.js +2 -1
package/dist/adapters/codex/index.d.ts +68 -0
package/dist/adapters/codex/index.js +3 -0
package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
package/dist/claude-code-C_7hxC8z.js.map +1 -0
package/dist/cli/bin.js +204 -127
package/dist/cli/bin.js.map +1 -1
package/dist/codex-0cHO2te9.js +496 -0
package/dist/codex-0cHO2te9.js.map +1 -0
package/dist/config/loader.d.ts +2 -2
package/dist/config/loader.js +2 -2
package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
package/dist/index.d.ts +397 -153
package/dist/index.js +125 -5
package/dist/index.js.map +1 -0
package/dist/loader-B1WmGGzf.d.ts +107 -0
package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
package/dist/loader-DnQ6Jt0i.js.map +1 -0
package/dist/reporter-Biy-5-9M.js +2216 -0
package/dist/reporter-Biy-5-9M.js.map +1 -0
package/dist/runner/suite.d.ts +1 -1
package/dist/runner/suite.js +1 -1
package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
package/dist/suite-BcP64nlb.js.map +1 -0
package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
package/dist/types-Bac8_Ixb.js +246 -0
package/dist/types-Bac8_Ixb.js.map +1 -0
package/dist/types-Bu8uOZZN.d.ts +77 -0
package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
package/package.json +7 -2
package/schemas/eval-interchange-instances.schema.json +196 -0
package/schemas/eval-interchange.schema.json +65 -52
package/schemas/eval-run-envelope.schema.json +182 -425
package/dist/build-DsVJ_UeU.js +0 -1396
package/dist/build-DsVJ_UeU.js.map +0 -1
package/dist/claude-code-ycT0JQZF.js.map +0 -1
package/dist/loader-BCnFJ8rm.js.map +0 -1
package/dist/loader-DTvoVfN0.d.ts +0 -33
package/dist/suite-chj0j22j.js.map +0 -1
package/schemas/eval-interchange-agent-trace.schema.json +0 -322
package/schemas/eval-interchange-proto-instance.schema.json +0 -106

package/dist/index.d.ts CHANGED Viewed

@@ -1,120 +1,39 @@
-import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-B9H4IZtA.js";
-import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-6Z17eKZx.js";
-import { _ as ObjectPredicate, a as ProgressEvent, b as ToolPattern, c as RunSuiteOptions, d as TestSuite, f as Assertion, g as LeafPredicate, h as CompoundPredicate, i as ProgressCallback, l as SuiteReport, m as Cardinality, n as CellReport, o as RepetitionError, p as AssertionResult, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as Predicate, y as ThresholdedAssertion } from "./types-BQol062t.js";
-import { i as GradingConfig, r as parseSuite, s as ConfigError, t as loadSuite } from "./loader-DTvoVfN0.js";
-import { t as runSuite } from "./suite-BoOvK_lq.js";
+import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-C0gBkl0-.js";
+import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-DnvP1UBl.js";
+import { i as CodexOptions } from "./types-Bu8uOZZN.js";
+import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-0QkNVyp9.js";
+import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, n as parseSuite, o as SuiteDocument, s as PipelineConfig, t as loadSuite } from "./loader-B1WmGGzf.js";
+import { t as runSuite } from "./suite-BEShV0by.js";
 import { Readable } from "node:stream";
-//#region src/types/eval-interchange.d.ts
-/**
- * TypeScript types for eval interchange output.
- */
-interface InterchangeToolCall {
-  tool_name: string;
-  tool_input: string;
-}
-interface InterchangeTrajectory {
-  tool_calls: InterchangeToolCall[];
-}
-interface TabularToolCall {
-  tool_name: string;
-  tool_input: unknown;
-}
-interface ContentPart {
-  text?: string;
-  function_call?: {
-    name: string;
-    args: unknown;
-  };
-  function_response?: {
-    name: string;
-    response: unknown;
-  };
-}
-interface AgentEvent {
-  author: string;
-  content: {
-    parts: ContentPart[];
-  };
-  event_time?: string;
-  state_delta?: Record<string, unknown>;
-  active_tools?: Array<{
-    name: string;
-  }>;
-}
-interface ConversationTurn {
-  turn_index: number;
-  turn_id?: string;
-  events: AgentEvent[];
-}
-interface AgentConfig {
-  agent_id: string;
-  agent_type?: string;
-  description?: string;
-  instruction?: string;
-  tools?: Array<{
-    name: string;
-  }>;
-  sub_agents?: string[];
-}
-interface AgentTrace {
-  agents: Record<string, AgentConfig>;
-  turns: ConversationTurn[];
-}
-interface EvalDatasetRow {
-  prompt?: string;
-  response?: string;
-  reference?: string;
-  predicted_trajectory: TabularToolCall[];
-  reference_trajectory?: TabularToolCall[];
-  latency_in_seconds: number;
-  failure: 0 | 1;
-  human_ratings?: Record<string, number>;
-}
-interface ProtoTrajectoryInstance {
-  predicted_trajectory: InterchangeTrajectory;
-  reference_trajectory?: InterchangeTrajectory;
-  prompt?: string;
-  response?: string;
-  reference?: string;
-}
-interface TrajectoryMetrics {
-  trajectory_exact_match: number;
-  trajectory_in_order_match: number;
-  trajectory_any_order_match: number;
-  trajectory_precision: number;
-  trajectory_recall: number;
-  trajectory_single_tool_use: number;
-}
-interface ToolCallMetrics {
-  tool_call_valid: number;
-  tool_name_match: number;
-  tool_parameter_key_match: number;
-  tool_parameter_kv_match: number;
-}
-//#endregion
 //#region src/grader/types.d.ts
 /**
  * Outcome grading types (LLM-as-judge layer).
  *
  * Behavioral assertions live in harness-eval assertions; expectations here
  * are natural-language outcome checks graded from trajectory transcripts.
+ * Grading runs as a second pass over a {@link SuiteReport} JSON artifact.
  */
 interface GradedExpectation {
+  /** Original expectation text from the suite or sidecar file. */
   text: string;
   passed: boolean;
+  /** Quote or description supporting the pass/fail decision. */
   evidence: string;
 }
+/** Aggregate pass/fail counts for one grading unit (rep or full report). */
 interface GradingSummary {
   passed: number;
   failed: number;
   total: number;
   passRate: number;
 }
+/** Suggestion for improving an expectation or assertion wording. */
 interface EvalFeedbackSuggestion {
   assertion?: string;
   reason: string;
 }
+/** Optional meta-feedback from the judge about expectation quality. */
 interface EvalFeedback {
   suggestions: EvalFeedbackSuggestion[];
   overall: string;
@@ -132,14 +51,22 @@ interface RepGradingResult {
   graderError?: string;
   durationMs: number;
 }
+/** Full grading report for a suite run. */
 interface SuiteGradingReport {
   gradedAt: string;
   sourceReport: string;
   /** Grading YAML path when `--config` was used. */
   gradingConfigPath?: string;
+  /** Judge that produced outcome grades. */
+  judge?: {
+    id: string;
+    model?: string;
+    adapter?: string;
+  };
   results: RepGradingResult[];
   summary: GradingSummary;
 }
+/** Options controlling {@link gradeReport} and the CLI `grade` command. */
 interface GradeReportOptions {
   /** Path to the report being graded (stored in output). */
   sourceReport?: string;
@@ -159,14 +86,19 @@ interface GradeReportOptions {
   env?: Record<string, string>;
   /** Working directory for the judge subprocess. */
   cwd?: string;
+  /** Grading adapter id. Default: `claude-code`. */
+  judgeAdapter?: "claude-code" | "codex";
   /** Claude Code options for the judge (nested in grading YAML under `claudeCode`). */
   claudeCode?: Record<string, unknown>;
+  /** Codex CLI options for the judge (nested in grading YAML under `codex`). */
+  codex?: Record<string, unknown>;
   /** Path to grading YAML when `--config` was used. */
   gradingConfigPath?: string;
   /** Inject a custom grader (for tests). */
   gradeFn?: GraderFn;
   onProgress?: (event: GradeProgressEvent) => void;
 }
+/** Progress events emitted during outcome grading. */
 type GradeProgressEvent = {
   kind: "grade-start";
   total: number;
@@ -185,13 +117,16 @@ type GradeProgressEvent = {
   totalExpectations: number;
   passedExpectations: number;
 };
+/** Pluggable grader implementation (defaults to Claude subprocess). */
 type GraderFn = (input: GraderInput) => Promise<GraderOutput>;
+/** Input passed to a grader for one repetition. */
 interface GraderInput {
   prompt: string;
   transcript: string;
   expectations: string[];
   systemInstruction?: string;
 }
+/** Parsed grader response before alignment with input expectation order. */
 interface GraderOutput {
   expectations: GradedExpectation[];
   summary: GradingSummary;
@@ -206,20 +141,14 @@ declare const EVAL_RUN_SCHEMA_VERSION = "1.0";
 declare const TRAJECTORY_SCHEMA_VERSION = "1.0";
 /** Link to the suite spec that produced a run. */
 interface SuiteReference {
-  /** Absolute or repo-relative path to the suite YAML. */
   uri?: string;
-  /** Stable suite identifier when known (e.g. case bundle name). */
   id?: string;
-  /** SHA-256 or similar hash of suite file contents. */
   contentHash?: string;
 }
 /** Harness that executed the run. */
 interface HarnessInfo {
-  /** Adapter id from suite YAML, e.g. `claude-code`. */
   adapter: string;
-  /** harness-eval package version when envelope was built. */
   frameworkVersion?: string;
-  /** Optional harness binary version (e.g. `claude -v`). */
   harnessVersion?: string;
 }
 /** CI, git, or runtime provenance for correlation in the DB. */
@@ -244,9 +173,7 @@ interface EvalProvenance {
 interface EvalRunSummary {
   cellsTotal: number;
   cellsPassed: number;
-  /** All cells passed behavioral assertion thresholds. */
   behavioralPass: boolean;
-  /** All graded expectations passed (when outcome layer present). */
   outcomePass?: boolean;
 }
 /** Identity of the judge that produced outcome grades. */
@@ -254,6 +181,8 @@ interface JudgeInfo {
   id: string;
   model?: string;
   version?: string;
+  /** Grading adapter id when known (e.g. `codex`, `claude-code`). */
+  adapter?: string;
 }
 /** Outcome grades for one repetition (built-in or external judge). */
 interface OutcomeGrades {
@@ -278,20 +207,14 @@ interface ExternalScore {
 }
 /** Optional large or vendor-specific blobs (store by reference in DB when possible). */
 interface EvalArtifacts {
-  /** Claude Code `stream-json` lines — debug only, not cross-harness. */
   rawStreamEvents?: unknown[];
-  /** URI to OTLP JSON (S3, GCS, etc.). */
   otlpTraceUri?: string;
-  /** Text transcript for judges (`trajectoryToTranscript`). */
   transcript?: string;
 }
-/**
- * One harness invocation — the unit external judges and trajectory queries use.
- */
+/** One harness invocation — the unit external judges and trajectory queries use. */
 interface EvalRepetition {
   repetitionIndex: number;
   durationMs: number;
-  /** Normalized harness session. Required when the harness completed with a view. */
   trajectory?: TrajectoryView & {
     schemaVersion: string;
   };
@@ -300,18 +223,14 @@ interface EvalRepetition {
   outcomeGrades?: OutcomeGrades;
   externalScores?: ExternalScore[];
   artifacts?: EvalArtifacts;
-  /** Interchange-format predicted tool-call trajectory. */
-  predicted_trajectory?: InterchangeToolCall[];
-  /** Full multi-turn agent trace in interchange format. */
-  agent_trace?: AgentTrace;
-  /** Session latency in seconds (interchange field). */
-  latency_in_seconds?: number;
-  /** 1 when the harness run failed, 0 on success (interchange field). */
+  /** Vertex EvaluationInstance protojson wire object. */
+  evaluationInstance?: EvaluationInstanceJson;
+  /** Vertex Trajectory*Instance protojson wire objects keyed by metric. */
+  trajectoryInstances?: TrajectoryInstancesJson;
+  /** Harness-precomputed trajectory metric scores (camelCase). */
+  harnessMetrics?: HarnessMetrics;
+  latencySeconds?: number;
   failure?: 0 | 1;
-  /** Trajectory-level metrics when reference_trajectory is provided. */
-  trajectoryMetrics?: TrajectoryMetrics;
-  /** Tool-call-level metrics when reference_trajectory is provided. */
-  toolCallMetrics?: ToolCallMetrics;
   error?: {
     message: string;
     diagnostics?: Partial<AdapterDiagnostics>;
@@ -335,24 +254,16 @@ interface EvalCellResult {
   expectations?: string[];
   cellLabel: string;
   axes?: Record<string, string>;
-  /** Reference tool-call trajectory for metric computation. */
-  reference_trajectory?: TabularToolCall[];
-  /** Human ratings keyed by metric name for judge calibration. */
-  human_ratings?: Record<string, number>;
+  /** Reference trajectory in Vertex protojson wire format. */
+  referenceTrajectory?: ProtojsonTrajectory;
+  humanRatings?: Record<string, number>;
   assertionStats: EvalAssertionStat[];
   adapterErrors: number;
-  /** Passed all behavioral assertion thresholds for this cell. */
   behavioralPass: boolean;
-  /** Passed all outcome expectations when graded; omitted if not graded. */
   outcomePass?: boolean;
   repetitions: EvalRepetition[];
 }
-/**
- * Top-level document for CI/CD pipelines, APIs, and databases.
- *
- * This is the interchange format your storage layer should target — not
- * {@link import("./stream").StreamEvent} or OTLP traces.
- */
+/** Top-level document for CI/CD pipelines, APIs, and databases. */
 interface EvalRunEnvelope {
   schemaVersion: typeof EVAL_RUN_SCHEMA_VERSION;
   runId: string;
@@ -365,12 +276,15 @@ interface EvalRunEnvelope {
   cells: EvalCellResult[];
 }
 interface BuildEvalRunEnvelopeOptions {
-  /** UUID for this run; generated if omitted. */
+  /** Override envelope runId; defaults to a random UUID. */
   runId?: string;
+  /** Link to the suite YAML that produced the run. */
   suite?: SuiteReference;
+  /** Harness adapter metadata; adapter defaults to `"claude-code"`. */
   harness?: Partial<HarnessInfo>;
+  /** CI, git, and runtime provenance for correlation. */
   provenance?: EvalProvenance;
-  /** Merge outcome grades from `gradeReport()` or compatible structure. */
+  /** Outcome grades to merge from a grader run. */
   grading?: {
     gradedAt?: string;
     sourceReport?: string;
@@ -386,9 +300,9 @@ interface BuildEvalRunEnvelopeOptions {
     }>;
     judge?: JudgeInfo;
   };
-  /** Include transcript in each repetition's artifacts. Default true. */
+  /** Include text transcript artifact (default true). */
   includeTranscript?: boolean;
-  /** Include raw stream events when adapter provides them. Default false. */
+  /** Include raw stream-json events (default false; debug only). */
   includeRawStreamEvents?: boolean;
 }
 //#endregion
@@ -506,6 +420,7 @@ declare function getDefaultAdapter(): HarnessAdapter;
 declare const DEFAULT_REPETITIONS = 5;
 /** Default assertion pass-rate threshold when `threshold` is omitted. */
 declare const DEFAULT_THRESHOLD = 1;
+/** Injectable adapter run function (used by tests to stub harness I/O). */
 type AdapterRunFn = (config: BaseAdapterConfig & Record<string, unknown>) => Promise<AdapterResult>;
 /**
  * Build the effective adapter config for one (suite, case, cell).
@@ -513,8 +428,21 @@ type AdapterRunFn = (config: BaseAdapterConfig & Record<string, unknown>) => Pro
  * Merge order (later wins): defaultConfig < case.config < cell.config.
  */
 declare function mergeConfig(suite: TestSuite, testCase: TestCase, cell: MatrixCell): BaseAdapterConfig & Record<string, unknown>;
+/** Effective repetition count for a case (`case.repetitions` or default). */
 declare function getRepetitions(testCase: TestCase): number;
+/**
+ * Run one repetition: invoke the adapter, evaluate assertions, capture errors.
+ *
+ * Adapter failures are returned as {@link RepetitionResult.error} rather than
+ * thrown so the suite runner can continue other reps and report adapter error counts.
+ */
 declare function runRepetition(testCase: TestCase, _cell: MatrixCell, config: BaseAdapterConfig & Record<string, unknown>, repetitionIndex: number, run: AdapterRunFn, signal?: AbortSignal): Promise<RepetitionResult>;
+/**
+ * Roll up repetition results into a {@link CellReport}.
+ *
+ * Adapter errors reduce `evaluatedCount` but do not fail the cell by
+ * themselves — only assertion threshold misses mark a cell as failed.
+ */
 declare function aggregateCell(testCase: TestCase, cell: MatrixCell, repetitions: RepetitionResult[]): CellReport;
 //#endregion
 //#region src/runner/limit.d.ts
@@ -536,6 +464,106 @@ declare function aggregateCell(testCase: TestCase, cell: MatrixCell, repetitions
 type LimitedRunner = <T>(fn: () => Promise<T>) => Promise<T>;
 declare function createLimit(max: number): LimitedRunner;
 //#endregion
+//#region src/cli/commands/envelope.d.ts
+/** Supported `--projection` values for envelope output. */
+type EnvelopeProjection = "envelope" | "trajectory" | "instances";
+//#endregion
+//#region src/pipeline/resolve-inputs.d.ts
+type PipelineStepName = "run" | "grade" | "envelope";
+/** CLI overrides for pipeline artifact paths (take precedence over YAML). */
+interface PipelineCliOverrides {
+  run?: {
+    output?: string;
+    maxConcurrent?: number;
+  };
+  grade?: {
+    input?: string;
+    output?: string;
+    maxConcurrent?: number;
+  };
+  envelope?: {
+    report?: string;
+    grading?: string;
+    output?: string;
+    projection?: EnvelopeProjection;
+  };
+}
+/** Resolved paths for the harness run step. */
+interface ResolvedPipelineRun {
+  output: string;
+  maxConcurrent?: number;
+}
+/** Resolved input (suite report) and output (grading JSON) for the grade step. */
+interface ResolvedPipelineGrade {
+  input: string;
+  output: string;
+  maxConcurrent?: number;
+}
+/** Resolved artifact paths for the envelope export step. */
+interface ResolvedPipelineEnvelope {
+  report: string;
+  grading?: string;
+  output: string;
+  projection: EnvelopeProjection;
+  includeRawStreamEvents: boolean;
+  noTranscript: boolean;
+}
+/** Fully resolved pipeline inputs for one or more enabled steps. */
+interface ResolvedPipeline {
+  suitePath: string;
+  run?: ResolvedPipelineRun;
+  grade?: ResolvedPipelineGrade;
+  envelope?: ResolvedPipelineEnvelope;
+}
+/** Inputs for {@link resolvePipelineInputs}. */
+interface ResolvePipelineInputsOptions {
+  suitePath: string;
+  suiteDir: string;
+  pipeline: PipelineConfig;
+  steps: PipelineStepName[];
+  executed?: {
+    run?: {
+      output: string;
+    };
+    grade?: {
+      input: string;
+      output: string;
+    };
+  };
+  overrides?: PipelineCliOverrides;
+}
+/** Resolve absolute paths for enabled pipeline steps. */
+declare function resolvePipelineInputs(options: ResolvePipelineInputsOptions): Promise<ResolvedPipeline>;
+/**
+ * Resolve a grading artifact path from a unified suite's `pipeline:` block.
+ *
+ * Used by `harness-eval envelope --suite` when `--grading` is omitted (spec C-7).
+ * Checks `pipeline.envelope.grading` then default `pipeline.grade.output` on disk.
+ */
+declare function resolveGradingArtifactFromSuite(suitePath: string): Promise<string | undefined>;
+//#endregion
+//#region src/pipeline/run-pipeline.d.ts
+/** Options for {@link runPipeline} (CLI flags and progress callbacks). */
+interface RunPipelineOptions {
+  /** Comma-separated subset of configured steps (e.g. `run,grade`). */
+  steps?: string;
+  maxConcurrent?: number;
+  overrides?: PipelineCliOverrides;
+  onRunProgress?: ProgressCallback;
+  onGradeProgress?: GradeReportOptions["onProgress"];
+  /** Framework version stamped on envelope export. */
+  frameworkVersion?: string;
+}
+/** Outcome of a pipeline run including per-step exit semantics. */
+interface RunPipelineResult {
+  /** 0 pass, 1 eval/grade/envelope failure, 2 load error (thrown before return). */
+  exitCode: number;
+  stepsRun: PipelineStepName[];
+  runReport?: SuiteReport;
+}
+/** Execute configured pipeline steps in order; stop on first failure. */
+declare function runPipeline(doc: SuiteDocument, options?: RunPipelineOptions): Promise<RunPipelineResult>;
+//#endregion
 //#region src/otel/types.d.ts
 /**
  * Minimal OTLP JSON types for trace export.
@@ -543,24 +571,30 @@ declare function createLimit(max: number): LimitedRunner;
  * Shapes follow OTLP/HTTP JSON Protobuf encoding (lowerCamelCase field names).
  * @see https://opentelemetry.io/docs/specs/otlp/
  */
+/** OTLP ExportTraceServiceRequest root — batch of resource spans. */
 interface ExportTraceServiceRequest {
   resourceSpans: ResourceSpans[];
 }
+/** Resource-attributed span group in an export batch. */
 interface ResourceSpans {
   resource: Resource;
   scopeSpans: ScopeSpans[];
 }
+/** OTLP resource descriptor (service.name, agent metadata). */
 interface Resource {
   attributes: KeyValue[];
 }
+/** Spans emitted by one instrumentation scope within a resource. */
 interface ScopeSpans {
   scope: InstrumentationScope;
   spans: Span[];
 }
+/** Instrumentation library identity (name + optional version). */
 interface InstrumentationScope {
   name: string;
   version?: string;
 }
+/** One span in OTLP JSON encoding (nanosecond timestamps as strings). */
 interface Span {
   traceId: string;
   spanId: string;
@@ -572,14 +606,17 @@ interface Span {
   attributes: KeyValue[];
   status?: SpanStatus;
 }
+/** OTLP span status (OK, ERROR, or UNSET). */
 interface SpanStatus {
   code: number;
   message?: string;
 }
+/** Key-value attribute pair on a span or resource. */
 interface KeyValue {
   key: string;
   value: AnyValue;
 }
+/** Discriminated OTLP attribute value (one of the typed fields set). */
 interface AnyValue {
   stringValue?: string;
   boolValue?: boolean;
@@ -595,6 +632,7 @@ interface ArrayValue {
 interface KeyValueList {
   values: KeyValue[];
 }
+/** Options passed to {@link trajectoryToOtlp} / {@link emitOtel}. */
 interface EmitOtelOptions {
   /** User prompt for the first `gen_ai.input.messages` entry. */
   prompt?: string;
@@ -627,13 +665,20 @@ interface EmitOtelOptions {
  * ```
  */
 declare function trajectoryToOtlp(view: TrajectoryView, options?: EmitOtelOptions): ExportTraceServiceRequest;
-/** Alias matching the implementation plan naming. */
+/** Alias for {@link trajectoryToOtlp} — matches implementation plan naming. */
 declare const emitOtel: typeof trajectoryToOtlp;
 //#endregion
 //#region src/grader/grade-report.d.ts
+/**
+ * Grade every repetition in a {@link SuiteReport} that has expectations.
+ *
+ * Expectations come from inline case fields or an optional sidecar YAML/JSON
+ * map. Runs are concurrent under {@link GradeReportOptions.maxConcurrent}.
+ */
 declare function gradeReport(report: SuiteReport, options?: GradeReportOptions): Promise<SuiteGradingReport>;
 //#endregion
 //#region src/grader/resolve-grade-options.d.ts
+/** CLI flag overrides for grading (take precedence over grading YAML). */
 interface GradeCliOverrides {
   model?: string;
   binary?: string;
@@ -648,9 +693,16 @@ interface GradeCliOverrides {
 declare function resolveGradeOptions(fileConfig?: GradingConfig, cli?: GradeCliOverrides, configPath?: string): GradeReportOptions;
 //#endregion
 //#region src/grader/transcript.d.ts
+/**
+ * Render a {@link TrajectoryView} as markdown for LLM graders.
+ *
+ * Tool results are truncated at {@link MAX_RESULT_CHARS} to keep judge
+ * prompts within reasonable token limits.
+ */
 declare function trajectoryToTranscript(view: TrajectoryView, prompt?: string): string;
 //#endregion
 //#region src/grader/claude-grader.d.ts
+/** Options for {@link createClaudeGrader} / {@link runClaudeGrader}. */
 interface ClaudeGraderOptions {
   binary?: string;
   model?: string;
@@ -659,14 +711,36 @@ interface ClaudeGraderOptions {
   cwd?: string;
   claudeCode?: ClaudeCodeOptions;
 }
+/** Factory returning a {@link GraderFn} bound to subprocess options. */
 declare function createClaudeGrader(options?: ClaudeGraderOptions): GraderFn;
 //#endregion
+//#region src/grader/codex-grader.d.ts
+/** Options for {@link createCodexGrader} / {@link runCodexGrader}. */
+interface CodexGraderOptions {
+  binary?: string;
+  model?: string;
+  timeoutMs?: number;
+  env?: Record<string, string>;
+  cwd?: string;
+  codex?: CodexOptions;
+}
+/** Factory returning a {@link GraderFn} bound to subprocess options. */
+declare function createCodexGrader(options?: CodexGraderOptions): GraderFn;
+//#endregion
 //#region src/grader/format-console.d.ts
+/**
+ * Format a {@link SuiteGradingReport} for terminal output.
+ *
+ * @param color When true, emit ANSI status colors (default for TTY console).
+ */
 declare function formatGradingConsole(report: SuiteGradingReport, color?: boolean): string;
+/** True when every graded rep passed all expectations without grader errors. */
 declare function gradingReportPassed(report: SuiteGradingReport): boolean;
 //#endregion
 //#region src/reporter/types.d.ts
+/** Output format selector for {@link formatReport}. */
 type ReportFormat = "console" | "markdown" | "json";
+/** Options for suite report formatting. */
 interface ReporterOptions {
   format: ReportFormat;
   baseline?: SuiteReport;
@@ -674,52 +748,222 @@ interface ReporterOptions {
 }
 //#endregion
 //#region src/reporter/index.d.ts
+/**
+ * Format a {@link SuiteReport} for console, markdown, or JSON output.
+ *
+ * JSON format bypasses the renderable intermediate model and serializes the
+ * report directly. Console and markdown apply optional baseline deltas.
+ */
 declare function formatReport(report: SuiteReport, options: ReporterOptions): string;
 //#endregion
 //#region src/eval-record/build.d.ts
 /**
  * Convert a {@link SuiteReport} (and optional grading) into a versioned
  * {@link EvalRunEnvelope} for storage or API handoff.
+ *
+ * @param report - Runner output for one suite execution.
+ * @param options - Provenance, grading merge, and artifact inclusion flags.
+ * @returns A fully populated envelope with protojson interchange fields on each repetition.
  */
 declare function buildEvalRunEnvelope(report: SuiteReport, options?: BuildEvalRunEnvelopeOptions): EvalRunEnvelope;
-/** Build envelope from on-disk report + optional grading JSON paths. */
+/**
+ * Build an envelope from on-disk runner and grader JSON artifacts.
+ *
+ * Reads `reportPath` as a {@link SuiteReport}. When `gradingPath` is set, merges
+ * outcome grades from a {@link SuiteGradingReport}. When `suitePath` is set,
+ * attaches suite URI and SHA-256 content hash for reproducibility.
+ *
+ * @param reportPath - Path to the suite run report JSON from `harness-eval run`.
+ * @param options - Same build options as {@link buildEvalRunEnvelope}, plus file paths.
+ */
 declare function buildEvalRunEnvelopeFromFiles(reportPath: string, options?: BuildEvalRunEnvelopeOptions & {
   gradingPath?: string;
   suitePath?: string;
 }): Promise<EvalRunEnvelope>;
 //#endregion
-//#region src/metrics/tool-calls.d.ts
-interface ToolCallMetricOptions {
-  useStrictStringMatch?: boolean;
-}
-type ToolCallInput = InterchangeToolCall | TabularToolCall | {
+//#region src/eval-interchange/enrich.d.ts
+/**
+ * Attach Vertex protojson interchange fields to one {@link EvalRepetition}.
+ *
+ * When no trajectory exists (adapter error), sets `failure: 1` and skips
+ * protojson payloads. Trajectory instances and harness metrics are only
+ * computed when the suite defines a non-empty reference trajectory.
+ *
+ * @param repetition - Base repetition from the runner (trajectory, assertions, grades).
+ * @param options.prompt - Case prompt for EvaluationInstance.
+ * @param options.reference - Suite reference trajectory config, if any.
+ */
+declare function enrichRepetitionWithProtojson(repetition: EvalRepetition, options?: {
+  prompt?: string;
+  reference?: ReferenceTrajectoryConfig;
+}): EvalRepetition;
+//#endregion
+//#region src/eval-interchange/protojson/evaluation-instance.d.ts
+/**
+ * Build an EvaluationInstance protojson object from harness strings.
+ *
+ * Omitted fields are excluded from the output object rather than set to
+ * empty wrappers — protojson omits unset optional fields.
+ *
+ * @param options.prompt - Case prompt sent to the agent.
+ * @param options.response - Final agent response from the trajectory.
+ * @param options.reference - Optional reference answer text (rare in harness eval).
+ */
+declare function toEvaluationInstance(options: {
+  prompt?: string;
+  response?: string;
+  reference?: string;
+}): EvaluationInstanceJson;
+//#endregion
+//#region src/eval-interchange/protojson/harness-metrics.d.ts
+/** Suite YAML reference step shape accepted by metric computation. */
+type ReferenceStep$1 = {
   tool_name: string;
   tool_input: unknown;
 };
-declare function toolCallValid(toolCall: ToolCallInput): number;
-declare function toolNameMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
-declare function toolParameterKeyMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
-declare function toolParameterKvMatch(predicted: ToolCallInput, reference: ToolCallInput, options?: ToolCallMetricOptions): number;
-declare function computeToolCallMetrics(predicted: ToolCallInput[], reference: ToolCallInput[], options?: ToolCallMetricOptions): ToolCallMetrics;
+/**
+ * Compute trajectory metrics and map snake_case keys to Vertex camelCase.
+ *
+ * When `referenceToolNameMode` is `"bare"`, both predicted and reference tool
+ * names are stripped to the suffix after the last `__` so suite reference steps
+ * authored with bare names (e.g. `ListLandingZones`) match harness MCP names
+ * (e.g. `mcp__plugin__ListLandingZones`).
+ *
+ * @param predicted - Tool calls from the harness trajectory view.
+ * @param reference - Reference steps from suite YAML.
+ * @param options.referenceToolNameMode - Name normalization mode from suite YAML.
+ */
+declare function toHarnessMetrics(predicted: ToolCall[], reference: ReferenceStep$1[], options?: {
+  referenceToolNameMode?: ReferenceToolNameMode;
+}): HarnessMetrics;
+//#endregion
+//#region src/eval-interchange/protojson/trajectory-instances.d.ts
+type ReferenceStep = {
+  tool_name: string;
+  tool_input: unknown;
+};
+/**
+ * Build all Trajectory*Instance payloads for one predicted/reference pair.
+ *
+ * Pair metrics (exact, in-order, any-order, precision, recall) share the
+ * same trajectory pair; single-tool-use omits the reference trajectory
+ * per Vertex API shape.
+ */
+declare function toTrajectoryInstances(options: {
+  predicted: ToolCall[];
+  reference: ReferenceStep[];
+  referenceToolNameMode?: ReferenceToolNameMode;
+}): TrajectoryInstancesJson;
 //#endregion
 //#region src/eval-interchange/projections.d.ts
+/**
+ * Trajectory projection — all repetitions in the envelope as dataset rows.
+ */
 declare function toTrajectory(envelope: EvalRunEnvelope): EvalDatasetRow[];
-declare function toProtoInstances(envelope: EvalRunEnvelope): ProtoTrajectoryInstance[];
-declare function toAgentTrace(envelope: EvalRunEnvelope): AgentTrace[];
-declare function enrichRepetitionWithInterchange(repetition: EvalRepetition, referenceTrajectory?: TabularToolCall[]): EvalRepetition;
+/**
+ * Instances projection — all trajectory metric instances as JSONL rows.
+ */
+declare function toInstancesJsonl(envelope: EvalRunEnvelope): InstancesJsonlRow[];
 //#endregion
 //#region src/metrics/trajectory.d.ts
-type TrajectoryInput = InterchangeToolCall[] | TabularToolCall[] | Array<{
+/**
+ * Trajectory-level metrics for comparing predicted and reference tool-call sequences.
+ *
+ * Aligns with Vertex AI EvaluationService trajectory metrics (exact match,
+ * in-order, any-order, precision, recall, single tool use). Tool calls are
+ * compared by `(tool_name, serialized tool_input)` identity after normalization.
+ *
+ * Binary metrics return 0 or 1; precision and recall return fractions in [0, 1].
+ */
+/** Canonical wire tool call used internally for comparison. */
+interface WireToolCall {
   tool_name: string;
-  tool_input: unknown;
+  tool_input: string;
+}
+/** All trajectory metric scores for one predicted/reference pair. */
+interface TrajectoryMetrics {
+  trajectory_exact_match: number;
+  trajectory_in_order_match: number;
+  trajectory_any_order_match: number;
+  trajectory_precision: number;
+  trajectory_recall: number;
+  trajectory_single_tool_use: number;
+}
+/** Input accepted by trajectory metrics — wire or harness/YAML shapes. */
+type TrajectoryInput = WireToolCall[] | Array<{
+  tool_name: string;
+  tool_input: unknown | string;
 }>;
+/** Exact sequence equality after normalization. */
 declare function trajectoryExactMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
+/** Reference is a subsequence of predicted (order preserved, extras allowed). */
 declare function trajectoryInOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
+/** Same multiset of tool calls; length must match. */
 declare function trajectoryAnyOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
+/**
+ * Fraction of predicted tool calls that appear in reference (multiset).
+ *
+ * Returns 1 when both trajectories are empty.
+ */
 declare function trajectoryPrecision(predicted: TrajectoryInput, reference: TrajectoryInput): number;
+/**
+ * Fraction of reference tool calls matched in predicted (multiset recall).
+ *
+ * Returns 1 when reference is empty and predicted is empty.
+ */
 declare function trajectoryRecall(predicted: TrajectoryInput, reference: TrajectoryInput): number;
+/** Both trajectories have exactly one call and they match. */
 declare function trajectorySingleToolUse(predicted: TrajectoryInput, reference: TrajectoryInput): number;
+/** Compute all trajectory metrics in one pass. */
 declare function computeTrajectoryMetrics(predicted: TrajectoryInput, reference: TrajectoryInput): TrajectoryMetrics;
 //#endregion
-export { type AdapterDiagnostics, AdapterError, type AdapterResult, type AdapterRunFn, AgentConfig, AgentEvent, AgentTrace, Assertion, AssertionResult, AssertionStat, AssistantMessage, AssistantMessageEvent, AssistantTurn, type BaseAdapterConfig, BuildEvalRunEnvelopeOptions, Cardinality, CellReport, CompoundPredicate, ConfigError, ContentBlock, ContentPart, ConversationTurn, DEFAULT_ADAPTER_ID, DEFAULT_REPETITIONS, DEFAULT_THRESHOLD, EVAL_RUN_SCHEMA_VERSION, type EmitOtelOptions, EvalArtifacts, EvalAssertionStat, EvalCellResult, EvalDatasetRow, EvalProvenance, EvalRepetition, EvalRunEnvelope, EvalRunSummary, type ExportTraceServiceRequest, ExternalScore, type GradeReportOptions, type HarnessAdapter, HarnessInfo, InterchangeToolCall, InterchangeTrajectory, JudgeInfo, LeafPredicate, type LimitedRunner, MatrixCell, McpServerStatus, ObjectPredicate, OutcomeGrades, type ParseErrorRecord, type ParseResult, Predicate, ProgressCallback, ProgressEvent, ProtoTrajectoryInstance, type RepGradingResult, RepetitionError, RepetitionResult, type ReporterOptions, ResultEvent, RetryRecord, RunSuiteOptions, SessionMeta, StopReason, StreamEvent, type SuiteConfig, type SuiteGradingReport, SuiteReference, SuiteReport, SystemCompactBoundaryEvent, SystemInitEvent, SystemPluginInstallEvent, SystemRetryEvent, SystemUnknownEvent, TRAJECTORY_SCHEMA_VERSION, TabularToolCall, TestCase, TestSuite, TextBlock, ThresholdedAssertion, ToolCall, type ToolCallMetricOptions, ToolCallMetrics, ToolPattern, ToolResultBlock, ToolUseBlock, TrajectoryBuilder, type TrajectoryInput, TrajectoryMetrics, TrajectoryView, Usage, UsageSummary, UserMessage, UserMessageEvent, aggregateCell, buildEvalRunEnvelope, buildEvalRunEnvelopeFromFiles, buildTrajectory, index_d_exports as claudeCode, computeToolCallMetrics, computeTrajectoryMetrics, createClaudeGrader, createLimit, emitOtel, enrichRepetitionWithInterchange, evaluate, evaluateAll, formatGradingConsole, formatReport, getAdapter, getDefaultAdapter, getRepetitions, gradeReport, gradingReportPassed, isAssistantMessage, isResult, isSystemInit, isSystemRetry, isTextBlock, isToolResultBlock, isToolUseBlock, isUserMessage, listAdapters, loadSuite, mergeConfig, namespaceOf, parseStreamJson, parseSuite, registerAdapter, resolveGradeOptions, runRepetition, runSuite, toAgentTrace, toProtoInstances, toTrajectory, toolCallValid, toolNameMatch, toolParameterKeyMatch, toolParameterKvMatch, trajectoryAnyOrderMatch, trajectoryExactMatch, trajectoryInOrderMatch, trajectoryPrecision, trajectoryRecall, trajectorySingleToolUse, trajectoryToOtlp, trajectoryToTranscript };
+//#region src/metrics/tool-calls.d.ts
+/** Options for parameter value comparison. */
+interface ToolCallMetricOptions {
+  /** When true, compare serialized JSON strictly (reserved for future semantics). */
+  useStrictStringMatch?: boolean;
+}
+/** Aggregated tool-call metric scores (each 0..1). */
+interface ToolCallMetrics {
+  tool_call_valid: number;
+  tool_name_match: number;
+  tool_parameter_key_match: number;
+  tool_parameter_kv_match: number;
+}
+type ToolCallInput = TrajectoryInput[number];
+/**
+ * Whether a predicted tool call is well-formed (non-empty name, parseable JSON input).
+ *
+ * @returns 1 when valid, 0 otherwise.
+ */
+declare function toolCallValid(toolCall: ToolCallInput): number;
+/**
+ * Whether predicted and reference tool names match exactly.
+ *
+ * @returns 1 on match, 0 otherwise.
+ */
+declare function toolNameMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
+/**
+ * Whether parameter key sets match (same keys, same order after sort).
+ *
+ * Requires matching tool names first. Returns 0 when args are not objects.
+ */
+declare function toolParameterKeyMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
+/**
+ * Whether all reference parameter key-value pairs match in the predicted call.
+ *
+ * Requires {@link toolParameterKeyMatch} first. Only keys present in reference
+ * are checked (predicted may have extra keys).
+ */
+declare function toolParameterKvMatch(predicted: ToolCallInput, reference: ToolCallInput, options?: ToolCallMetricOptions): number;
+/**
+ * Average tool-call metrics across index-aligned predicted/reference pairs.
+ *
+ * Denominator is `max(predicted.length, reference.length, 1)`. Missing
+ * predicted calls at an index are skipped for pair metrics; validity still
+ * counts when a predicted call exists.
+ */
+declare function computeToolCallMetrics(predicted: ToolCallInput[], reference: ToolCallInput[], options?: ToolCallMetricOptions): ToolCallMetrics;
+//#endregion
+export { type AdapterDiagnostics, AdapterError, type AdapterResult, type AdapterRunFn, Assertion, AssertionResult, AssertionStat, AssistantMessage, AssistantMessageEvent, AssistantTurn, type BaseAdapterConfig, BuildEvalRunEnvelopeOptions, Cardinality, CellReport, CompoundPredicate, ConfigError, ContentBlock, DEFAULT_ADAPTER_ID, DEFAULT_REPETITIONS, DEFAULT_THRESHOLD, EVAL_RUN_SCHEMA_VERSION, type EmitOtelOptions, EvalArtifacts, EvalAssertionStat, EvalCellResult, EvalDatasetRow, EvalProvenance, EvalRepetition, EvalRunEnvelope, EvalRunSummary, EvaluationInstanceJson, type ExportTraceServiceRequest, ExternalScore, type GradeReportOptions, type HarnessAdapter, HarnessInfo, HarnessMetrics, InstanceData, InstancesJsonlRow, JudgeInfo, LeafPredicate, type LimitedRunner, MatrixCell, McpServerStatus, ObjectPredicate, OutcomeGrades, type ParseErrorRecord, type ParseResult, type PipelineConfig, Predicate, ProgressCallback, ProgressEvent, ProtojsonToolCall, ProtojsonTrajectory, ReferenceToolNameMode, ReferenceTrajectoryConfig, type RepGradingResult, RepetitionError, RepetitionResult, type ReporterOptions, ResultEvent, RetryRecord, RunSuiteOptions, SessionMeta, StopReason, StreamEvent, type SuiteConfig, type SuiteDocument, type SuiteGradingReport, SuiteReference, SuiteReport, SystemCompactBoundaryEvent, SystemInitEvent, SystemPluginInstallEvent, SystemRetryEvent, SystemUnknownEvent, TRAJECTORY_SCHEMA_VERSION, TestCase, TestSuite, TextBlock, ThresholdedAssertion, ToolCall, type ToolCallMetricOptions, ToolPattern, ToolResultBlock, ToolUseBlock, TrajectoryBuilder, type TrajectoryInput, TrajectoryInstanceMetricKey, TrajectoryInstancesJson, TrajectoryPairInstanceJson, TrajectorySingleToolUseInstanceJson, TrajectoryView, Usage, UsageSummary, UserMessage, UserMessageEvent, aggregateCell, buildEvalRunEnvelope, buildEvalRunEnvelopeFromFiles, buildTrajectory, index_d_exports as claudeCode, computeToolCallMetrics, computeTrajectoryMetrics, createClaudeGrader, createCodexGrader, createLimit, emitOtel, enrichRepetitionWithProtojson, evaluate, evaluateAll, formatGradingConsole, formatReport, getAdapter, getDefaultAdapter, getRepetitions, gradeReport, gradingReportPassed, isAssistantMessage, isResult, isSystemInit, isSystemRetry, isTextBlock, isToolResultBlock, isToolUseBlock, isUserMessage, listAdapters, loadSuite, loadSuiteDocument, mergeConfig, namespaceOf, parseStreamJson, parseSuite, registerAdapter, resolveGradeOptions, resolveGradingArtifactFromSuite, resolvePipelineInputs, runPipeline, runRepetition, runSuite, toEvaluationInstance, toHarnessMetrics, toInstancesJsonl, toTrajectory, toTrajectoryInstances, toolCallValid, toolNameMatch, toolParameterKeyMatch, toolParameterKvMatch, trajectoryAnyOrderMatch, trajectoryExactMatch, trajectoryInOrderMatch, trajectoryPrecision, trajectoryRecall, trajectorySingleToolUse, trajectoryToOtlp, trajectoryToTranscript };
 //# sourceMappingURL=index.d.ts.map