npm - @plaited/agent-eval-harness - Versions diffs - 0.5.2 → 0.6.0 - Mend

@plaited/agent-eval-harness 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/README.md +4 -5
package/bin/cli.ts +0 -2
package/package.json +1 -1
package/src/commands/balance.ts +0 -2
package/src/commands/calibrate.ts +1 -2
package/src/commands/capture.ts +1 -1
package/src/commands/summarize.ts +1 -3
package/src/commands/trials.ts +1 -1
package/src/commands/validate-refs.ts +1 -2
package/src/core/core.ts +1 -1
package/src/core/loading.ts +77 -0
package/src/core/output.ts +0 -1
package/src/core.ts +4 -1
package/src/graders/compare-statistical.ts +187 -0
package/src/graders/compare-weighted.ts +112 -0
package/src/graders/tests/compare-graders.spec.ts +293 -0
package/src/graders.ts +19 -0
package/src/headless/headless-cli.ts +0 -2
package/src/headless/headless-session-manager.ts +4 -1
package/src/pipeline/compare.ts +512 -70
package/src/pipeline/extract.ts +1 -1
package/src/pipeline/format.ts +0 -1
package/src/pipeline/grade.ts +1 -1
package/src/pipeline/pipeline.ts +2 -1
package/src/pipeline/pipeline.types.ts +29 -1
package/src/pipeline/run.ts +5 -3
package/src/schemas/grader-loader.ts +9 -1
package/src/schemas/schemas-cli.ts +0 -7
package/src/schemas/schemas.ts +211 -0
package/src/schemas.ts +23 -0

package/src/pipeline/extract.ts CHANGED Viewed

@@ -91,6 +91,7 @@ const extractFromRaw = (rawOutput: RawOutput, parser: ReturnType<typeof createOu
     output: finalOutput,
     trajectory,
     toolErrors: toolErrors || !!rawOutput.error,
+    metadata: rawOutput.metadata,
     timing: rawOutput.timing,
     ...(rawOutput.error && { error: rawOutput.error }),
   }
@@ -186,7 +187,6 @@ export const extract = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness extract [raw.jsonl] --schema <schema.json> [options]

package/src/pipeline/format.ts CHANGED Viewed

@@ -235,7 +235,6 @@ export const format = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness format [results.jsonl] [options]

package/src/pipeline/grade.ts CHANGED Viewed

@@ -51,6 +51,7 @@ export const runGrade = async (
       output: extracted.output,
       hint: extracted.hint,
       trajectory: extracted.trajectory,
+      metadata: extracted.metadata,
     })
     const graded: GradedResult = {
@@ -110,7 +111,6 @@ export const grade = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness grade [extracted.jsonl] --grader <grader> [options]

package/src/pipeline/pipeline.ts CHANGED Viewed

@@ -15,7 +15,7 @@
  */
 // Commands
-export { compare } from './compare.ts'
+export { type CompareStrategy, compare, type ExtendedCompareConfig, runCompare } from './compare.ts'
 export { extract } from './extract.ts'
 export { format } from './format.ts'
 export { grade } from './grade.ts'
@@ -27,6 +27,7 @@ export type {
   ComparisonGraderResult,
   ComparisonRanking,
   ComparisonResult,
+  ComparisonRunData,
   ExtractConfig,
   ExtractedResult,
   FormatConfig,

package/src/pipeline/pipeline.types.ts CHANGED Viewed

@@ -26,6 +26,8 @@ export type RawOutput = {
   input: string | string[]
   /** Grader context hint */
   hint?: string
+  /** Optional metadata from original prompt */
+  metadata?: Record<string, unknown>
   /** Raw output lines from the agent (JSON strings) */
   rawLines: string[]
   /** Timing metadata */
@@ -58,6 +60,8 @@ export type ExtractedResult = {
   trajectory: TrajectoryStep[]
   /** Whether tool errors were detected */
   toolErrors: boolean
+  /** Optional metadata from original prompt */
+  metadata?: Record<string, unknown>
   /** Timing metadata */
   timing: {
     start: number
@@ -158,6 +162,28 @@ export type LabeledRun = {
   path: string
 }
+/**
+ * Run data provided to comparison graders.
+ *
+ * @remarks
+ * Extended run data includes optional fields that built-in graders use:
+ * - `score`: Grader result if the run was previously graded
+ * - `duration`: Total duration from timing
+ * - `toolErrors`: Whether tool errors occurred
+ */
+export type ComparisonRunData = {
+  /** Final agent output */
+  output: string
+  /** Execution trajectory (optional, varies by adapter) */
+  trajectory?: TrajectoryStep[]
+  /** Grader score (if run was graded) */
+  score?: GraderResult
+  /** Total duration in milliseconds */
+  duration?: number
+  /** Whether tool errors occurred */
+  toolErrors?: boolean
+}
 /**
  * Input to comparison grader function.
  *
@@ -172,8 +198,10 @@ export type ComparisonGraderInput = {
   input: string | string[]
   /** Grader context hint */
   hint?: string
+  /** Optional metadata from original prompt */
+  metadata?: Record<string, unknown>
   /** Results keyed by run label */
-  runs: Record<string, { output: string; trajectory?: TrajectoryStep[] }>
+  runs: Record<string, ComparisonRunData>
 }
 /**

package/src/pipeline/run.ts CHANGED Viewed

@@ -105,7 +105,7 @@ const runShell = async (
  */
 export const runPipeline = async (
   config: RunConfig,
-  prompts: Array<{ id: string; input: string | string[]; hint?: string }>,
+  prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record<string, unknown> }>,
   outputPath?: string,
 ): Promise<void> => {
   const {
@@ -181,6 +181,7 @@ export const runPipeline = async (
         id: promptCase.id,
         input: promptCase.input,
         hint: promptCase.hint,
+        metadata: promptCase.metadata,
         rawLines,
         timing: {
           start: startTime,
@@ -224,6 +225,7 @@ export const runPipeline = async (
         id: promptCase.id,
         input: promptCase.input,
         hint: promptCase.hint,
+        metadata: promptCase.metadata,
         rawLines: allLines,
         timing: {
           start: startTime,
@@ -267,6 +269,7 @@ export const runPipeline = async (
         id: promptCase.id,
         input: promptCase.input,
         hint: promptCase.hint,
+        metadata: promptCase.metadata,
         rawLines: allLines,
         timing: {
           start: startTime,
@@ -331,7 +334,6 @@ export const run = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness run [prompts.jsonl] [options]
@@ -383,7 +385,7 @@ Examples:
   // Load prompts from file or stdin
   const promptsPath = positionals[0]
-  let prompts: Array<{ id: string; input: string | string[]; hint?: string }>
+  let prompts: Array<{ id: string; input: string | string[]; hint?: string; metadata?: Record<string, unknown> }>
   if (promptsPath) {
     prompts = await loadPrompts(promptsPath)

package/src/schemas/grader-loader.ts CHANGED Viewed

@@ -40,12 +40,20 @@ const resolvePath = (path: string): string => {
 // Executable Grader
 // ============================================================================
-/** Input format for executable graders (stdin JSON) */
+/**
+ * Input format for executable graders (stdin JSON).
+ *
+ * @remarks
+ * The metadata field contains arbitrary key-value pairs from the original
+ * prompt JSONL (e.g., category, difficulty, tags). Use this to implement
+ * category-specific grading logic or filter calibration samples.
+ */
 type ExecGraderInput = {
   input: string | string[]
   output: string
   hint?: string
   trajectory?: TrajectoryStep[]
+  metadata?: Record<string, unknown>
 }
 /**

package/src/schemas/schemas-cli.ts CHANGED Viewed

@@ -99,10 +99,8 @@ export const runSchemas = async (config: SchemasConfig): Promise<Record<string,
   // List mode
   if (list) {
     const names = Object.keys(SCHEMA_REGISTRY)
-    // biome-ignore lint/suspicious/noConsole: CLI stdout output
     console.log('Available schemas:')
     for (const name of names) {
-      // biome-ignore lint/suspicious/noConsole: CLI stdout output
       console.log(`  - ${name}`)
     }
     return names
@@ -123,7 +121,6 @@ export const runSchemas = async (config: SchemasConfig): Promise<Record<string,
     if (outputPath) {
       await Bun.write(resolvePath(outputPath), output)
     } else {
-      // biome-ignore lint/suspicious/noConsole: CLI stdout output
       console.log(output)
     }
@@ -154,15 +151,12 @@ export const runSchemas = async (config: SchemasConfig): Promise<Record<string,
     if (outputPath) {
       await Bun.write(resolvePath(outputPath), output)
     } else {
-      // biome-ignore lint/suspicious/noConsole: CLI stdout output
       console.log(output)
     }
   } else {
     // Default: list schemas
-    // biome-ignore lint/suspicious/noConsole: CLI stdout output
     console.log('Available schemas (use --json to export):')
     for (const name of Object.keys(allSchemas)) {
-      // biome-ignore lint/suspicious/noConsole: CLI stdout output
       console.log(`  - ${name}`)
     }
   }
@@ -193,7 +187,6 @@ export const schemasCli = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness schemas [schema-name] [options]

package/src/schemas/schemas.ts CHANGED Viewed

@@ -229,12 +229,14 @@ export type GraderResult = z.infer<typeof GraderResultSchema>
  * User-provided graders implement this interface to score agent outputs.
  * - `input` is the original prompt (string or array for multi-turn)
  * - `hint` provides grader context (renamed from `expected`)
+ * - `metadata` contains arbitrary key-value pairs from the original prompt JSONL
  */
 export type Grader = (params: {
   input: string | string[]
   output: string
   hint?: string
   trajectory?: TrajectoryStep[]
+  metadata?: Record<string, unknown>
 }) => Promise<GraderResult>
 // ============================================================================
@@ -556,3 +558,212 @@ export const ValidationResultSchema = z.object({
 /** Validation result type */
 export type ValidationResult = z.infer<typeof ValidationResultSchema>
+// ============================================================================
+// Comparison Report Schemas
+// ============================================================================
+/**
+ * Score distribution histogram for quality analysis.
+ *
+ * @remarks
+ * Buckets divide the 0-1 score range into 5 equal bins.
+ */
+export const ScoreDistributionSchema = z.object({
+  '0.0-0.2': z.number(),
+  '0.2-0.4': z.number(),
+  '0.4-0.6': z.number(),
+  '0.6-0.8': z.number(),
+  '0.8-1.0': z.number(),
+})
+/** Score distribution type */
+export type ScoreDistribution = z.infer<typeof ScoreDistributionSchema>
+/**
+ * Quality metrics for a single run in comparison.
+ */
+export const QualityMetricsSchema = z.object({
+  /** Mean grader score (0-1) */
+  avgScore: z.number(),
+  /** Percentage of pass=true results */
+  passRate: z.number(),
+  /** Count of passing results */
+  passCount: z.number(),
+  /** Count of failing results */
+  failCount: z.number(),
+  /** Score distribution histogram */
+  scoreDistribution: ScoreDistributionSchema,
+})
+/** Quality metrics type */
+export type QualityMetrics = z.infer<typeof QualityMetricsSchema>
+/**
+ * Latency statistics for performance analysis.
+ */
+export const LatencyStatsSchema = z.object({
+  /** 50th percentile (median) in milliseconds */
+  p50: z.number(),
+  /** 90th percentile in milliseconds */
+  p90: z.number(),
+  /** 99th percentile in milliseconds */
+  p99: z.number(),
+  /** Mean latency in milliseconds */
+  mean: z.number(),
+  /** Minimum latency in milliseconds */
+  min: z.number(),
+  /** Maximum latency in milliseconds */
+  max: z.number(),
+})
+/** Latency stats type */
+export type LatencyStats = z.infer<typeof LatencyStatsSchema>
+/**
+ * Performance metrics for a single run in comparison.
+ */
+export const PerformanceMetricsSchema = z.object({
+  /** End-to-end latency statistics */
+  latency: LatencyStatsSchema,
+  /** Time to first response statistics (optional, not all adapters support) */
+  firstResponse: LatencyStatsSchema.optional(),
+  /** Sum of all run durations in milliseconds */
+  totalDuration: z.number(),
+})
+/** Performance metrics type */
+export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
+/**
+ * Reliability metrics for a single run in comparison.
+ */
+export const ReliabilityMetricsSchema = z.object({
+  /** Count of runs with toolErrors=true */
+  toolErrors: z.number(),
+  /** Percentage of runs with tool errors */
+  toolErrorRate: z.number(),
+  /** Count of runs that hit timeout */
+  timeouts: z.number(),
+  /** Percentage of runs that hit timeout */
+  timeoutRate: z.number(),
+  /** Percentage of runs that completed successfully */
+  completionRate: z.number(),
+})
+/** Reliability metrics type */
+export type ReliabilityMetrics = z.infer<typeof ReliabilityMetricsSchema>
+/**
+ * Trajectory info for a single run in comparison.
+ */
+export const TrajectoryInfoSchema = z.object({
+  /** Trajectory richness level */
+  richness: TrajectoryRichnessSchema,
+  /** Average trajectory steps per run */
+  avgStepCount: z.number(),
+})
+/** Trajectory info type */
+export type TrajectoryInfo = z.infer<typeof TrajectoryInfoSchema>
+/**
+ * Per-prompt comparison entry for head-to-head drill-down.
+ */
+export const PromptComparisonSchema = z.object({
+  /** Prompt identifier */
+  id: z.string(),
+  /** Run label of the winner, or null if tie */
+  winner: z.string().nullable(),
+  /** Scores by run label */
+  scores: z.record(z.string(), z.number()),
+  /** Latencies by run label in milliseconds */
+  latencies: z.record(z.string(), z.number()),
+  /** Whether each run had errors */
+  hadErrors: z.record(z.string(), z.boolean()),
+})
+/** Prompt comparison type */
+export type PromptComparison = z.infer<typeof PromptComparisonSchema>
+/**
+ * Pairwise win/loss/tie statistics between two runs.
+ */
+export const PairwiseComparisonSchema = z.object({
+  /** First run label */
+  runA: z.string(),
+  /** Second run label */
+  runB: z.string(),
+  /** Number of prompts where A won */
+  aWins: z.number(),
+  /** Number of prompts where B won */
+  bWins: z.number(),
+  /** Number of prompts where A and B tied */
+  ties: z.number(),
+})
+/** Pairwise comparison type */
+export type PairwiseComparison = z.infer<typeof PairwiseComparisonSchema>
+/**
+ * Head-to-head comparison section.
+ */
+export const HeadToHeadSchema = z.object({
+  /** Per-prompt breakdown for drill-down */
+  prompts: z.array(PromptComparisonSchema),
+  /** Pairwise win rates between runs */
+  pairwise: z.array(PairwiseComparisonSchema),
+})
+/** Head-to-head type */
+export type HeadToHead = z.infer<typeof HeadToHeadSchema>
+/**
+ * Metadata for the comparison report.
+ */
+export const ComparisonMetaSchema = z.object({
+  /** ISO timestamp when report was generated */
+  generatedAt: z.string(),
+  /** Run labels included in comparison */
+  runs: z.array(z.string()),
+  /** Total prompts compared */
+  promptCount: z.number(),
+  /** Prompts where all runs completed */
+  promptsWithAllRuns: z.number(),
+})
+/** Comparison meta type */
+export type ComparisonMeta = z.infer<typeof ComparisonMetaSchema>
+/**
+ * Holistic comparison report schema.
+ *
+ * @remarks
+ * Aggregates comparison output across all dimensions:
+ * - Quality: pass rates, scores, distributions
+ * - Performance: latency percentiles
+ * - Reliability: error rates, completion rates
+ * - Head-to-head: per-prompt winners, pairwise stats
+ *
+ * Note: Tool usage analysis is NOT included because adapter formats vary.
+ * Different adapters provide different `trajectoryRichness` levels and
+ * the `tool_call.name` field often contains tool use IDs rather than
+ * human-readable names.
+ */
+export const ComparisonReportSchema = z.object({
+  /** Report metadata */
+  meta: ComparisonMetaSchema,
+  /** Quality metrics by run label */
+  quality: z.record(z.string(), QualityMetricsSchema),
+  /** Performance metrics by run label */
+  performance: z.record(z.string(), PerformanceMetricsSchema),
+  /** Reliability metrics by run label */
+  reliability: z.record(z.string(), ReliabilityMetricsSchema),
+  /** Trajectory info by run label */
+  trajectoryInfo: z.record(z.string(), TrajectoryInfoSchema),
+  /** Head-to-head comparison details */
+  headToHead: HeadToHeadSchema,
+})
+/** Comparison report type */
+export type ComparisonReport = z.infer<typeof ComparisonReportSchema>

package/src/schemas.ts CHANGED Viewed

@@ -35,10 +35,17 @@ export {
   CaptureResultSchema,
   type CategoryDistribution,
   CategoryDistributionSchema,
+  // Comparison report types
+  type ComparisonMeta,
+  ComparisonMetaSchema,
+  type ComparisonReport,
+  ComparisonReportSchema,
   EnvVariableSchema,
   type Grader,
   type GraderResult,
   GraderResultSchema,
+  type HeadToHead,
+  HeadToHeadSchema,
   HttpHeaderSchema,
   type IndexedStep,
   type JsonRpcError,
@@ -55,14 +62,28 @@ export {
   JsonRpcResponseSchema,
   type JsonRpcSuccessResponse,
   JsonRpcSuccessResponseSchema,
+  type LatencyStats,
+  LatencyStatsSchema,
   type McpServerConfig,
   McpServerHttpSchema,
   McpServerSchema,
   McpServerStdioSchema,
   MessageStepSchema,
+  type PairwiseComparison,
+  PairwiseComparisonSchema,
+  type PerformanceMetrics,
+  PerformanceMetricsSchema,
   PlanStepSchema,
   type PromptCase,
   PromptCaseSchema,
+  type PromptComparison,
+  PromptComparisonSchema,
+  type QualityMetrics,
+  QualityMetricsSchema,
+  type ReliabilityMetrics,
+  ReliabilityMetricsSchema,
+  type ScoreDistribution,
+  ScoreDistributionSchema,
   type Session,
   SessionSchema,
   type SummaryResult,
@@ -73,6 +94,8 @@ export {
   ToolCallStepSchema,
   type ToolInput,
   ToolInputSchema,
+  type TrajectoryInfo,
+  TrajectoryInfoSchema,
   type TrajectoryRichness,
   TrajectoryRichnessSchema,
   type TrajectoryStep,