npm - @plaited/agent-eval-harness - Versions diffs - 0.5.2 → 0.6.0 - Mend

@plaited/agent-eval-harness 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/README.md +4 -5
package/bin/cli.ts +0 -2
package/package.json +1 -1
package/src/commands/balance.ts +0 -2
package/src/commands/calibrate.ts +1 -2
package/src/commands/capture.ts +1 -1
package/src/commands/summarize.ts +1 -3
package/src/commands/trials.ts +1 -1
package/src/commands/validate-refs.ts +1 -2
package/src/core/core.ts +1 -1
package/src/core/loading.ts +77 -0
package/src/core/output.ts +0 -1
package/src/core.ts +4 -1
package/src/graders/compare-statistical.ts +187 -0
package/src/graders/compare-weighted.ts +112 -0
package/src/graders/tests/compare-graders.spec.ts +293 -0
package/src/graders.ts +19 -0
package/src/headless/headless-cli.ts +0 -2
package/src/headless/headless-session-manager.ts +4 -1
package/src/pipeline/compare.ts +512 -70
package/src/pipeline/extract.ts +1 -1
package/src/pipeline/format.ts +0 -1
package/src/pipeline/grade.ts +1 -1
package/src/pipeline/pipeline.ts +2 -1
package/src/pipeline/pipeline.types.ts +29 -1
package/src/pipeline/run.ts +5 -3
package/src/schemas/grader-loader.ts +9 -1
package/src/schemas/schemas-cli.ts +0 -7
package/src/schemas/schemas.ts +211 -0
package/src/schemas.ts +23 -0

package/README.md CHANGED Viewed

@@ -48,7 +48,7 @@ Pre-built schemas are available in `.claude/skills/headless-adapters/schemas/` f
 | `extract <raw> --schema <path>` | Parse raw output into trajectories |
 | `grade <results> --grader <path>` | Apply grader to extracted results |
 | `format <results> --style <style>` | Convert to markdown, csv, or jsonl |
-| `compare <run1> <run2>... --grader <path>` | Compare multiple runs |
+| `compare <run1> <run2>...` | Compare runs (aggregate report) |
 ### Examples
@@ -76,9 +76,8 @@ cat prompts.jsonl | \
   bunx @plaited/agent-eval-harness grade -g ./grader.ts | \
   bunx @plaited/agent-eval-harness format -f markdown > report.md
-# Compare multiple runs
-bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl \
-  --grader ./compare-grader.ts -o comparison.jsonl
+# Compare runs (built-in strategies: weighted, statistical, custom)
+bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
 ```
 ## Skills for AI Agents
@@ -117,7 +116,7 @@ CLI tool for capturing agent trajectories, optimized for TypeScript/JavaScript p
 | `extract` | Parse raw output into trajectories |
 | `grade` | Apply grader to extracted results |
 | `format` | Convert to markdown, csv, or jsonl |
-| `compare` | Compare multiple runs |
+| `compare` | Compare runs (aggregate report) |
 **Use cases:**
 - Capturing trajectories for downstream evaluation (Braintrust, custom scorers)

package/bin/cli.ts CHANGED Viewed

@@ -30,7 +30,6 @@ import { schemasCli } from '../src/schemas/schemas-cli.ts'
 const [command, ...args] = Bun.argv.slice(2)
 const printHelp = () => {
-  // biome-ignore lint/suspicious/noConsole: CLI help output
   console.log(`
 agent-eval-harness - CLI tool for agent evaluation
@@ -144,7 +143,6 @@ const main = async () => {
     case '-v':
     case '--version': {
       const { version } = await import('../package.json')
-      // biome-ignore lint/suspicious/noConsole: CLI version output
       console.log(version)
       break
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.5.2",
+  "version": "0.6.0",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {

package/src/commands/balance.ts CHANGED Viewed

@@ -171,7 +171,6 @@ export const runBalance = async (config: BalanceConfig): Promise<BalanceAnalysis
   if (outputPath) {
     await Bun.write(resolvePath(outputPath), output)
   } else {
-    // biome-ignore lint/suspicious/noConsole: CLI stdout output
     console.log(output)
   }
@@ -216,7 +215,6 @@ export const balance = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness balance <prompts.jsonl> [options]

package/src/commands/calibrate.ts CHANGED Viewed

@@ -218,6 +218,7 @@ export const runCalibrate = async (config: CalibrateConfig): Promise<Calibration
         output: result.output,
         hint: result.hint,
         trajectory: result.trajectory,
+        metadata: result.metadata,
       })
     }
@@ -231,7 +232,6 @@ export const runCalibrate = async (config: CalibrateConfig): Promise<Calibration
   if (outputPath) {
     await Bun.write(resolvePath(outputPath), markdown)
   } else {
-    // biome-ignore lint/suspicious/noConsole: CLI stdout output
     console.log(markdown)
   }
@@ -260,7 +260,6 @@ export const calibrate = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness calibrate <results.jsonl> [options]

package/src/commands/capture.ts CHANGED Viewed

@@ -230,6 +230,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
           output,
           hint: promptCase.hint,
           trajectory,
+          metadata: promptCase.metadata,
         })
       }
@@ -309,7 +310,6 @@ export const capture = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]

package/src/commands/summarize.ts CHANGED Viewed

@@ -42,7 +42,7 @@ export const formatSummary = (result: CaptureResult): SummaryResult => {
     id: result.id,
     input: inputText,
     output: result.output,
-    toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
+    toolCalls: result.trajectory.flatMap((s) => (s.type === 'tool_call' ? [s.name] : [])),
     duration: result.timing.end - result.timing.start,
   }
 }
@@ -160,7 +160,6 @@ export const runSummarize = async (config: SummarizeConfig): Promise<string> =>
   if (outputPath) {
     await Bun.write(resolvePath(outputPath), output)
   } else {
-    // biome-ignore lint/suspicious/noConsole: CLI stdout output
     console.log(output)
   }
@@ -188,7 +187,6 @@ export const summarize = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness summarize <results.jsonl> [options]

package/src/commands/trials.ts CHANGED Viewed

@@ -216,6 +216,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
             output,
             hint: promptCase.hint,
             trajectory,
+            metadata: promptCase.metadata,
           })
           entry.pass = graderResult.pass
           entry.score = graderResult.score
@@ -310,7 +311,6 @@ export const trials = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]

package/src/commands/validate-refs.ts CHANGED Viewed

@@ -71,6 +71,7 @@ export const runValidateRefs = async (config: ValidateRefsConfig): Promise<Valid
       output: prompt.reference as string,
       hint: prompt.hint,
       trajectory: [], // No trajectory for reference validation
+      metadata: prompt.metadata,
     })
     results.push({
@@ -91,7 +92,6 @@ export const runValidateRefs = async (config: ValidateRefsConfig): Promise<Valid
   if (outputPath) {
     await Bun.write(resolvePath(outputPath), output)
   } else {
-    // biome-ignore lint/suspicious/noConsole: CLI stdout output
     console.log(output)
   }
@@ -131,7 +131,6 @@ export const validateRefs = async (args: string[]): Promise<void> => {
   })
   if (values.help) {
-    // biome-ignore lint/suspicious/noConsole: CLI help output
     console.log(`
 Usage: agent-eval-harness validate-refs <prompts.jsonl> --grader <grader.ts> [options]

package/src/core/core.ts CHANGED Viewed

@@ -11,7 +11,7 @@
  */
 // Loading utilities
-export { loadJsonl, loadPrompts, loadResults } from './loading.ts'
+export { buildResultsIndex, countLines, loadJsonl, loadPrompts, loadResults, streamResults } from './loading.ts'
 // Output utilities
 export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
 // Trajectory utilities

package/src/core/loading.ts CHANGED Viewed

@@ -94,3 +94,80 @@ export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
       }
     })
 }
+// ============================================================================
+// Streaming Loading
+// ============================================================================
+/**
+ * Stream capture results from a JSONL file.
+ *
+ * @remarks
+ * Memory-efficient alternative to loadResults for large files.
+ * Yields results one at a time using an async generator.
+ *
+ * @param path - Path to the results.jsonl file
+ * @yields Parsed and validated capture results
+ * @throws Error if file cannot be read or any line is invalid
+ *
+ * @public
+ */
+export async function* streamResults(path: string): AsyncGenerator<CaptureResult, void, unknown> {
+  const file = Bun.file(path)
+  const text = await file.text()
+  const lines = text.split('\n')
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i]?.trim()
+    if (!line) continue
+    try {
+      yield CaptureResultSchema.parse(JSON.parse(line))
+    } catch (error) {
+      throw new Error(`Invalid result at line ${i + 1}: ${error instanceof Error ? error.message : error}`)
+    }
+  }
+}
+/**
+ * Build an indexed map of results by ID using streaming.
+ *
+ * @remarks
+ * Memory-efficient for the compare command. Loads results into a Map
+ * keyed by ID for O(1) lookups without holding raw file content.
+ *
+ * For very large files (10k+ results), this is more memory-efficient than
+ * loading everything into an array and then building an index.
+ *
+ * @param path - Path to the results.jsonl file
+ * @returns Map of result ID to CaptureResult
+ *
+ * @public
+ */
+export const buildResultsIndex = async (path: string): Promise<Map<string, CaptureResult>> => {
+  const index = new Map<string, CaptureResult>()
+  for await (const result of streamResults(path)) {
+    index.set(result.id, result)
+  }
+  return index
+}
+/**
+ * Count lines in a JSONL file without loading content.
+ *
+ * @remarks
+ * Useful for detecting large files that should use streaming mode.
+ * Uses byte-level scanning for efficiency.
+ *
+ * @param path - Path to the JSONL file
+ * @returns Number of non-empty lines
+ *
+ * @public
+ */
+export const countLines = async (path: string): Promise<number> => {
+  const file = Bun.file(path)
+  const text = await file.text()
+  return text.split('\n').filter((line) => line.trim()).length
+}

package/src/core/output.ts CHANGED Viewed

@@ -35,7 +35,6 @@ export const writeOutput = async (line: string, outputPath?: string, append?: bo
       await Bun.write(outputPath, `${line}\n`)
     }
   } else {
-    // biome-ignore lint/suspicious/noConsole: CLI stdout output
     console.log(line)
   }
 }

package/src/core.ts CHANGED Viewed

@@ -8,6 +8,9 @@
  */
 export {
+  // Loading
+  buildResultsIndex,
+  countLines,
   // Trajectory
   detectTrajectoryRichness,
   extractContent,
@@ -18,11 +21,11 @@ export {
   getInputPreview,
   hasToolErrors,
   headTailPreview,
-  // Loading
   loadJsonl,
   loadPrompts,
   loadResults,
   logProgress,
   resolvePath,
+  streamResults,
   writeOutput,
 } from './core/core.ts'

package/src/graders/compare-statistical.ts ADDED Viewed

@@ -0,0 +1,187 @@
+/**
+ * Built-in statistical significance comparison grader.
+ *
+ * @remarks
+ * Uses bootstrap sampling to compute confidence intervals for score estimates.
+ * Flags when the winner is statistically significant (p<0.05, non-overlapping CIs).
+ *
+ * Bootstrap iterations can be customized via environment variable:
+ * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
+ *
+ * @packageDocumentation
+ */
+import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
+/** Default number of bootstrap iterations */
+const DEFAULT_ITERATIONS = 1000
+/**
+ * Bootstrap confidence interval result.
+ */
+type BootstrapResult = {
+  /** Estimated mean from bootstrap */
+  mean: number
+  /** 95% confidence interval [lower, upper] */
+  ci95: [number, number]
+}
+/**
+ * Compute bootstrap confidence interval for sample mean.
+ *
+ * @remarks
+ * Bootstrap resampling provides robust confidence intervals without
+ * assuming a specific distribution. For small samples, it's more
+ * reliable than parametric methods.
+ *
+ * @param samples - Array of numeric samples
+ * @param iterations - Number of bootstrap iterations
+ * @returns Bootstrap mean and 95% confidence interval
+ */
+const bootstrap = (samples: number[], iterations: number = DEFAULT_ITERATIONS): BootstrapResult => {
+  if (samples.length === 0) {
+    return { mean: 0, ci95: [0, 0] }
+  }
+  if (samples.length === 1) {
+    const value = samples[0] ?? 0
+    return { mean: value, ci95: [value, value] }
+  }
+  const means: number[] = []
+  for (let i = 0; i < iterations; i++) {
+    // Resample with replacement - we know samples.length > 1 at this point
+    const resampled = Array.from(
+      { length: samples.length },
+      () => samples[Math.floor(Math.random() * samples.length)] as number,
+    )
+    // Compute mean of resampled data
+    const sum = resampled.reduce((acc, val) => acc + val, 0)
+    means.push(sum / resampled.length)
+  }
+  // Sort means for percentile calculation
+  means.sort((a, b) => a - b)
+  // 95% CI: 2.5th and 97.5th percentiles
+  const lowerIdx = Math.floor(iterations * 0.025)
+  const upperIdx = Math.floor(iterations * 0.975)
+  return {
+    mean: means[Math.floor(iterations / 2)] ?? 0,
+    ci95: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
+  }
+}
+/**
+ * Get bootstrap iterations from environment variable.
+ *
+ * @returns Number of bootstrap iterations
+ */
+const getIterationsFromEnv = (): number => {
+  const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
+  if (!envValue) return DEFAULT_ITERATIONS
+  const parsed = Number.parseInt(envValue, 10)
+  return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
+}
+/**
+ * Statistical significance comparison grader.
+ *
+ * @remarks
+ * Compares runs using bootstrap sampling to determine if differences
+ * are statistically significant. When confidence intervals don't overlap,
+ * the difference is flagged as significant (p<0.05).
+ *
+ * **Single-sample limitation:** When comparing individual prompts, each run
+ * provides only one score sample. Bootstrap with a single sample yields a
+ * degenerate CI of `[value, value]`. This grader is most useful when:
+ * - Aggregating results across multiple prompts
+ * - Using with the full comparison report (which combines per-prompt comparisons)
+ *
+ * For single-prompt comparisons, consider the weighted grader instead.
+ *
+ * @public
+ */
+export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
+  const iterations = getIterationsFromEnv()
+  // Collect scores for each run
+  const runStats = Object.entries(runs).map(([label, run]) => {
+    // Use grader score if available, otherwise 0
+    const score = run.score?.score ?? 0
+    // For single-prompt comparison, we only have one sample
+    // In practice, this grader is most useful when aggregating across prompts
+    const stats = bootstrap([score], iterations)
+    return { label, score, stats }
+  })
+  // Sort by bootstrap mean descending
+  const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
+  // Check if winner is statistically significant
+  // CIs don't overlap = significant difference (approximately p<0.05)
+  let isSignificant = false
+  const first = sorted[0]
+  const second = sorted[1]
+  if (first && second) {
+    // Non-overlapping: first's lower bound > second's upper bound
+    isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
+  }
+  const reasoning = isSignificant
+    ? `Winner "${first?.label}" is statistically significant (p<0.05, non-overlapping 95% CIs)`
+    : 'No statistically significant difference between top runs (overlapping 95% CIs)'
+  return {
+    rankings: sorted.map((s, i) => ({
+      run: s.label,
+      rank: i + 1,
+      score: s.stats.mean,
+    })),
+    reasoning,
+  }
+}
+/**
+ * Create a statistical grader with custom iteration count.
+ *
+ * @param iterations - Number of bootstrap iterations
+ * @returns Comparison grader function
+ *
+ * @public
+ */
+export const createStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): ComparisonGrader => {
+  return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
+    const runStats = Object.entries(runs).map(([label, run]) => {
+      const score = run.score?.score ?? 0
+      const stats = bootstrap([score], iterations)
+      return { label, score, stats }
+    })
+    const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
+    let isSignificant = false
+    const first = sorted[0]
+    const second = sorted[1]
+    if (first && second) {
+      isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
+    }
+    return {
+      rankings: sorted.map((s, i) => ({
+        run: s.label,
+        rank: i + 1,
+        score: s.stats.mean,
+      })),
+      reasoning: isSignificant
+        ? `Winner "${first?.label}" is statistically significant (p<0.05)`
+        : 'No statistically significant difference between top runs',
+    }
+  }
+}

package/src/graders/compare-weighted.ts ADDED Viewed

@@ -0,0 +1,112 @@
+/**
+ * Built-in weighted multi-dimensional comparison grader.
+ *
+ * @remarks
+ * Configurable weights for quality, latency, and reliability.
+ * Default strategy when no `--grader` is specified for the compare command.
+ *
+ * Weights can be customized via environment variables:
+ * - `COMPARE_QUALITY` (default: 0.5)
+ * - `COMPARE_LATENCY` (default: 0.3)
+ * - `COMPARE_RELIABILITY` (default: 0.2)
+ *
+ * @packageDocumentation
+ */
+import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
+/**
+ * Weight configuration for comparison dimensions.
+ */
+export type Weights = {
+  /** Weight for quality (pass/score) - how much correctness matters */
+  quality: number
+  /** Weight for latency - how much speed matters */
+  latency: number
+  /** Weight for reliability - how much error-free execution matters */
+  reliability: number
+}
+/** Default weights: quality=0.5, latency=0.3, reliability=0.2 */
+export const DEFAULT_WEIGHTS: Weights = {
+  quality: 0.5,
+  latency: 0.3,
+  reliability: 0.2,
+}
+/**
+ * Read weights from environment variables with fallback to defaults.
+ *
+ * @returns Weights configuration
+ */
+export const getWeightsFromEnv = (): Weights => {
+  const quality = Number.parseFloat(process.env.COMPARE_QUALITY ?? String(DEFAULT_WEIGHTS.quality))
+  const latency = Number.parseFloat(process.env.COMPARE_LATENCY ?? String(DEFAULT_WEIGHTS.latency))
+  const reliability = Number.parseFloat(process.env.COMPARE_RELIABILITY ?? String(DEFAULT_WEIGHTS.reliability))
+  return {
+    quality: Number.isNaN(quality) ? DEFAULT_WEIGHTS.quality : quality,
+    latency: Number.isNaN(latency) ? DEFAULT_WEIGHTS.latency : latency,
+    reliability: Number.isNaN(reliability) ? DEFAULT_WEIGHTS.reliability : reliability,
+  }
+}
+/**
+ * Create a weighted comparison grader with custom weights.
+ *
+ * @param weights - Weight configuration for comparison dimensions
+ * @returns Comparison grader function
+ *
+ * @public
+ */
+export const createWeightedGrader = (weights: Weights = DEFAULT_WEIGHTS): ComparisonGrader => {
+  return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
+    const scores = Object.entries(runs).map(([label, run]) => {
+      // Quality score: use grader score if available, otherwise 0
+      // Note: run.score is only present if the result was graded
+      const qualityScore = run.score?.score ?? 0
+      // Latency score: inverse relationship (faster = better)
+      // Normalize: 1 / (1 + duration/1000) gives ~0.5 at 1s, ~0.1 at 10s
+      const duration = run.duration ?? 10000
+      const latencyScore = 1 / (1 + duration / 1000)
+      // Reliability score: 1 if no errors, 0 if errors
+      const hasErrors = run.toolErrors ?? false
+      const reliabilityScore = hasErrors ? 0 : 1
+      // Weighted combination
+      const weighted =
+        qualityScore * weights.quality + latencyScore * weights.latency + reliabilityScore * weights.reliability
+      return { label, weighted, qualityScore, latencyScore, reliabilityScore }
+    })
+    // Sort by weighted score descending (highest = best)
+    const sorted = scores.sort((a, b) => b.weighted - a.weighted)
+    return {
+      rankings: sorted.map((s, i) => ({
+        run: s.label,
+        rank: i + 1,
+        score: s.weighted,
+      })),
+      reasoning: `Weighted: quality=${weights.quality}, latency=${weights.latency}, reliability=${weights.reliability}`,
+    }
+  }
+}
+/**
+ * Default weighted comparison grader using environment or default weights.
+ *
+ * @remarks
+ * This is the default grader used when `--strategy weighted` is specified
+ * or when no strategy is specified for the compare command.
+ *
+ * @public
+ */
+export const grade: ComparisonGrader = async (input: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
+  const weights = getWeightsFromEnv()
+  const grader = createWeightedGrader(weights)
+  return grader(input)
+}