npm - @alis-build/harness-eval - Versions diffs - 0.1.0 - Mend

@alis-build/harness-eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/LICENSE +201 -0
package/README.md +700 -0
package/dist/adapters/claude-code/index.d.ts +3 -0
package/dist/adapters/claude-code/index.js +2 -0
package/dist/build-DsVJ_UeU.js +1396 -0
package/dist/build-DsVJ_UeU.js.map +1 -0
package/dist/cardinality-DlE44e-4.js +31 -0
package/dist/cardinality-DlE44e-4.js.map +1 -0
package/dist/claude-code-ycT0JQZF.js +563 -0
package/dist/claude-code-ycT0JQZF.js.map +1 -0
package/dist/cli/bin.d.ts +1 -0
package/dist/cli/bin.js +623 -0
package/dist/cli/bin.js.map +1 -0
package/dist/config/loader.d.ts +2 -0
package/dist/config/loader.js +2 -0
package/dist/index-6Z17eKZx.d.ts +72 -0
package/dist/index.d.ts +725 -0
package/dist/index.js +5 -0
package/dist/loader-BCnFJ8rm.js +717 -0
package/dist/loader-BCnFJ8rm.js.map +1 -0
package/dist/loader-DTvoVfN0.d.ts +33 -0
package/dist/rolldown-runtime-D7D4PA-g.js +13 -0
package/dist/runner/suite.d.ts +2 -0
package/dist/runner/suite.js +2 -0
package/dist/suite-BoOvK_lq.d.ts +7 -0
package/dist/suite-chj0j22j.js +684 -0
package/dist/suite-chj0j22j.js.map +1 -0
package/dist/types-B9H4IZtA.d.ts +305 -0
package/dist/types-BQol062t.d.ts +292 -0
package/package.json +74 -0
package/schemas/eval-interchange-agent-trace.schema.json +322 -0
package/schemas/eval-interchange-proto-instance.schema.json +106 -0
package/schemas/eval-interchange.schema.json +140 -0
package/schemas/eval-run-envelope.schema.json +2195 -0
package/schemas/trajectory-view.schema.json +441 -0

package/dist/types-BQol062t.d.ts ADDED Viewed

@@ -0,0 +1,292 @@
+import { a as HarnessAdapter, d as ToolCall, f as TrajectoryView, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics } from "./types-B9H4IZtA.js";
+//#region src/types/assertions.d.ts
+/**
+ * A tool name pattern. Either a literal name, or a glob with `*` wildcards.
+ *
+ * The object form (`{ pattern: "..." }`) exists only for YAML disambiguation —
+ * in YAML, a bare string is the default. Internally they are equivalent.
+ *
+ * @example
+ *   "mcp__api__search_skills"      // literal match
+ *   "mcp__api__*"                  // any tool in mcp__api namespace
+ *   "mcp__*"                       // any MCP tool
+ *   "*"                            // any tool at all
+ */
+type ToolPattern = string | {
+  pattern: string;
+};
+/**
+ * Cardinality spec for `called` assertions.
+ *
+ * Format: `"<op> <n>"` with `op` ∈ {`==`, `!=`, `>=`, `<=`, `>`, `<`}.
+ * Default (when omitted): `">= 1"`.
+ *
+ * Parsed lazily by `src/assertions/cardinality.ts`.
+ */
+type Cardinality = string;
+/**
+ * Argument-matching predicate.
+ *
+ * The predicate language is recursive. Three flavours:
+ *   - Leaf:     `{ equals: "x" }`, `{ contains: "foo" }`, etc.
+ *   - Compound: `{ all_of: [...] }`, `{ any_of: [...] }`, `{ not: ... }`.
+ *   - Object:   `{ field1: <predicate>, field2: <predicate>, ... }` — descend
+ *               into object fields. Each field's value is itself a Predicate.
+ *
+ * Disambiguation: a single-key object whose key matches a known leaf or
+ * compound operator is treated as a leaf/compound predicate. Otherwise it
+ * is treated as an object predicate (field name = key).
+ *
+ * Known limitation: if your tool's arg schema has a field literally named
+ * `equals`, `contains`, etc., you must wrap it: `{ equals: { equals: "x" } }`.
+ * In practice this never happens for MCP tools.
+ */
+type Predicate = LeafPredicate | CompoundPredicate | ObjectPredicate;
+type LeafPredicate = {
+  equals: unknown;
+} | {
+  contains: string;
+} | {
+  not_contains: string;
+} | {
+  regex: string;
+} | {
+  gte: number;
+} | {
+  lte: number;
+} | {
+  gt: number;
+} | {
+  lt: number;
+} | {
+  one_of: unknown[];
+};
+type CompoundPredicate = {
+  any_of: Predicate[];
+} | {
+  all_of: Predicate[];
+} | {
+  not: Predicate;
+};
+/** Object-shaped predicate. Field values may be sub-predicates or scalar shortcuts. */
+type ObjectPredicate = {
+  [field: string]: Predicate | string | number | boolean | null;
+};
+/**
+ * The full assertion language. Each variant is evaluated by a corresponding
+ * function in the `src/assertions/*.ts` modules.
+ *
+ * Grouped by concern for readability:
+ *   1. Tool-call presence and ordering
+ *   2. Tool-call argument matching
+ *   3. Behavior (efficiency, finishing, blind-answering)
+ *   4. Response text
+ *   5. Compound (logical operators)
+ *   6. Escape hatch (arbitrary TypeScript predicate)
+ */
+type Assertion = {
+  type: "called";
+  tool: ToolPattern;
+  times?: Cardinality;
+} | {
+  type: "not_called";
+  tool: ToolPattern;
+} | {
+  type: "called_any_of";
+  tools: ToolPattern[];
+} | {
+  type: "called_all_of";
+  tools: ToolPattern[];
+} | {
+  type: "called_before";
+  first: ToolPattern;
+  then: ToolPattern;
+} | {
+  type: "sequence";
+  tools: ToolPattern[];
+  strict?: boolean;
+} | {
+  type: "called_with";
+  tool: ToolPattern;
+  args: Predicate;
+} | {
+  type: "responded_without_tool_calls";
+} | {
+  type: "iterations_within";
+  max: number;
+} | {
+  type: "cost_within_usd";
+  max: number;
+} | {
+  type: "duration_within_ms";
+  max: number;
+} | {
+  type: "finished_with";
+  reasons: string | string[];
+} | {
+  type: "response_contains";
+  text: string;
+} | {
+  type: "response_not_contains";
+  text: string;
+} | {
+  type: "response_matches";
+  pattern: string;
+  flags?: string;
+} | {
+  type: "all_of";
+  assertions: Assertion[];
+} | {
+  type: "any_of";
+  assertions: Assertion[];
+} | {
+  type: "not";
+  assertion: Assertion;
+} | {
+  type: "predicate";
+  fn: (view: TrajectoryView) => boolean;
+  description?: string;
+};
+/** An assertion plus the pass-rate threshold it must meet across repetitions. */
+interface ThresholdedAssertion {
+  assertion: Assertion;
+  /**
+   * Minimum pass rate across repetitions for this assertion to be considered
+   * passing. Range 0..1. Default 1.0 (strict — every rep must pass).
+   */
+  threshold?: number;
+}
+/**
+ * Result of evaluating a single assertion.
+ *
+ * `children` is populated for compound assertions (and/or/not) so the
+ * reporter can render a tree showing which leaf caused a failure. `matches`
+ * carries the tool calls that satisfied (or could have satisfied) the
+ * assertion — useful for diagnostic output.
+ */
+interface AssertionResult {
+  passed: boolean;
+  /** Short human-readable name, e.g. `"called(mcp__api__search_skills, >= 1)"`. */
+  description: string;
+  /** Diagnostic detail. Always populated; explains the pass/fail. */
+  details: string;
+  /** Tool calls that satisfied the assertion (omitted when irrelevant). */
+  matches?: ToolCall[];
+  /** Sub-results for compound assertions. */
+  children?: AssertionResult[];
+}
+//#endregion
+//#region src/runner/types.d.ts
+interface TestCase {
+  id: string;
+  prompt: string;
+  category?: string;
+  notes?: string;
+  assertions: ThresholdedAssertion[];
+  /** Natural-language outcome checks for LLM grading (see `harness-eval grade`). */
+  expectations?: string[];
+  /** Reference tool-call trajectory for metric computation. */
+  reference_trajectory?: Array<{
+    tool_name: string;
+    tool_input: unknown;
+  }>;
+  /** Human ratings keyed by metric name for judge calibration. */
+  human_ratings?: Record<string, number>;
+  repetitions?: number;
+  config?: SuiteConfig;
+}
+interface MatrixCell {
+  label: string;
+  config: SuiteConfig;
+  axes?: Record<string, string>;
+}
+interface TestSuite {
+  /** Harness adapter id. Default: `claude-code`. */
+  adapter?: string;
+  cases: TestCase[];
+  matrix: MatrixCell[];
+  defaultConfig?: SuiteConfig;
+}
+interface RunSuiteOptions {
+  /** Maximum concurrent harness processes across the entire suite. Default 4. */
+  maxConcurrent?: number;
+  /** Harness adapter to run. Defaults to registry default (`claude-code`). */
+  adapter?: HarnessAdapter;
+  onProgress?: ProgressCallback;
+  signal?: AbortSignal;
+}
+type ProgressCallback = (event: ProgressEvent) => void;
+type ProgressEvent = {
+  kind: "suite-start";
+  totalReps: number;
+} | {
+  kind: "rep-start";
+  caseId: string;
+  cellLabel: string;
+  repIndex: number;
+} | {
+  kind: "rep-complete";
+  caseId: string;
+  cellLabel: string;
+  repIndex: number;
+  ok: boolean;
+  durationMs: number;
+  toolCallCount?: number;
+  assertionResults?: AssertionResult[];
+  errorMessage?: string;
+} | {
+  kind: "cell-complete";
+  report: CellReport;
+} | {
+  kind: "suite-complete";
+  report: SuiteReport;
+};
+interface RepetitionResult {
+  repetitionIndex: number;
+  adapterResult: AdapterResult | null;
+  error: RepetitionError | null;
+  assertionResults: AssertionResult[];
+  durationMs: number;
+}
+interface RepetitionError {
+  message: string;
+  diagnostics: Partial<AdapterDiagnostics>;
+}
+interface CellReport {
+  caseId: string;
+  category?: string;
+  notes?: string;
+  /** Eval prompt (copied for grading without re-loading the suite). */
+  prompt?: string;
+  /** Outcome expectations for LLM grading. */
+  expectations?: string[];
+  /** Reference tool-call trajectory for metric computation. */
+  reference_trajectory?: Array<{
+    tool_name: string;
+    tool_input: unknown;
+  }>;
+  /** Human ratings keyed by metric name for judge calibration. */
+  human_ratings?: Record<string, number>;
+  cell: MatrixCell;
+  repetitions: RepetitionResult[];
+  assertionStats: AssertionStat[];
+  adapterErrors: number;
+  passed: boolean;
+}
+interface AssertionStat {
+  description: string;
+  threshold: number;
+  passedCount: number;
+  evaluatedCount: number;
+  passRate: number;
+  meetsThreshold: boolean;
+}
+interface SuiteReport {
+  startedAt: string;
+  durationMs: number;
+  cells: CellReport[];
+}
+//#endregion
+export { ObjectPredicate as _, ProgressEvent as a, ToolPattern as b, RunSuiteOptions as c, TestSuite as d, Assertion as f, LeafPredicate as g, CompoundPredicate as h, ProgressCallback as i, SuiteReport as l, Cardinality as m, CellReport as n, RepetitionError as o, AssertionResult as p, MatrixCell as r, RepetitionResult as s, AssertionStat as t, TestCase as u, Predicate as v, ThresholdedAssertion as y };
+//# sourceMappingURL=types-BQol062t.d.ts.map

package/package.json ADDED Viewed

@@ -0,0 +1,74 @@
+{
+    "name": "@alis-build/harness-eval",
+    "version": "0.1.0",
+    "description": "Harness-level eval framework for measuring AI coding agent tool-selection behavior",
+    "type": "module",
+    "main": "./dist/index.js",
+    "types": "./dist/index.d.ts",
+    "author": "www.alisx.com",
+    "license": "Apache-2.0",
+    "engines": {
+        "node": ">=22.12.0"
+    },
+    "repository": {
+        "type": "git",
+        "url": "git+https://github.com/alis-build/harness-eval-ts.git"
+    },
+    "homepage": "https://github.com/alis-build/harness-eval-ts#readme",
+    "bugs": {
+        "url": "https://github.com/alis-build/harness-eval-ts/issues"
+    },
+    "exports": {
+        ".": {
+            "types": "./dist/index.d.ts",
+            "import": "./dist/index.js"
+        },
+        "./adapters/claude-code": {
+            "types": "./dist/adapters/claude-code/index.d.ts",
+            "import": "./dist/adapters/claude-code/index.js"
+        },
+        "./runner": {
+            "types": "./dist/runner/suite.d.ts",
+            "import": "./dist/runner/suite.js"
+        },
+        "./config": {
+            "types": "./dist/config/loader.d.ts",
+            "import": "./dist/config/loader.js"
+        }
+    },
+    "files": [
+        "dist",
+        "schemas",
+        "README.md",
+        "LICENSE"
+    ],
+    "scripts": {
+        "generate-schemas": "tsx src/schemas/generate.ts",
+        "build": "pnpm run generate-schemas && tsdown",
+        "prepack": "pnpm run build",
+        "prepublishOnly": "pnpm run build",
+        "watch": "tsdown --watch",
+        "clean": "rm -rf dist",
+        "test": "vitest run",
+        "test:watch": "vitest",
+        "typecheck": "tsc --noEmit"
+    },
+    "bin": {
+        "harness-eval": "./dist/cli/bin.js"
+    },
+    "dependencies": {
+        "yaml": "^2.6.0",
+        "zod": "^4.4.3"
+    },
+    "devDependencies": {
+        "@types/node": "^22.12.0",
+        "tsdown": "^0.22.3",
+        "tsx": "^4.22.4",
+        "typescript": "^5.6.0",
+        "vitest": "^2.1.0"
+    },
+    "publishConfig": {
+        "access": "public"
+    },
+    "packageManager": "pnpm@11.3.0"
+}

package/schemas/eval-interchange-agent-trace.schema.json ADDED Viewed

@@ -0,0 +1,322 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json#AgentTrace",
+  "title": "AgentTrace",
+  "description": "Full multi-turn agent execution trace.",
+  "type": "object",
+  "properties": {
+    "agents": {
+      "$ref": "#/$defs/__schema0"
+    },
+    "turns": {
+      "$ref": "#/$defs/__schema13"
+    }
+  },
+  "required": [
+    "agents",
+    "turns"
+  ],
+  "additionalProperties": false,
+  "$defs": {
+    "__schema0": {
+      "type": "object",
+      "propertyNames": {
+        "$ref": "#/$defs/__schema1"
+      },
+      "additionalProperties": {
+        "$ref": "#/$defs/AgentConfig"
+      },
+      "description": "Agent configurations keyed by agent id."
+    },
+    "__schema1": {
+      "type": "string"
+    },
+    "AgentConfig": {
+      "type": "object",
+      "properties": {
+        "agent_id": {
+          "$ref": "#/$defs/__schema2"
+        },
+        "agent_type": {
+          "$ref": "#/$defs/__schema3"
+        },
+        "description": {
+          "$ref": "#/$defs/__schema5"
+        },
+        "instruction": {
+          "$ref": "#/$defs/__schema7"
+        },
+        "tools": {
+          "$ref": "#/$defs/__schema9"
+        },
+        "sub_agents": {
+          "$ref": "#/$defs/__schema11"
+        }
+      },
+      "required": [
+        "agent_id"
+      ],
+      "additionalProperties": false,
+      "title": "AgentConfig",
+      "description": "Static configuration for one agent in a trace."
+    },
+    "__schema2": {
+      "type": "string",
+      "description": "Stable agent identifier."
+    },
+    "__schema3": {
+      "description": "Agent type or role.",
+      "$ref": "#/$defs/__schema4"
+    },
+    "__schema4": {
+      "type": "string"
+    },
+    "__schema5": {
+      "description": "Human-readable agent description.",
+      "$ref": "#/$defs/__schema6"
+    },
+    "__schema6": {
+      "type": "string"
+    },
+    "__schema7": {
+      "description": "System instruction for the agent.",
+      "$ref": "#/$defs/__schema8"
+    },
+    "__schema8": {
+      "type": "string"
+    },
+    "__schema9": {
+      "description": "Tools available to this agent.",
+      "$ref": "#/$defs/__schema10"
+    },
+    "__schema10": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string",
+            "description": "Tool name."
+          }
+        },
+        "required": [
+          "name"
+        ],
+        "additionalProperties": false
+      }
+    },
+    "__schema11": {
+      "description": "Sub-agent identifiers when using multi-agent setups.",
+      "$ref": "#/$defs/__schema12"
+    },
+    "__schema12": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "__schema13": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/ConversationTurn"
+      },
+      "description": "Chronological conversation turns."
+    },
+    "ConversationTurn": {
+      "type": "object",
+      "properties": {
+        "turn_index": {
+          "$ref": "#/$defs/__schema14"
+        },
+        "turn_id": {
+          "$ref": "#/$defs/__schema15"
+        },
+        "events": {
+          "$ref": "#/$defs/__schema17"
+        }
+      },
+      "required": [
+        "turn_index",
+        "events"
+      ],
+      "additionalProperties": false,
+      "title": "ConversationTurn",
+      "description": "One turn in a multi-turn agent conversation."
+    },
+    "__schema14": {
+      "type": "integer",
+      "minimum": -9007199254740991,
+      "maximum": 9007199254740991,
+      "description": "Zero-based turn index."
+    },
+    "__schema15": {
+      "description": "Optional stable turn identifier.",
+      "$ref": "#/$defs/__schema16"
+    },
+    "__schema16": {
+      "type": "string"
+    },
+    "__schema17": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/AgentEvent"
+      },
+      "description": "Events in chronological order."
+    },
+    "AgentEvent": {
+      "type": "object",
+      "properties": {
+        "author": {
+          "$ref": "#/$defs/__schema18"
+        },
+        "content": {
+          "$ref": "#/$defs/__schema19"
+        },
+        "event_time": {
+          "$ref": "#/$defs/__schema27"
+        },
+        "state_delta": {
+          "$ref": "#/$defs/__schema29"
+        },
+        "active_tools": {
+          "$ref": "#/$defs/__schema31"
+        }
+      },
+      "required": [
+        "author",
+        "content"
+      ],
+      "additionalProperties": false,
+      "title": "AgentEvent",
+      "description": "One event in a multi-turn agent trace."
+    },
+    "__schema18": {
+      "type": "string",
+      "description": "Agent id or user identifier for this event."
+    },
+    "__schema19": {
+      "type": "object",
+      "properties": {
+        "parts": {
+          "$ref": "#/$defs/__schema20"
+        }
+      },
+      "required": [
+        "parts"
+      ],
+      "additionalProperties": false,
+      "description": "Structured event content."
+    },
+    "__schema20": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/ContentPart"
+      },
+      "description": "Content parts for this event."
+    },
+    "ContentPart": {
+      "type": "object",
+      "properties": {
+        "text": {
+          "$ref": "#/$defs/__schema21"
+        },
+        "function_call": {
+          "$ref": "#/$defs/__schema23"
+        },
+        "function_response": {
+          "$ref": "#/$defs/__schema25"
+        }
+      },
+      "additionalProperties": false,
+      "title": "ContentPart",
+      "description": "One part of agent event content (text, function_call, or function_response)."
+    },
+    "__schema21": {
+      "description": "Plain text content.",
+      "$ref": "#/$defs/__schema22"
+    },
+    "__schema22": {
+      "type": "string"
+    },
+    "__schema23": {
+      "description": "Function call emitted by the agent.",
+      "$ref": "#/$defs/__schema24"
+    },
+    "__schema24": {
+      "type": "object",
+      "properties": {
+        "name": {
+          "type": "string",
+          "description": "Function or tool name."
+        },
+        "args": {
+          "description": "Function arguments."
+        }
+      },
+      "required": [
+        "name",
+        "args"
+      ],
+      "additionalProperties": false
+    },
+    "__schema25": {
+      "description": "Function response from tool execution.",
+      "$ref": "#/$defs/__schema26"
+    },
+    "__schema26": {
+      "type": "object",
+      "properties": {
+        "name": {
+          "type": "string",
+          "description": "Function or tool name."
+        },
+        "response": {
+          "description": "Function result payload."
+        }
+      },
+      "required": [
+        "name",
+        "response"
+      ],
+      "additionalProperties": false
+    },
+    "__schema27": {
+      "description": "ISO 8601 timestamp when the event occurred.",
+      "$ref": "#/$defs/__schema28"
+    },
+    "__schema28": {
+      "type": "string"
+    },
+    "__schema29": {
+      "description": "Session state changes associated with this event.",
+      "$ref": "#/$defs/__schema30"
+    },
+    "__schema30": {
+      "type": "object",
+      "propertyNames": {
+        "type": "string"
+      },
+      "additionalProperties": {}
+    },
+    "__schema31": {
+      "description": "Tools available to the agent at event time.",
+      "$ref": "#/$defs/__schema32"
+    },
+    "__schema32": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string",
+            "description": "Tool name."
+          }
+        },
+        "required": [
+          "name"
+        ],
+        "additionalProperties": false
+      }
+    }
+  }
+}