npm - open-multi-agent-kit - Versions diffs - 0.78.1 → 0.78.3 - Mend

open-multi-agent-kit 0.78.1 → 0.78.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/CHANGELOG.md +37 -0
package/MATURITY.md +4 -0
package/README.md +70 -1
package/dist/benchmark/contracts.d.ts +116 -0
package/dist/benchmark/contracts.js +6 -0
package/dist/benchmark/fixtures.d.ts +11 -0
package/dist/benchmark/fixtures.js +121 -0
package/dist/benchmark/harness.d.ts +13 -0
package/dist/benchmark/harness.js +191 -0
package/dist/benchmark/shadow-mode.d.ts +17 -0
package/dist/benchmark/shadow-mode.js +96 -0
package/dist/cli/register-spec-agent-goal-commands.js +45 -0
package/dist/cli/release-promotion-gate.d.ts +14 -0
package/dist/cli/release-promotion-gate.js +71 -0
package/dist/cli/v2/release-commands.d.ts +29 -0
package/dist/cli/v2/release-commands.js +95 -0
package/dist/commands/chat/native-root-loop.js +14 -1
package/dist/commands/chat/slash/commands/session.js +19 -1
package/dist/commands/goal-interview.d.ts +18 -0
package/dist/commands/goal-interview.js +396 -0
package/dist/commands/merge.js +102 -56
package/dist/contracts/interview.d.ts +106 -0
package/dist/contracts/interview.js +9 -0
package/dist/contracts/provider-health.d.ts +37 -0
package/dist/contracts/provider-health.js +49 -1
package/dist/evidence/evidence-trust-score.d.ts +101 -0
package/dist/evidence/evidence-trust-score.js +408 -0
package/dist/evidence/index.d.ts +6 -0
package/dist/evidence/index.js +3 -0
package/dist/evidence/proof-trust-cli.d.ts +8 -0
package/dist/evidence/proof-trust-cli.js +27 -0
package/dist/evidence/proof-trust.d.ts +14 -0
package/dist/evidence/proof-trust.js +381 -0
package/dist/evidence/regression-proof-matrix.d.ts +42 -0
package/dist/evidence/regression-proof-matrix.js +72 -0
package/dist/goal/intent-frame.d.ts +6 -0
package/dist/goal/intent-frame.js +21 -9
package/dist/goal/interview-assimilation.d.ts +13 -0
package/dist/goal/interview-assimilation.js +383 -0
package/dist/goal/interview-question-bank.d.ts +11 -0
package/dist/goal/interview-question-bank.js +225 -0
package/dist/goal/interview-scoring.d.ts +31 -0
package/dist/goal/interview-scoring.js +187 -0
package/dist/goal/interview-session.d.ts +25 -0
package/dist/goal/interview-session.js +116 -0
package/dist/input/input-envelope.d.ts +22 -0
package/dist/input/input-envelope.js +1 -0
package/dist/orchestration/merge-arbiter.d.ts +91 -0
package/dist/orchestration/merge-arbiter.js +376 -0
package/dist/providers/health.d.ts +3 -0
package/dist/providers/health.js +46 -0
package/dist/providers/index.d.ts +1 -0
package/dist/providers/index.js +1 -0
package/dist/providers/provider-health.d.ts +8 -1
package/dist/providers/provider-health.js +39 -0
package/dist/providers/provider-task-runner.js +31 -0
package/dist/providers/provider.d.ts +2 -0
package/dist/providers/router.js +87 -3
package/dist/providers/types.d.ts +4 -0
package/dist/runtime/advanced-control-loop.d.ts +60 -0
package/dist/runtime/advanced-control-loop.js +136 -0
package/dist/runtime/agent-runtime.d.ts +10 -0
package/dist/runtime/blast-radius.d.ts +10 -0
package/dist/runtime/blast-radius.js +14 -0
package/dist/runtime/contracts/evidence.d.ts +87 -0
package/dist/runtime/contracts/evidence.js +7 -0
package/dist/runtime/contracts/router-v2.d.ts +44 -0
package/dist/runtime/contracts/router-v2.js +4 -0
package/dist/runtime/contracts/weakness-remediation.d.ts +67 -0
package/dist/runtime/contracts/weakness-remediation.js +36 -0
package/dist/runtime/kimi-api-runtime.js +59 -1
package/dist/runtime/proof-bundle-trust.d.ts +74 -0
package/dist/runtime/proof-bundle-trust.js +100 -0
package/dist/runtime/provider-maturity-gate.d.ts +43 -0
package/dist/runtime/provider-maturity-gate.js +129 -0
package/dist/runtime/public-surface.d.ts +93 -0
package/dist/runtime/public-surface.js +146 -0
package/dist/runtime/router-v2-scoring.d.ts +11 -0
package/dist/runtime/router-v2-scoring.js +151 -0
package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
package/dist/runtime/tool-dispatch-contracts.js +42 -2
package/dist/runtime/weakness-remediation-index.d.ts +27 -0
package/dist/runtime/weakness-remediation-index.js +37 -0
package/dist/safety/enforcement-engine.d.ts +89 -0
package/dist/safety/enforcement-engine.js +279 -0
package/dist/safety/tool-authority-gate.d.ts +40 -0
package/dist/safety/tool-authority-gate.js +92 -0
package/dist/schema/evidence.schema.d.ts +2 -2
package/dist/schema/proof-bundle.schema.d.ts +28 -28
package/dist/util/clipboard-image.d.ts +49 -0
package/dist/util/clipboard-image.js +263 -0
package/docs/2026-06-09/critical-issues.md +20 -0
package/docs/2026-06-09/improvements.md +14 -0
package/docs/2026-06-09/init-checklist.md +25 -0
package/docs/2026-06-09/plan.md +20 -0
package/docs/benchmark-design.md +122 -0
package/docs/github-organic-promotion.md +127 -0
package/docs/native-root-runtime-algorithms.md +301 -0
package/package.json +8 -4
package/readmeasset/ASSET_INDEX.md +1 -0
package/templates/skills/agents/omk-agent-reach-websearch/SKILL.md +55 -0
package/templates/skills/kimi/omk-agent-reach-websearch/SKILL.md +55 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,42 @@
 # Changelog
+## v0.78.2 — Regression Proof Matrix, deep interview, clipboard image paste, and README hardening (2026-06-09)
+### Overview
+This release adds the Regression Proof Matrix (Algorithm 9) as a release-defense gate, ships the deep interview and clipboard image paste features, and hardens README links to be package-safe.
+### Added
+- **Regression Proof Matrix (Algorithm 9)** — release-defense gate that verifies Algorithms 1–8 are alive via tests, proof bundles, decision traces, and CLI surfaces. `scripts/regression-proof-matrix.mjs --json` evaluates coverage topology, test linkage, proof-bundle trust, and CLI reachability, returning a JSON verdict with per-algorithm coverage and reasons.
+- `src/evidence/regression-proof-matrix.ts` engine with configurable coverage and proof-trust thresholds (default `TAU_EVIDENCE` = 0.75), plus `test/regression-proof-matrix.test.mjs` unit coverage.
+- Proof bundle `011-regression-proof-matrix` under `proof/verified-runs/` with evidence, decisions, verify JSON, and `sha256sums.txt`.
+- `omk goal interview [input]` and `omk goal refine <goal-id>` commands under the existing `goal` group, adding an evidence-driven clarification step before planning.
+- Deterministic deep interview that scores goal ambiguity (`0..1`), ranks targeted questions (`informationGain*0.35 + riskReduction*0.25 + dagImpact*0.20 + evidenceImpact*0.15 - userCost*0.05`), and computes a completeness score from assimilated answers.
+- Spec-delta assimilation that folds interview answers into a structured `GoalSpec` with conflict resolution, selectable depth (`light|standard|deep`, auto-selected by ambiguity when omitted), and `--write-spec` persistence.
+- `omk.interview.v1` JSON contract (`schemas/omk.interview.v1.schema.json`) plus the `omk.interview-delta.v1` spec-delta envelope.
+- Per-session interview artifacts (`interview.json`, `spec-delta.json`, `questions.md`, `answers.jsonl`, `interview-report.md`) under `.omk/goals/<goalId>/interviews/<sessionId>/` (or `.omk/interviews/<sessionId>/` before `--write-spec`).
+- Clipboard image paste support: `/paste` slash command in chat REPL, `--image` flag on `omk goal interview`, cross-platform clipboard reader (macOS/Linux/Windows), `InputAttachment` type for multimodal image handling.
+- GitHub organic growth kit: README first-screen positioning, runnable awesome-list examples, a 1280x640 social preview upload candidate, and reusable Topics/About/awesome-list PR copy in `docs/github-organic-promotion.md`.
+### Changed
+- README install and badge links now use package-safe `open-multi-agent-kit` example URLs instead of the unavailable `@omk/cli` scope.
+- `MATURITY.md` and `docs/native-root-runtime-algorithms.md` clarify that the Regression Proof Matrix is a release-defense coverage gate, not a stable-release claim.
+### Commits
+```
+1504eae chore(release): bump v0.78.2
+3874558 docs(readme): use package-safe example links
+cb673e3 docs(readme): clarify regression proof matrix boundary
+278cdf4 docs(proof): clarify regression matrix release boundary
+285c68c Feat/regression proof matrix (#15)
+4701243 feat(runtime): send clipboard images as multimodal content parts
+78a31eb feat(clipboard): add image paste support for chat and goal interview
+69d65c6 feat(goal): add deep interview refinement
+```
 ## v0.78.1 — package alignment, JSON contract envelopes, and adaptive runtime algorithms (2026-06-07)
 ### Overview

package/MATURITY.md CHANGED Viewed

@@ -63,6 +63,10 @@ Current source version: v0.78.1 (`v1.2` runtime contract family)
 | `omk research` | Core runtime web research wrapper; depends on Kimi tool availability. |
 | `omk open-design-agent` | Local Open Design CLI bridge. |
+## Regression Proof Matrix Claim Boundary
+Regression Proof Matrix is a release-defense gate, not a stable-release claim. Stable promotion still requires full `npm test`, live provider maturity data, and a minimal verified demo pass.
 ## Automation Contract Status
 | Area | Current state | Next hardening |

package/README.md CHANGED Viewed

@@ -23,6 +23,7 @@
 <p align="center">
   <a href="#install">Install</a> ·
   <a href="#quick-start">Quick start</a> ·
+  <a href="#who-is-this-for">Who is this for?</a> ·
   <a href="#current-runtime-algorithm">Runtime algorithm</a> ·
   <a href="docs/getting-started.md">Docs</a> ·
   <a href="readmeasset/ASSET_INDEX.md">Visual assets</a>
@@ -30,6 +31,14 @@
 `OMK` (`omk`) turns a coding goal into a bounded, evidence-gated agent run.
+Use OMK when one coding agent is not enough: route Codex, OpenCode, Kimi, DeepSeek, Qwen, OpenRouter, and local runtimes through one evidence-gated control loop.
+## Who is this for?
+- Developers running multiple coding agents from the terminal.
+- Teams that need MCP-scoped agent execution instead of unrestricted tool access.
+- Agent builders who want routing, fallback, evidence gates, telemetry, and replay.
 > Current package source target: `open-multi-agent-kit@0.78.1`.
 > Public package name: `open-multi-agent-kit` (`@omk/cli` is not the active npm package).
 > Runtime contract family: `v1.2` (contract family, not a stable npm `1.x` release).
@@ -38,7 +47,7 @@
 ## Quickstart (3 minutes)
-A beginner reads this, runs four commands, and succeeds.
+A beginner reads this, runs four commands, and reaches an initialized OMK chat/doctor flow.
 ```bash
 npm i -g open-multi-agent-kit
@@ -47,6 +56,11 @@ omk doctor
 omk chat
 ```
+## Examples for agent tooling lists
+- [Codex MCP evidence run](https://github.com/dmae97/open-multi-agent-kit/tree/main/examples/codex-mcp-evidence-run): project-scoped MCP setup plus evidence-gated DAG dry run.
+- [Provider fallback](https://github.com/dmae97/open-multi-agent-kit/tree/main/examples/provider-fallback): `--provider auto` routing with parallel worker planning.
 ## Current release reality
 - The public npm line is `open-multi-agent-kit@0.78.x`. Published npm `latest` is `0.78.0`;
@@ -58,6 +72,8 @@ omk chat
   lanes are scoped by the provider-maturity contract.
 - Safety and evidence claims apply to the exact adapter, command, and verification gate that
   produced them.
+- Regression Proof Matrix is a release-defense coverage gate, not a stable-release claim.
+  Stable promotion still requires full tests, live provider maturity data, and a minimal verified demo pass.
 ## Why OMK
@@ -192,6 +208,59 @@ Kimi worker prompts use stdin with `--input-format text` where that adapter path
 Goal → DAG plan → parallel lanes → evidence bundle → verify gate → merge / replay / inspect
 ```
+## Goal lifecycle
+`omk goal` turns a raw goal into a planned, evidence-gated run. The **OMK Deep Interview** is an uncertainty reducer that clarifies the goal before planning, so the DAG is compiled from a structured spec instead of a vague prompt.
+Recommended flow:
+```bash
+omk goal interview "<raw goal>" --depth deep --write-spec
+omk goal plan <goal-id>
+omk goal run <goal-id> --provider auto --approval-policy interactive
+omk goal verify <goal-id>
+```
+### `omk goal interview [input]`
+Runs a deterministic deep interview that scores goal ambiguity (`0..1`), ranks targeted questions, assimilates answers into a structured spec delta, computes a completeness score, and (with `--write-spec`) creates or updates a `GoalSpec`. Question ranking is deterministic:
+```text
+score = informationGain*0.35 + riskReduction*0.25 + dagImpact*0.20 + evidenceImpact*0.15 - userCost*0.05
+```
+| Option                     | Purpose                                                         |
+| -------------------------- | -------------------------------------------------------------- |
+| `--goal-id <id>`           | Target an existing goal.                                       |
+| `--mode <create\|refine>`  | Create a new spec or refine an existing one.                   |
+| `--depth <light\|standard\|deep>` | Interview depth; omit to auto-select by ambiguity.      |
+| `--max-questions <n>`      | Cap the number of ranked questions.                           |
+| `--answers <file>`         | Supply answers non-interactively.                             |
+| `--write-spec`             | Persist the spec delta into a `GoalSpec`.                     |
+| `--json`                   | Emit the `omk.interview.v1` JSON contract.                    |
+### `omk goal refine <goal-id>`
+Applies the latest interview spec delta to a goal and optionally replans.
+| Option                  | Purpose                                          |
+| ----------------------- | ------------------------------------------------ |
+| `--from-interview <id>` | Source interview session (default: latest).      |
+| `--plan`                | Replan the goal after applying the delta.         |
+| `--json`                | Emit machine-readable output.                     |
+Answers file format (`--answers answers.json`):
+```json
+{
+  "answers": [
+    { "questionId": "q-success-criteria", "answer": "..." }
+  ]
+}
+```
+Session artifacts (`interview.json`, `spec-delta.json`, `questions.md`, `answers.jsonl`, `interview-report.md`) are written under `.omk/goals/<goalId>/interviews/<sessionId>/`, or `.omk/interviews/<sessionId>/` before `--write-spec`.
 ## What OMK controls
 | Surface            | What OMK does                                                                                           |

package/dist/benchmark/contracts.d.ts ADDED Viewed

@@ -0,0 +1,116 @@
+/**
+ * Benchmark contracts — omk.benchmark.v1
+ *
+ * Reproducible evaluation surface for OMK control plane tasks.
+ */
+import type { AttemptStatus, RuntimeId } from "../evidence/attempt-record.js";
+import type { RuntimeRouterDecisionV2, RuntimeScoreV2 } from "../runtime/contracts/router-v2.js";
+import type { RuntimeRouteDecision } from "../runtime/runtime-router.js";
+export declare const BENCHMARK_SCHEMA_VERSION = "omk.benchmark.v1";
+export type BenchmarkTaskCategory = "read-only-repo-qa" | "small-bug-fix" | "failing-test-repair" | "multi-file-refactor" | "cli-command-task" | "dependency-update" | "merge-conflict-task" | "security-sensitive-task" | "provider-failure-fallback" | "quota-auth-failure-fallback";
+export interface BenchmarkTask {
+    readonly taskId: string;
+    readonly category: BenchmarkTaskCategory;
+    readonly intent: string;
+    readonly description: string;
+    readonly treeHash: string;
+    readonly seed: number;
+    readonly providerConfigHash: string;
+    readonly omkVersion: string;
+    readonly worktreePath?: string;
+    readonly relevantFiles: readonly string[];
+    readonly expectedOutcome: "success" | "failure" | "fallback";
+    readonly recordedAttempts: readonly BenchmarkAttemptStub[];
+}
+export interface BenchmarkAttemptStub {
+    readonly attemptId: string;
+    readonly runtime: RuntimeId;
+    readonly model: string;
+    readonly provider: string;
+    readonly status: AttemptStatus;
+    readonly latencyMs: number;
+    readonly inputTokensEstimated: number;
+    readonly outputTokensEstimated: number;
+    readonly costUsdEstimated: number;
+    readonly evidenceResults: readonly {
+        gate: string;
+        passed: boolean;
+    }[];
+    readonly changedFiles: readonly string[];
+    readonly commandsRun: readonly string[];
+    readonly summary: string;
+    readonly error?: string;
+}
+export interface BenchmarkRunResult {
+    readonly taskId: string;
+    readonly solved: boolean;
+    readonly evidenceTrustScore: number;
+    readonly falseDone: boolean;
+    readonly fallbackUsed: boolean;
+    readonly fallbackSucceeded: boolean;
+    readonly routerRegret: number;
+    readonly costUsd: number;
+    readonly latencyMs: number;
+    readonly rolledBack: boolean;
+    readonly sandboxViolations: number;
+    readonly attemptCount: number;
+    readonly decisions: readonly BenchmarkDecisionRecord[];
+}
+export interface BenchmarkDecisionRecord {
+    readonly component: "runtime-router-v1" | "runtime-router-v2" | "provider-router";
+    readonly selectedRuntime: string;
+    readonly bestAvailableRuntime: string;
+    readonly regret: number;
+    readonly reason: string;
+    readonly scoresV2?: readonly RuntimeScoreV2[];
+}
+export interface BenchmarkSummary {
+    readonly schemaVersion: typeof BENCHMARK_SCHEMA_VERSION;
+    readonly runId: string;
+    readonly startedAt: string;
+    readonly completedAt: string;
+    readonly durationMs: number;
+    readonly treeHash: string;
+    readonly seed: number;
+    readonly providerConfigHash: string;
+    readonly omkVersion: string;
+    readonly mode: "shadow" | "live";
+    readonly totalTasks: number;
+    readonly solvedCount: number;
+    readonly solveRate: number;
+    readonly evidenceTrustScoreMean: number;
+    readonly falseDoneRate: number;
+    readonly fallbackSuccessRate: number;
+    readonly routerRegretMean: number;
+    readonly costPerSolvedTask: number;
+    readonly p95LatencyMs: number;
+    readonly rollbackRate: number;
+    readonly sandboxViolationCount: number;
+    readonly results: readonly BenchmarkRunResult[];
+}
+export interface ShadowModeRecord {
+    readonly taskId: string;
+    readonly nodeId: string;
+    readonly intent: string;
+    readonly v1Decision: RuntimeRouteDecision | null;
+    readonly v2Decision: RuntimeRouterDecisionV2 | null;
+    readonly regretV1: number;
+    readonly regretV2: number;
+    readonly disagreement: boolean;
+    readonly timestamp: string;
+}
+export interface BenchmarkConfig {
+    readonly mode: "shadow" | "live";
+    readonly tasksDir: string;
+    readonly outputDir: string;
+    readonly runId: string;
+    readonly maxConcurrency: number;
+    readonly pinTreeHash?: string;
+    readonly pinSeed?: number;
+    readonly pinProviderConfigHash?: string;
+    readonly categories?: readonly BenchmarkTaskCategory[];
+}
+export interface BenchmarkFixture {
+    readonly tasks: readonly BenchmarkTask[];
+    readonly version: string;
+}

package/dist/benchmark/contracts.js ADDED Viewed

@@ -0,0 +1,6 @@
+/**
+ * Benchmark contracts — omk.benchmark.v1
+ *
+ * Reproducible evaluation surface for OMK control plane tasks.
+ */
+export const BENCHMARK_SCHEMA_VERSION = "omk.benchmark.v1";

package/dist/benchmark/fixtures.d.ts ADDED Viewed

@@ -0,0 +1,11 @@
+/**
+ * Benchmark fixtures — synthetic trace generation + recorded trace loader.
+ *
+ * All synthetic traces are deterministic given a seed.
+ */
+import type { BenchmarkFixture } from "./contracts.js";
+export declare const DEFAULT_FIXTURE_VERSION = "1.0.0";
+export declare function generateSyntheticTraces(countPerCategory: number, seed: number, omkVersion: string, treeHash: string, providerConfigHash: string): BenchmarkFixture;
+export declare function loadRecordedTraces(dir: string): Promise<BenchmarkFixture>;
+export declare function hashConfig(obj: unknown): string;
+export declare function computeTreeHash(): string;

package/dist/benchmark/fixtures.js ADDED Viewed

@@ -0,0 +1,121 @@
+/**
+ * Benchmark fixtures — synthetic trace generation + recorded trace loader.
+ *
+ * All synthetic traces are deterministic given a seed.
+ */
+import { createHash } from "node:crypto";
+import { readFile, readdir } from "node:fs/promises";
+import { join } from "node:path";
+export const DEFAULT_FIXTURE_VERSION = "1.0.0";
+function seededRandom(seed) {
+    let s = seed;
+    return () => {
+        s = (s * 16807 + 0) % 2147483647;
+        return (s - 1) / 2147483646;
+    };
+}
+function pick(arr, rng) {
+    return arr[Math.floor(rng() * arr.length)];
+}
+const CATEGORIES = [
+    "read-only-repo-qa",
+    "small-bug-fix",
+    "failing-test-repair",
+    "multi-file-refactor",
+    "cli-command-task",
+    "dependency-update",
+    "merge-conflict-task",
+    "security-sensitive-task",
+    "provider-failure-fallback",
+    "quota-auth-failure-fallback",
+];
+const RUNTIME_IDS = ["kimi-wire", "kimi-print", "openai-compatible", "deepseek", "local"];
+function makeAttemptStub(taskId, category, attemptNumber, rng, outcomeOverride) {
+    const runtime = pick([...RUNTIME_IDS], rng);
+    const statusBase = outcomeOverride ?? pick(["success", "success", "failure", "fallback"], rng);
+    const status = statusBase === "fallback" ? "runtime_failed" : statusBase === "success" ? "success" : "evidence_failed";
+    const latencyMs = Math.floor(500 + rng() * 8000);
+    const inputTokens = Math.floor(1000 + rng() * 15000);
+    const outputTokens = Math.floor(200 + rng() * 5000);
+    const costUsd = parseFloat((inputTokens * 0.000002 + outputTokens * 0.000006).toFixed(6));
+    const evidenceGates = category === "security-sensitive-task"
+        ? ["test", "lint", "audit", "review"]
+        : category === "cli-command-task"
+            ? ["command", "stdout-match"]
+            : ["test", "lint", "diff"];
+    const evidenceResults = evidenceGates.map((gate) => ({
+        gate,
+        passed: status === "success" ? true : rng() > 0.3,
+    }));
+    return {
+        attemptId: `${taskId}__${attemptNumber}`,
+        runtime,
+        model: "default",
+        provider: runtime.split("-")[0],
+        status,
+        latencyMs,
+        inputTokensEstimated: inputTokens,
+        outputTokensEstimated: outputTokens,
+        costUsdEstimated: costUsd,
+        evidenceResults,
+        changedFiles: category === "read-only-repo-qa" ? [] : [`src/${taskId}.ts`],
+        commandsRun: ["npm test", "npm run lint"],
+        summary: `${category} attempt ${attemptNumber}`,
+        error: status !== "success" ? "simulated failure" : undefined,
+    };
+}
+function makeTask(index, category, seed, omkVersion, treeHash, providerConfigHash) {
+    const rng = seededRandom(seed + index * 7919);
+    const taskId = `bench-${category}-${String(index).padStart(3, "0")}`;
+    const expectedOutcome = pick(["success", "success", "failure", "fallback"], rng);
+    const attempts = [];
+    const attemptCount = expectedOutcome === "fallback" ? 2 : 1;
+    for (let i = 1; i <= attemptCount; i++) {
+        attempts.push(makeAttemptStub(taskId, category, i, rng, i === 1 ? undefined : "success"));
+    }
+    return {
+        taskId,
+        category,
+        intent: category.replace(/-/g, "_"),
+        description: `Synthetic ${category} task #${index}`,
+        treeHash,
+        seed,
+        providerConfigHash,
+        omkVersion,
+        relevantFiles: [`src/${taskId}.ts`],
+        expectedOutcome,
+        recordedAttempts: attempts,
+    };
+}
+export function generateSyntheticTraces(countPerCategory, seed, omkVersion, treeHash, providerConfigHash) {
+    const tasks = [];
+    for (const category of CATEGORIES) {
+        for (let i = 0; i < countPerCategory; i++) {
+            tasks.push(makeTask(i, category, seed, omkVersion, treeHash, providerConfigHash));
+        }
+    }
+    return { tasks, version: DEFAULT_FIXTURE_VERSION };
+}
+export async function loadRecordedTraces(dir) {
+    const files = (await readdir(dir).catch(() => []))
+        .filter((f) => f.endsWith(".json"))
+        .map((f) => join(dir, f));
+    const tasks = [];
+    for (const file of files) {
+        const raw = await readFile(file, "utf-8");
+        const parsed = JSON.parse(raw);
+        tasks.push(parsed);
+    }
+    return { tasks, version: DEFAULT_FIXTURE_VERSION };
+}
+export function hashConfig(obj) {
+    return createHash("sha256")
+        .update(JSON.stringify(obj))
+        .digest("hex")
+        .slice(0, 16);
+}
+export function computeTreeHash() {
+    // In real usage this would be `git rev-parse HEAD`.
+    // Benchmark harness supplies the actual commit hash.
+    return "unknown";
+}

package/dist/benchmark/harness.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+/**
+ * Benchmark Harness — run benchmark suite, compute metrics, write report.
+ */
+import type { BenchmarkConfig, BenchmarkSummary } from "./contracts.js";
+import type { AgentRuntime } from "../runtime/agent-runtime.js";
+import type { EvidenceHistoryEntry } from "../runtime/contracts/router-v2.js";
+export interface HarnessOptions {
+    readonly config: BenchmarkConfig;
+    readonly runtimes: AgentRuntime[];
+    readonly history?: EvidenceHistoryEntry[];
+}
+export declare function runBenchmarkSuite(options: HarnessOptions): Promise<BenchmarkSummary>;
+export { createShadowModeEngine, computeRouterRegret } from "./shadow-mode.js";

package/dist/benchmark/harness.js ADDED Viewed

@@ -0,0 +1,191 @@
+/**
+ * Benchmark Harness — run benchmark suite, compute metrics, write report.
+ */
+import { mkdir, writeFile } from "node:fs/promises";
+import { join } from "node:path";
+import { performance } from "node:perf_hooks";
+import { generateSyntheticTraces, loadRecordedTraces, hashConfig } from "./fixtures.js";
+import { createShadowModeEngine } from "./shadow-mode.js";
+import { createEvidenceTrustScoreV2Engine } from "../evidence/evidence-trust-score.js";
+function capsuleFromTask(task) {
+    return {
+        runId: task.taskId,
+        nodeId: task.taskId,
+        goal: task.description,
+        system: "Benchmark system prompt",
+        task: task.description,
+        dependencySummaries: [],
+        relevantFiles: task.relevantFiles.map((path) => ({
+            path,
+            startLine: 1,
+            endLine: 10,
+            content: "// synthetic",
+        })),
+        graphMemory: [],
+        priorAttempts: [],
+        evidenceRequirements: [],
+        budget: { maxInputTokens: 8000, reservedOutputTokens: 4096, maxFileTokens: 4096, maxToolResultTokens: 2048, maxMemoryFacts: 10, compression: "lossless-ish" },
+        node: {
+            id: task.taskId,
+            name: task.description,
+            role: "coder",
+            dependsOn: [],
+            status: "running",
+            retries: 0,
+            maxRetries: 1,
+        },
+    };
+}
+function mapGateToKind(gate) {
+    switch (gate) {
+        case "test": return "test";
+        case "lint": return "command";
+        case "audit": return "audit";
+        case "review": return "review";
+        case "command": return "command";
+        case "stdout-match": return "trace";
+        case "diff": return "diff";
+        default: return "trace";
+    }
+}
+function attemptToEvidenceItem(attempt) {
+    return attempt.evidenceResults.map((ev) => ({
+        id: `${attempt.attemptId}-${ev.gate}`,
+        kind: mapGateToKind(ev.gate),
+        source: "runner",
+        description: ev.gate,
+        verdict: (ev.passed ? "pass" : "fail"),
+        timestamp: new Date().toISOString(),
+        confidence: 0.9,
+        linkedFilePaths: [...attempt.changedFiles],
+    }));
+}
+export async function runBenchmarkSuite(options) {
+    const startedAt = new Date().toISOString();
+    const startedMs = performance.now();
+    const { config, runtimes, history = [] } = options;
+    await mkdir(config.outputDir, { recursive: true });
+    // Load tasks
+    const tasks = [];
+    if (config.mode === "shadow") {
+        const version = process.env.npm_package_version ?? "0.0.0";
+        const treeHash = config.pinTreeHash ?? "synthetic";
+        const seed = config.pinSeed ?? 42;
+        const providerHash = config.pinProviderConfigHash ?? hashConfig(runtimes.map((r) => r.id));
+        const fixture = generateSyntheticTraces(2, seed, version, treeHash, providerHash);
+        tasks.push(...fixture.tasks);
+    }
+    else {
+        const fixture = await loadRecordedTraces(config.tasksDir);
+        tasks.push(...fixture.tasks);
+    }
+    if (config.categories && config.categories.length > 0) {
+        const allowed = new Set(config.categories);
+        const filtered = tasks.filter((t) => allowed.has(t.category));
+        tasks.length = 0;
+        tasks.push(...filtered);
+    }
+    const shadowEngine = createShadowModeEngine({ runtimes, history });
+    const etsEngine = createEvidenceTrustScoreV2Engine();
+    const results = [];
+    for (const task of tasks) {
+        const capsule = capsuleFromTask(task);
+        const shadowRecord = shadowEngine.evaluate(task.taskId, task.taskId, capsule);
+        const decisions = shadowEngine.toBenchmarkDecision(shadowRecord);
+        // Simulate execution using recorded attempts
+        const lastAttempt = task.recordedAttempts[task.recordedAttempts.length - 1];
+        const solved = lastAttempt?.status === "success";
+        const fallbackUsed = task.recordedAttempts.length > 1;
+        const fallbackSucceeded = fallbackUsed && solved;
+        const rolledBack = task.recordedAttempts.some((a) => a.status === "cancelled");
+        const sandboxViolations = task.recordedAttempts.some((a) => a.changedFiles.some((f) => f.startsWith("/") && !f.includes("worktree")))
+            ? 1
+            : 0;
+        // ETS v2 evaluation
+        const allEvidence = task.recordedAttempts.flatMap((a) => attemptToEvidenceItem(a));
+        const etsResult = await etsEngine.evaluate({
+            output: lastAttempt?.summary ?? "",
+            taskType: task.category.includes("security") ? "security" : "feature",
+            risk: task.category.includes("security") ? "critical" : "medium",
+            runArtifacts: {
+                items: allEvidence,
+                meta: {
+                    runId: task.taskId,
+                    nodeId: task.taskId,
+                    provider: lastAttempt?.provider ?? "unknown",
+                    model: lastAttempt?.model ?? "unknown",
+                    cwd: "[repo-root]",
+                    treeHashBefore: task.treeHash,
+                    treeHashAfter: task.treeHash,
+                    commandHash: hashConfig(task.recordedAttempts.map((a) => a.commandsRun)),
+                    timestamp: new Date().toISOString(),
+                    command: task.recordedAttempts.map((a) => a.commandsRun.join("; ")).join(" || "),
+                },
+            },
+            dependencyGraphFiles: task.relevantFiles,
+        });
+        const falseDone = !solved && etsResult.verdict === "pass";
+        const totalLatency = task.recordedAttempts.reduce((s, a) => s + a.latencyMs, 0);
+        const totalCost = task.recordedAttempts.reduce((s, a) => s + a.costUsdEstimated, 0);
+        results.push({
+            taskId: task.taskId,
+            solved,
+            evidenceTrustScore: etsResult.score,
+            falseDone,
+            fallbackUsed,
+            fallbackSucceeded,
+            routerRegret: shadowRecord.regretV2,
+            costUsd: totalCost,
+            latencyMs: totalLatency,
+            rolledBack,
+            sandboxViolations,
+            attemptCount: task.recordedAttempts.length,
+            decisions,
+        });
+    }
+    const completedAt = new Date().toISOString();
+    const durationMs = Math.round(performance.now() - startedMs);
+    const solvedCount = results.filter((r) => r.solved).length;
+    const totalTasks = results.length;
+    const solveRate = totalTasks > 0 ? solvedCount / totalTasks : 0;
+    const evidenceMean = totalTasks > 0 ? results.reduce((s, r) => s + r.evidenceTrustScore, 0) / totalTasks : 0;
+    const falseDoneRate = totalTasks > 0 ? results.filter((r) => r.falseDone).length / totalTasks : 0;
+    const fallbackAttempts = results.filter((r) => r.fallbackUsed);
+    const fallbackSuccessRate = fallbackAttempts.length > 0
+        ? fallbackAttempts.filter((r) => r.fallbackSucceeded).length / fallbackAttempts.length
+        : 0;
+    const routerRegretMean = totalTasks > 0 ? results.reduce((s, r) => s + r.routerRegret, 0) / totalTasks : 0;
+    const costPerSolved = solvedCount > 0 ? results.reduce((s, r) => s + r.costUsd, 0) / solvedCount : 0;
+    const latencies = results.map((r) => r.latencyMs).sort((a, b) => a - b);
+    const p95Latency = latencies.length > 0 ? latencies[Math.floor(latencies.length * 0.95)] ?? latencies[latencies.length - 1] : 0;
+    const rollbackRate = totalTasks > 0 ? results.filter((r) => r.rolledBack).length / totalTasks : 0;
+    const sandboxViolationCount = results.reduce((s, r) => s + r.sandboxViolations, 0);
+    const summary = {
+        schemaVersion: "omk.benchmark.v1",
+        runId: config.runId,
+        startedAt,
+        completedAt,
+        durationMs,
+        treeHash: config.pinTreeHash ?? "synthetic",
+        seed: config.pinSeed ?? 42,
+        providerConfigHash: config.pinProviderConfigHash ?? hashConfig(runtimes.map((r) => r.id)),
+        omkVersion: process.env.npm_package_version ?? "0.0.0",
+        mode: config.mode,
+        totalTasks,
+        solvedCount,
+        solveRate,
+        evidenceTrustScoreMean: evidenceMean,
+        falseDoneRate,
+        fallbackSuccessRate,
+        routerRegretMean,
+        costPerSolvedTask: costPerSolved,
+        p95LatencyMs: p95Latency,
+        rollbackRate,
+        sandboxViolationCount,
+        results,
+    };
+    const outPath = join(config.outputDir, `${config.runId}.json`);
+    await writeFile(outPath, JSON.stringify(summary, null, 2), "utf-8");
+    return summary;
+}
+export { createShadowModeEngine, computeRouterRegret } from "./shadow-mode.js";

package/dist/benchmark/shadow-mode.d.ts ADDED Viewed

@@ -0,0 +1,17 @@
+/**
+ * Shadow Mode Engine — side-by-side router v1/v2 recording.
+ */
+import type { AgentRuntime } from "../runtime/agent-runtime.js";
+import type { ContextCapsule } from "../runtime/context-capsule.js";
+import type { EvidenceHistoryEntry } from "../runtime/contracts/router-v2.js";
+import type { ShadowModeRecord, BenchmarkDecisionRecord } from "./contracts.js";
+export interface ShadowModeOptions {
+    readonly runtimes: AgentRuntime[];
+    readonly history: EvidenceHistoryEntry[];
+}
+export interface ShadowModeEngine {
+    evaluate(taskId: string, nodeId: string, capsule: ContextCapsule): ShadowModeRecord;
+    toBenchmarkDecision(record: ShadowModeRecord): BenchmarkDecisionRecord[];
+}
+export declare function createShadowModeEngine(options: ShadowModeOptions): ShadowModeEngine;
+export declare function computeRouterRegret(candidates: AgentRuntime[], intent: string, history: EvidenceHistoryEntry[], selectedId: string): number;