npm - open-multi-agent-kit - Versions diffs - 0.78.2 → 0.78.5 - Mend

open-multi-agent-kit 0.78.2 → 0.78.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/CHANGELOG.md +27 -2
package/MATURITY.md +2 -2
package/README.md +4 -4
package/dist/benchmark/contracts.d.ts +116 -0
package/dist/benchmark/contracts.js +6 -0
package/dist/benchmark/fixtures.d.ts +11 -0
package/dist/benchmark/fixtures.js +124 -0
package/dist/benchmark/harness.d.ts +13 -0
package/dist/benchmark/harness.js +191 -0
package/dist/benchmark/shadow-mode.d.ts +17 -0
package/dist/benchmark/shadow-mode.js +95 -0
package/dist/cli/release-promotion-gate.js +14 -4
package/dist/commands/merge.js +102 -56
package/dist/contracts/provider-health.d.ts +37 -0
package/dist/contracts/provider-health.js +49 -1
package/dist/evidence/evidence-trust-score.d.ts +101 -0
package/dist/evidence/evidence-trust-score.js +408 -0
package/dist/evidence/index.d.ts +2 -0
package/dist/evidence/index.js +1 -0
package/dist/native/linux-x64/omk-safety +0 -0
package/dist/orchestration/merge-arbiter.d.ts +91 -0
package/dist/orchestration/merge-arbiter.js +376 -0
package/dist/providers/health.d.ts +3 -0
package/dist/providers/health.js +46 -0
package/dist/providers/index.d.ts +1 -0
package/dist/providers/index.js +1 -0
package/dist/providers/provider-health.d.ts +8 -1
package/dist/providers/provider-health.js +39 -0
package/dist/providers/provider-task-runner.js +31 -0
package/dist/providers/provider.d.ts +2 -0
package/dist/providers/router.js +80 -3
package/dist/providers/types.d.ts +4 -0
package/dist/runtime/contracts/weakness-remediation.d.ts +6 -0
package/dist/runtime/provider-maturity-gate.d.ts +2 -0
package/dist/runtime/provider-maturity-gate.js +26 -0
package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
package/dist/runtime/tool-dispatch-contracts.js +42 -2
package/dist/runtime/weakness-remediation-index.d.ts +1 -1
package/dist/runtime/weakness-remediation-index.js +1 -1
package/dist/safety/enforcement-engine.d.ts +89 -0
package/dist/safety/enforcement-engine.js +279 -0
package/dist/safety/tool-authority-gate.d.ts +40 -0
package/dist/safety/tool-authority-gate.js +92 -0
package/dist/schema/evidence.schema.d.ts +2 -2
package/dist/schema/proof-bundle.schema.d.ts +2 -2
package/docs/benchmark-design.md +122 -0
package/docs/getting-started.md +1 -1
package/docs/provider-maturity.md +1 -1
package/docs/versioning.md +3 -3
package/package.json +7 -3

package/dist/safety/enforcement-engine.js ADDED Viewed

@@ -0,0 +1,279 @@
+/**
+ * Policy / Sandbox Enforcement Engine v2
+ *
+ * Capability lattice with conservative policy combination.
+ * effectivePolicy = minByAuthority(userPolicy, repoPolicy, providerPolicy, adapterPolicy, riskPolicy)
+ *
+ * Conservative by default. Any ambiguity → block.
+ */
+import { createHash } from "node:crypto";
+export const ALL_CAPABILITIES = [
+    "read",
+    "write",
+    "shell",
+    "network",
+    "secret_read",
+    "secret_write",
+    "merge",
+    "publish",
+];
+// ---------------------------------------------------------------------------
+// Authority ranking (higher = more permissive)
+// ---------------------------------------------------------------------------
+const AUTHORITY_RANK = {
+    none: 0,
+    advisory: 1,
+    direct: 2,
+    full: 3,
+};
+export function rankOf(level) {
+    return AUTHORITY_RANK[level];
+}
+// ---------------------------------------------------------------------------
+// Defaults
+// ---------------------------------------------------------------------------
+const DEFAULT_CAPABILITY_LEVEL = "full";
+const DEFAULT_SANDBOX_MODE = "unrestricted";
+const DEFAULT_APPROVAL_POLICY = "yolo";
+export function defaultLattice() {
+    return {
+        read: DEFAULT_CAPABILITY_LEVEL,
+        write: DEFAULT_CAPABILITY_LEVEL,
+        shell: DEFAULT_CAPABILITY_LEVEL,
+        network: DEFAULT_CAPABILITY_LEVEL,
+        secret_read: DEFAULT_CAPABILITY_LEVEL,
+        secret_write: DEFAULT_CAPABILITY_LEVEL,
+        merge: DEFAULT_CAPABILITY_LEVEL,
+        publish: DEFAULT_CAPABILITY_LEVEL,
+    };
+}
+// ---------------------------------------------------------------------------
+// minByAuthority — conservative combination
+// ---------------------------------------------------------------------------
+/**
+ * Combine multiple policy layers by taking the **most restrictive**
+ * (minimum) authority level for each capability.
+ *
+ * If no layer expresses an opinion on a capability, it defaults to "full".
+ * If any layer expresses a sandbox mode, the most restrictive mode wins.
+ * If any layer expresses an approval policy, the most restrictive wins.
+ */
+export function combinePoliciesByMinAuthority(layers) {
+    const base = defaultLattice();
+    const activeSources = [];
+    // Track the most restrictive values seen so far.
+    const lattice = { ...base };
+    let sandboxMode = DEFAULT_SANDBOX_MODE;
+    let approvalPolicy = DEFAULT_APPROVAL_POLICY;
+    for (const layer of layers) {
+        activeSources.push(layer.source);
+        for (const cap of ALL_CAPABILITIES) {
+            const level = layer.lattice[cap];
+            if (level !== undefined) {
+                if (AUTHORITY_RANK[level] < AUTHORITY_RANK[lattice[cap]]) {
+                    lattice[cap] = level;
+                }
+            }
+        }
+        if (layer.sandboxMode !== undefined) {
+            if (sandboxModeRank(layer.sandboxMode) < sandboxModeRank(sandboxMode)) {
+                sandboxMode = layer.sandboxMode;
+            }
+        }
+        if (layer.approvalPolicy !== undefined) {
+            if (approvalPolicyRank(layer.approvalPolicy) < approvalPolicyRank(approvalPolicy)) {
+                approvalPolicy = layer.approvalPolicy;
+            }
+        }
+    }
+    return {
+        lattice: lattice,
+        sandboxMode,
+        approvalPolicy,
+        sources: activeSources,
+    };
+}
+// ---------------------------------------------------------------------------
+// Ranking helpers for sandbox mode and approval policy
+// ---------------------------------------------------------------------------
+function sandboxModeRank(mode) {
+    switch (mode) {
+        case "read-only":
+            return 0;
+        case "network-isolated":
+            return 1;
+        case "workspace-write":
+            return 2;
+        case "unrestricted":
+            return 3;
+    }
+}
+function approvalPolicyRank(policy) {
+    switch (policy) {
+        case "block":
+            return 0;
+        case "interactive":
+            return 1;
+        case "auto":
+            return 2;
+        case "yolo":
+            return 3;
+    }
+}
+// ---------------------------------------------------------------------------
+// Derive blocked / approval-required capabilities from combined policy
+// ---------------------------------------------------------------------------
+/**
+ * Compute the enforcement proof from a combined policy.
+ *
+ * Rules:
+ * 1. read-only sandbox blocks write, shell, network, merge, publish.
+ * 2. network-isolated sandbox blocks network.
+ * 3. Any capability with level "none" is blocked.
+ * 4. Any capability with level "advisory" requires approval.
+ * 5. interactive policy requires approval for non-read capabilities.
+ * 6. block policy blocks everything except read.
+ */
+export function computeEnforcementProof(combined) {
+    const blocked = new Set();
+    const approvalRequired = new Set();
+    const { lattice, sandboxMode, approvalPolicy, sources } = combined;
+    // Sandbox hard floors
+    if (sandboxMode === "read-only") {
+        for (const cap of ["write", "shell", "network", "merge", "publish", "secret_write"]) {
+            blocked.add(cap);
+        }
+    }
+    if (sandboxMode === "network-isolated") {
+        blocked.add("network");
+    }
+    // Per-capability levels
+    for (const cap of ALL_CAPABILITIES) {
+        const level = lattice[cap];
+        if (level === "none") {
+            blocked.add(cap);
+        }
+        else if (level === "advisory") {
+            approvalRequired.add(cap);
+        }
+    }
+    // Approval policy overrides
+    if (approvalPolicy === "block") {
+        for (const cap of ALL_CAPABILITIES) {
+            if (cap !== "read")
+                blocked.add(cap);
+        }
+    }
+    else if (approvalPolicy === "interactive") {
+        for (const cap of ALL_CAPABILITIES) {
+            if (cap !== "read" && !blocked.has(cap)) {
+                approvalRequired.add(cap);
+            }
+        }
+    }
+    else if (approvalPolicy === "yolo") {
+        // yolo removes approval requirements (but keeps blocks)
+        for (const cap of ALL_CAPABILITIES) {
+            approvalRequired.delete(cap);
+        }
+    }
+    // auto: advisory-level capabilities still need approval; full = allow
+    // (approvalRequired already contains advisory-level caps)
+    const blockedCapabilities = ALL_CAPABILITIES.filter((c) => blocked.has(c));
+    const approvalRequiredCapabilities = ALL_CAPABILITIES.filter((c) => approvalRequired.has(c) && !blocked.has(c));
+    return {
+        sandboxMode,
+        enforcedBy: [...sources],
+        blockedCapabilities,
+        approvalRequired: approvalRequiredCapabilities,
+        policyHash: hashCombinedPolicy(combined),
+    };
+}
+// ---------------------------------------------------------------------------
+// Policy hash (deterministic, no secrets)
+// ---------------------------------------------------------------------------
+function hashCombinedPolicy(combined) {
+    const payload = JSON.stringify({
+        lattice: combined.lattice,
+        sandboxMode: combined.sandboxMode,
+        approvalPolicy: combined.approvalPolicy,
+        sources: combined.sources,
+    });
+    return createHash("sha256").update(payload).digest("hex").slice(0, 16);
+}
+// ---------------------------------------------------------------------------
+// Adapter enforcement check
+// ---------------------------------------------------------------------------
+/**
+ * Returns true when the runtime/adapter has provided a valid enforcement proof.
+ * Runtimes without enforcement proof cannot enter authority lanes.
+ */
+export function hasValidEnforcementProof(proof) {
+    if (typeof proof !== "object" || proof === null)
+        return false;
+    const p = proof;
+    if (typeof p.policyHash !== "string" || p.policyHash.length === 0)
+        return false;
+    if (!Array.isArray(p.enforcedBy) || p.enforcedBy.length === 0)
+        return false;
+    if (!Array.isArray(p.blockedCapabilities))
+        return false;
+    if (!Array.isArray(p.approvalRequired))
+        return false;
+    if (!isSandboxMode(p.sandboxMode))
+        return false;
+    return true;
+}
+function isSandboxMode(v) {
+    return v === "read-only" || v === "workspace-write" || v === "network-isolated" || v === "unrestricted";
+}
+// ---------------------------------------------------------------------------
+// Convenience: build a PolicyLayer from legacy authority levels
+// ---------------------------------------------------------------------------
+export function policyLayerFromLegacyAuthorities(source, options) {
+    const lattice = {};
+    if (options.writeAuthority) {
+        lattice.write = options.writeAuthority;
+        lattice.merge = options.writeAuthority;
+        lattice.publish = options.writeAuthority;
+    }
+    if (options.shellAuthority) {
+        lattice.shell = options.shellAuthority;
+        lattice.merge = minLevel(lattice.merge, options.shellAuthority);
+        lattice.publish = minLevel(lattice.publish, options.shellAuthority);
+    }
+    return {
+        source,
+        lattice,
+        sandboxMode: options.sandboxMode,
+        approvalPolicy: options.approvalPolicy,
+    };
+}
+function minLevel(a, b) {
+    if (a === undefined)
+        return b;
+    return AUTHORITY_RANK[a] <= AUTHORITY_RANK[b] ? a : b;
+}
+/**
+ * Map a capability-lattice capability to the coarse ToolOp used by the gate.
+ * This preserves backward compatibility with the existing 4-class gate while
+ * allowing the new lattice to express finer-grained restrictions.
+ */
+export function capabilityToToolOp(cap) {
+    switch (cap) {
+        case "read":
+            return "read";
+        case "write":
+        case "publish":
+            return "write";
+        case "shell":
+            return "shell";
+        case "merge":
+            return "merge";
+        case "network":
+            return "network";
+        case "secret_read":
+        case "secret_write":
+            return "secret";
+    }
+}

package/dist/safety/tool-authority-gate.d.ts CHANGED Viewed

@@ -60,3 +60,43 @@ export declare function mapToolNameToOp(toolName: string): ToolOp;
  * @see ToolAuthorityContext for the decision inputs and ordering rules.
  */
 export declare function decideToolAuthority(ctx: ToolAuthorityContext): ToolAuthorityDecision;
+import type { CapabilityLattice, CapabilityLevel, EnforcementProof, SandboxCapability } from "./enforcement-engine.js";
+/**
+ * Extended operation class for v2 gate.
+ * Adds "network" and "secret" ops so the lattice can express finer
+ * restrictions without weakening the existing 4-class gate.
+ */
+export type ToolOpV2 = ToolOp | "network" | "secret";
+/** Authority context enriched with enforcement proof. */
+export interface ToolAuthorityContextV2 extends ToolAuthorityContext {
+    /** v2 enforcement proof — required for authority lanes. */
+    readonly enforcementProof?: EnforcementProof;
+    /** Full capability lattice when available. */
+    readonly lattice?: Readonly<CapabilityLattice>;
+}
+/**
+ * Derive the effective capability level for a tool operation from the lattice.
+ */
+export declare function toolOpToCapability(op: ToolOpV2): SandboxCapability;
+/**
+ * Build a ToolAuthorityContext from an enforcement proof.
+ * Bridges the v2 lattice into the legacy gate.
+ */
+export declare function buildToolAuthorityContextFromProof(op: ToolOpV2, proof: EnforcementProof, tty: boolean): ToolAuthorityContext;
+/**
+ * Decide using v2 enforcement proof when available, else fall back to legacy.
+ */
+export declare function decideToolAuthorityV2(ctx: ToolAuthorityContextV2): ToolAuthorityDecision;
+/**
+ * Pure v2 capability check.
+ */
+export declare function effectiveCapabilityLevel(cap: SandboxCapability, lattice?: Readonly<CapabilityLattice>): CapabilityLevel;
+/**
+ * Adapter-enforced capability resolution.
+ * Runtimes without enforcement proof cannot enter authority lanes.
+ */
+export declare function isOperationAllowedByProof(op: ToolOpV2, proof: EnforcementProof | undefined): boolean;
+/**
+ * Returns true when the operation requires explicit approval per the proof.
+ */
+export declare function isOperationApprovalRequiredByProof(op: ToolOpV2, proof: EnforcementProof | undefined): boolean;

package/dist/safety/tool-authority-gate.js CHANGED Viewed

@@ -106,3 +106,95 @@ export function decideToolAuthority(ctx) {
     // interactive: ask only when a TTY is attached; non-TTY ask = deny-by-default.
     return ctx.tty ? "ask" : "block";
 }
+/**
+ * Derive the effective capability level for a tool operation from the lattice.
+ */
+export function toolOpToCapability(op) {
+    switch (op) {
+        case "read":
+            return "read";
+        case "write":
+            return "write";
+        case "shell":
+            return "shell";
+        case "merge":
+            return "merge";
+        case "network":
+            return "network";
+        case "secret":
+            return "secret_write";
+    }
+}
+/**
+ * Build a ToolAuthorityContext from an enforcement proof.
+ * Bridges the v2 lattice into the legacy gate.
+ */
+export function buildToolAuthorityContextFromProof(op, proof, tty) {
+    const cap = toolOpToCapability(op);
+    const blocked = proof.blockedCapabilities.includes(cap);
+    const approvalPolicy = blocked
+        ? "block"
+        : proof.approvalRequired.includes(cap)
+            ? "interactive"
+            : "auto";
+    const writeBlocked = proof.blockedCapabilities.includes("write") || proof.blockedCapabilities.includes("publish");
+    const shellBlocked = proof.blockedCapabilities.includes("shell");
+    const writeApproval = proof.approvalRequired.includes("write") || proof.approvalRequired.includes("publish");
+    const shellApproval = proof.approvalRequired.includes("shell");
+    const writeAuthority = writeBlocked
+        ? "none"
+        : writeApproval
+            ? "advisory"
+            : "full";
+    const shellAuthority = shellBlocked
+        ? "none"
+        : shellApproval
+            ? "advisory"
+            : "full";
+    const sandboxMode = proof.sandboxMode === "read-only" ? "read-only" : "workspace-write";
+    return {
+        op: op === "network" || op === "secret" ? "shell" : op,
+        writeAuthority,
+        shellAuthority,
+        approvalPolicy,
+        sandboxMode,
+        tty,
+    };
+}
+/**
+ * Decide using v2 enforcement proof when available, else fall back to legacy.
+ */
+export function decideToolAuthorityV2(ctx) {
+    if (ctx.enforcementProof) {
+        const legacyCtx = buildToolAuthorityContextFromProof(ctx.op, ctx.enforcementProof, ctx.tty);
+        return decideToolAuthority(legacyCtx);
+    }
+    return decideToolAuthority(ctx);
+}
+/**
+ * Pure v2 capability check.
+ */
+export function effectiveCapabilityLevel(cap, lattice) {
+    if (!lattice)
+        return "full";
+    return lattice[cap] ?? "full";
+}
+/**
+ * Adapter-enforced capability resolution.
+ * Runtimes without enforcement proof cannot enter authority lanes.
+ */
+export function isOperationAllowedByProof(op, proof) {
+    if (!proof)
+        return false;
+    const cap = toolOpToCapability(op);
+    return !proof.blockedCapabilities.includes(cap);
+}
+/**
+ * Returns true when the operation requires explicit approval per the proof.
+ */
+export function isOperationApprovalRequiredByProof(op, proof) {
+    if (!proof)
+        return true;
+    const cap = toolOpToCapability(op);
+    return proof.approvalRequired.includes(cap);
+}

package/dist/schema/evidence.schema.d.ts CHANGED Viewed

@@ -14,7 +14,7 @@ export declare const EvidenceRecordSchema: z.ZodObject<{
     observedAt: z.ZodString;
     message: z.ZodOptional<z.ZodString>;
 }, "strip", z.ZodTypeAny, {
-    status: "failed" | "skipped" | "blocked" | "missing" | "passed";
+    status: "failed" | "skipped" | "missing" | "blocked" | "passed";
     kind: "file-exists" | "custom" | "summary-present" | "command-passes" | "git-diff-non-empty" | "marker-present" | "screenshot-present";
     required: boolean;
     runId: string;
@@ -27,7 +27,7 @@ export declare const EvidenceRecordSchema: z.ZodObject<{
     path?: string | undefined;
     nodeId?: string | undefined;
 }, {
-    status: "failed" | "skipped" | "blocked" | "missing" | "passed";
+    status: "failed" | "skipped" | "missing" | "blocked" | "passed";
     kind: "file-exists" | "custom" | "summary-present" | "command-passes" | "git-diff-non-empty" | "marker-present" | "screenshot-present";
     required: boolean;
     runId: string;

package/dist/schema/proof-bundle.schema.d.ts CHANGED Viewed

@@ -114,10 +114,10 @@ export declare const ProofBundleSchema: z.ZodObject<{
     };
     providerPolicy: string;
     omkVersion: string;
+    verdict: "failed" | "partial" | "passed";
     proofId: string;
     runtimeVersion: "v1.2";
     scenario: "no-kimi-smoke" | "evidence-block" | "fallback-route" | "dag-dependent-block" | "replay-inspect" | "example-generation" | "doctor-provider" | "native-safety" | "contract-version-smoke";
-    verdict: "failed" | "partial" | "passed";
     knownLimitations: string[];
     checksums: Record<string, string>;
 }, {
@@ -141,10 +141,10 @@ export declare const ProofBundleSchema: z.ZodObject<{
     };
     providerPolicy: string;
     omkVersion: string;
+    verdict: "failed" | "partial" | "passed";
     proofId: string;
     runtimeVersion: "v1.2";
     scenario: "no-kimi-smoke" | "evidence-block" | "fallback-route" | "dag-dependent-block" | "replay-inspect" | "example-generation" | "doctor-provider" | "native-safety" | "contract-version-smoke";
-    verdict: "failed" | "partial" | "passed";
     knownLimitations: string[];
     checksums: Record<string, string>;
 }>;

package/docs/benchmark-design.md ADDED Viewed

@@ -0,0 +1,122 @@
+# OMK Control Plane Replay Benchmark Design
+## 1. Purpose
+Design a reproducible benchmark suite that measures OMK control plane
+performance across 10 representative task categories. The benchmark runs in
+**shadow mode** (recorded traces, no live LLM calls) for baseline
+reproducibility, with optional **live-evaluation mode** for regression
+testing against real providers.
+## 2. Task Categories
+| # | Category | Intent | Description |
+|---|----------|--------|-------------|
+| 1 | read-only repo Q&A | research | Agent answers questions about codebase structure |
+| 2 | small bug fix | debugging | Single-file typo / logic fix |
+| 3 | failing test repair | debugging | Update implementation to satisfy failing test |
+| 4 | multi-file refactor | refactor | Rename/move symbols across 3+ files |
+| 5 | CLI command task | shell-operation | Execute and verify CLI output |
+| 6 | dependency update | coding | Bump package version, fix breaking changes |
+| 7 | merge-conflict task | merge | Resolve git merge conflict automatically |
+| 8 | security-sensitive task | review | Patch vulnerability with audit trail |
+| 9 | provider failure fallback | debugging | Primary provider fails; fallback succeeds |
+| 10 | quota/auth failure fallback | debugging | Quota/auth error triggers provider switch |
+## 3. Metrics
+| Metric | Definition | Source |
+|--------|-----------|--------|
+| solve_rate | passed_tasks / total_tasks | harness result |
+| evidence_trust_score | ETS v2 score per task | evidence-trust-score engine |
+| false_done_rate | tasks claiming success with failing evidence / total | harness+ETS |
+| fallback_success_rate | fallback attempts that succeed / total fallback attempts | router decision trace |
+| router_regret | best_available_runtime_score − selected_runtime_score | shadow-mode diff |
+| cost_per_solved_task | Σ costUsdEstimated / solved_count | attempt records |
+| p95_latency | 95th percentile of task latencyMs | attempt records |
+| rollback_rate | tasks rolled back / total tasks | decision trace |
+| sandbox_violation_count | tasks with unexpected file writes outside worktree | sandbox audit |
+**router_regret** is computed in shadow mode by scoring all candidates for
+every decision and comparing the selected runtime’s composite against the
+maximum composite.
+## 4. Reproducibility Contract
+Every benchmark run must pin:
+- **treeHash**: git commit SHA of the repo under test
+- **seed**: PRNG seed for synthetic fixture generation
+- **providerConfigHash**: hash of the runtime provider configuration
+- **omkVersion**: package version
+- **benchmarkSchemaVersion**: `omk.benchmark.v1`
+Shadow-mode runs use pre-recorded `BenchmarkTrace` fixtures (see
+`src/benchmark/fixtures.ts`). Live-evaluation mode records new traces into
+`.omk/benchmarks/<runId>/`.
+## 5. Shadow Mode
+Shadow mode runs router v1 and v2 side-by-side on identical inputs:
+1. Load a `BenchmarkTask` fixture.
+2. Run `createRuntimeRouter` (v1) and `createRouterV2ScoringEngine` (v2).
+3. Record both decisions into `ShadowModeRecord`.
+4. Compute `router_regret` for each.
+5. Diff v1/v2 selections and log disagreements.
+No LLM API calls are made. Runtime `runNode` is replaced with a stub that
+returns the recorded outcome from the fixture.
+## 6. Benchmark Harness Lifecycle
+```
+loadConfig() → discoverTasks() → for each task:
+  setupWorktree() → runTask() → evaluateEvidence() → teardown()
+→ computeSummary() → writeJsonReport()
+```
+The harness integrates with `scripts/run-tests.mjs` via:
+```bash
+node scripts/run-benchmark.mjs --shadow --summary-json .omk/benchmarks/latest.json
+```
+## 7. CI Integration
+A new `benchmark` job runs after `fast-gate` passes on `main` branch merges
+and nightly cron. It:
+1. Checks out the repo at the merge commit.
+2. Runs `npm run benchmark:shadow`.
+3. Uploads `.omk/benchmarks/latest.json` as artifact.
+4. Fails if `solve_rate < 0.85` or `false_done_rate > 0.05`.
+## 8. Directory Layout
+```
+src/benchmark/
+  contracts.ts      # BenchmarkTask, BenchmarkResult, BenchmarkSummary
+  harness.ts        # runBenchmarkSuite(), runBenchmarkTask()
+  shadow-mode.ts    # ShadowModeEngine, computeRouterRegret()
+  fixtures.ts       # generateSyntheticTraces(), loadRecordedTraces()
+scripts/
+  run-benchmark.mjs # CLI entrypoint
+test/
+  benchmark-harness.test.mjs
+.omk/benchmarks/
+  sample-run.json   # example output
+```
+## 9. Extending the Benchmark
+To add a new task category:
+1. Add intent mapping in `src/benchmark/fixtures.ts`.
+2. Create a fixture under `test/benchmark-fixtures/`.
+3. Add an evaluation rule in `src/benchmark/harness.ts`.
+4. Register the category in `scripts/run-benchmark.mjs`.
+## 10. Risks & Mitigations
+| Risk | Mitigation |
+|------|-----------|
+| Fixture drift (codebase changes) | Pin treeHash; auto-regenerate fixtures in CI if drift detected |
+| Shadow mode not representative of live behavior | Weekly live-evaluation job with small sample |
+| Metrics gaming (fake evidence) | ETS v2 gaming penalty + runner-source requirement |
+| Secret leakage in recorded traces | Redact with `redactTrace()` before persistence |

package/docs/getting-started.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # Getting Started
-Source release target: `open-multi-agent-kit@0.78.1` (`pre-1.0`).
+Source release target: `open-multi-agent-kit@0.78.5` (`pre-1.0`).
 ## Prerequisites

package/docs/provider-maturity.md CHANGED Viewed

@@ -4,7 +4,7 @@ This page documents provider status for the current source tree.
 ## Current source target
-- Package version: `0.78.1`
+- Package version: `0.78.5`
 - Runtime contract family: `v1.2`
 - Release channel: `pre-1.0`

package/docs/versioning.md CHANGED Viewed

@@ -4,11 +4,11 @@ OMK uses two version fields in release artifacts:
 | Field | Current value | Source | Meaning |
 | --- | --- | --- | --- |
-| Package version | `0.78.1` | `package.json`, `package-lock.json` | npm/package source version. |
+| Package version | `0.78.5` | `package.json`, `package-lock.json` | npm/package source version. |
 | Runtime version | `v1.2` | `src/version.ts`, JSON schemas | Contract/runtime family used by OMK envelopes. |
 | Release channel | `pre-1.0` | `src/version.ts` | Pre-1.0 package channel. |
-`0.78.1` is the package source version for the `v1.2` runtime contract family.
+`0.78.5` is the package source version for the `v1.2` runtime contract family.
 Use `v1.2` only for runtime contracts; do not substitute it for the package version.
 ## Contract versions
@@ -44,6 +44,6 @@ The `version --json` command emits one `omk.contract.v1` envelope whose data pay
 ## Documentation rules
-- Use `0.78.1` when referring to the current package source version.
+- Use `0.78.5` when referring to the current package source version.
 - Use `v1.2` only for the runtime contract family.
 - Keep historical changelog entries unchanged unless the text is not clearly historical.

package/package.json CHANGED Viewed

@@ -1,9 +1,10 @@
 {
   "name": "open-multi-agent-kit",
-  "version": "0.78.2",
+  "version": "0.78.5",
   "description": "Provider-neutral multi-agent control plane for coding workflows: route agents, verify evidence, orchestrate MCP-aware DAGs, and control the loop from the omk CLI.",
   "type": "module",
   "bin": {
+    "omk": "dist/cli.js",
     "omk-project-mcp": "dist/mcp/omk-project-server.js",
     "omk-acp": "dist/mcp/acp-server.js",
     "omk-mcp-host": "dist/mcp/host.js"
@@ -32,7 +33,7 @@
     "version:check": "node scripts/check-version-consistency.mjs",
     "contract:check": "npm run build:clean && npm run schema:check && node scripts/check-json-stdout.mjs",
     "verify:contracts": "npm run contract:check",
-    "release:check": "node scripts/release-gate.mjs",
+    "release:check": "npm run verify:no-kimi && npm run contract:check && npm run schema:check && npm run version:check && npm run proof:check && npm run smoke:execution && npm run native:build && npm run pack:dry && npm run audit:package && npm run smoke:pack && OMK_RELEASE_DEMO=1 node scripts/release-gate.mjs",
     "release:full": "npm run verify && npm run verify:no-kimi && npm run contract:check && npm run schema:check && npm run version:check && npm run proof:check && npm run smoke:execution && npm run native:build && npm run pack:dry && npm run audit:package && npm run smoke:pack",
     "regression:matrix": "node scripts/regression-proof-matrix.mjs",
     "release:rc": "npm run verify && npm run verify:no-kimi && npm run contract:check && npm run schema:check && npm run version:check && npm run proof:check && npm run smoke:execution && npm run native:build && npm run pack:dry && npm run audit:package && npm run smoke:pack",
@@ -47,7 +48,10 @@
     "native:no-kimi:turn": "OMK_MCP_PREFLIGHT=off OMK_PROJECT_ROOT=\"$PWD\" node --test test/no-kimi-native-turn.test.mjs",
     "no-kimi:default-surface": "node scripts/no-kimi-default-surface.mjs",
     "test:no-kimi:runtime-routing": "node --test --test-timeout=300000 test/provider-router.test.mjs test/runtime-router.test.mjs && node --test --test-timeout=300000 --test-name-pattern='authority provider write-risk routes|configured non-Kimi authority|implicit default authority|unavailable DeepSeek|external direct fallback|Super OMK config|scoreProviders includes|computeProviderRouteScore|runtime fallback' test/provider-routing.test.mjs && node --test test/no-kimi-cli-hud-surface.test.mjs test/v2-regression.test.mjs",
-    "legacy-identity:check": "node scripts/no-legacy-identity-surface.mjs"
+    "legacy-identity:check": "node scripts/no-legacy-identity-surface.mjs",
+    "benchmark:shadow": "node scripts/run-benchmark.mjs --shadow",
+    "benchmark:live": "node scripts/run-benchmark.mjs --live",
+    "benchmark:ci": "node scripts/run-benchmark.mjs --shadow --summary-json proof/benchmarks/ci-run.json"
   },
   "keywords": [
     "agent-runtime",