open-multi-agent-kit 0.78.2 → 0.78.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +27 -2
  2. package/MATURITY.md +2 -2
  3. package/README.md +4 -4
  4. package/dist/benchmark/contracts.d.ts +116 -0
  5. package/dist/benchmark/contracts.js +6 -0
  6. package/dist/benchmark/fixtures.d.ts +11 -0
  7. package/dist/benchmark/fixtures.js +124 -0
  8. package/dist/benchmark/harness.d.ts +13 -0
  9. package/dist/benchmark/harness.js +191 -0
  10. package/dist/benchmark/shadow-mode.d.ts +17 -0
  11. package/dist/benchmark/shadow-mode.js +95 -0
  12. package/dist/cli/release-promotion-gate.js +14 -4
  13. package/dist/commands/merge.js +102 -56
  14. package/dist/contracts/provider-health.d.ts +37 -0
  15. package/dist/contracts/provider-health.js +49 -1
  16. package/dist/evidence/evidence-trust-score.d.ts +101 -0
  17. package/dist/evidence/evidence-trust-score.js +408 -0
  18. package/dist/evidence/index.d.ts +2 -0
  19. package/dist/evidence/index.js +1 -0
  20. package/dist/native/linux-x64/omk-safety +0 -0
  21. package/dist/orchestration/merge-arbiter.d.ts +91 -0
  22. package/dist/orchestration/merge-arbiter.js +376 -0
  23. package/dist/providers/health.d.ts +3 -0
  24. package/dist/providers/health.js +46 -0
  25. package/dist/providers/index.d.ts +1 -0
  26. package/dist/providers/index.js +1 -0
  27. package/dist/providers/provider-health.d.ts +8 -1
  28. package/dist/providers/provider-health.js +39 -0
  29. package/dist/providers/provider-task-runner.js +31 -0
  30. package/dist/providers/provider.d.ts +2 -0
  31. package/dist/providers/router.js +80 -3
  32. package/dist/providers/types.d.ts +4 -0
  33. package/dist/runtime/contracts/weakness-remediation.d.ts +6 -0
  34. package/dist/runtime/provider-maturity-gate.d.ts +2 -0
  35. package/dist/runtime/provider-maturity-gate.js +26 -0
  36. package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
  37. package/dist/runtime/tool-dispatch-contracts.js +42 -2
  38. package/dist/runtime/weakness-remediation-index.d.ts +1 -1
  39. package/dist/runtime/weakness-remediation-index.js +1 -1
  40. package/dist/safety/enforcement-engine.d.ts +89 -0
  41. package/dist/safety/enforcement-engine.js +279 -0
  42. package/dist/safety/tool-authority-gate.d.ts +40 -0
  43. package/dist/safety/tool-authority-gate.js +92 -0
  44. package/dist/schema/evidence.schema.d.ts +2 -2
  45. package/dist/schema/proof-bundle.schema.d.ts +2 -2
  46. package/docs/benchmark-design.md +122 -0
  47. package/docs/getting-started.md +1 -1
  48. package/docs/provider-maturity.md +1 -1
  49. package/docs/versioning.md +3 -3
  50. package/package.json +7 -3
@@ -0,0 +1,279 @@
1
+ /**
2
+ * Policy / Sandbox Enforcement Engine v2
3
+ *
4
+ * Capability lattice with conservative policy combination.
5
+ * effectivePolicy = minByAuthority(userPolicy, repoPolicy, providerPolicy, adapterPolicy, riskPolicy)
6
+ *
7
+ * Conservative by default. Any ambiguity → block.
8
+ */
9
+ import { createHash } from "node:crypto";
10
+ export const ALL_CAPABILITIES = [
11
+ "read",
12
+ "write",
13
+ "shell",
14
+ "network",
15
+ "secret_read",
16
+ "secret_write",
17
+ "merge",
18
+ "publish",
19
+ ];
20
+ // ---------------------------------------------------------------------------
21
+ // Authority ranking (higher = more permissive)
22
+ // ---------------------------------------------------------------------------
23
+ const AUTHORITY_RANK = {
24
+ none: 0,
25
+ advisory: 1,
26
+ direct: 2,
27
+ full: 3,
28
+ };
29
+ export function rankOf(level) {
30
+ return AUTHORITY_RANK[level];
31
+ }
32
+ // ---------------------------------------------------------------------------
33
+ // Defaults
34
+ // ---------------------------------------------------------------------------
35
+ const DEFAULT_CAPABILITY_LEVEL = "full";
36
+ const DEFAULT_SANDBOX_MODE = "unrestricted";
37
+ const DEFAULT_APPROVAL_POLICY = "yolo";
38
+ export function defaultLattice() {
39
+ return {
40
+ read: DEFAULT_CAPABILITY_LEVEL,
41
+ write: DEFAULT_CAPABILITY_LEVEL,
42
+ shell: DEFAULT_CAPABILITY_LEVEL,
43
+ network: DEFAULT_CAPABILITY_LEVEL,
44
+ secret_read: DEFAULT_CAPABILITY_LEVEL,
45
+ secret_write: DEFAULT_CAPABILITY_LEVEL,
46
+ merge: DEFAULT_CAPABILITY_LEVEL,
47
+ publish: DEFAULT_CAPABILITY_LEVEL,
48
+ };
49
+ }
50
+ // ---------------------------------------------------------------------------
51
+ // minByAuthority — conservative combination
52
+ // ---------------------------------------------------------------------------
53
+ /**
54
+ * Combine multiple policy layers by taking the **most restrictive**
55
+ * (minimum) authority level for each capability.
56
+ *
57
+ * If no layer expresses an opinion on a capability, it defaults to "full".
58
+ * If any layer expresses a sandbox mode, the most restrictive mode wins.
59
+ * If any layer expresses an approval policy, the most restrictive wins.
60
+ */
61
+ export function combinePoliciesByMinAuthority(layers) {
62
+ const base = defaultLattice();
63
+ const activeSources = [];
64
+ // Track the most restrictive values seen so far.
65
+ const lattice = { ...base };
66
+ let sandboxMode = DEFAULT_SANDBOX_MODE;
67
+ let approvalPolicy = DEFAULT_APPROVAL_POLICY;
68
+ for (const layer of layers) {
69
+ activeSources.push(layer.source);
70
+ for (const cap of ALL_CAPABILITIES) {
71
+ const level = layer.lattice[cap];
72
+ if (level !== undefined) {
73
+ if (AUTHORITY_RANK[level] < AUTHORITY_RANK[lattice[cap]]) {
74
+ lattice[cap] = level;
75
+ }
76
+ }
77
+ }
78
+ if (layer.sandboxMode !== undefined) {
79
+ if (sandboxModeRank(layer.sandboxMode) < sandboxModeRank(sandboxMode)) {
80
+ sandboxMode = layer.sandboxMode;
81
+ }
82
+ }
83
+ if (layer.approvalPolicy !== undefined) {
84
+ if (approvalPolicyRank(layer.approvalPolicy) < approvalPolicyRank(approvalPolicy)) {
85
+ approvalPolicy = layer.approvalPolicy;
86
+ }
87
+ }
88
+ }
89
+ return {
90
+ lattice: lattice,
91
+ sandboxMode,
92
+ approvalPolicy,
93
+ sources: activeSources,
94
+ };
95
+ }
96
+ // ---------------------------------------------------------------------------
97
+ // Ranking helpers for sandbox mode and approval policy
98
+ // ---------------------------------------------------------------------------
99
+ function sandboxModeRank(mode) {
100
+ switch (mode) {
101
+ case "read-only":
102
+ return 0;
103
+ case "network-isolated":
104
+ return 1;
105
+ case "workspace-write":
106
+ return 2;
107
+ case "unrestricted":
108
+ return 3;
109
+ }
110
+ }
111
+ function approvalPolicyRank(policy) {
112
+ switch (policy) {
113
+ case "block":
114
+ return 0;
115
+ case "interactive":
116
+ return 1;
117
+ case "auto":
118
+ return 2;
119
+ case "yolo":
120
+ return 3;
121
+ }
122
+ }
123
+ // ---------------------------------------------------------------------------
124
+ // Derive blocked / approval-required capabilities from combined policy
125
+ // ---------------------------------------------------------------------------
126
+ /**
127
+ * Compute the enforcement proof from a combined policy.
128
+ *
129
+ * Rules:
130
+ * 1. read-only sandbox blocks write, shell, network, merge, publish.
131
+ * 2. network-isolated sandbox blocks network.
132
+ * 3. Any capability with level "none" is blocked.
133
+ * 4. Any capability with level "advisory" requires approval.
134
+ * 5. interactive policy requires approval for non-read capabilities.
135
+ * 6. block policy blocks everything except read.
136
+ */
137
+ export function computeEnforcementProof(combined) {
138
+ const blocked = new Set();
139
+ const approvalRequired = new Set();
140
+ const { lattice, sandboxMode, approvalPolicy, sources } = combined;
141
+ // Sandbox hard floors
142
+ if (sandboxMode === "read-only") {
143
+ for (const cap of ["write", "shell", "network", "merge", "publish", "secret_write"]) {
144
+ blocked.add(cap);
145
+ }
146
+ }
147
+ if (sandboxMode === "network-isolated") {
148
+ blocked.add("network");
149
+ }
150
+ // Per-capability levels
151
+ for (const cap of ALL_CAPABILITIES) {
152
+ const level = lattice[cap];
153
+ if (level === "none") {
154
+ blocked.add(cap);
155
+ }
156
+ else if (level === "advisory") {
157
+ approvalRequired.add(cap);
158
+ }
159
+ }
160
+ // Approval policy overrides
161
+ if (approvalPolicy === "block") {
162
+ for (const cap of ALL_CAPABILITIES) {
163
+ if (cap !== "read")
164
+ blocked.add(cap);
165
+ }
166
+ }
167
+ else if (approvalPolicy === "interactive") {
168
+ for (const cap of ALL_CAPABILITIES) {
169
+ if (cap !== "read" && !blocked.has(cap)) {
170
+ approvalRequired.add(cap);
171
+ }
172
+ }
173
+ }
174
+ else if (approvalPolicy === "yolo") {
175
+ // yolo removes approval requirements (but keeps blocks)
176
+ for (const cap of ALL_CAPABILITIES) {
177
+ approvalRequired.delete(cap);
178
+ }
179
+ }
180
+ // auto: advisory-level capabilities still need approval; full = allow
181
+ // (approvalRequired already contains advisory-level caps)
182
+ const blockedCapabilities = ALL_CAPABILITIES.filter((c) => blocked.has(c));
183
+ const approvalRequiredCapabilities = ALL_CAPABILITIES.filter((c) => approvalRequired.has(c) && !blocked.has(c));
184
+ return {
185
+ sandboxMode,
186
+ enforcedBy: [...sources],
187
+ blockedCapabilities,
188
+ approvalRequired: approvalRequiredCapabilities,
189
+ policyHash: hashCombinedPolicy(combined),
190
+ };
191
+ }
192
+ // ---------------------------------------------------------------------------
193
+ // Policy hash (deterministic, no secrets)
194
+ // ---------------------------------------------------------------------------
195
+ function hashCombinedPolicy(combined) {
196
+ const payload = JSON.stringify({
197
+ lattice: combined.lattice,
198
+ sandboxMode: combined.sandboxMode,
199
+ approvalPolicy: combined.approvalPolicy,
200
+ sources: combined.sources,
201
+ });
202
+ return createHash("sha256").update(payload).digest("hex").slice(0, 16);
203
+ }
204
+ // ---------------------------------------------------------------------------
205
+ // Adapter enforcement check
206
+ // ---------------------------------------------------------------------------
207
+ /**
208
+ * Returns true when the runtime/adapter has provided a valid enforcement proof.
209
+ * Runtimes without enforcement proof cannot enter authority lanes.
210
+ */
211
+ export function hasValidEnforcementProof(proof) {
212
+ if (typeof proof !== "object" || proof === null)
213
+ return false;
214
+ const p = proof;
215
+ if (typeof p.policyHash !== "string" || p.policyHash.length === 0)
216
+ return false;
217
+ if (!Array.isArray(p.enforcedBy) || p.enforcedBy.length === 0)
218
+ return false;
219
+ if (!Array.isArray(p.blockedCapabilities))
220
+ return false;
221
+ if (!Array.isArray(p.approvalRequired))
222
+ return false;
223
+ if (!isSandboxMode(p.sandboxMode))
224
+ return false;
225
+ return true;
226
+ }
227
+ function isSandboxMode(v) {
228
+ return v === "read-only" || v === "workspace-write" || v === "network-isolated" || v === "unrestricted";
229
+ }
230
+ // ---------------------------------------------------------------------------
231
+ // Convenience: build a PolicyLayer from legacy authority levels
232
+ // ---------------------------------------------------------------------------
233
+ export function policyLayerFromLegacyAuthorities(source, options) {
234
+ const lattice = {};
235
+ if (options.writeAuthority) {
236
+ lattice.write = options.writeAuthority;
237
+ lattice.merge = options.writeAuthority;
238
+ lattice.publish = options.writeAuthority;
239
+ }
240
+ if (options.shellAuthority) {
241
+ lattice.shell = options.shellAuthority;
242
+ lattice.merge = minLevel(lattice.merge, options.shellAuthority);
243
+ lattice.publish = minLevel(lattice.publish, options.shellAuthority);
244
+ }
245
+ return {
246
+ source,
247
+ lattice,
248
+ sandboxMode: options.sandboxMode,
249
+ approvalPolicy: options.approvalPolicy,
250
+ };
251
+ }
252
+ function minLevel(a, b) {
253
+ if (a === undefined)
254
+ return b;
255
+ return AUTHORITY_RANK[a] <= AUTHORITY_RANK[b] ? a : b;
256
+ }
257
+ /**
258
+ * Map a capability-lattice capability to the coarse ToolOp used by the gate.
259
+ * This preserves backward compatibility with the existing 4-class gate while
260
+ * allowing the new lattice to express finer-grained restrictions.
261
+ */
262
+ export function capabilityToToolOp(cap) {
263
+ switch (cap) {
264
+ case "read":
265
+ return "read";
266
+ case "write":
267
+ case "publish":
268
+ return "write";
269
+ case "shell":
270
+ return "shell";
271
+ case "merge":
272
+ return "merge";
273
+ case "network":
274
+ return "network";
275
+ case "secret_read":
276
+ case "secret_write":
277
+ return "secret";
278
+ }
279
+ }
@@ -60,3 +60,43 @@ export declare function mapToolNameToOp(toolName: string): ToolOp;
60
60
  * @see ToolAuthorityContext for the decision inputs and ordering rules.
61
61
  */
62
62
  export declare function decideToolAuthority(ctx: ToolAuthorityContext): ToolAuthorityDecision;
63
+ import type { CapabilityLattice, CapabilityLevel, EnforcementProof, SandboxCapability } from "./enforcement-engine.js";
64
+ /**
65
+ * Extended operation class for v2 gate.
66
+ * Adds "network" and "secret" ops so the lattice can express finer
67
+ * restrictions without weakening the existing 4-class gate.
68
+ */
69
+ export type ToolOpV2 = ToolOp | "network" | "secret";
70
+ /** Authority context enriched with enforcement proof. */
71
+ export interface ToolAuthorityContextV2 extends ToolAuthorityContext {
72
+ /** v2 enforcement proof — required for authority lanes. */
73
+ readonly enforcementProof?: EnforcementProof;
74
+ /** Full capability lattice when available. */
75
+ readonly lattice?: Readonly<CapabilityLattice>;
76
+ }
77
+ /**
78
+ * Derive the effective capability level for a tool operation from the lattice.
79
+ */
80
+ export declare function toolOpToCapability(op: ToolOpV2): SandboxCapability;
81
+ /**
82
+ * Build a ToolAuthorityContext from an enforcement proof.
83
+ * Bridges the v2 lattice into the legacy gate.
84
+ */
85
+ export declare function buildToolAuthorityContextFromProof(op: ToolOpV2, proof: EnforcementProof, tty: boolean): ToolAuthorityContext;
86
+ /**
87
+ * Decide using v2 enforcement proof when available, else fall back to legacy.
88
+ */
89
+ export declare function decideToolAuthorityV2(ctx: ToolAuthorityContextV2): ToolAuthorityDecision;
90
+ /**
91
+ * Pure v2 capability check.
92
+ */
93
+ export declare function effectiveCapabilityLevel(cap: SandboxCapability, lattice?: Readonly<CapabilityLattice>): CapabilityLevel;
94
+ /**
95
+ * Adapter-enforced capability resolution.
96
+ * Runtimes without enforcement proof cannot enter authority lanes.
97
+ */
98
+ export declare function isOperationAllowedByProof(op: ToolOpV2, proof: EnforcementProof | undefined): boolean;
99
+ /**
100
+ * Returns true when the operation requires explicit approval per the proof.
101
+ */
102
+ export declare function isOperationApprovalRequiredByProof(op: ToolOpV2, proof: EnforcementProof | undefined): boolean;
@@ -106,3 +106,95 @@ export function decideToolAuthority(ctx) {
106
106
  // interactive: ask only when a TTY is attached; non-TTY ask = deny-by-default.
107
107
  return ctx.tty ? "ask" : "block";
108
108
  }
109
+ /**
110
+ * Derive the effective capability level for a tool operation from the lattice.
111
+ */
112
+ export function toolOpToCapability(op) {
113
+ switch (op) {
114
+ case "read":
115
+ return "read";
116
+ case "write":
117
+ return "write";
118
+ case "shell":
119
+ return "shell";
120
+ case "merge":
121
+ return "merge";
122
+ case "network":
123
+ return "network";
124
+ case "secret":
125
+ return "secret_write";
126
+ }
127
+ }
128
+ /**
129
+ * Build a ToolAuthorityContext from an enforcement proof.
130
+ * Bridges the v2 lattice into the legacy gate.
131
+ */
132
+ export function buildToolAuthorityContextFromProof(op, proof, tty) {
133
+ const cap = toolOpToCapability(op);
134
+ const blocked = proof.blockedCapabilities.includes(cap);
135
+ const approvalPolicy = blocked
136
+ ? "block"
137
+ : proof.approvalRequired.includes(cap)
138
+ ? "interactive"
139
+ : "auto";
140
+ const writeBlocked = proof.blockedCapabilities.includes("write") || proof.blockedCapabilities.includes("publish");
141
+ const shellBlocked = proof.blockedCapabilities.includes("shell");
142
+ const writeApproval = proof.approvalRequired.includes("write") || proof.approvalRequired.includes("publish");
143
+ const shellApproval = proof.approvalRequired.includes("shell");
144
+ const writeAuthority = writeBlocked
145
+ ? "none"
146
+ : writeApproval
147
+ ? "advisory"
148
+ : "full";
149
+ const shellAuthority = shellBlocked
150
+ ? "none"
151
+ : shellApproval
152
+ ? "advisory"
153
+ : "full";
154
+ const sandboxMode = proof.sandboxMode === "read-only" ? "read-only" : "workspace-write";
155
+ return {
156
+ op: op === "network" || op === "secret" ? "shell" : op,
157
+ writeAuthority,
158
+ shellAuthority,
159
+ approvalPolicy,
160
+ sandboxMode,
161
+ tty,
162
+ };
163
+ }
164
+ /**
165
+ * Decide using v2 enforcement proof when available, else fall back to legacy.
166
+ */
167
+ export function decideToolAuthorityV2(ctx) {
168
+ if (ctx.enforcementProof) {
169
+ const legacyCtx = buildToolAuthorityContextFromProof(ctx.op, ctx.enforcementProof, ctx.tty);
170
+ return decideToolAuthority(legacyCtx);
171
+ }
172
+ return decideToolAuthority(ctx);
173
+ }
174
+ /**
175
+ * Pure v2 capability check.
176
+ */
177
+ export function effectiveCapabilityLevel(cap, lattice) {
178
+ if (!lattice)
179
+ return "full";
180
+ return lattice[cap] ?? "full";
181
+ }
182
+ /**
183
+ * Adapter-enforced capability resolution.
184
+ * Runtimes without enforcement proof cannot enter authority lanes.
185
+ */
186
+ export function isOperationAllowedByProof(op, proof) {
187
+ if (!proof)
188
+ return false;
189
+ const cap = toolOpToCapability(op);
190
+ return !proof.blockedCapabilities.includes(cap);
191
+ }
192
+ /**
193
+ * Returns true when the operation requires explicit approval per the proof.
194
+ */
195
+ export function isOperationApprovalRequiredByProof(op, proof) {
196
+ if (!proof)
197
+ return true;
198
+ const cap = toolOpToCapability(op);
199
+ return proof.approvalRequired.includes(cap);
200
+ }
@@ -14,7 +14,7 @@ export declare const EvidenceRecordSchema: z.ZodObject<{
14
14
  observedAt: z.ZodString;
15
15
  message: z.ZodOptional<z.ZodString>;
16
16
  }, "strip", z.ZodTypeAny, {
17
- status: "failed" | "skipped" | "blocked" | "missing" | "passed";
17
+ status: "failed" | "skipped" | "missing" | "blocked" | "passed";
18
18
  kind: "file-exists" | "custom" | "summary-present" | "command-passes" | "git-diff-non-empty" | "marker-present" | "screenshot-present";
19
19
  required: boolean;
20
20
  runId: string;
@@ -27,7 +27,7 @@ export declare const EvidenceRecordSchema: z.ZodObject<{
27
27
  path?: string | undefined;
28
28
  nodeId?: string | undefined;
29
29
  }, {
30
- status: "failed" | "skipped" | "blocked" | "missing" | "passed";
30
+ status: "failed" | "skipped" | "missing" | "blocked" | "passed";
31
31
  kind: "file-exists" | "custom" | "summary-present" | "command-passes" | "git-diff-non-empty" | "marker-present" | "screenshot-present";
32
32
  required: boolean;
33
33
  runId: string;
@@ -114,10 +114,10 @@ export declare const ProofBundleSchema: z.ZodObject<{
114
114
  };
115
115
  providerPolicy: string;
116
116
  omkVersion: string;
117
+ verdict: "failed" | "partial" | "passed";
117
118
  proofId: string;
118
119
  runtimeVersion: "v1.2";
119
120
  scenario: "no-kimi-smoke" | "evidence-block" | "fallback-route" | "dag-dependent-block" | "replay-inspect" | "example-generation" | "doctor-provider" | "native-safety" | "contract-version-smoke";
120
- verdict: "failed" | "partial" | "passed";
121
121
  knownLimitations: string[];
122
122
  checksums: Record<string, string>;
123
123
  }, {
@@ -141,10 +141,10 @@ export declare const ProofBundleSchema: z.ZodObject<{
141
141
  };
142
142
  providerPolicy: string;
143
143
  omkVersion: string;
144
+ verdict: "failed" | "partial" | "passed";
144
145
  proofId: string;
145
146
  runtimeVersion: "v1.2";
146
147
  scenario: "no-kimi-smoke" | "evidence-block" | "fallback-route" | "dag-dependent-block" | "replay-inspect" | "example-generation" | "doctor-provider" | "native-safety" | "contract-version-smoke";
147
- verdict: "failed" | "partial" | "passed";
148
148
  knownLimitations: string[];
149
149
  checksums: Record<string, string>;
150
150
  }>;
@@ -0,0 +1,122 @@
1
+ # OMK Control Plane Replay Benchmark Design
2
+
3
+ ## 1. Purpose
4
+
5
+ Design a reproducible benchmark suite that measures OMK control plane
6
+ performance across 10 representative task categories. The benchmark runs in
7
+ **shadow mode** (recorded traces, no live LLM calls) for baseline
8
+ reproducibility, with optional **live-evaluation mode** for regression
9
+ testing against real providers.
10
+
11
+ ## 2. Task Categories
12
+
13
+ | # | Category | Intent | Description |
14
+ |---|----------|--------|-------------|
15
+ | 1 | read-only repo Q&A | research | Agent answers questions about codebase structure |
16
+ | 2 | small bug fix | debugging | Single-file typo / logic fix |
17
+ | 3 | failing test repair | debugging | Update implementation to satisfy failing test |
18
+ | 4 | multi-file refactor | refactor | Rename/move symbols across 3+ files |
19
+ | 5 | CLI command task | shell-operation | Execute and verify CLI output |
20
+ | 6 | dependency update | coding | Bump package version, fix breaking changes |
21
+ | 7 | merge-conflict task | merge | Resolve git merge conflict automatically |
22
+ | 8 | security-sensitive task | review | Patch vulnerability with audit trail |
23
+ | 9 | provider failure fallback | debugging | Primary provider fails; fallback succeeds |
24
+ | 10 | quota/auth failure fallback | debugging | Quota/auth error triggers provider switch |
25
+
26
+ ## 3. Metrics
27
+
28
+ | Metric | Definition | Source |
29
+ |--------|-----------|--------|
30
+ | solve_rate | passed_tasks / total_tasks | harness result |
31
+ | evidence_trust_score | ETS v2 score per task | evidence-trust-score engine |
32
+ | false_done_rate | tasks claiming success with failing evidence / total | harness+ETS |
33
+ | fallback_success_rate | fallback attempts that succeed / total fallback attempts | router decision trace |
34
+ | router_regret | best_available_runtime_score − selected_runtime_score | shadow-mode diff |
35
+ | cost_per_solved_task | Σ costUsdEstimated / solved_count | attempt records |
36
+ | p95_latency | 95th percentile of task latencyMs | attempt records |
37
+ | rollback_rate | tasks rolled back / total tasks | decision trace |
38
+ | sandbox_violation_count | tasks with unexpected file writes outside worktree | sandbox audit |
39
+
40
+ **router_regret** is computed in shadow mode by scoring all candidates for
41
+ every decision and comparing the selected runtime’s composite against the
42
+ maximum composite.
43
+
44
+ ## 4. Reproducibility Contract
45
+
46
+ Every benchmark run must pin:
47
+ - **treeHash**: git commit SHA of the repo under test
48
+ - **seed**: PRNG seed for synthetic fixture generation
49
+ - **providerConfigHash**: hash of the runtime provider configuration
50
+ - **omkVersion**: package version
51
+ - **benchmarkSchemaVersion**: `omk.benchmark.v1`
52
+
53
+ Shadow-mode runs use pre-recorded `BenchmarkTrace` fixtures (see
54
+ `src/benchmark/fixtures.ts`). Live-evaluation mode records new traces into
55
+ `.omk/benchmarks/<runId>/`.
56
+
57
+ ## 5. Shadow Mode
58
+
59
+ Shadow mode runs router v1 and v2 side-by-side on identical inputs:
60
+ 1. Load a `BenchmarkTask` fixture.
61
+ 2. Run `createRuntimeRouter` (v1) and `createRouterV2ScoringEngine` (v2).
62
+ 3. Record both decisions into `ShadowModeRecord`.
63
+ 4. Compute `router_regret` for each.
64
+ 5. Diff v1/v2 selections and log disagreements.
65
+
66
+ No LLM API calls are made. Runtime `runNode` is replaced with a stub that
67
+ returns the recorded outcome from the fixture.
68
+
69
+ ## 6. Benchmark Harness Lifecycle
70
+
71
+ ```
72
+ loadConfig() → discoverTasks() → for each task:
73
+ setupWorktree() → runTask() → evaluateEvidence() → teardown()
74
+ → computeSummary() → writeJsonReport()
75
+ ```
76
+
77
+ The harness integrates with `scripts/run-tests.mjs` via:
78
+ ```bash
79
+ node scripts/run-benchmark.mjs --shadow --summary-json .omk/benchmarks/latest.json
80
+ ```
81
+
82
+ ## 7. CI Integration
83
+
84
+ A new `benchmark` job runs after `fast-gate` passes on `main` branch merges
85
+ and nightly cron. It:
86
+ 1. Checks out the repo at the merge commit.
87
+ 2. Runs `npm run benchmark:shadow`.
88
+ 3. Uploads `.omk/benchmarks/latest.json` as artifact.
89
+ 4. Fails if `solve_rate < 0.85` or `false_done_rate > 0.05`.
90
+
91
+ ## 8. Directory Layout
92
+
93
+ ```
94
+ src/benchmark/
95
+ contracts.ts # BenchmarkTask, BenchmarkResult, BenchmarkSummary
96
+ harness.ts # runBenchmarkSuite(), runBenchmarkTask()
97
+ shadow-mode.ts # ShadowModeEngine, computeRouterRegret()
98
+ fixtures.ts # generateSyntheticTraces(), loadRecordedTraces()
99
+ scripts/
100
+ run-benchmark.mjs # CLI entrypoint
101
+ test/
102
+ benchmark-harness.test.mjs
103
+ .omk/benchmarks/
104
+ sample-run.json # example output
105
+ ```
106
+
107
+ ## 9. Extending the Benchmark
108
+
109
+ To add a new task category:
110
+ 1. Add intent mapping in `src/benchmark/fixtures.ts`.
111
+ 2. Create a fixture under `test/benchmark-fixtures/`.
112
+ 3. Add an evaluation rule in `src/benchmark/harness.ts`.
113
+ 4. Register the category in `scripts/run-benchmark.mjs`.
114
+
115
+ ## 10. Risks & Mitigations
116
+
117
+ | Risk | Mitigation |
118
+ |------|-----------|
119
+ | Fixture drift (codebase changes) | Pin treeHash; auto-regenerate fixtures in CI if drift detected |
120
+ | Shadow mode not representative of live behavior | Weekly live-evaluation job with small sample |
121
+ | Metrics gaming (fake evidence) | ETS v2 gaming penalty + runner-source requirement |
122
+ | Secret leakage in recorded traces | Redact with `redactTrace()` before persistence |
@@ -1,6 +1,6 @@
1
1
  # Getting Started
2
2
 
3
- Source release target: `open-multi-agent-kit@0.78.1` (`pre-1.0`).
3
+ Source release target: `open-multi-agent-kit@0.78.5` (`pre-1.0`).
4
4
 
5
5
  ## Prerequisites
6
6
 
@@ -4,7 +4,7 @@ This page documents provider status for the current source tree.
4
4
 
5
5
  ## Current source target
6
6
 
7
- - Package version: `0.78.1`
7
+ - Package version: `0.78.5`
8
8
  - Runtime contract family: `v1.2`
9
9
  - Release channel: `pre-1.0`
10
10
 
@@ -4,11 +4,11 @@ OMK uses two version fields in release artifacts:
4
4
 
5
5
  | Field | Current value | Source | Meaning |
6
6
  | --- | --- | --- | --- |
7
- | Package version | `0.78.1` | `package.json`, `package-lock.json` | npm/package source version. |
7
+ | Package version | `0.78.5` | `package.json`, `package-lock.json` | npm/package source version. |
8
8
  | Runtime version | `v1.2` | `src/version.ts`, JSON schemas | Contract/runtime family used by OMK envelopes. |
9
9
  | Release channel | `pre-1.0` | `src/version.ts` | Pre-1.0 package channel. |
10
10
 
11
- `0.78.1` is the package source version for the `v1.2` runtime contract family.
11
+ `0.78.5` is the package source version for the `v1.2` runtime contract family.
12
12
  Use `v1.2` only for runtime contracts; do not substitute it for the package version.
13
13
 
14
14
  ## Contract versions
@@ -44,6 +44,6 @@ The `version --json` command emits one `omk.contract.v1` envelope whose data pay
44
44
 
45
45
  ## Documentation rules
46
46
 
47
- - Use `0.78.1` when referring to the current package source version.
47
+ - Use `0.78.5` when referring to the current package source version.
48
48
  - Use `v1.2` only for the runtime contract family.
49
49
  - Keep historical changelog entries unchanged unless the text is not clearly historical.
package/package.json CHANGED
@@ -1,9 +1,10 @@
1
1
  {
2
2
  "name": "open-multi-agent-kit",
3
- "version": "0.78.2",
3
+ "version": "0.78.5",
4
4
  "description": "Provider-neutral multi-agent control plane for coding workflows: route agents, verify evidence, orchestrate MCP-aware DAGs, and control the loop from the omk CLI.",
5
5
  "type": "module",
6
6
  "bin": {
7
+ "omk": "dist/cli.js",
7
8
  "omk-project-mcp": "dist/mcp/omk-project-server.js",
8
9
  "omk-acp": "dist/mcp/acp-server.js",
9
10
  "omk-mcp-host": "dist/mcp/host.js"
@@ -32,7 +33,7 @@
32
33
  "version:check": "node scripts/check-version-consistency.mjs",
33
34
  "contract:check": "npm run build:clean && npm run schema:check && node scripts/check-json-stdout.mjs",
34
35
  "verify:contracts": "npm run contract:check",
35
- "release:check": "node scripts/release-gate.mjs",
36
+ "release:check": "npm run verify:no-kimi && npm run contract:check && npm run schema:check && npm run version:check && npm run proof:check && npm run smoke:execution && npm run native:build && npm run pack:dry && npm run audit:package && npm run smoke:pack && OMK_RELEASE_DEMO=1 node scripts/release-gate.mjs",
36
37
  "release:full": "npm run verify && npm run verify:no-kimi && npm run contract:check && npm run schema:check && npm run version:check && npm run proof:check && npm run smoke:execution && npm run native:build && npm run pack:dry && npm run audit:package && npm run smoke:pack",
37
38
  "regression:matrix": "node scripts/regression-proof-matrix.mjs",
38
39
  "release:rc": "npm run verify && npm run verify:no-kimi && npm run contract:check && npm run schema:check && npm run version:check && npm run proof:check && npm run smoke:execution && npm run native:build && npm run pack:dry && npm run audit:package && npm run smoke:pack",
@@ -47,7 +48,10 @@
47
48
  "native:no-kimi:turn": "OMK_MCP_PREFLIGHT=off OMK_PROJECT_ROOT=\"$PWD\" node --test test/no-kimi-native-turn.test.mjs",
48
49
  "no-kimi:default-surface": "node scripts/no-kimi-default-surface.mjs",
49
50
  "test:no-kimi:runtime-routing": "node --test --test-timeout=300000 test/provider-router.test.mjs test/runtime-router.test.mjs && node --test --test-timeout=300000 --test-name-pattern='authority provider write-risk routes|configured non-Kimi authority|implicit default authority|unavailable DeepSeek|external direct fallback|Super OMK config|scoreProviders includes|computeProviderRouteScore|runtime fallback' test/provider-routing.test.mjs && node --test test/no-kimi-cli-hud-surface.test.mjs test/v2-regression.test.mjs",
50
- "legacy-identity:check": "node scripts/no-legacy-identity-surface.mjs"
51
+ "legacy-identity:check": "node scripts/no-legacy-identity-surface.mjs",
52
+ "benchmark:shadow": "node scripts/run-benchmark.mjs --shadow",
53
+ "benchmark:live": "node scripts/run-benchmark.mjs --live",
54
+ "benchmark:ci": "node scripts/run-benchmark.mjs --shadow --summary-json proof/benchmarks/ci-run.json"
51
55
  },
52
56
  "keywords": [
53
57
  "agent-runtime",