open-multi-agent-kit 0.78.1 → 0.78.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/CHANGELOG.md +37 -0
  2. package/MATURITY.md +4 -0
  3. package/README.md +70 -1
  4. package/dist/benchmark/contracts.d.ts +116 -0
  5. package/dist/benchmark/contracts.js +6 -0
  6. package/dist/benchmark/fixtures.d.ts +11 -0
  7. package/dist/benchmark/fixtures.js +121 -0
  8. package/dist/benchmark/harness.d.ts +13 -0
  9. package/dist/benchmark/harness.js +191 -0
  10. package/dist/benchmark/shadow-mode.d.ts +17 -0
  11. package/dist/benchmark/shadow-mode.js +96 -0
  12. package/dist/cli/register-spec-agent-goal-commands.js +45 -0
  13. package/dist/cli/release-promotion-gate.d.ts +14 -0
  14. package/dist/cli/release-promotion-gate.js +71 -0
  15. package/dist/cli/v2/release-commands.d.ts +29 -0
  16. package/dist/cli/v2/release-commands.js +95 -0
  17. package/dist/commands/chat/native-root-loop.js +14 -1
  18. package/dist/commands/chat/slash/commands/session.js +19 -1
  19. package/dist/commands/goal-interview.d.ts +18 -0
  20. package/dist/commands/goal-interview.js +396 -0
  21. package/dist/commands/merge.js +102 -56
  22. package/dist/contracts/interview.d.ts +106 -0
  23. package/dist/contracts/interview.js +9 -0
  24. package/dist/contracts/provider-health.d.ts +37 -0
  25. package/dist/contracts/provider-health.js +49 -1
  26. package/dist/evidence/evidence-trust-score.d.ts +101 -0
  27. package/dist/evidence/evidence-trust-score.js +408 -0
  28. package/dist/evidence/index.d.ts +6 -0
  29. package/dist/evidence/index.js +3 -0
  30. package/dist/evidence/proof-trust-cli.d.ts +8 -0
  31. package/dist/evidence/proof-trust-cli.js +27 -0
  32. package/dist/evidence/proof-trust.d.ts +14 -0
  33. package/dist/evidence/proof-trust.js +381 -0
  34. package/dist/evidence/regression-proof-matrix.d.ts +42 -0
  35. package/dist/evidence/regression-proof-matrix.js +72 -0
  36. package/dist/goal/intent-frame.d.ts +6 -0
  37. package/dist/goal/intent-frame.js +21 -9
  38. package/dist/goal/interview-assimilation.d.ts +13 -0
  39. package/dist/goal/interview-assimilation.js +383 -0
  40. package/dist/goal/interview-question-bank.d.ts +11 -0
  41. package/dist/goal/interview-question-bank.js +225 -0
  42. package/dist/goal/interview-scoring.d.ts +31 -0
  43. package/dist/goal/interview-scoring.js +187 -0
  44. package/dist/goal/interview-session.d.ts +25 -0
  45. package/dist/goal/interview-session.js +116 -0
  46. package/dist/input/input-envelope.d.ts +22 -0
  47. package/dist/input/input-envelope.js +1 -0
  48. package/dist/orchestration/merge-arbiter.d.ts +91 -0
  49. package/dist/orchestration/merge-arbiter.js +376 -0
  50. package/dist/providers/health.d.ts +3 -0
  51. package/dist/providers/health.js +46 -0
  52. package/dist/providers/index.d.ts +1 -0
  53. package/dist/providers/index.js +1 -0
  54. package/dist/providers/provider-health.d.ts +8 -1
  55. package/dist/providers/provider-health.js +39 -0
  56. package/dist/providers/provider-task-runner.js +31 -0
  57. package/dist/providers/provider.d.ts +2 -0
  58. package/dist/providers/router.js +87 -3
  59. package/dist/providers/types.d.ts +4 -0
  60. package/dist/runtime/advanced-control-loop.d.ts +60 -0
  61. package/dist/runtime/advanced-control-loop.js +136 -0
  62. package/dist/runtime/agent-runtime.d.ts +10 -0
  63. package/dist/runtime/blast-radius.d.ts +10 -0
  64. package/dist/runtime/blast-radius.js +14 -0
  65. package/dist/runtime/contracts/evidence.d.ts +87 -0
  66. package/dist/runtime/contracts/evidence.js +7 -0
  67. package/dist/runtime/contracts/router-v2.d.ts +44 -0
  68. package/dist/runtime/contracts/router-v2.js +4 -0
  69. package/dist/runtime/contracts/weakness-remediation.d.ts +67 -0
  70. package/dist/runtime/contracts/weakness-remediation.js +36 -0
  71. package/dist/runtime/kimi-api-runtime.js +59 -1
  72. package/dist/runtime/proof-bundle-trust.d.ts +74 -0
  73. package/dist/runtime/proof-bundle-trust.js +100 -0
  74. package/dist/runtime/provider-maturity-gate.d.ts +43 -0
  75. package/dist/runtime/provider-maturity-gate.js +129 -0
  76. package/dist/runtime/public-surface.d.ts +93 -0
  77. package/dist/runtime/public-surface.js +146 -0
  78. package/dist/runtime/router-v2-scoring.d.ts +11 -0
  79. package/dist/runtime/router-v2-scoring.js +151 -0
  80. package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
  81. package/dist/runtime/tool-dispatch-contracts.js +42 -2
  82. package/dist/runtime/weakness-remediation-index.d.ts +27 -0
  83. package/dist/runtime/weakness-remediation-index.js +37 -0
  84. package/dist/safety/enforcement-engine.d.ts +89 -0
  85. package/dist/safety/enforcement-engine.js +279 -0
  86. package/dist/safety/tool-authority-gate.d.ts +40 -0
  87. package/dist/safety/tool-authority-gate.js +92 -0
  88. package/dist/schema/evidence.schema.d.ts +2 -2
  89. package/dist/schema/proof-bundle.schema.d.ts +28 -28
  90. package/dist/util/clipboard-image.d.ts +49 -0
  91. package/dist/util/clipboard-image.js +263 -0
  92. package/docs/2026-06-09/critical-issues.md +20 -0
  93. package/docs/2026-06-09/improvements.md +14 -0
  94. package/docs/2026-06-09/init-checklist.md +25 -0
  95. package/docs/2026-06-09/plan.md +20 -0
  96. package/docs/benchmark-design.md +122 -0
  97. package/docs/github-organic-promotion.md +127 -0
  98. package/docs/native-root-runtime-algorithms.md +301 -0
  99. package/package.json +8 -4
  100. package/readmeasset/ASSET_INDEX.md +1 -0
  101. package/templates/skills/agents/omk-agent-reach-websearch/SKILL.md +55 -0
  102. package/templates/skills/kimi/omk-agent-reach-websearch/SKILL.md +55 -0
package/CHANGELOG.md CHANGED
@@ -1,5 +1,42 @@
1
1
  # Changelog
2
2
 
3
+ ## v0.78.2 — Regression Proof Matrix, deep interview, clipboard image paste, and README hardening (2026-06-09)
4
+
5
+ ### Overview
6
+
7
+ This release adds the Regression Proof Matrix (Algorithm 9) as a release-defense gate, ships the deep interview and clipboard image paste features, and hardens README links to be package-safe.
8
+
9
+ ### Added
10
+
11
+ - **Regression Proof Matrix (Algorithm 9)** — release-defense gate that verifies Algorithms 1–8 are alive via tests, proof bundles, decision traces, and CLI surfaces. `scripts/regression-proof-matrix.mjs --json` evaluates coverage topology, test linkage, proof-bundle trust, and CLI reachability, returning a JSON verdict with per-algorithm coverage and reasons.
12
+ - `src/evidence/regression-proof-matrix.ts` engine with configurable coverage and proof-trust thresholds (default `TAU_EVIDENCE` = 0.75), plus `test/regression-proof-matrix.test.mjs` unit coverage.
13
+ - Proof bundle `011-regression-proof-matrix` under `proof/verified-runs/` with evidence, decisions, verify JSON, and `sha256sums.txt`.
14
+ - `omk goal interview [input]` and `omk goal refine <goal-id>` commands under the existing `goal` group, adding an evidence-driven clarification step before planning.
15
+ - Deterministic deep interview that scores goal ambiguity (`0..1`), ranks targeted questions (`informationGain*0.35 + riskReduction*0.25 + dagImpact*0.20 + evidenceImpact*0.15 - userCost*0.05`), and computes a completeness score from assimilated answers.
16
+ - Spec-delta assimilation that folds interview answers into a structured `GoalSpec` with conflict resolution, selectable depth (`light|standard|deep`, auto-selected by ambiguity when omitted), and `--write-spec` persistence.
17
+ - `omk.interview.v1` JSON contract (`schemas/omk.interview.v1.schema.json`) plus the `omk.interview-delta.v1` spec-delta envelope.
18
+ - Per-session interview artifacts (`interview.json`, `spec-delta.json`, `questions.md`, `answers.jsonl`, `interview-report.md`) under `.omk/goals/<goalId>/interviews/<sessionId>/` (or `.omk/interviews/<sessionId>/` before `--write-spec`).
19
+ - Clipboard image paste support: `/paste` slash command in chat REPL, `--image` flag on `omk goal interview`, cross-platform clipboard reader (macOS/Linux/Windows), `InputAttachment` type for multimodal image handling.
20
+ - GitHub organic growth kit: README first-screen positioning, runnable awesome-list examples, a 1280x640 social preview upload candidate, and reusable Topics/About/awesome-list PR copy in `docs/github-organic-promotion.md`.
21
+
22
+ ### Changed
23
+
24
+ - README install and badge links now use package-safe `open-multi-agent-kit` example URLs instead of the unavailable `@omk/cli` scope.
25
+ - `MATURITY.md` and `docs/native-root-runtime-algorithms.md` clarify that the Regression Proof Matrix is a release-defense coverage gate, not a stable-release claim.
26
+
27
+ ### Commits
28
+
29
+ ```
30
+ 1504eae chore(release): bump v0.78.2
31
+ 3874558 docs(readme): use package-safe example links
32
+ cb673e3 docs(readme): clarify regression proof matrix boundary
33
+ 278cdf4 docs(proof): clarify regression matrix release boundary
34
+ 285c68c Feat/regression proof matrix (#15)
35
+ 4701243 feat(runtime): send clipboard images as multimodal content parts
36
+ 78a31eb feat(clipboard): add image paste support for chat and goal interview
37
+ 69d65c6 feat(goal): add deep interview refinement
38
+ ```
39
+
3
40
  ## v0.78.1 — package alignment, JSON contract envelopes, and adaptive runtime algorithms (2026-06-07)
4
41
 
5
42
  ### Overview
package/MATURITY.md CHANGED
@@ -63,6 +63,10 @@ Current source version: v0.78.1 (`v1.2` runtime contract family)
63
63
  | `omk research` | Core runtime web research wrapper; depends on Kimi tool availability. |
64
64
  | `omk open-design-agent` | Local Open Design CLI bridge. |
65
65
 
66
+ ## Regression Proof Matrix Claim Boundary
67
+
68
+ Regression Proof Matrix is a release-defense gate, not a stable-release claim. Stable promotion still requires full `npm test`, live provider maturity data, and a minimal verified demo pass.
69
+
66
70
  ## Automation Contract Status
67
71
 
68
72
  | Area | Current state | Next hardening |
package/README.md CHANGED
@@ -23,6 +23,7 @@
23
23
  <p align="center">
24
24
  <a href="#install">Install</a> ·
25
25
  <a href="#quick-start">Quick start</a> ·
26
+ <a href="#who-is-this-for">Who is this for?</a> ·
26
27
  <a href="#current-runtime-algorithm">Runtime algorithm</a> ·
27
28
  <a href="docs/getting-started.md">Docs</a> ·
28
29
  <a href="readmeasset/ASSET_INDEX.md">Visual assets</a>
@@ -30,6 +31,14 @@
30
31
 
31
32
  `OMK` (`omk`) turns a coding goal into a bounded, evidence-gated agent run.
32
33
 
34
+ Use OMK when one coding agent is not enough: route Codex, OpenCode, Kimi, DeepSeek, Qwen, OpenRouter, and local runtimes through one evidence-gated control loop.
35
+
36
+ ## Who is this for?
37
+
38
+ - Developers running multiple coding agents from the terminal.
39
+ - Teams that need MCP-scoped agent execution instead of unrestricted tool access.
40
+ - Agent builders who want routing, fallback, evidence gates, telemetry, and replay.
41
+
33
42
  > Current package source target: `open-multi-agent-kit@0.78.1`.
34
43
  > Public package name: `open-multi-agent-kit` (`@omk/cli` is not the active npm package).
35
44
  > Runtime contract family: `v1.2` (contract family, not a stable npm `1.x` release).
@@ -38,7 +47,7 @@
38
47
 
39
48
  ## Quickstart (3 minutes)
40
49
 
41
- A beginner reads this, runs four commands, and succeeds.
50
+ A beginner reads this, runs four commands, and reaches an initialized OMK chat/doctor flow.
42
51
 
43
52
  ```bash
44
53
  npm i -g open-multi-agent-kit
@@ -47,6 +56,11 @@ omk doctor
47
56
  omk chat
48
57
  ```
49
58
 
59
+ ## Examples for agent tooling lists
60
+
61
+ - [Codex MCP evidence run](https://github.com/dmae97/open-multi-agent-kit/tree/main/examples/codex-mcp-evidence-run): project-scoped MCP setup plus evidence-gated DAG dry run.
62
+ - [Provider fallback](https://github.com/dmae97/open-multi-agent-kit/tree/main/examples/provider-fallback): `--provider auto` routing with parallel worker planning.
63
+
50
64
  ## Current release reality
51
65
 
52
66
  - The public npm line is `open-multi-agent-kit@0.78.x`. Published npm `latest` is `0.78.0`;
@@ -58,6 +72,8 @@ omk chat
58
72
  lanes are scoped by the provider-maturity contract.
59
73
  - Safety and evidence claims apply to the exact adapter, command, and verification gate that
60
74
  produced them.
75
+ - Regression Proof Matrix is a release-defense coverage gate, not a stable-release claim.
76
+ Stable promotion still requires full tests, live provider maturity data, and a minimal verified demo pass.
61
77
 
62
78
  ## Why OMK
63
79
 
@@ -192,6 +208,59 @@ Kimi worker prompts use stdin with `--input-format text` where that adapter path
192
208
  Goal → DAG plan → parallel lanes → evidence bundle → verify gate → merge / replay / inspect
193
209
  ```
194
210
 
211
+ ## Goal lifecycle
212
+
213
+ `omk goal` turns a raw goal into a planned, evidence-gated run. The **OMK Deep Interview** is an uncertainty reducer that clarifies the goal before planning, so the DAG is compiled from a structured spec instead of a vague prompt.
214
+
215
+ Recommended flow:
216
+
217
+ ```bash
218
+ omk goal interview "<raw goal>" --depth deep --write-spec
219
+ omk goal plan <goal-id>
220
+ omk goal run <goal-id> --provider auto --approval-policy interactive
221
+ omk goal verify <goal-id>
222
+ ```
223
+
224
+ ### `omk goal interview [input]`
225
+
226
+ Runs a deterministic deep interview that scores goal ambiguity (`0..1`), ranks targeted questions, assimilates answers into a structured spec delta, computes a completeness score, and (with `--write-spec`) creates or updates a `GoalSpec`. Question ranking is deterministic:
227
+
228
+ ```text
229
+ score = informationGain*0.35 + riskReduction*0.25 + dagImpact*0.20 + evidenceImpact*0.15 - userCost*0.05
230
+ ```
231
+
232
+ | Option | Purpose |
233
+ | -------------------------- | -------------------------------------------------------------- |
234
+ | `--goal-id <id>` | Target an existing goal. |
235
+ | `--mode <create\|refine>` | Create a new spec or refine an existing one. |
236
+ | `--depth <light\|standard\|deep>` | Interview depth; omit to auto-select by ambiguity. |
237
+ | `--max-questions <n>` | Cap the number of ranked questions. |
238
+ | `--answers <file>` | Supply answers non-interactively. |
239
+ | `--write-spec` | Persist the spec delta into a `GoalSpec`. |
240
+ | `--json` | Emit the `omk.interview.v1` JSON contract. |
241
+
242
+ ### `omk goal refine <goal-id>`
243
+
244
+ Applies the latest interview spec delta to a goal and optionally replans.
245
+
246
+ | Option | Purpose |
247
+ | ----------------------- | ------------------------------------------------ |
248
+ | `--from-interview <id>` | Source interview session (default: latest). |
249
+ | `--plan` | Replan the goal after applying the delta. |
250
+ | `--json` | Emit machine-readable output. |
251
+
252
+ Answers file format (`--answers answers.json`):
253
+
254
+ ```json
255
+ {
256
+ "answers": [
257
+ { "questionId": "q-success-criteria", "answer": "..." }
258
+ ]
259
+ }
260
+ ```
261
+
262
+ Session artifacts (`interview.json`, `spec-delta.json`, `questions.md`, `answers.jsonl`, `interview-report.md`) are written under `.omk/goals/<goalId>/interviews/<sessionId>/`, or `.omk/interviews/<sessionId>/` before `--write-spec`.
263
+
195
264
  ## What OMK controls
196
265
 
197
266
  | Surface | What OMK does |
@@ -0,0 +1,116 @@
1
+ /**
2
+ * Benchmark contracts — omk.benchmark.v1
3
+ *
4
+ * Reproducible evaluation surface for OMK control plane tasks.
5
+ */
6
+ import type { AttemptStatus, RuntimeId } from "../evidence/attempt-record.js";
7
+ import type { RuntimeRouterDecisionV2, RuntimeScoreV2 } from "../runtime/contracts/router-v2.js";
8
+ import type { RuntimeRouteDecision } from "../runtime/runtime-router.js";
9
+ export declare const BENCHMARK_SCHEMA_VERSION = "omk.benchmark.v1";
10
+ export type BenchmarkTaskCategory = "read-only-repo-qa" | "small-bug-fix" | "failing-test-repair" | "multi-file-refactor" | "cli-command-task" | "dependency-update" | "merge-conflict-task" | "security-sensitive-task" | "provider-failure-fallback" | "quota-auth-failure-fallback";
11
+ export interface BenchmarkTask {
12
+ readonly taskId: string;
13
+ readonly category: BenchmarkTaskCategory;
14
+ readonly intent: string;
15
+ readonly description: string;
16
+ readonly treeHash: string;
17
+ readonly seed: number;
18
+ readonly providerConfigHash: string;
19
+ readonly omkVersion: string;
20
+ readonly worktreePath?: string;
21
+ readonly relevantFiles: readonly string[];
22
+ readonly expectedOutcome: "success" | "failure" | "fallback";
23
+ readonly recordedAttempts: readonly BenchmarkAttemptStub[];
24
+ }
25
+ export interface BenchmarkAttemptStub {
26
+ readonly attemptId: string;
27
+ readonly runtime: RuntimeId;
28
+ readonly model: string;
29
+ readonly provider: string;
30
+ readonly status: AttemptStatus;
31
+ readonly latencyMs: number;
32
+ readonly inputTokensEstimated: number;
33
+ readonly outputTokensEstimated: number;
34
+ readonly costUsdEstimated: number;
35
+ readonly evidenceResults: readonly {
36
+ gate: string;
37
+ passed: boolean;
38
+ }[];
39
+ readonly changedFiles: readonly string[];
40
+ readonly commandsRun: readonly string[];
41
+ readonly summary: string;
42
+ readonly error?: string;
43
+ }
44
+ export interface BenchmarkRunResult {
45
+ readonly taskId: string;
46
+ readonly solved: boolean;
47
+ readonly evidenceTrustScore: number;
48
+ readonly falseDone: boolean;
49
+ readonly fallbackUsed: boolean;
50
+ readonly fallbackSucceeded: boolean;
51
+ readonly routerRegret: number;
52
+ readonly costUsd: number;
53
+ readonly latencyMs: number;
54
+ readonly rolledBack: boolean;
55
+ readonly sandboxViolations: number;
56
+ readonly attemptCount: number;
57
+ readonly decisions: readonly BenchmarkDecisionRecord[];
58
+ }
59
+ export interface BenchmarkDecisionRecord {
60
+ readonly component: "runtime-router-v1" | "runtime-router-v2" | "provider-router";
61
+ readonly selectedRuntime: string;
62
+ readonly bestAvailableRuntime: string;
63
+ readonly regret: number;
64
+ readonly reason: string;
65
+ readonly scoresV2?: readonly RuntimeScoreV2[];
66
+ }
67
+ export interface BenchmarkSummary {
68
+ readonly schemaVersion: typeof BENCHMARK_SCHEMA_VERSION;
69
+ readonly runId: string;
70
+ readonly startedAt: string;
71
+ readonly completedAt: string;
72
+ readonly durationMs: number;
73
+ readonly treeHash: string;
74
+ readonly seed: number;
75
+ readonly providerConfigHash: string;
76
+ readonly omkVersion: string;
77
+ readonly mode: "shadow" | "live";
78
+ readonly totalTasks: number;
79
+ readonly solvedCount: number;
80
+ readonly solveRate: number;
81
+ readonly evidenceTrustScoreMean: number;
82
+ readonly falseDoneRate: number;
83
+ readonly fallbackSuccessRate: number;
84
+ readonly routerRegretMean: number;
85
+ readonly costPerSolvedTask: number;
86
+ readonly p95LatencyMs: number;
87
+ readonly rollbackRate: number;
88
+ readonly sandboxViolationCount: number;
89
+ readonly results: readonly BenchmarkRunResult[];
90
+ }
91
+ export interface ShadowModeRecord {
92
+ readonly taskId: string;
93
+ readonly nodeId: string;
94
+ readonly intent: string;
95
+ readonly v1Decision: RuntimeRouteDecision | null;
96
+ readonly v2Decision: RuntimeRouterDecisionV2 | null;
97
+ readonly regretV1: number;
98
+ readonly regretV2: number;
99
+ readonly disagreement: boolean;
100
+ readonly timestamp: string;
101
+ }
102
+ export interface BenchmarkConfig {
103
+ readonly mode: "shadow" | "live";
104
+ readonly tasksDir: string;
105
+ readonly outputDir: string;
106
+ readonly runId: string;
107
+ readonly maxConcurrency: number;
108
+ readonly pinTreeHash?: string;
109
+ readonly pinSeed?: number;
110
+ readonly pinProviderConfigHash?: string;
111
+ readonly categories?: readonly BenchmarkTaskCategory[];
112
+ }
113
+ export interface BenchmarkFixture {
114
+ readonly tasks: readonly BenchmarkTask[];
115
+ readonly version: string;
116
+ }
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Benchmark contracts — omk.benchmark.v1
3
+ *
4
+ * Reproducible evaluation surface for OMK control plane tasks.
5
+ */
6
+ export const BENCHMARK_SCHEMA_VERSION = "omk.benchmark.v1";
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Benchmark fixtures — synthetic trace generation + recorded trace loader.
3
+ *
4
+ * All synthetic traces are deterministic given a seed.
5
+ */
6
+ import type { BenchmarkFixture } from "./contracts.js";
7
+ export declare const DEFAULT_FIXTURE_VERSION = "1.0.0";
8
+ export declare function generateSyntheticTraces(countPerCategory: number, seed: number, omkVersion: string, treeHash: string, providerConfigHash: string): BenchmarkFixture;
9
+ export declare function loadRecordedTraces(dir: string): Promise<BenchmarkFixture>;
10
+ export declare function hashConfig(obj: unknown): string;
11
+ export declare function computeTreeHash(): string;
@@ -0,0 +1,121 @@
1
+ /**
2
+ * Benchmark fixtures — synthetic trace generation + recorded trace loader.
3
+ *
4
+ * All synthetic traces are deterministic given a seed.
5
+ */
6
+ import { createHash } from "node:crypto";
7
+ import { readFile, readdir } from "node:fs/promises";
8
+ import { join } from "node:path";
9
+ export const DEFAULT_FIXTURE_VERSION = "1.0.0";
10
+ function seededRandom(seed) {
11
+ let s = seed;
12
+ return () => {
13
+ s = (s * 16807 + 0) % 2147483647;
14
+ return (s - 1) / 2147483646;
15
+ };
16
+ }
17
+ function pick(arr, rng) {
18
+ return arr[Math.floor(rng() * arr.length)];
19
+ }
20
+ const CATEGORIES = [
21
+ "read-only-repo-qa",
22
+ "small-bug-fix",
23
+ "failing-test-repair",
24
+ "multi-file-refactor",
25
+ "cli-command-task",
26
+ "dependency-update",
27
+ "merge-conflict-task",
28
+ "security-sensitive-task",
29
+ "provider-failure-fallback",
30
+ "quota-auth-failure-fallback",
31
+ ];
32
+ const RUNTIME_IDS = ["kimi-wire", "kimi-print", "openai-compatible", "deepseek", "local"];
33
+ function makeAttemptStub(taskId, category, attemptNumber, rng, outcomeOverride) {
34
+ const runtime = pick([...RUNTIME_IDS], rng);
35
+ const statusBase = outcomeOverride ?? pick(["success", "success", "failure", "fallback"], rng);
36
+ const status = statusBase === "fallback" ? "runtime_failed" : statusBase === "success" ? "success" : "evidence_failed";
37
+ const latencyMs = Math.floor(500 + rng() * 8000);
38
+ const inputTokens = Math.floor(1000 + rng() * 15000);
39
+ const outputTokens = Math.floor(200 + rng() * 5000);
40
+ const costUsd = parseFloat((inputTokens * 0.000002 + outputTokens * 0.000006).toFixed(6));
41
+ const evidenceGates = category === "security-sensitive-task"
42
+ ? ["test", "lint", "audit", "review"]
43
+ : category === "cli-command-task"
44
+ ? ["command", "stdout-match"]
45
+ : ["test", "lint", "diff"];
46
+ const evidenceResults = evidenceGates.map((gate) => ({
47
+ gate,
48
+ passed: status === "success" ? true : rng() > 0.3,
49
+ }));
50
+ return {
51
+ attemptId: `${taskId}__${attemptNumber}`,
52
+ runtime,
53
+ model: "default",
54
+ provider: runtime.split("-")[0],
55
+ status,
56
+ latencyMs,
57
+ inputTokensEstimated: inputTokens,
58
+ outputTokensEstimated: outputTokens,
59
+ costUsdEstimated: costUsd,
60
+ evidenceResults,
61
+ changedFiles: category === "read-only-repo-qa" ? [] : [`src/${taskId}.ts`],
62
+ commandsRun: ["npm test", "npm run lint"],
63
+ summary: `${category} attempt ${attemptNumber}`,
64
+ error: status !== "success" ? "simulated failure" : undefined,
65
+ };
66
+ }
67
+ function makeTask(index, category, seed, omkVersion, treeHash, providerConfigHash) {
68
+ const rng = seededRandom(seed + index * 7919);
69
+ const taskId = `bench-${category}-${String(index).padStart(3, "0")}`;
70
+ const expectedOutcome = pick(["success", "success", "failure", "fallback"], rng);
71
+ const attempts = [];
72
+ const attemptCount = expectedOutcome === "fallback" ? 2 : 1;
73
+ for (let i = 1; i <= attemptCount; i++) {
74
+ attempts.push(makeAttemptStub(taskId, category, i, rng, i === 1 ? undefined : "success"));
75
+ }
76
+ return {
77
+ taskId,
78
+ category,
79
+ intent: category.replace(/-/g, "_"),
80
+ description: `Synthetic ${category} task #${index}`,
81
+ treeHash,
82
+ seed,
83
+ providerConfigHash,
84
+ omkVersion,
85
+ relevantFiles: [`src/${taskId}.ts`],
86
+ expectedOutcome,
87
+ recordedAttempts: attempts,
88
+ };
89
+ }
90
+ export function generateSyntheticTraces(countPerCategory, seed, omkVersion, treeHash, providerConfigHash) {
91
+ const tasks = [];
92
+ for (const category of CATEGORIES) {
93
+ for (let i = 0; i < countPerCategory; i++) {
94
+ tasks.push(makeTask(i, category, seed, omkVersion, treeHash, providerConfigHash));
95
+ }
96
+ }
97
+ return { tasks, version: DEFAULT_FIXTURE_VERSION };
98
+ }
99
+ export async function loadRecordedTraces(dir) {
100
+ const files = (await readdir(dir).catch(() => []))
101
+ .filter((f) => f.endsWith(".json"))
102
+ .map((f) => join(dir, f));
103
+ const tasks = [];
104
+ for (const file of files) {
105
+ const raw = await readFile(file, "utf-8");
106
+ const parsed = JSON.parse(raw);
107
+ tasks.push(parsed);
108
+ }
109
+ return { tasks, version: DEFAULT_FIXTURE_VERSION };
110
+ }
111
+ export function hashConfig(obj) {
112
+ return createHash("sha256")
113
+ .update(JSON.stringify(obj))
114
+ .digest("hex")
115
+ .slice(0, 16);
116
+ }
117
+ export function computeTreeHash() {
118
+ // In real usage this would be `git rev-parse HEAD`.
119
+ // Benchmark harness supplies the actual commit hash.
120
+ return "unknown";
121
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Benchmark Harness — run benchmark suite, compute metrics, write report.
3
+ */
4
+ import type { BenchmarkConfig, BenchmarkSummary } from "./contracts.js";
5
+ import type { AgentRuntime } from "../runtime/agent-runtime.js";
6
+ import type { EvidenceHistoryEntry } from "../runtime/contracts/router-v2.js";
7
+ export interface HarnessOptions {
8
+ readonly config: BenchmarkConfig;
9
+ readonly runtimes: AgentRuntime[];
10
+ readonly history?: EvidenceHistoryEntry[];
11
+ }
12
+ export declare function runBenchmarkSuite(options: HarnessOptions): Promise<BenchmarkSummary>;
13
+ export { createShadowModeEngine, computeRouterRegret } from "./shadow-mode.js";
@@ -0,0 +1,191 @@
1
+ /**
2
+ * Benchmark Harness — run benchmark suite, compute metrics, write report.
3
+ */
4
+ import { mkdir, writeFile } from "node:fs/promises";
5
+ import { join } from "node:path";
6
+ import { performance } from "node:perf_hooks";
7
+ import { generateSyntheticTraces, loadRecordedTraces, hashConfig } from "./fixtures.js";
8
+ import { createShadowModeEngine } from "./shadow-mode.js";
9
+ import { createEvidenceTrustScoreV2Engine } from "../evidence/evidence-trust-score.js";
10
+ function capsuleFromTask(task) {
11
+ return {
12
+ runId: task.taskId,
13
+ nodeId: task.taskId,
14
+ goal: task.description,
15
+ system: "Benchmark system prompt",
16
+ task: task.description,
17
+ dependencySummaries: [],
18
+ relevantFiles: task.relevantFiles.map((path) => ({
19
+ path,
20
+ startLine: 1,
21
+ endLine: 10,
22
+ content: "// synthetic",
23
+ })),
24
+ graphMemory: [],
25
+ priorAttempts: [],
26
+ evidenceRequirements: [],
27
+ budget: { maxInputTokens: 8000, reservedOutputTokens: 4096, maxFileTokens: 4096, maxToolResultTokens: 2048, maxMemoryFacts: 10, compression: "lossless-ish" },
28
+ node: {
29
+ id: task.taskId,
30
+ name: task.description,
31
+ role: "coder",
32
+ dependsOn: [],
33
+ status: "running",
34
+ retries: 0,
35
+ maxRetries: 1,
36
+ },
37
+ };
38
+ }
39
+ function mapGateToKind(gate) {
40
+ switch (gate) {
41
+ case "test": return "test";
42
+ case "lint": return "command";
43
+ case "audit": return "audit";
44
+ case "review": return "review";
45
+ case "command": return "command";
46
+ case "stdout-match": return "trace";
47
+ case "diff": return "diff";
48
+ default: return "trace";
49
+ }
50
+ }
51
+ function attemptToEvidenceItem(attempt) {
52
+ return attempt.evidenceResults.map((ev) => ({
53
+ id: `${attempt.attemptId}-${ev.gate}`,
54
+ kind: mapGateToKind(ev.gate),
55
+ source: "runner",
56
+ description: ev.gate,
57
+ verdict: (ev.passed ? "pass" : "fail"),
58
+ timestamp: new Date().toISOString(),
59
+ confidence: 0.9,
60
+ linkedFilePaths: [...attempt.changedFiles],
61
+ }));
62
+ }
63
+ export async function runBenchmarkSuite(options) {
64
+ const startedAt = new Date().toISOString();
65
+ const startedMs = performance.now();
66
+ const { config, runtimes, history = [] } = options;
67
+ await mkdir(config.outputDir, { recursive: true });
68
+ // Load tasks
69
+ const tasks = [];
70
+ if (config.mode === "shadow") {
71
+ const version = process.env.npm_package_version ?? "0.0.0";
72
+ const treeHash = config.pinTreeHash ?? "synthetic";
73
+ const seed = config.pinSeed ?? 42;
74
+ const providerHash = config.pinProviderConfigHash ?? hashConfig(runtimes.map((r) => r.id));
75
+ const fixture = generateSyntheticTraces(2, seed, version, treeHash, providerHash);
76
+ tasks.push(...fixture.tasks);
77
+ }
78
+ else {
79
+ const fixture = await loadRecordedTraces(config.tasksDir);
80
+ tasks.push(...fixture.tasks);
81
+ }
82
+ if (config.categories && config.categories.length > 0) {
83
+ const allowed = new Set(config.categories);
84
+ const filtered = tasks.filter((t) => allowed.has(t.category));
85
+ tasks.length = 0;
86
+ tasks.push(...filtered);
87
+ }
88
+ const shadowEngine = createShadowModeEngine({ runtimes, history });
89
+ const etsEngine = createEvidenceTrustScoreV2Engine();
90
+ const results = [];
91
+ for (const task of tasks) {
92
+ const capsule = capsuleFromTask(task);
93
+ const shadowRecord = shadowEngine.evaluate(task.taskId, task.taskId, capsule);
94
+ const decisions = shadowEngine.toBenchmarkDecision(shadowRecord);
95
+ // Simulate execution using recorded attempts
96
+ const lastAttempt = task.recordedAttempts[task.recordedAttempts.length - 1];
97
+ const solved = lastAttempt?.status === "success";
98
+ const fallbackUsed = task.recordedAttempts.length > 1;
99
+ const fallbackSucceeded = fallbackUsed && solved;
100
+ const rolledBack = task.recordedAttempts.some((a) => a.status === "cancelled");
101
+ const sandboxViolations = task.recordedAttempts.some((a) => a.changedFiles.some((f) => f.startsWith("/") && !f.includes("worktree")))
102
+ ? 1
103
+ : 0;
104
+ // ETS v2 evaluation
105
+ const allEvidence = task.recordedAttempts.flatMap((a) => attemptToEvidenceItem(a));
106
+ const etsResult = await etsEngine.evaluate({
107
+ output: lastAttempt?.summary ?? "",
108
+ taskType: task.category.includes("security") ? "security" : "feature",
109
+ risk: task.category.includes("security") ? "critical" : "medium",
110
+ runArtifacts: {
111
+ items: allEvidence,
112
+ meta: {
113
+ runId: task.taskId,
114
+ nodeId: task.taskId,
115
+ provider: lastAttempt?.provider ?? "unknown",
116
+ model: lastAttempt?.model ?? "unknown",
117
+ cwd: "[repo-root]",
118
+ treeHashBefore: task.treeHash,
119
+ treeHashAfter: task.treeHash,
120
+ commandHash: hashConfig(task.recordedAttempts.map((a) => a.commandsRun)),
121
+ timestamp: new Date().toISOString(),
122
+ command: task.recordedAttempts.map((a) => a.commandsRun.join("; ")).join(" || "),
123
+ },
124
+ },
125
+ dependencyGraphFiles: task.relevantFiles,
126
+ });
127
+ const falseDone = !solved && etsResult.verdict === "pass";
128
+ const totalLatency = task.recordedAttempts.reduce((s, a) => s + a.latencyMs, 0);
129
+ const totalCost = task.recordedAttempts.reduce((s, a) => s + a.costUsdEstimated, 0);
130
+ results.push({
131
+ taskId: task.taskId,
132
+ solved,
133
+ evidenceTrustScore: etsResult.score,
134
+ falseDone,
135
+ fallbackUsed,
136
+ fallbackSucceeded,
137
+ routerRegret: shadowRecord.regretV2,
138
+ costUsd: totalCost,
139
+ latencyMs: totalLatency,
140
+ rolledBack,
141
+ sandboxViolations,
142
+ attemptCount: task.recordedAttempts.length,
143
+ decisions,
144
+ });
145
+ }
146
+ const completedAt = new Date().toISOString();
147
+ const durationMs = Math.round(performance.now() - startedMs);
148
+ const solvedCount = results.filter((r) => r.solved).length;
149
+ const totalTasks = results.length;
150
+ const solveRate = totalTasks > 0 ? solvedCount / totalTasks : 0;
151
+ const evidenceMean = totalTasks > 0 ? results.reduce((s, r) => s + r.evidenceTrustScore, 0) / totalTasks : 0;
152
+ const falseDoneRate = totalTasks > 0 ? results.filter((r) => r.falseDone).length / totalTasks : 0;
153
+ const fallbackAttempts = results.filter((r) => r.fallbackUsed);
154
+ const fallbackSuccessRate = fallbackAttempts.length > 0
155
+ ? fallbackAttempts.filter((r) => r.fallbackSucceeded).length / fallbackAttempts.length
156
+ : 0;
157
+ const routerRegretMean = totalTasks > 0 ? results.reduce((s, r) => s + r.routerRegret, 0) / totalTasks : 0;
158
+ const costPerSolved = solvedCount > 0 ? results.reduce((s, r) => s + r.costUsd, 0) / solvedCount : 0;
159
+ const latencies = results.map((r) => r.latencyMs).sort((a, b) => a - b);
160
+ const p95Latency = latencies.length > 0 ? latencies[Math.floor(latencies.length * 0.95)] ?? latencies[latencies.length - 1] : 0;
161
+ const rollbackRate = totalTasks > 0 ? results.filter((r) => r.rolledBack).length / totalTasks : 0;
162
+ const sandboxViolationCount = results.reduce((s, r) => s + r.sandboxViolations, 0);
163
+ const summary = {
164
+ schemaVersion: "omk.benchmark.v1",
165
+ runId: config.runId,
166
+ startedAt,
167
+ completedAt,
168
+ durationMs,
169
+ treeHash: config.pinTreeHash ?? "synthetic",
170
+ seed: config.pinSeed ?? 42,
171
+ providerConfigHash: config.pinProviderConfigHash ?? hashConfig(runtimes.map((r) => r.id)),
172
+ omkVersion: process.env.npm_package_version ?? "0.0.0",
173
+ mode: config.mode,
174
+ totalTasks,
175
+ solvedCount,
176
+ solveRate,
177
+ evidenceTrustScoreMean: evidenceMean,
178
+ falseDoneRate,
179
+ fallbackSuccessRate,
180
+ routerRegretMean,
181
+ costPerSolvedTask: costPerSolved,
182
+ p95LatencyMs: p95Latency,
183
+ rollbackRate,
184
+ sandboxViolationCount,
185
+ results,
186
+ };
187
+ const outPath = join(config.outputDir, `${config.runId}.json`);
188
+ await writeFile(outPath, JSON.stringify(summary, null, 2), "utf-8");
189
+ return summary;
190
+ }
191
+ export { createShadowModeEngine, computeRouterRegret } from "./shadow-mode.js";
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Shadow Mode Engine — side-by-side router v1/v2 recording.
3
+ */
4
+ import type { AgentRuntime } from "../runtime/agent-runtime.js";
5
+ import type { ContextCapsule } from "../runtime/context-capsule.js";
6
+ import type { EvidenceHistoryEntry } from "../runtime/contracts/router-v2.js";
7
+ import type { ShadowModeRecord, BenchmarkDecisionRecord } from "./contracts.js";
8
+ export interface ShadowModeOptions {
9
+ readonly runtimes: AgentRuntime[];
10
+ readonly history: EvidenceHistoryEntry[];
11
+ }
12
+ export interface ShadowModeEngine {
13
+ evaluate(taskId: string, nodeId: string, capsule: ContextCapsule): ShadowModeRecord;
14
+ toBenchmarkDecision(record: ShadowModeRecord): BenchmarkDecisionRecord[];
15
+ }
16
+ export declare function createShadowModeEngine(options: ShadowModeOptions): ShadowModeEngine;
17
+ export declare function computeRouterRegret(candidates: AgentRuntime[], intent: string, history: EvidenceHistoryEntry[], selectedId: string): number;