open-multi-agent-kit 0.78.2 → 0.78.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +27 -2
  2. package/MATURITY.md +2 -2
  3. package/README.md +4 -4
  4. package/dist/benchmark/contracts.d.ts +116 -0
  5. package/dist/benchmark/contracts.js +6 -0
  6. package/dist/benchmark/fixtures.d.ts +11 -0
  7. package/dist/benchmark/fixtures.js +124 -0
  8. package/dist/benchmark/harness.d.ts +13 -0
  9. package/dist/benchmark/harness.js +191 -0
  10. package/dist/benchmark/shadow-mode.d.ts +17 -0
  11. package/dist/benchmark/shadow-mode.js +95 -0
  12. package/dist/cli/release-promotion-gate.js +14 -4
  13. package/dist/commands/merge.js +102 -56
  14. package/dist/contracts/provider-health.d.ts +37 -0
  15. package/dist/contracts/provider-health.js +49 -1
  16. package/dist/evidence/evidence-trust-score.d.ts +101 -0
  17. package/dist/evidence/evidence-trust-score.js +408 -0
  18. package/dist/evidence/index.d.ts +2 -0
  19. package/dist/evidence/index.js +1 -0
  20. package/dist/native/linux-x64/omk-safety +0 -0
  21. package/dist/orchestration/merge-arbiter.d.ts +91 -0
  22. package/dist/orchestration/merge-arbiter.js +376 -0
  23. package/dist/providers/health.d.ts +3 -0
  24. package/dist/providers/health.js +46 -0
  25. package/dist/providers/index.d.ts +1 -0
  26. package/dist/providers/index.js +1 -0
  27. package/dist/providers/provider-health.d.ts +8 -1
  28. package/dist/providers/provider-health.js +39 -0
  29. package/dist/providers/provider-task-runner.js +31 -0
  30. package/dist/providers/provider.d.ts +2 -0
  31. package/dist/providers/router.js +80 -3
  32. package/dist/providers/types.d.ts +4 -0
  33. package/dist/runtime/contracts/weakness-remediation.d.ts +6 -0
  34. package/dist/runtime/provider-maturity-gate.d.ts +2 -0
  35. package/dist/runtime/provider-maturity-gate.js +26 -0
  36. package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
  37. package/dist/runtime/tool-dispatch-contracts.js +42 -2
  38. package/dist/runtime/weakness-remediation-index.d.ts +1 -1
  39. package/dist/runtime/weakness-remediation-index.js +1 -1
  40. package/dist/safety/enforcement-engine.d.ts +89 -0
  41. package/dist/safety/enforcement-engine.js +279 -0
  42. package/dist/safety/tool-authority-gate.d.ts +40 -0
  43. package/dist/safety/tool-authority-gate.js +92 -0
  44. package/dist/schema/evidence.schema.d.ts +2 -2
  45. package/dist/schema/proof-bundle.schema.d.ts +2 -2
  46. package/docs/benchmark-design.md +122 -0
  47. package/docs/getting-started.md +1 -1
  48. package/docs/provider-maturity.md +1 -1
  49. package/docs/versioning.md +3 -3
  50. package/package.json +7 -3
package/CHANGELOG.md CHANGED
@@ -1,16 +1,41 @@
1
1
  # Changelog
2
2
 
3
- ## UnreleasedDeep Interview + Clipboard Image Paste
3
+ ## v0.78.2Regression Proof Matrix, deep interview, clipboard image paste, and README hardening (2026-06-09)
4
+
5
+ ### Overview
6
+
7
+ This release adds the Regression Proof Matrix (Algorithm 9) as a release-defense gate, ships the deep interview and clipboard image paste features, and hardens README links to be package-safe.
4
8
 
5
9
  ### Added
6
10
 
11
+ - **Regression Proof Matrix (Algorithm 9)** — release-defense gate that verifies Algorithms 1–8 are alive via tests, proof bundles, decision traces, and CLI surfaces. `scripts/regression-proof-matrix.mjs --json` evaluates coverage topology, test linkage, proof-bundle trust, and CLI reachability, returning a JSON verdict with per-algorithm coverage and reasons.
12
+ - `src/evidence/regression-proof-matrix.ts` engine with configurable coverage and proof-trust thresholds (default `TAU_EVIDENCE` = 0.75), plus `test/regression-proof-matrix.test.mjs` unit coverage.
13
+ - Proof bundle `011-regression-proof-matrix` under `proof/verified-runs/` with evidence, decisions, verify JSON, and `sha256sums.txt`.
7
14
  - `omk goal interview [input]` and `omk goal refine <goal-id>` commands under the existing `goal` group, adding an evidence-driven clarification step before planning.
8
15
  - Deterministic deep interview that scores goal ambiguity (`0..1`), ranks targeted questions (`informationGain*0.35 + riskReduction*0.25 + dagImpact*0.20 + evidenceImpact*0.15 - userCost*0.05`), and computes a completeness score from assimilated answers.
9
16
  - Spec-delta assimilation that folds interview answers into a structured `GoalSpec` with conflict resolution, selectable depth (`light|standard|deep`, auto-selected by ambiguity when omitted), and `--write-spec` persistence.
10
17
  - `omk.interview.v1` JSON contract (`schemas/omk.interview.v1.schema.json`) plus the `omk.interview-delta.v1` spec-delta envelope.
11
18
  - Per-session interview artifacts (`interview.json`, `spec-delta.json`, `questions.md`, `answers.jsonl`, `interview-report.md`) under `.omk/goals/<goalId>/interviews/<sessionId>/` (or `.omk/interviews/<sessionId>/` before `--write-spec`).
12
- - GitHub organic growth kit: README first-screen positioning, runnable awesome-list examples, a 1280x640 social preview upload candidate, and reusable Topics/About/awesome-list PR copy in `docs/github-organic-promotion.md`.
13
19
  - Clipboard image paste support: `/paste` slash command in chat REPL, `--image` flag on `omk goal interview`, cross-platform clipboard reader (macOS/Linux/Windows), `InputAttachment` type for multimodal image handling.
20
+ - GitHub organic growth kit: README first-screen positioning, runnable awesome-list examples, a 1280x640 social preview upload candidate, and reusable Topics/About/awesome-list PR copy in `docs/github-organic-promotion.md`.
21
+
22
+ ### Changed
23
+
24
+ - README install and badge links now use package-safe `open-multi-agent-kit` example URLs instead of the unavailable `@omk/cli` scope.
25
+ - `MATURITY.md` and `docs/native-root-runtime-algorithms.md` clarify that the Regression Proof Matrix is a release-defense coverage gate, not a stable-release claim.
26
+
27
+ ### Commits
28
+
29
+ ```
30
+ 1504eae chore(release): bump v0.78.2
31
+ 3874558 docs(readme): use package-safe example links
32
+ cb673e3 docs(readme): clarify regression proof matrix boundary
33
+ 278cdf4 docs(proof): clarify regression matrix release boundary
34
+ 285c68c Feat/regression proof matrix (#15)
35
+ 4701243 feat(runtime): send clipboard images as multimodal content parts
36
+ 78a31eb feat(clipboard): add image paste support for chat and goal interview
37
+ 69d65c6 feat(goal): add deep interview refinement
38
+ ```
14
39
 
15
40
  ## v0.78.1 — package alignment, JSON contract envelopes, and adaptive runtime algorithms (2026-06-07)
16
41
 
package/MATURITY.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # OMK Command Maturity Matrix
2
2
 
3
3
  Last updated: 2026-06-07
4
- Current source version: v0.78.1 (`v1.2` runtime contract family)
4
+ Current source version: v0.78.5 (`v1.2` runtime contract family)
5
5
 
6
6
  | Level | Meaning |
7
7
  |-------|---------|
@@ -76,6 +76,6 @@ Regression Proof Matrix is a release-defense gate, not a stable-release claim. S
76
76
  | Native runtime safety | OMK owns the root-orchestrator direction, but native chat must still lock turn-risk inference, approval/sandbox propagation, authority resolution, provider health probes, and DeepSeek read-only enforcement before stable provider-neutral claims. | Treat `docs/native-root-runtime-hardening.md` and `.omk/specs/native-orchestrator-phase1/` as the active hardening contract. |
77
77
  | MCP diagnostics | `mcp list/doctor/test` exist; invalid project/global MCP JSON now fails visibly through diagnostics without exposing config contents. | Add machine-readable MCP JSON and structured failure categories for command resolution, timeout, permission, and server health. |
78
78
  | Skills and harness templates | `omk skill` exposes current core/TypeScript/review packs, while init templates document project MCP scope, runtime skills, portable `.agents/skills`, and run-scoped harness manifests. | Keep external-inspired skills compact, source-linked, and non-vendored; verify install/sync through `skill-command` tests and package audit. |
79
- | Release docs and site | README, CHANGELOG, MATURITY, ROADMAP, versioning docs, provider-maturity docs, package audit, and release-gate commands distinguish the `0.78.x` public package line from the `v1.2` runtime contract family while documenting alpha/experimental surfaces, current harness templates, provider limits, and the public project repository at `https://github.com/dmae97/open-multi-agent-kit`. | Treat `npm run release:check`, native safety packaging, tarball install smoke, and CI evidence on the exact commit as the publish/deploy gate before claiming `0.78.1` published or release-ready. |
79
+ | Release docs and site | README, CHANGELOG, MATURITY, ROADMAP, versioning docs, provider-maturity docs, package audit, and release-gate commands distinguish the `0.78.x` public package line from the `v1.2` runtime contract family while documenting alpha/experimental surfaces, current harness templates, provider limits, and the public project repository at `https://github.com/dmae97/open-multi-agent-kit`. | Treat `npm run release:check`, native safety packaging, tarball install smoke, and CI evidence on the exact commit as the publish/deploy gate before claiming `0.78.5` published or release-ready. |
80
80
  | Public proof bundles | `omk.proof-bundle.v1`, `npm run proof:check`, `npm run proof:index`, and ten scoped RC hardening bundles now cover no-Kimi, provider/doctor, fallback routing, native safety, contract/version, evidence-block, replay/inspect, and graph-audit axes. Proof integrity includes runId/commit/evidence/decision linkage and per-bundle `sha256sums.txt` artifact hashes. | Keep strengthening proof authenticity with sanitized repo-relative artifacts, non-empty known limitations, and broader provider fallback variants. |
81
81
  | Goal planner | Goal lifecycle exists, including continue, generated plan/evidence criteria, and verification. | Expand planner quality scoring and release evidence. |
package/README.md CHANGED
@@ -39,7 +39,7 @@ Use OMK when one coding agent is not enough: route Codex, OpenCode, Kimi, DeepSe
39
39
  - Teams that need MCP-scoped agent execution instead of unrestricted tool access.
40
40
  - Agent builders who want routing, fallback, evidence gates, telemetry, and replay.
41
41
 
42
- > Current package source target: `open-multi-agent-kit@0.78.1`.
42
+ > Current package source target: `open-multi-agent-kit@0.78.5`.
43
43
  > Public package name: `open-multi-agent-kit` (`@omk/cli` is not the active npm package).
44
44
  > Runtime contract family: `v1.2` (contract family, not a stable npm `1.x` release).
45
45
  > Release channel: `pre-1.0`.
@@ -63,8 +63,8 @@ omk chat
63
63
 
64
64
  ## Current release reality
65
65
 
66
- - The public npm line is `open-multi-agent-kit@0.78.x`. Published npm `latest` is `0.78.0`;
67
- source/target is `0.78.1` and is published only after the release workflow passes on the tagged commit.
66
+ - The public npm line is `open-multi-agent-kit@0.78.x`. Published npm `latest` is `0.78.5`;
67
+ source/target is `0.78.5` and is published only after the release workflow passes on the tagged commit.
68
68
  - The `v1.2` label in docs is a runtime contract family for the source tree, not a claim that
69
69
  an npm `1.2.x` stable release exists.
70
70
  - Provider support is intentionally uneven: Kimi remains the most mature authority path;
@@ -305,7 +305,7 @@ The npm package is intentionally package-safe:
305
305
  | Contract | Value |
306
306
  | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
307
307
  | Package | [`open-multi-agent-kit`](https://www.npmjs.com/package/open-multi-agent-kit) |
308
- | Version | `0.78.1` |
308
+ | Version | `0.78.5` |
309
309
  | Runtime contract family | `v1.2` |
310
310
  | Bins | `omk`, `omk-project-mcp`, `omk-acp`, `omk-mcp-host` |
311
311
  | Packaged docs | `README.md`, `docs/`, `SECURITY.md`, `ROADMAP.md`, `MATURITY.md`, `DESIGN.md` |
@@ -0,0 +1,116 @@
1
+ /**
2
+ * Benchmark contracts — omk.benchmark.v1
3
+ *
4
+ * Reproducible evaluation surface for OMK control plane tasks.
5
+ */
6
+ import type { AttemptStatus, RuntimeId } from "../evidence/attempt-record.js";
7
+ import type { RuntimeRouterDecisionV2, RuntimeScoreV2 } from "../runtime/contracts/router-v2.js";
8
+ import type { RuntimeRouteDecision } from "../runtime/runtime-router.js";
9
+ export declare const BENCHMARK_SCHEMA_VERSION = "omk.benchmark.v1";
10
+ export type BenchmarkTaskCategory = "read-only-repo-qa" | "small-bug-fix" | "failing-test-repair" | "multi-file-refactor" | "cli-command-task" | "dependency-update" | "merge-conflict-task" | "security-sensitive-task" | "provider-failure-fallback" | "quota-auth-failure-fallback";
11
+ export interface BenchmarkTask {
12
+ readonly taskId: string;
13
+ readonly category: BenchmarkTaskCategory;
14
+ readonly intent: string;
15
+ readonly description: string;
16
+ readonly treeHash: string;
17
+ readonly seed: number;
18
+ readonly providerConfigHash: string;
19
+ readonly omkVersion: string;
20
+ readonly worktreePath?: string;
21
+ readonly relevantFiles: readonly string[];
22
+ readonly expectedOutcome: "success" | "failure" | "fallback";
23
+ readonly recordedAttempts: readonly BenchmarkAttemptStub[];
24
+ }
25
+ export interface BenchmarkAttemptStub {
26
+ readonly attemptId: string;
27
+ readonly runtime: RuntimeId;
28
+ readonly model: string;
29
+ readonly provider: string;
30
+ readonly status: AttemptStatus;
31
+ readonly latencyMs: number;
32
+ readonly inputTokensEstimated: number;
33
+ readonly outputTokensEstimated: number;
34
+ readonly costUsdEstimated: number;
35
+ readonly evidenceResults: readonly {
36
+ gate: string;
37
+ passed: boolean;
38
+ }[];
39
+ readonly changedFiles: readonly string[];
40
+ readonly commandsRun: readonly string[];
41
+ readonly summary: string;
42
+ readonly error?: string;
43
+ }
44
+ export interface BenchmarkRunResult {
45
+ readonly taskId: string;
46
+ readonly solved: boolean;
47
+ readonly evidenceTrustScore: number;
48
+ readonly falseDone: boolean;
49
+ readonly fallbackUsed: boolean;
50
+ readonly fallbackSucceeded: boolean;
51
+ readonly routerRegret: number;
52
+ readonly costUsd: number;
53
+ readonly latencyMs: number;
54
+ readonly rolledBack: boolean;
55
+ readonly sandboxViolations: number;
56
+ readonly attemptCount: number;
57
+ readonly decisions: readonly BenchmarkDecisionRecord[];
58
+ }
59
+ export interface BenchmarkDecisionRecord {
60
+ readonly component: "runtime-router-v1" | "runtime-router-v2" | "provider-router";
61
+ readonly selectedRuntime: string;
62
+ readonly bestAvailableRuntime: string;
63
+ readonly regret: number;
64
+ readonly reason: string;
65
+ readonly scoresV2?: readonly RuntimeScoreV2[];
66
+ }
67
+ export interface BenchmarkSummary {
68
+ readonly schemaVersion: typeof BENCHMARK_SCHEMA_VERSION;
69
+ readonly runId: string;
70
+ readonly startedAt: string;
71
+ readonly completedAt: string;
72
+ readonly durationMs: number;
73
+ readonly treeHash: string;
74
+ readonly seed: number;
75
+ readonly providerConfigHash: string;
76
+ readonly omkVersion: string;
77
+ readonly mode: "shadow" | "live";
78
+ readonly totalTasks: number;
79
+ readonly solvedCount: number;
80
+ readonly solveRate: number;
81
+ readonly evidenceTrustScoreMean: number;
82
+ readonly falseDoneRate: number;
83
+ readonly fallbackSuccessRate: number;
84
+ readonly routerRegretMean: number;
85
+ readonly costPerSolvedTask: number;
86
+ readonly p95LatencyMs: number;
87
+ readonly rollbackRate: number;
88
+ readonly sandboxViolationCount: number;
89
+ readonly results: readonly BenchmarkRunResult[];
90
+ }
91
+ export interface ShadowModeRecord {
92
+ readonly taskId: string;
93
+ readonly nodeId: string;
94
+ readonly intent: string;
95
+ readonly v1Decision: RuntimeRouteDecision | null;
96
+ readonly v2Decision: RuntimeRouterDecisionV2 | null;
97
+ readonly regretV1: number;
98
+ readonly regretV2: number;
99
+ readonly disagreement: boolean;
100
+ readonly timestamp: string;
101
+ }
102
+ export interface BenchmarkConfig {
103
+ readonly mode: "shadow" | "live";
104
+ readonly tasksDir: string;
105
+ readonly outputDir: string;
106
+ readonly runId: string;
107
+ readonly maxConcurrency: number;
108
+ readonly pinTreeHash?: string;
109
+ readonly pinSeed?: number;
110
+ readonly pinProviderConfigHash?: string;
111
+ readonly categories?: readonly BenchmarkTaskCategory[];
112
+ }
113
+ export interface BenchmarkFixture {
114
+ readonly tasks: readonly BenchmarkTask[];
115
+ readonly version: string;
116
+ }
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Benchmark contracts — omk.benchmark.v1
3
+ *
4
+ * Reproducible evaluation surface for OMK control plane tasks.
5
+ */
6
+ export const BENCHMARK_SCHEMA_VERSION = "omk.benchmark.v1";
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Benchmark fixtures — synthetic trace generation + recorded trace loader.
3
+ *
4
+ * All synthetic traces are deterministic given a seed.
5
+ */
6
+ import type { BenchmarkFixture } from "./contracts.js";
7
+ export declare const DEFAULT_FIXTURE_VERSION = "1.0.0";
8
+ export declare function generateSyntheticTraces(countPerCategory: number, seed: number, omkVersion: string, treeHash: string, providerConfigHash: string): BenchmarkFixture;
9
+ export declare function loadRecordedTraces(dir: string): Promise<BenchmarkFixture>;
10
+ export declare function hashConfig(obj: unknown): string;
11
+ export declare function computeTreeHash(): string;
@@ -0,0 +1,124 @@
1
+ /**
2
+ * Benchmark fixtures — synthetic trace generation + recorded trace loader.
3
+ *
4
+ * All synthetic traces are deterministic given a seed.
5
+ */
6
+ import { createHash } from "node:crypto";
7
+ import { readFile, readdir } from "node:fs/promises";
8
+ import { join } from "node:path";
9
+ export const DEFAULT_FIXTURE_VERSION = "1.0.0";
10
+ function seededRandom(seed) {
11
+ let s = seed;
12
+ return () => {
13
+ s = (s * 16807 + 0) % 2147483647;
14
+ return (s - 1) / 2147483646;
15
+ };
16
+ }
17
+ function pick(arr, rng) {
18
+ return arr[Math.floor(rng() * arr.length)];
19
+ }
20
+ const CATEGORIES = [
21
+ "read-only-repo-qa",
22
+ "small-bug-fix",
23
+ "failing-test-repair",
24
+ "multi-file-refactor",
25
+ "cli-command-task",
26
+ "dependency-update",
27
+ "merge-conflict-task",
28
+ "security-sensitive-task",
29
+ "provider-failure-fallback",
30
+ "quota-auth-failure-fallback",
31
+ ];
32
+ const RUNTIME_IDS = ["kimi-wire", "kimi-print", "openai-compatible", "deepseek", "local"];
33
+ function makeAttemptStub(taskId, category, attemptNumber, rng, outcomeOverride) {
34
+ const runtime = pick([...RUNTIME_IDS], rng);
35
+ const statusBase = outcomeOverride ?? pick(["success", "success", "failure", "fallback"], rng);
36
+ const status = statusBase === "fallback" ? "runtime_failed" : statusBase === "success" ? "success" : "evidence_failed";
37
+ const latencyMs = Math.floor(500 + rng() * 8000);
38
+ const inputTokens = Math.floor(1000 + rng() * 15000);
39
+ const outputTokens = Math.floor(200 + rng() * 5000);
40
+ const costUsd = parseFloat((inputTokens * 0.000002 + outputTokens * 0.000006).toFixed(6));
41
+ const evidenceGates = category === "security-sensitive-task"
42
+ ? ["test", "lint", "audit", "review"]
43
+ : category === "cli-command-task"
44
+ ? ["command", "stdout-match"]
45
+ : ["test", "lint", "diff"];
46
+ const evidenceResults = evidenceGates.map((gate) => ({
47
+ gate,
48
+ passed: status === "success" ? true : rng() > 0.3,
49
+ }));
50
+ return {
51
+ attemptId: `${taskId}__${attemptNumber}`,
52
+ runtime,
53
+ model: "default",
54
+ provider: runtime.split("-")[0],
55
+ status,
56
+ latencyMs,
57
+ inputTokensEstimated: inputTokens,
58
+ outputTokensEstimated: outputTokens,
59
+ costUsdEstimated: costUsd,
60
+ evidenceResults,
61
+ changedFiles: category === "read-only-repo-qa" ? [] : [`src/${taskId}.ts`],
62
+ commandsRun: ["npm test", "npm run lint"],
63
+ summary: `${category} attempt ${attemptNumber}`,
64
+ error: status !== "success" ? "simulated failure" : undefined,
65
+ };
66
+ }
67
+ function makeTask(index, category, seed, omkVersion, treeHash, providerConfigHash) {
68
+ const rng = seededRandom(seed + index * 7919);
69
+ const taskId = `bench-${category}-${String(index).padStart(3, "0")}`;
70
+ const expectedOutcome = category === "provider-failure-fallback" || category === "quota-auth-failure-fallback"
71
+ ? "fallback"
72
+ : "success";
73
+ const attempts = [];
74
+ const attemptCount = expectedOutcome === "fallback" ? 2 : 1;
75
+ for (let i = 1; i <= attemptCount; i++) {
76
+ const outcome = expectedOutcome === "fallback" && i === 1 ? "fallback" : "success";
77
+ attempts.push(makeAttemptStub(taskId, category, i, rng, outcome));
78
+ }
79
+ return {
80
+ taskId,
81
+ category,
82
+ intent: category.replace(/-/g, "_"),
83
+ description: `Synthetic ${category} task #${index}`,
84
+ treeHash,
85
+ seed,
86
+ providerConfigHash,
87
+ omkVersion,
88
+ relevantFiles: [`src/${taskId}.ts`],
89
+ expectedOutcome,
90
+ recordedAttempts: attempts,
91
+ };
92
+ }
93
+ export function generateSyntheticTraces(countPerCategory, seed, omkVersion, treeHash, providerConfigHash) {
94
+ const tasks = [];
95
+ for (const category of CATEGORIES) {
96
+ for (let i = 0; i < countPerCategory; i++) {
97
+ tasks.push(makeTask(i, category, seed, omkVersion, treeHash, providerConfigHash));
98
+ }
99
+ }
100
+ return { tasks, version: DEFAULT_FIXTURE_VERSION };
101
+ }
102
+ export async function loadRecordedTraces(dir) {
103
+ const files = (await readdir(dir).catch(() => []))
104
+ .filter((f) => f.endsWith(".json"))
105
+ .map((f) => join(dir, f));
106
+ const tasks = [];
107
+ for (const file of files) {
108
+ const raw = await readFile(file, "utf-8");
109
+ const parsed = JSON.parse(raw);
110
+ tasks.push(parsed);
111
+ }
112
+ return { tasks, version: DEFAULT_FIXTURE_VERSION };
113
+ }
114
+ export function hashConfig(obj) {
115
+ return createHash("sha256")
116
+ .update(JSON.stringify(obj))
117
+ .digest("hex")
118
+ .slice(0, 16);
119
+ }
120
+ export function computeTreeHash() {
121
+ // In real usage this would be `git rev-parse HEAD`.
122
+ // Benchmark harness supplies the actual commit hash.
123
+ return "unknown";
124
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Benchmark Harness — run benchmark suite, compute metrics, write report.
3
+ */
4
+ import type { BenchmarkConfig, BenchmarkSummary } from "./contracts.js";
5
+ import type { AgentRuntime } from "../runtime/agent-runtime.js";
6
+ import type { EvidenceHistoryEntry } from "../runtime/contracts/router-v2.js";
7
+ export interface HarnessOptions {
8
+ readonly config: BenchmarkConfig;
9
+ readonly runtimes: AgentRuntime[];
10
+ readonly history?: EvidenceHistoryEntry[];
11
+ }
12
+ export declare function runBenchmarkSuite(options: HarnessOptions): Promise<BenchmarkSummary>;
13
+ export { createShadowModeEngine, computeRouterRegret } from "./shadow-mode.js";
@@ -0,0 +1,191 @@
1
+ /**
2
+ * Benchmark Harness — run benchmark suite, compute metrics, write report.
3
+ */
4
+ import { mkdir, writeFile } from "node:fs/promises";
5
+ import { join } from "node:path";
6
+ import { performance } from "node:perf_hooks";
7
+ import { generateSyntheticTraces, loadRecordedTraces, hashConfig } from "./fixtures.js";
8
+ import { createShadowModeEngine } from "./shadow-mode.js";
9
+ import { createEvidenceTrustScoreV2Engine } from "../evidence/evidence-trust-score.js";
10
+ function capsuleFromTask(task) {
11
+ return {
12
+ runId: task.taskId,
13
+ nodeId: task.taskId,
14
+ goal: task.description,
15
+ system: "Benchmark system prompt",
16
+ task: task.description,
17
+ dependencySummaries: [],
18
+ relevantFiles: task.relevantFiles.map((path) => ({
19
+ path,
20
+ startLine: 1,
21
+ endLine: 10,
22
+ content: "// synthetic",
23
+ })),
24
+ graphMemory: [],
25
+ priorAttempts: [],
26
+ evidenceRequirements: [],
27
+ budget: { maxInputTokens: 8000, reservedOutputTokens: 4096, maxFileTokens: 4096, maxToolResultTokens: 2048, maxMemoryFacts: 10, compression: "lossless-ish" },
28
+ node: {
29
+ id: task.taskId,
30
+ name: task.description,
31
+ role: "coder",
32
+ dependsOn: [],
33
+ status: "running",
34
+ retries: 0,
35
+ maxRetries: 1,
36
+ },
37
+ };
38
+ }
39
+ function mapGateToKind(gate) {
40
+ switch (gate) {
41
+ case "test": return "test";
42
+ case "lint": return "command";
43
+ case "audit": return "audit";
44
+ case "review": return "review";
45
+ case "command": return "command";
46
+ case "stdout-match": return "trace";
47
+ case "diff": return "diff";
48
+ default: return "trace";
49
+ }
50
+ }
51
+ function attemptToEvidenceItem(attempt) {
52
+ return attempt.evidenceResults.map((ev) => ({
53
+ id: `${attempt.attemptId}-${ev.gate}`,
54
+ kind: mapGateToKind(ev.gate),
55
+ source: "runner",
56
+ description: ev.gate,
57
+ verdict: (ev.passed ? "pass" : "fail"),
58
+ timestamp: new Date().toISOString(),
59
+ confidence: 0.9,
60
+ linkedFilePaths: [...attempt.changedFiles],
61
+ }));
62
+ }
63
+ export async function runBenchmarkSuite(options) {
64
+ const startedAt = new Date().toISOString();
65
+ const startedMs = performance.now();
66
+ const { config, runtimes, history = [] } = options;
67
+ await mkdir(config.outputDir, { recursive: true });
68
+ // Load tasks
69
+ const tasks = [];
70
+ if (config.mode === "shadow") {
71
+ const version = process.env.npm_package_version ?? "0.0.0";
72
+ const treeHash = config.pinTreeHash ?? "synthetic";
73
+ const seed = config.pinSeed ?? 42;
74
+ const providerHash = config.pinProviderConfigHash ?? hashConfig(runtimes.map((r) => r.id));
75
+ const fixture = generateSyntheticTraces(2, seed, version, treeHash, providerHash);
76
+ tasks.push(...fixture.tasks);
77
+ }
78
+ else {
79
+ const fixture = await loadRecordedTraces(config.tasksDir);
80
+ tasks.push(...fixture.tasks);
81
+ }
82
+ if (config.categories && config.categories.length > 0) {
83
+ const allowed = new Set(config.categories);
84
+ const filtered = tasks.filter((t) => allowed.has(t.category));
85
+ tasks.length = 0;
86
+ tasks.push(...filtered);
87
+ }
88
+ const shadowEngine = createShadowModeEngine({ runtimes, history });
89
+ const etsEngine = createEvidenceTrustScoreV2Engine();
90
+ const results = [];
91
+ for (const task of tasks) {
92
+ const capsule = capsuleFromTask(task);
93
+ const shadowRecord = shadowEngine.evaluate(task.taskId, task.taskId, capsule);
94
+ const decisions = shadowEngine.toBenchmarkDecision(shadowRecord);
95
+ // Simulate execution using recorded attempts
96
+ const lastAttempt = task.recordedAttempts[task.recordedAttempts.length - 1];
97
+ const solved = lastAttempt?.status === "success";
98
+ const fallbackUsed = task.recordedAttempts.length > 1;
99
+ const fallbackSucceeded = fallbackUsed && solved;
100
+ const rolledBack = task.recordedAttempts.some((a) => a.status === "cancelled");
101
+ const sandboxViolations = task.recordedAttempts.some((a) => a.changedFiles.some((f) => f.startsWith("/") && !f.includes("worktree")))
102
+ ? 1
103
+ : 0;
104
+ // ETS v2 evaluation
105
+ const allEvidence = task.recordedAttempts.flatMap((a) => attemptToEvidenceItem(a));
106
+ const etsResult = await etsEngine.evaluate({
107
+ output: lastAttempt?.summary ?? "",
108
+ taskType: task.category.includes("security") ? "security" : "feature",
109
+ risk: task.category.includes("security") ? "critical" : "medium",
110
+ runArtifacts: {
111
+ items: allEvidence,
112
+ meta: {
113
+ runId: task.taskId,
114
+ nodeId: task.taskId,
115
+ provider: lastAttempt?.provider ?? "unknown",
116
+ model: lastAttempt?.model ?? "unknown",
117
+ cwd: "[repo-root]",
118
+ treeHashBefore: task.treeHash,
119
+ treeHashAfter: task.treeHash,
120
+ commandHash: hashConfig(task.recordedAttempts.map((a) => a.commandsRun)),
121
+ timestamp: new Date().toISOString(),
122
+ command: task.recordedAttempts.map((a) => a.commandsRun.join("; ")).join(" || "),
123
+ },
124
+ },
125
+ dependencyGraphFiles: task.relevantFiles,
126
+ });
127
+ const falseDone = !solved && etsResult.verdict === "pass";
128
+ const totalLatency = task.recordedAttempts.reduce((s, a) => s + a.latencyMs, 0);
129
+ const totalCost = task.recordedAttempts.reduce((s, a) => s + a.costUsdEstimated, 0);
130
+ results.push({
131
+ taskId: task.taskId,
132
+ solved,
133
+ evidenceTrustScore: etsResult.score,
134
+ falseDone,
135
+ fallbackUsed,
136
+ fallbackSucceeded,
137
+ routerRegret: shadowRecord.regretV2,
138
+ costUsd: totalCost,
139
+ latencyMs: totalLatency,
140
+ rolledBack,
141
+ sandboxViolations,
142
+ attemptCount: task.recordedAttempts.length,
143
+ decisions,
144
+ });
145
+ }
146
+ const completedAt = new Date().toISOString();
147
+ const durationMs = Math.round(performance.now() - startedMs);
148
+ const solvedCount = results.filter((r) => r.solved).length;
149
+ const totalTasks = results.length;
150
+ const solveRate = totalTasks > 0 ? solvedCount / totalTasks : 0;
151
+ const evidenceMean = totalTasks > 0 ? results.reduce((s, r) => s + r.evidenceTrustScore, 0) / totalTasks : 0;
152
+ const falseDoneRate = totalTasks > 0 ? results.filter((r) => r.falseDone).length / totalTasks : 0;
153
+ const fallbackAttempts = results.filter((r) => r.fallbackUsed);
154
+ const fallbackSuccessRate = fallbackAttempts.length > 0
155
+ ? fallbackAttempts.filter((r) => r.fallbackSucceeded).length / fallbackAttempts.length
156
+ : 0;
157
+ const routerRegretMean = totalTasks > 0 ? results.reduce((s, r) => s + r.routerRegret, 0) / totalTasks : 0;
158
+ const costPerSolved = solvedCount > 0 ? results.reduce((s, r) => s + r.costUsd, 0) / solvedCount : 0;
159
+ const latencies = results.map((r) => r.latencyMs).sort((a, b) => a - b);
160
+ const p95Latency = latencies.length > 0 ? latencies[Math.floor(latencies.length * 0.95)] ?? latencies[latencies.length - 1] : 0;
161
+ const rollbackRate = totalTasks > 0 ? results.filter((r) => r.rolledBack).length / totalTasks : 0;
162
+ const sandboxViolationCount = results.reduce((s, r) => s + r.sandboxViolations, 0);
163
+ const summary = {
164
+ schemaVersion: "omk.benchmark.v1",
165
+ runId: config.runId,
166
+ startedAt,
167
+ completedAt,
168
+ durationMs,
169
+ treeHash: config.pinTreeHash ?? "synthetic",
170
+ seed: config.pinSeed ?? 42,
171
+ providerConfigHash: config.pinProviderConfigHash ?? hashConfig(runtimes.map((r) => r.id)),
172
+ omkVersion: process.env.npm_package_version ?? "0.0.0",
173
+ mode: config.mode,
174
+ totalTasks,
175
+ solvedCount,
176
+ solveRate,
177
+ evidenceTrustScoreMean: evidenceMean,
178
+ falseDoneRate,
179
+ fallbackSuccessRate,
180
+ routerRegretMean,
181
+ costPerSolvedTask: costPerSolved,
182
+ p95LatencyMs: p95Latency,
183
+ rollbackRate,
184
+ sandboxViolationCount,
185
+ results,
186
+ };
187
+ const outPath = join(config.outputDir, `${config.runId}.json`);
188
+ await writeFile(outPath, JSON.stringify(summary, null, 2), "utf-8");
189
+ return summary;
190
+ }
191
+ export { createShadowModeEngine, computeRouterRegret } from "./shadow-mode.js";
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Shadow Mode Engine — side-by-side router v1/v2 recording.
3
+ */
4
+ import type { AgentRuntime } from "../runtime/agent-runtime.js";
5
+ import type { ContextCapsule } from "../runtime/context-capsule.js";
6
+ import type { EvidenceHistoryEntry } from "../runtime/contracts/router-v2.js";
7
+ import type { ShadowModeRecord, BenchmarkDecisionRecord } from "./contracts.js";
8
+ export interface ShadowModeOptions {
9
+ readonly runtimes: AgentRuntime[];
10
+ readonly history: EvidenceHistoryEntry[];
11
+ }
12
+ export interface ShadowModeEngine {
13
+ evaluate(taskId: string, nodeId: string, capsule: ContextCapsule): ShadowModeRecord;
14
+ toBenchmarkDecision(record: ShadowModeRecord): BenchmarkDecisionRecord[];
15
+ }
16
+ export declare function createShadowModeEngine(options: ShadowModeOptions): ShadowModeEngine;
17
+ export declare function computeRouterRegret(candidates: AgentRuntime[], intent: string, history: EvidenceHistoryEntry[], selectedId: string): number;