open-multi-agent-kit 0.78.2 → 0.78.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -2
- package/dist/benchmark/contracts.d.ts +116 -0
- package/dist/benchmark/contracts.js +6 -0
- package/dist/benchmark/fixtures.d.ts +11 -0
- package/dist/benchmark/fixtures.js +121 -0
- package/dist/benchmark/harness.d.ts +13 -0
- package/dist/benchmark/harness.js +191 -0
- package/dist/benchmark/shadow-mode.d.ts +17 -0
- package/dist/benchmark/shadow-mode.js +96 -0
- package/dist/commands/merge.js +102 -56
- package/dist/contracts/provider-health.d.ts +37 -0
- package/dist/contracts/provider-health.js +49 -1
- package/dist/evidence/evidence-trust-score.d.ts +101 -0
- package/dist/evidence/evidence-trust-score.js +408 -0
- package/dist/evidence/index.d.ts +2 -0
- package/dist/evidence/index.js +1 -0
- package/dist/orchestration/merge-arbiter.d.ts +91 -0
- package/dist/orchestration/merge-arbiter.js +376 -0
- package/dist/providers/health.d.ts +3 -0
- package/dist/providers/health.js +46 -0
- package/dist/providers/index.d.ts +1 -0
- package/dist/providers/index.js +1 -0
- package/dist/providers/provider-health.d.ts +8 -1
- package/dist/providers/provider-health.js +39 -0
- package/dist/providers/provider-task-runner.js +31 -0
- package/dist/providers/provider.d.ts +2 -0
- package/dist/providers/router.js +87 -3
- package/dist/providers/types.d.ts +4 -0
- package/dist/runtime/provider-maturity-gate.d.ts +2 -0
- package/dist/runtime/provider-maturity-gate.js +28 -0
- package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
- package/dist/runtime/tool-dispatch-contracts.js +42 -2
- package/dist/runtime/weakness-remediation-index.d.ts +1 -1
- package/dist/runtime/weakness-remediation-index.js +1 -1
- package/dist/safety/enforcement-engine.d.ts +89 -0
- package/dist/safety/enforcement-engine.js +279 -0
- package/dist/safety/tool-authority-gate.d.ts +40 -0
- package/dist/safety/tool-authority-gate.js +92 -0
- package/dist/schema/evidence.schema.d.ts +2 -2
- package/dist/schema/proof-bundle.schema.d.ts +2 -2
- package/docs/benchmark-design.md +122 -0
- package/package.json +5 -2
package/CHANGELOG.md
CHANGED
|
@@ -1,16 +1,41 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
-
##
|
|
3
|
+
## v0.78.2 — Regression Proof Matrix, deep interview, clipboard image paste, and README hardening (2026-06-09)
|
|
4
|
+
|
|
5
|
+
### Overview
|
|
6
|
+
|
|
7
|
+
This release adds the Regression Proof Matrix (Algorithm 9) as a release-defense gate, ships the deep interview and clipboard image paste features, and hardens README links to be package-safe.
|
|
4
8
|
|
|
5
9
|
### Added
|
|
6
10
|
|
|
11
|
+
- **Regression Proof Matrix (Algorithm 9)** — release-defense gate that verifies Algorithms 1–8 are alive via tests, proof bundles, decision traces, and CLI surfaces. `scripts/regression-proof-matrix.mjs --json` evaluates coverage topology, test linkage, proof-bundle trust, and CLI reachability, returning a JSON verdict with per-algorithm coverage and reasons.
|
|
12
|
+
- `src/evidence/regression-proof-matrix.ts` engine with configurable coverage and proof-trust thresholds (default `TAU_EVIDENCE` = 0.75), plus `test/regression-proof-matrix.test.mjs` unit coverage.
|
|
13
|
+
- Proof bundle `011-regression-proof-matrix` under `proof/verified-runs/` with evidence, decisions, verify JSON, and `sha256sums.txt`.
|
|
7
14
|
- `omk goal interview [input]` and `omk goal refine <goal-id>` commands under the existing `goal` group, adding an evidence-driven clarification step before planning.
|
|
8
15
|
- Deterministic deep interview that scores goal ambiguity (`0..1`), ranks targeted questions (`informationGain*0.35 + riskReduction*0.25 + dagImpact*0.20 + evidenceImpact*0.15 - userCost*0.05`), and computes a completeness score from assimilated answers.
|
|
9
16
|
- Spec-delta assimilation that folds interview answers into a structured `GoalSpec` with conflict resolution, selectable depth (`light|standard|deep`, auto-selected by ambiguity when omitted), and `--write-spec` persistence.
|
|
10
17
|
- `omk.interview.v1` JSON contract (`schemas/omk.interview.v1.schema.json`) plus the `omk.interview-delta.v1` spec-delta envelope.
|
|
11
18
|
- Per-session interview artifacts (`interview.json`, `spec-delta.json`, `questions.md`, `answers.jsonl`, `interview-report.md`) under `.omk/goals/<goalId>/interviews/<sessionId>/` (or `.omk/interviews/<sessionId>/` before `--write-spec`).
|
|
12
|
-
- GitHub organic growth kit: README first-screen positioning, runnable awesome-list examples, a 1280x640 social preview upload candidate, and reusable Topics/About/awesome-list PR copy in `docs/github-organic-promotion.md`.
|
|
13
19
|
- Clipboard image paste support: `/paste` slash command in chat REPL, `--image` flag on `omk goal interview`, cross-platform clipboard reader (macOS/Linux/Windows), `InputAttachment` type for multimodal image handling.
|
|
20
|
+
- GitHub organic growth kit: README first-screen positioning, runnable awesome-list examples, a 1280x640 social preview upload candidate, and reusable Topics/About/awesome-list PR copy in `docs/github-organic-promotion.md`.
|
|
21
|
+
|
|
22
|
+
### Changed
|
|
23
|
+
|
|
24
|
+
- README install and badge links now use package-safe `open-multi-agent-kit` example URLs instead of the unavailable `@omk/cli` scope.
|
|
25
|
+
- `MATURITY.md` and `docs/native-root-runtime-algorithms.md` clarify that the Regression Proof Matrix is a release-defense coverage gate, not a stable-release claim.
|
|
26
|
+
|
|
27
|
+
### Commits
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
1504eae chore(release): bump v0.78.2
|
|
31
|
+
3874558 docs(readme): use package-safe example links
|
|
32
|
+
cb673e3 docs(readme): clarify regression proof matrix boundary
|
|
33
|
+
278cdf4 docs(proof): clarify regression matrix release boundary
|
|
34
|
+
285c68c Feat/regression proof matrix (#15)
|
|
35
|
+
4701243 feat(runtime): send clipboard images as multimodal content parts
|
|
36
|
+
78a31eb feat(clipboard): add image paste support for chat and goal interview
|
|
37
|
+
69d65c6 feat(goal): add deep interview refinement
|
|
38
|
+
```
|
|
14
39
|
|
|
15
40
|
## v0.78.1 — package alignment, JSON contract envelopes, and adaptive runtime algorithms (2026-06-07)
|
|
16
41
|
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark contracts — omk.benchmark.v1
|
|
3
|
+
*
|
|
4
|
+
* Reproducible evaluation surface for OMK control plane tasks.
|
|
5
|
+
*/
|
|
6
|
+
import type { AttemptStatus, RuntimeId } from "../evidence/attempt-record.js";
|
|
7
|
+
import type { RuntimeRouterDecisionV2, RuntimeScoreV2 } from "../runtime/contracts/router-v2.js";
|
|
8
|
+
import type { RuntimeRouteDecision } from "../runtime/runtime-router.js";
|
|
9
|
+
export declare const BENCHMARK_SCHEMA_VERSION = "omk.benchmark.v1";
|
|
10
|
+
export type BenchmarkTaskCategory = "read-only-repo-qa" | "small-bug-fix" | "failing-test-repair" | "multi-file-refactor" | "cli-command-task" | "dependency-update" | "merge-conflict-task" | "security-sensitive-task" | "provider-failure-fallback" | "quota-auth-failure-fallback";
|
|
11
|
+
export interface BenchmarkTask {
|
|
12
|
+
readonly taskId: string;
|
|
13
|
+
readonly category: BenchmarkTaskCategory;
|
|
14
|
+
readonly intent: string;
|
|
15
|
+
readonly description: string;
|
|
16
|
+
readonly treeHash: string;
|
|
17
|
+
readonly seed: number;
|
|
18
|
+
readonly providerConfigHash: string;
|
|
19
|
+
readonly omkVersion: string;
|
|
20
|
+
readonly worktreePath?: string;
|
|
21
|
+
readonly relevantFiles: readonly string[];
|
|
22
|
+
readonly expectedOutcome: "success" | "failure" | "fallback";
|
|
23
|
+
readonly recordedAttempts: readonly BenchmarkAttemptStub[];
|
|
24
|
+
}
|
|
25
|
+
export interface BenchmarkAttemptStub {
|
|
26
|
+
readonly attemptId: string;
|
|
27
|
+
readonly runtime: RuntimeId;
|
|
28
|
+
readonly model: string;
|
|
29
|
+
readonly provider: string;
|
|
30
|
+
readonly status: AttemptStatus;
|
|
31
|
+
readonly latencyMs: number;
|
|
32
|
+
readonly inputTokensEstimated: number;
|
|
33
|
+
readonly outputTokensEstimated: number;
|
|
34
|
+
readonly costUsdEstimated: number;
|
|
35
|
+
readonly evidenceResults: readonly {
|
|
36
|
+
gate: string;
|
|
37
|
+
passed: boolean;
|
|
38
|
+
}[];
|
|
39
|
+
readonly changedFiles: readonly string[];
|
|
40
|
+
readonly commandsRun: readonly string[];
|
|
41
|
+
readonly summary: string;
|
|
42
|
+
readonly error?: string;
|
|
43
|
+
}
|
|
44
|
+
export interface BenchmarkRunResult {
|
|
45
|
+
readonly taskId: string;
|
|
46
|
+
readonly solved: boolean;
|
|
47
|
+
readonly evidenceTrustScore: number;
|
|
48
|
+
readonly falseDone: boolean;
|
|
49
|
+
readonly fallbackUsed: boolean;
|
|
50
|
+
readonly fallbackSucceeded: boolean;
|
|
51
|
+
readonly routerRegret: number;
|
|
52
|
+
readonly costUsd: number;
|
|
53
|
+
readonly latencyMs: number;
|
|
54
|
+
readonly rolledBack: boolean;
|
|
55
|
+
readonly sandboxViolations: number;
|
|
56
|
+
readonly attemptCount: number;
|
|
57
|
+
readonly decisions: readonly BenchmarkDecisionRecord[];
|
|
58
|
+
}
|
|
59
|
+
export interface BenchmarkDecisionRecord {
|
|
60
|
+
readonly component: "runtime-router-v1" | "runtime-router-v2" | "provider-router";
|
|
61
|
+
readonly selectedRuntime: string;
|
|
62
|
+
readonly bestAvailableRuntime: string;
|
|
63
|
+
readonly regret: number;
|
|
64
|
+
readonly reason: string;
|
|
65
|
+
readonly scoresV2?: readonly RuntimeScoreV2[];
|
|
66
|
+
}
|
|
67
|
+
export interface BenchmarkSummary {
|
|
68
|
+
readonly schemaVersion: typeof BENCHMARK_SCHEMA_VERSION;
|
|
69
|
+
readonly runId: string;
|
|
70
|
+
readonly startedAt: string;
|
|
71
|
+
readonly completedAt: string;
|
|
72
|
+
readonly durationMs: number;
|
|
73
|
+
readonly treeHash: string;
|
|
74
|
+
readonly seed: number;
|
|
75
|
+
readonly providerConfigHash: string;
|
|
76
|
+
readonly omkVersion: string;
|
|
77
|
+
readonly mode: "shadow" | "live";
|
|
78
|
+
readonly totalTasks: number;
|
|
79
|
+
readonly solvedCount: number;
|
|
80
|
+
readonly solveRate: number;
|
|
81
|
+
readonly evidenceTrustScoreMean: number;
|
|
82
|
+
readonly falseDoneRate: number;
|
|
83
|
+
readonly fallbackSuccessRate: number;
|
|
84
|
+
readonly routerRegretMean: number;
|
|
85
|
+
readonly costPerSolvedTask: number;
|
|
86
|
+
readonly p95LatencyMs: number;
|
|
87
|
+
readonly rollbackRate: number;
|
|
88
|
+
readonly sandboxViolationCount: number;
|
|
89
|
+
readonly results: readonly BenchmarkRunResult[];
|
|
90
|
+
}
|
|
91
|
+
export interface ShadowModeRecord {
|
|
92
|
+
readonly taskId: string;
|
|
93
|
+
readonly nodeId: string;
|
|
94
|
+
readonly intent: string;
|
|
95
|
+
readonly v1Decision: RuntimeRouteDecision | null;
|
|
96
|
+
readonly v2Decision: RuntimeRouterDecisionV2 | null;
|
|
97
|
+
readonly regretV1: number;
|
|
98
|
+
readonly regretV2: number;
|
|
99
|
+
readonly disagreement: boolean;
|
|
100
|
+
readonly timestamp: string;
|
|
101
|
+
}
|
|
102
|
+
export interface BenchmarkConfig {
|
|
103
|
+
readonly mode: "shadow" | "live";
|
|
104
|
+
readonly tasksDir: string;
|
|
105
|
+
readonly outputDir: string;
|
|
106
|
+
readonly runId: string;
|
|
107
|
+
readonly maxConcurrency: number;
|
|
108
|
+
readonly pinTreeHash?: string;
|
|
109
|
+
readonly pinSeed?: number;
|
|
110
|
+
readonly pinProviderConfigHash?: string;
|
|
111
|
+
readonly categories?: readonly BenchmarkTaskCategory[];
|
|
112
|
+
}
|
|
113
|
+
export interface BenchmarkFixture {
|
|
114
|
+
readonly tasks: readonly BenchmarkTask[];
|
|
115
|
+
readonly version: string;
|
|
116
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark fixtures — synthetic trace generation + recorded trace loader.
|
|
3
|
+
*
|
|
4
|
+
* All synthetic traces are deterministic given a seed.
|
|
5
|
+
*/
|
|
6
|
+
import type { BenchmarkFixture } from "./contracts.js";
|
|
7
|
+
export declare const DEFAULT_FIXTURE_VERSION = "1.0.0";
|
|
8
|
+
export declare function generateSyntheticTraces(countPerCategory: number, seed: number, omkVersion: string, treeHash: string, providerConfigHash: string): BenchmarkFixture;
|
|
9
|
+
export declare function loadRecordedTraces(dir: string): Promise<BenchmarkFixture>;
|
|
10
|
+
export declare function hashConfig(obj: unknown): string;
|
|
11
|
+
export declare function computeTreeHash(): string;
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark fixtures — synthetic trace generation + recorded trace loader.
|
|
3
|
+
*
|
|
4
|
+
* All synthetic traces are deterministic given a seed.
|
|
5
|
+
*/
|
|
6
|
+
import { createHash } from "node:crypto";
|
|
7
|
+
import { readFile, readdir } from "node:fs/promises";
|
|
8
|
+
import { join } from "node:path";
|
|
9
|
+
export const DEFAULT_FIXTURE_VERSION = "1.0.0";
|
|
10
|
+
function seededRandom(seed) {
|
|
11
|
+
let s = seed;
|
|
12
|
+
return () => {
|
|
13
|
+
s = (s * 16807 + 0) % 2147483647;
|
|
14
|
+
return (s - 1) / 2147483646;
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
function pick(arr, rng) {
|
|
18
|
+
return arr[Math.floor(rng() * arr.length)];
|
|
19
|
+
}
|
|
20
|
+
const CATEGORIES = [
|
|
21
|
+
"read-only-repo-qa",
|
|
22
|
+
"small-bug-fix",
|
|
23
|
+
"failing-test-repair",
|
|
24
|
+
"multi-file-refactor",
|
|
25
|
+
"cli-command-task",
|
|
26
|
+
"dependency-update",
|
|
27
|
+
"merge-conflict-task",
|
|
28
|
+
"security-sensitive-task",
|
|
29
|
+
"provider-failure-fallback",
|
|
30
|
+
"quota-auth-failure-fallback",
|
|
31
|
+
];
|
|
32
|
+
const RUNTIME_IDS = ["kimi-wire", "kimi-print", "openai-compatible", "deepseek", "local"];
|
|
33
|
+
function makeAttemptStub(taskId, category, attemptNumber, rng, outcomeOverride) {
|
|
34
|
+
const runtime = pick([...RUNTIME_IDS], rng);
|
|
35
|
+
const statusBase = outcomeOverride ?? pick(["success", "success", "failure", "fallback"], rng);
|
|
36
|
+
const status = statusBase === "fallback" ? "runtime_failed" : statusBase === "success" ? "success" : "evidence_failed";
|
|
37
|
+
const latencyMs = Math.floor(500 + rng() * 8000);
|
|
38
|
+
const inputTokens = Math.floor(1000 + rng() * 15000);
|
|
39
|
+
const outputTokens = Math.floor(200 + rng() * 5000);
|
|
40
|
+
const costUsd = parseFloat((inputTokens * 0.000002 + outputTokens * 0.000006).toFixed(6));
|
|
41
|
+
const evidenceGates = category === "security-sensitive-task"
|
|
42
|
+
? ["test", "lint", "audit", "review"]
|
|
43
|
+
: category === "cli-command-task"
|
|
44
|
+
? ["command", "stdout-match"]
|
|
45
|
+
: ["test", "lint", "diff"];
|
|
46
|
+
const evidenceResults = evidenceGates.map((gate) => ({
|
|
47
|
+
gate,
|
|
48
|
+
passed: status === "success" ? true : rng() > 0.3,
|
|
49
|
+
}));
|
|
50
|
+
return {
|
|
51
|
+
attemptId: `${taskId}__${attemptNumber}`,
|
|
52
|
+
runtime,
|
|
53
|
+
model: "default",
|
|
54
|
+
provider: runtime.split("-")[0],
|
|
55
|
+
status,
|
|
56
|
+
latencyMs,
|
|
57
|
+
inputTokensEstimated: inputTokens,
|
|
58
|
+
outputTokensEstimated: outputTokens,
|
|
59
|
+
costUsdEstimated: costUsd,
|
|
60
|
+
evidenceResults,
|
|
61
|
+
changedFiles: category === "read-only-repo-qa" ? [] : [`src/${taskId}.ts`],
|
|
62
|
+
commandsRun: ["npm test", "npm run lint"],
|
|
63
|
+
summary: `${category} attempt ${attemptNumber}`,
|
|
64
|
+
error: status !== "success" ? "simulated failure" : undefined,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
function makeTask(index, category, seed, omkVersion, treeHash, providerConfigHash) {
|
|
68
|
+
const rng = seededRandom(seed + index * 7919);
|
|
69
|
+
const taskId = `bench-${category}-${String(index).padStart(3, "0")}`;
|
|
70
|
+
const expectedOutcome = pick(["success", "success", "failure", "fallback"], rng);
|
|
71
|
+
const attempts = [];
|
|
72
|
+
const attemptCount = expectedOutcome === "fallback" ? 2 : 1;
|
|
73
|
+
for (let i = 1; i <= attemptCount; i++) {
|
|
74
|
+
attempts.push(makeAttemptStub(taskId, category, i, rng, i === 1 ? undefined : "success"));
|
|
75
|
+
}
|
|
76
|
+
return {
|
|
77
|
+
taskId,
|
|
78
|
+
category,
|
|
79
|
+
intent: category.replace(/-/g, "_"),
|
|
80
|
+
description: `Synthetic ${category} task #${index}`,
|
|
81
|
+
treeHash,
|
|
82
|
+
seed,
|
|
83
|
+
providerConfigHash,
|
|
84
|
+
omkVersion,
|
|
85
|
+
relevantFiles: [`src/${taskId}.ts`],
|
|
86
|
+
expectedOutcome,
|
|
87
|
+
recordedAttempts: attempts,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
export function generateSyntheticTraces(countPerCategory, seed, omkVersion, treeHash, providerConfigHash) {
|
|
91
|
+
const tasks = [];
|
|
92
|
+
for (const category of CATEGORIES) {
|
|
93
|
+
for (let i = 0; i < countPerCategory; i++) {
|
|
94
|
+
tasks.push(makeTask(i, category, seed, omkVersion, treeHash, providerConfigHash));
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
return { tasks, version: DEFAULT_FIXTURE_VERSION };
|
|
98
|
+
}
|
|
99
|
+
export async function loadRecordedTraces(dir) {
|
|
100
|
+
const files = (await readdir(dir).catch(() => []))
|
|
101
|
+
.filter((f) => f.endsWith(".json"))
|
|
102
|
+
.map((f) => join(dir, f));
|
|
103
|
+
const tasks = [];
|
|
104
|
+
for (const file of files) {
|
|
105
|
+
const raw = await readFile(file, "utf-8");
|
|
106
|
+
const parsed = JSON.parse(raw);
|
|
107
|
+
tasks.push(parsed);
|
|
108
|
+
}
|
|
109
|
+
return { tasks, version: DEFAULT_FIXTURE_VERSION };
|
|
110
|
+
}
|
|
111
|
+
export function hashConfig(obj) {
|
|
112
|
+
return createHash("sha256")
|
|
113
|
+
.update(JSON.stringify(obj))
|
|
114
|
+
.digest("hex")
|
|
115
|
+
.slice(0, 16);
|
|
116
|
+
}
|
|
117
|
+
export function computeTreeHash() {
|
|
118
|
+
// In real usage this would be `git rev-parse HEAD`.
|
|
119
|
+
// Benchmark harness supplies the actual commit hash.
|
|
120
|
+
return "unknown";
|
|
121
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark Harness — run benchmark suite, compute metrics, write report.
|
|
3
|
+
*/
|
|
4
|
+
import type { BenchmarkConfig, BenchmarkSummary } from "./contracts.js";
|
|
5
|
+
import type { AgentRuntime } from "../runtime/agent-runtime.js";
|
|
6
|
+
import type { EvidenceHistoryEntry } from "../runtime/contracts/router-v2.js";
|
|
7
|
+
export interface HarnessOptions {
|
|
8
|
+
readonly config: BenchmarkConfig;
|
|
9
|
+
readonly runtimes: AgentRuntime[];
|
|
10
|
+
readonly history?: EvidenceHistoryEntry[];
|
|
11
|
+
}
|
|
12
|
+
export declare function runBenchmarkSuite(options: HarnessOptions): Promise<BenchmarkSummary>;
|
|
13
|
+
export { createShadowModeEngine, computeRouterRegret } from "./shadow-mode.js";
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark Harness — run benchmark suite, compute metrics, write report.
|
|
3
|
+
*/
|
|
4
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
import { performance } from "node:perf_hooks";
|
|
7
|
+
import { generateSyntheticTraces, loadRecordedTraces, hashConfig } from "./fixtures.js";
|
|
8
|
+
import { createShadowModeEngine } from "./shadow-mode.js";
|
|
9
|
+
import { createEvidenceTrustScoreV2Engine } from "../evidence/evidence-trust-score.js";
|
|
10
|
+
function capsuleFromTask(task) {
|
|
11
|
+
return {
|
|
12
|
+
runId: task.taskId,
|
|
13
|
+
nodeId: task.taskId,
|
|
14
|
+
goal: task.description,
|
|
15
|
+
system: "Benchmark system prompt",
|
|
16
|
+
task: task.description,
|
|
17
|
+
dependencySummaries: [],
|
|
18
|
+
relevantFiles: task.relevantFiles.map((path) => ({
|
|
19
|
+
path,
|
|
20
|
+
startLine: 1,
|
|
21
|
+
endLine: 10,
|
|
22
|
+
content: "// synthetic",
|
|
23
|
+
})),
|
|
24
|
+
graphMemory: [],
|
|
25
|
+
priorAttempts: [],
|
|
26
|
+
evidenceRequirements: [],
|
|
27
|
+
budget: { maxInputTokens: 8000, reservedOutputTokens: 4096, maxFileTokens: 4096, maxToolResultTokens: 2048, maxMemoryFacts: 10, compression: "lossless-ish" },
|
|
28
|
+
node: {
|
|
29
|
+
id: task.taskId,
|
|
30
|
+
name: task.description,
|
|
31
|
+
role: "coder",
|
|
32
|
+
dependsOn: [],
|
|
33
|
+
status: "running",
|
|
34
|
+
retries: 0,
|
|
35
|
+
maxRetries: 1,
|
|
36
|
+
},
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
function mapGateToKind(gate) {
|
|
40
|
+
switch (gate) {
|
|
41
|
+
case "test": return "test";
|
|
42
|
+
case "lint": return "command";
|
|
43
|
+
case "audit": return "audit";
|
|
44
|
+
case "review": return "review";
|
|
45
|
+
case "command": return "command";
|
|
46
|
+
case "stdout-match": return "trace";
|
|
47
|
+
case "diff": return "diff";
|
|
48
|
+
default: return "trace";
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
function attemptToEvidenceItem(attempt) {
|
|
52
|
+
return attempt.evidenceResults.map((ev) => ({
|
|
53
|
+
id: `${attempt.attemptId}-${ev.gate}`,
|
|
54
|
+
kind: mapGateToKind(ev.gate),
|
|
55
|
+
source: "runner",
|
|
56
|
+
description: ev.gate,
|
|
57
|
+
verdict: (ev.passed ? "pass" : "fail"),
|
|
58
|
+
timestamp: new Date().toISOString(),
|
|
59
|
+
confidence: 0.9,
|
|
60
|
+
linkedFilePaths: [...attempt.changedFiles],
|
|
61
|
+
}));
|
|
62
|
+
}
|
|
63
|
+
export async function runBenchmarkSuite(options) {
|
|
64
|
+
const startedAt = new Date().toISOString();
|
|
65
|
+
const startedMs = performance.now();
|
|
66
|
+
const { config, runtimes, history = [] } = options;
|
|
67
|
+
await mkdir(config.outputDir, { recursive: true });
|
|
68
|
+
// Load tasks
|
|
69
|
+
const tasks = [];
|
|
70
|
+
if (config.mode === "shadow") {
|
|
71
|
+
const version = process.env.npm_package_version ?? "0.0.0";
|
|
72
|
+
const treeHash = config.pinTreeHash ?? "synthetic";
|
|
73
|
+
const seed = config.pinSeed ?? 42;
|
|
74
|
+
const providerHash = config.pinProviderConfigHash ?? hashConfig(runtimes.map((r) => r.id));
|
|
75
|
+
const fixture = generateSyntheticTraces(2, seed, version, treeHash, providerHash);
|
|
76
|
+
tasks.push(...fixture.tasks);
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
const fixture = await loadRecordedTraces(config.tasksDir);
|
|
80
|
+
tasks.push(...fixture.tasks);
|
|
81
|
+
}
|
|
82
|
+
if (config.categories && config.categories.length > 0) {
|
|
83
|
+
const allowed = new Set(config.categories);
|
|
84
|
+
const filtered = tasks.filter((t) => allowed.has(t.category));
|
|
85
|
+
tasks.length = 0;
|
|
86
|
+
tasks.push(...filtered);
|
|
87
|
+
}
|
|
88
|
+
const shadowEngine = createShadowModeEngine({ runtimes, history });
|
|
89
|
+
const etsEngine = createEvidenceTrustScoreV2Engine();
|
|
90
|
+
const results = [];
|
|
91
|
+
for (const task of tasks) {
|
|
92
|
+
const capsule = capsuleFromTask(task);
|
|
93
|
+
const shadowRecord = shadowEngine.evaluate(task.taskId, task.taskId, capsule);
|
|
94
|
+
const decisions = shadowEngine.toBenchmarkDecision(shadowRecord);
|
|
95
|
+
// Simulate execution using recorded attempts
|
|
96
|
+
const lastAttempt = task.recordedAttempts[task.recordedAttempts.length - 1];
|
|
97
|
+
const solved = lastAttempt?.status === "success";
|
|
98
|
+
const fallbackUsed = task.recordedAttempts.length > 1;
|
|
99
|
+
const fallbackSucceeded = fallbackUsed && solved;
|
|
100
|
+
const rolledBack = task.recordedAttempts.some((a) => a.status === "cancelled");
|
|
101
|
+
const sandboxViolations = task.recordedAttempts.some((a) => a.changedFiles.some((f) => f.startsWith("/") && !f.includes("worktree")))
|
|
102
|
+
? 1
|
|
103
|
+
: 0;
|
|
104
|
+
// ETS v2 evaluation
|
|
105
|
+
const allEvidence = task.recordedAttempts.flatMap((a) => attemptToEvidenceItem(a));
|
|
106
|
+
const etsResult = await etsEngine.evaluate({
|
|
107
|
+
output: lastAttempt?.summary ?? "",
|
|
108
|
+
taskType: task.category.includes("security") ? "security" : "feature",
|
|
109
|
+
risk: task.category.includes("security") ? "critical" : "medium",
|
|
110
|
+
runArtifacts: {
|
|
111
|
+
items: allEvidence,
|
|
112
|
+
meta: {
|
|
113
|
+
runId: task.taskId,
|
|
114
|
+
nodeId: task.taskId,
|
|
115
|
+
provider: lastAttempt?.provider ?? "unknown",
|
|
116
|
+
model: lastAttempt?.model ?? "unknown",
|
|
117
|
+
cwd: "[repo-root]",
|
|
118
|
+
treeHashBefore: task.treeHash,
|
|
119
|
+
treeHashAfter: task.treeHash,
|
|
120
|
+
commandHash: hashConfig(task.recordedAttempts.map((a) => a.commandsRun)),
|
|
121
|
+
timestamp: new Date().toISOString(),
|
|
122
|
+
command: task.recordedAttempts.map((a) => a.commandsRun.join("; ")).join(" || "),
|
|
123
|
+
},
|
|
124
|
+
},
|
|
125
|
+
dependencyGraphFiles: task.relevantFiles,
|
|
126
|
+
});
|
|
127
|
+
const falseDone = !solved && etsResult.verdict === "pass";
|
|
128
|
+
const totalLatency = task.recordedAttempts.reduce((s, a) => s + a.latencyMs, 0);
|
|
129
|
+
const totalCost = task.recordedAttempts.reduce((s, a) => s + a.costUsdEstimated, 0);
|
|
130
|
+
results.push({
|
|
131
|
+
taskId: task.taskId,
|
|
132
|
+
solved,
|
|
133
|
+
evidenceTrustScore: etsResult.score,
|
|
134
|
+
falseDone,
|
|
135
|
+
fallbackUsed,
|
|
136
|
+
fallbackSucceeded,
|
|
137
|
+
routerRegret: shadowRecord.regretV2,
|
|
138
|
+
costUsd: totalCost,
|
|
139
|
+
latencyMs: totalLatency,
|
|
140
|
+
rolledBack,
|
|
141
|
+
sandboxViolations,
|
|
142
|
+
attemptCount: task.recordedAttempts.length,
|
|
143
|
+
decisions,
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
const completedAt = new Date().toISOString();
|
|
147
|
+
const durationMs = Math.round(performance.now() - startedMs);
|
|
148
|
+
const solvedCount = results.filter((r) => r.solved).length;
|
|
149
|
+
const totalTasks = results.length;
|
|
150
|
+
const solveRate = totalTasks > 0 ? solvedCount / totalTasks : 0;
|
|
151
|
+
const evidenceMean = totalTasks > 0 ? results.reduce((s, r) => s + r.evidenceTrustScore, 0) / totalTasks : 0;
|
|
152
|
+
const falseDoneRate = totalTasks > 0 ? results.filter((r) => r.falseDone).length / totalTasks : 0;
|
|
153
|
+
const fallbackAttempts = results.filter((r) => r.fallbackUsed);
|
|
154
|
+
const fallbackSuccessRate = fallbackAttempts.length > 0
|
|
155
|
+
? fallbackAttempts.filter((r) => r.fallbackSucceeded).length / fallbackAttempts.length
|
|
156
|
+
: 0;
|
|
157
|
+
const routerRegretMean = totalTasks > 0 ? results.reduce((s, r) => s + r.routerRegret, 0) / totalTasks : 0;
|
|
158
|
+
const costPerSolved = solvedCount > 0 ? results.reduce((s, r) => s + r.costUsd, 0) / solvedCount : 0;
|
|
159
|
+
const latencies = results.map((r) => r.latencyMs).sort((a, b) => a - b);
|
|
160
|
+
const p95Latency = latencies.length > 0 ? latencies[Math.floor(latencies.length * 0.95)] ?? latencies[latencies.length - 1] : 0;
|
|
161
|
+
const rollbackRate = totalTasks > 0 ? results.filter((r) => r.rolledBack).length / totalTasks : 0;
|
|
162
|
+
const sandboxViolationCount = results.reduce((s, r) => s + r.sandboxViolations, 0);
|
|
163
|
+
const summary = {
|
|
164
|
+
schemaVersion: "omk.benchmark.v1",
|
|
165
|
+
runId: config.runId,
|
|
166
|
+
startedAt,
|
|
167
|
+
completedAt,
|
|
168
|
+
durationMs,
|
|
169
|
+
treeHash: config.pinTreeHash ?? "synthetic",
|
|
170
|
+
seed: config.pinSeed ?? 42,
|
|
171
|
+
providerConfigHash: config.pinProviderConfigHash ?? hashConfig(runtimes.map((r) => r.id)),
|
|
172
|
+
omkVersion: process.env.npm_package_version ?? "0.0.0",
|
|
173
|
+
mode: config.mode,
|
|
174
|
+
totalTasks,
|
|
175
|
+
solvedCount,
|
|
176
|
+
solveRate,
|
|
177
|
+
evidenceTrustScoreMean: evidenceMean,
|
|
178
|
+
falseDoneRate,
|
|
179
|
+
fallbackSuccessRate,
|
|
180
|
+
routerRegretMean,
|
|
181
|
+
costPerSolvedTask: costPerSolved,
|
|
182
|
+
p95LatencyMs: p95Latency,
|
|
183
|
+
rollbackRate,
|
|
184
|
+
sandboxViolationCount,
|
|
185
|
+
results,
|
|
186
|
+
};
|
|
187
|
+
const outPath = join(config.outputDir, `${config.runId}.json`);
|
|
188
|
+
await writeFile(outPath, JSON.stringify(summary, null, 2), "utf-8");
|
|
189
|
+
return summary;
|
|
190
|
+
}
|
|
191
|
+
export { createShadowModeEngine, computeRouterRegret } from "./shadow-mode.js";
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shadow Mode Engine — side-by-side router v1/v2 recording.
|
|
3
|
+
*/
|
|
4
|
+
import type { AgentRuntime } from "../runtime/agent-runtime.js";
|
|
5
|
+
import type { ContextCapsule } from "../runtime/context-capsule.js";
|
|
6
|
+
import type { EvidenceHistoryEntry } from "../runtime/contracts/router-v2.js";
|
|
7
|
+
import type { ShadowModeRecord, BenchmarkDecisionRecord } from "./contracts.js";
|
|
8
|
+
export interface ShadowModeOptions {
|
|
9
|
+
readonly runtimes: AgentRuntime[];
|
|
10
|
+
readonly history: EvidenceHistoryEntry[];
|
|
11
|
+
}
|
|
12
|
+
export interface ShadowModeEngine {
|
|
13
|
+
evaluate(taskId: string, nodeId: string, capsule: ContextCapsule): ShadowModeRecord;
|
|
14
|
+
toBenchmarkDecision(record: ShadowModeRecord): BenchmarkDecisionRecord[];
|
|
15
|
+
}
|
|
16
|
+
export declare function createShadowModeEngine(options: ShadowModeOptions): ShadowModeEngine;
|
|
17
|
+
export declare function computeRouterRegret(candidates: AgentRuntime[], intent: string, history: EvidenceHistoryEntry[], selectedId: string): number;
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shadow Mode Engine — side-by-side router v1/v2 recording.
|
|
3
|
+
*/
|
|
4
|
+
import { createRuntimeRouter } from "../runtime/runtime-router.js";
|
|
5
|
+
import { createRouterV2ScoringEngine, scoreRuntimes } from "../runtime/router-v2-scoring.js";
|
|
6
|
+
export function createShadowModeEngine(options) {
|
|
7
|
+
const v1Router = createRuntimeRouter({ runtimes: options.runtimes });
|
|
8
|
+
const v2Engine = createRouterV2ScoringEngine();
|
|
9
|
+
function computeRegret(scores, selectedId) {
|
|
10
|
+
if (scores.length === 0)
|
|
11
|
+
return 0;
|
|
12
|
+
const best = Math.max(...scores.map((s) => s.composite));
|
|
13
|
+
const selected = scores.find((s) => s.runtimeId === selectedId)?.composite ?? 0;
|
|
14
|
+
return Math.max(0, best - selected);
|
|
15
|
+
}
|
|
16
|
+
function evaluate(taskId, nodeId, capsule) {
|
|
17
|
+
const intent = v1Router.classifyIntent(capsule);
|
|
18
|
+
let v1Decision = null;
|
|
19
|
+
let regretV1 = 0;
|
|
20
|
+
try {
|
|
21
|
+
v1Decision = v1Router.selectByIntent(capsule, options.history);
|
|
22
|
+
const v1Scores = v1Decision.scores.map((s) => ({
|
|
23
|
+
runtimeId: s.runtime,
|
|
24
|
+
composite: 0.35 * s.qualityScore +
|
|
25
|
+
0.25 * s.evidencePassRate +
|
|
26
|
+
0.15 * s.costScore +
|
|
27
|
+
0.1 * s.latencyScore +
|
|
28
|
+
0.15 * (1 - s.recentFailurePenalty),
|
|
29
|
+
}));
|
|
30
|
+
regretV1 = computeRegret(v1Scores, v1Decision.runtime.id);
|
|
31
|
+
}
|
|
32
|
+
catch {
|
|
33
|
+
v1Decision = null;
|
|
34
|
+
regretV1 = 1;
|
|
35
|
+
}
|
|
36
|
+
let v2Decision = null;
|
|
37
|
+
let regretV2 = 0;
|
|
38
|
+
try {
|
|
39
|
+
v2Decision = v2Engine.select(options.runtimes, intent, options.history);
|
|
40
|
+
const v2Scores = v2Decision.scores.map((s) => ({
|
|
41
|
+
runtimeId: s.runtimeId,
|
|
42
|
+
composite: s.composite,
|
|
43
|
+
}));
|
|
44
|
+
regretV2 = computeRegret(v2Scores, v2Decision.runtime.id);
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
v2Decision = null;
|
|
48
|
+
regretV2 = 1;
|
|
49
|
+
}
|
|
50
|
+
const disagreement = v1Decision?.runtime.id !== v2Decision?.runtime.id;
|
|
51
|
+
return {
|
|
52
|
+
taskId,
|
|
53
|
+
nodeId,
|
|
54
|
+
intent,
|
|
55
|
+
v1Decision,
|
|
56
|
+
v2Decision,
|
|
57
|
+
regretV1,
|
|
58
|
+
regretV2,
|
|
59
|
+
disagreement,
|
|
60
|
+
timestamp: new Date().toISOString(),
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
function toBenchmarkDecision(record) {
|
|
64
|
+
const out = [];
|
|
65
|
+
if (record.v1Decision) {
|
|
66
|
+
out.push({
|
|
67
|
+
component: "runtime-router-v1",
|
|
68
|
+
selectedRuntime: record.v1Decision.runtime.id,
|
|
69
|
+
bestAvailableRuntime: record.v2Decision?.scores[0]?.runtimeId ?? record.v1Decision.runtime.id,
|
|
70
|
+
regret: record.regretV1,
|
|
71
|
+
reason: record.v1Decision.reason,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
if (record.v2Decision) {
|
|
75
|
+
out.push({
|
|
76
|
+
component: "runtime-router-v2",
|
|
77
|
+
selectedRuntime: record.v2Decision.runtime.id,
|
|
78
|
+
bestAvailableRuntime: record.v2Decision.scores[0]?.runtimeId ?? record.v2Decision.runtime.id,
|
|
79
|
+
regret: record.regretV2,
|
|
80
|
+
reason: record.v2Decision.reason,
|
|
81
|
+
scoresV2: record.v2Decision.scores,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
return out;
|
|
85
|
+
}
|
|
86
|
+
return { evaluate, toBenchmarkDecision };
|
|
87
|
+
}
|
|
88
|
+
export function computeRouterRegret(candidates, intent, history, selectedId) {
|
|
89
|
+
const engine = createRouterV2ScoringEngine();
|
|
90
|
+
const scores = scoreRuntimes(candidates, intent, history);
|
|
91
|
+
if (scores.length === 0)
|
|
92
|
+
return 0;
|
|
93
|
+
const best = Math.max(...scores.map((s) => s.composite));
|
|
94
|
+
const selected = scores.find((s) => s.runtimeId === selectedId)?.composite ?? 0;
|
|
95
|
+
return Math.max(0, best - selected);
|
|
96
|
+
}
|