open-multi-agent-kit 0.78.2 → 0.78.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/CHANGELOG.md +27 -2
  2. package/dist/benchmark/contracts.d.ts +116 -0
  3. package/dist/benchmark/contracts.js +6 -0
  4. package/dist/benchmark/fixtures.d.ts +11 -0
  5. package/dist/benchmark/fixtures.js +121 -0
  6. package/dist/benchmark/harness.d.ts +13 -0
  7. package/dist/benchmark/harness.js +191 -0
  8. package/dist/benchmark/shadow-mode.d.ts +17 -0
  9. package/dist/benchmark/shadow-mode.js +96 -0
  10. package/dist/commands/merge.js +102 -56
  11. package/dist/contracts/provider-health.d.ts +37 -0
  12. package/dist/contracts/provider-health.js +49 -1
  13. package/dist/evidence/evidence-trust-score.d.ts +101 -0
  14. package/dist/evidence/evidence-trust-score.js +408 -0
  15. package/dist/evidence/index.d.ts +2 -0
  16. package/dist/evidence/index.js +1 -0
  17. package/dist/orchestration/merge-arbiter.d.ts +91 -0
  18. package/dist/orchestration/merge-arbiter.js +376 -0
  19. package/dist/providers/health.d.ts +3 -0
  20. package/dist/providers/health.js +46 -0
  21. package/dist/providers/index.d.ts +1 -0
  22. package/dist/providers/index.js +1 -0
  23. package/dist/providers/provider-health.d.ts +8 -1
  24. package/dist/providers/provider-health.js +39 -0
  25. package/dist/providers/provider-task-runner.js +31 -0
  26. package/dist/providers/provider.d.ts +2 -0
  27. package/dist/providers/router.js +87 -3
  28. package/dist/providers/types.d.ts +4 -0
  29. package/dist/runtime/provider-maturity-gate.d.ts +2 -0
  30. package/dist/runtime/provider-maturity-gate.js +28 -0
  31. package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
  32. package/dist/runtime/tool-dispatch-contracts.js +42 -2
  33. package/dist/runtime/weakness-remediation-index.d.ts +1 -1
  34. package/dist/runtime/weakness-remediation-index.js +1 -1
  35. package/dist/safety/enforcement-engine.d.ts +89 -0
  36. package/dist/safety/enforcement-engine.js +279 -0
  37. package/dist/safety/tool-authority-gate.d.ts +40 -0
  38. package/dist/safety/tool-authority-gate.js +92 -0
  39. package/dist/schema/evidence.schema.d.ts +2 -2
  40. package/dist/schema/proof-bundle.schema.d.ts +2 -2
  41. package/docs/benchmark-design.md +122 -0
  42. package/package.json +5 -2
@@ -11,6 +11,7 @@ import { getOmkResourceSettings } from "../util/resource-profile.js";
11
11
  import { defaultScopedRoleAgentFile, writeScopedAgentFile } from "../util/scoped-agent-file.js";
12
12
  import { createOmkJsonEnvelope } from "../util/json-envelope.js";
13
13
  import { emitJson } from "../util/cli-contract.js";
14
+ import { runMergeArbiter, } from "../orchestration/merge-arbiter.js";
14
15
  /**
15
16
  * JSON path for `omk merge --json`.
16
17
  * Read-only preview: resolves the run, collects worktree diffs (git diff +
@@ -138,65 +139,110 @@ export async function mergeCommand(options) {
138
139
  if (dryRun)
139
140
  console.log(style.orange("🟡 DRY RUN — no changes will be applied"));
140
141
  console.log("");
141
- // ── 1. Collect diffs from all worktrees ──
142
- const workers = [];
143
- for (const name of workerNames) {
144
- const wtPath = join(worktreesDir, name);
145
- const diffResult = await runShell("git", ["-C", wtPath, "diff", currentBranch], { timeout: 15000 });
146
- if (diffResult.failed || !diffResult.stdout.trim()) {
147
- console.log(style.gray(` ${name}: no changes`));
148
- continue;
149
- }
150
- const diff = diffResult.stdout;
151
- const diffLines = diff.split("\n").length;
152
- // Check apply-ability
153
- const applyCheck = await runShell("git", ["apply", "--check"], {
154
- cwd: root,
155
- input: diff,
156
- timeout: 15000,
142
+ let report;
143
+ let winner = null;
144
+ if (strategy === "arbiter") {
145
+ // ── Arbiter path ──
146
+ console.log(style.purple("Running merge arbiter..."));
147
+ const config = await readTextFile(join(root, ".omk", "config.toml"), "");
148
+ const arbiterResult = await runMergeArbiter(worktreesDir, currentBranch, root, config, {
149
+ threshold: 0.6,
150
+ testTimeoutMs: 120_000,
157
151
  });
158
- const canApply = !applyCheck.failed;
159
- workers.push({ name, path: wtPath, diff, diffLines, canApply });
160
- console.log(` ${style.purpleBold(name)} ${canApply ? style.mint("(clean)") : style.pink("(conflicts)")} ${style.gray(`${diffLines} lines`)}`);
161
- }
162
- if (workers.length === 0) {
163
- console.log(status.warn("No worker changes to merge."));
164
- return;
165
- }
166
- // ── 2. Reviewer scoring ──
167
- console.log("");
168
- console.log(style.purple("Scoring diffs with reviewer..."));
169
- for (const w of workers) {
170
- const score = await scoreDiff(w.diff, w.name);
171
- w.reviewScore = score.score;
172
- w.reviewReason = score.reason;
173
- const color = score.score >= 80 ? style.mint : score.score >= 50 ? style.orange : style.pink;
174
- console.log(` ${w.name}: ${color(`${score.score}/100`)} ${style.gray(score.reason)}`);
175
- }
176
- // ── 3. Test verification in worktrees ──
177
- console.log("");
178
- console.log(style.purple("Running tests in worktrees..."));
179
- for (const w of workers) {
180
- const testResult = await runShell("sh", ["-c", "npm test 2>/dev/null || pnpm test 2>/dev/null || yarn test 2>/dev/null || true"], {
181
- cwd: w.path,
182
- timeout: 120_000,
152
+ // Map arbiter candidates back to WorkerDiff for reporting
153
+ const arbiterWorkers = arbiterResult.trace.steps
154
+ .filter((s) => s.step === "evidence-suite" || s.step === "score")
155
+ .map((s) => {
156
+ return {
157
+ name: s.candidateId.replace("candidate-", ""),
158
+ path: "",
159
+ diff: "",
160
+ diffLines: 0,
161
+ canApply: s.detail.includes("apply=true"),
162
+ reviewScore: 50,
163
+ reviewReason: s.detail,
164
+ testsPassed: s.detail.includes("tests=true"),
165
+ };
183
166
  });
184
- w.testsPassed = !testResult.failed;
185
- console.log(` ${w.name}: ${w.testsPassed ? style.mint("tests passed") : style.pink("tests failed")}`);
167
+ // De-duplicate by name
168
+ const workerMap = new Map();
169
+ for (const w of arbiterWorkers)
170
+ workerMap.set(w.name, w);
171
+ report = {
172
+ winner: arbiterResult.winner?.name ?? null,
173
+ reason: arbiterResult.rationale.summary,
174
+ conflicts: arbiterResult.rationale.conflicts,
175
+ filesApplied: 0,
176
+ dryRun,
177
+ workers: [...workerMap.values()],
178
+ };
179
+ if (arbiterResult.requiresHumanApproval) {
180
+ console.log(status.error(arbiterResult.rationale.humanApprovalReason ?? "No candidate meets threshold — human approval required."));
181
+ printReport(report);
182
+ process.exit(1);
183
+ }
184
+ winner = arbiterResult.winner ? { name: arbiterResult.winner.name, path: arbiterResult.winner.path, diff: arbiterResult.winner.diff, diffLines: arbiterResult.winner.diffLines, canApply: arbiterResult.winner.canApply, reviewScore: arbiterResult.winner.evidence.reviewerScore, reviewReason: arbiterResult.winner.evidence.reviewerReason, testsPassed: arbiterResult.winner.evidence.testsPassed } : null;
185
+ }
186
+ else {
187
+ // ── 1. Collect diffs from all worktrees ──
188
+ const workers = [];
189
+ for (const name of workerNames) {
190
+ const wtPath = join(worktreesDir, name);
191
+ const diffResult = await runShell("git", ["-C", wtPath, "diff", currentBranch], { timeout: 15000 });
192
+ if (diffResult.failed || !diffResult.stdout.trim()) {
193
+ console.log(style.gray(` ${name}: no changes`));
194
+ continue;
195
+ }
196
+ const diff = diffResult.stdout;
197
+ const diffLines = diff.split("\n").length;
198
+ // Check apply-ability
199
+ const applyCheck = await runShell("git", ["apply", "--check"], {
200
+ cwd: root,
201
+ input: diff,
202
+ timeout: 15000,
203
+ });
204
+ const canApply = !applyCheck.failed;
205
+ workers.push({ name, path: wtPath, diff, diffLines, canApply });
206
+ console.log(` ${style.purpleBold(name)} ${canApply ? style.mint("(clean)") : style.pink("(conflicts)")} ${style.gray(`${diffLines} lines`)}`);
207
+ }
208
+ if (workers.length === 0) {
209
+ console.log(status.warn("No worker changes to merge."));
210
+ return;
211
+ }
212
+ // ── 2. Reviewer scoring ──
213
+ console.log("");
214
+ console.log(style.purple("Scoring diffs with reviewer..."));
215
+ for (const w of workers) {
216
+ const score = await scoreDiff(w.diff, w.name);
217
+ w.reviewScore = score.score;
218
+ w.reviewReason = score.reason;
219
+ const color = score.score >= 80 ? style.mint : score.score >= 50 ? style.orange : style.pink;
220
+ console.log(` ${w.name}: ${color(`${score.score}/100`)} ${style.gray(score.reason)}`);
221
+ }
222
+ // ── 3. Test verification in worktrees ──
223
+ console.log("");
224
+ console.log(style.purple("Running tests in worktrees..."));
225
+ for (const w of workers) {
226
+ const testResult = await runShell("sh", ["-c", "npm test 2>/dev/null || pnpm test 2>/dev/null || yarn test 2>/dev/null || true"], {
227
+ cwd: w.path,
228
+ timeout: 120_000,
229
+ });
230
+ w.testsPassed = !testResult.failed;
231
+ console.log(` ${w.name}: ${w.testsPassed ? style.mint("tests passed") : style.pink("tests failed")}`);
232
+ }
233
+ // ── 4. Select winner ──
234
+ console.log("");
235
+ console.log(style.purple("Selecting winner..."));
236
+ winner = selectWinner(workers, strategy);
237
+ report = {
238
+ winner: winner?.name ?? null,
239
+ reason: winner?.reviewReason ?? "No suitable candidate",
240
+ conflicts: workers.filter((w) => !w.canApply).map((w) => w.name),
241
+ filesApplied: 0,
242
+ dryRun,
243
+ workers,
244
+ };
186
245
  }
187
- // ── 4. Select winner ──
188
- console.log("");
189
- console.log(style.purple("Selecting winner..."));
190
- const winner = selectWinner(workers, strategy);
191
- // ── 5. Apply or preview ──
192
- const report = {
193
- winner: winner?.name ?? null,
194
- reason: winner?.reviewReason ?? "No suitable candidate",
195
- conflicts: workers.filter((w) => !w.canApply).map((w) => w.name),
196
- filesApplied: 0,
197
- dryRun,
198
- workers,
199
- };
200
246
  if (!winner) {
201
247
  console.log(status.error("No worker diff can be applied cleanly."));
202
248
  printReport(report);
@@ -10,6 +10,43 @@
10
10
  export type ProviderFailureKind = "none" | "runtime" | "auth" | "model" | "quota" | "policy" | "transient" | "unknown";
11
11
  /** Authority level a provider holds for a given capability lane. */
12
12
  export type ProviderAuthorityLevel = "none" | "advisory" | "direct" | "full";
13
+ /** Capability-vector state machine for a single provider dimension. */
14
+ export type ProviderCapabilityState = "missing" | "installed" | "auth_present" | "auth_valid" | "model_available" | "quota_available" | "sandbox_supported" | "tool_contract_verified" | "ready";
15
+ /** Ordinal ordering for capability states (higher = more mature). */
16
+ export declare const PROVIDER_CAPABILITY_ORDINAL: Readonly<Record<ProviderCapabilityState, number>>;
17
+ /** Provider health as a capability vector (Profiler v2). */
18
+ export interface ProviderHealthVector {
19
+ /** Provider id (e.g. "kimi", "deepseek", "codex"). */
20
+ provider: string;
21
+ /** Binary/runtime installation state. */
22
+ binary: ProviderCapabilityState;
23
+ /** Authentication state. */
24
+ auth: ProviderCapabilityState;
25
+ /** Model resolution state. */
26
+ model: ProviderCapabilityState;
27
+ /** Quota/balance state. */
28
+ quota: ProviderCapabilityState;
29
+ /** P50 latency in milliseconds (0 = unknown). */
30
+ latencyP50Ms: number;
31
+ /** P95 latency in milliseconds (0 = unknown). */
32
+ latencyP95Ms: number;
33
+ /** Whether the provider supports read operations. */
34
+ supportsRead: boolean;
35
+ /** Whether the provider supports write operations. */
36
+ supportsWrite: boolean;
37
+ /** Whether the provider supports shell execution. */
38
+ supportsShell: boolean;
39
+ /** Whether the provider supports sandboxed execution. */
40
+ supportsSandbox: boolean;
41
+ /** 7-day evidence pass rate [0, 1] (default 0.5 = no data). */
42
+ evidencePassRate7d: number;
43
+ /** Exponentially-weighted moving average of failures [0, 1] (0 = healthy). */
44
+ failureEwma: number;
45
+ }
46
+ /** Derive a backward-compatible `healthy` boolean from a capability vector. */
47
+ export declare function isHealthy(vector: ProviderHealthVector): boolean;
48
+ /** Convert the legacy {@link ProviderHealth} contract into a v2 capability vector. */
49
+ export declare function providerHealthToVector(health: ProviderHealth): ProviderHealthVector;
13
50
  /**
14
51
  * Normalized provider health snapshot.
15
52
  *
@@ -6,4 +6,52 @@
6
6
  * renaming any pre-existing keys. It never carries secret values — only
7
7
  * boolean signals (e.g. `authOk`) and non-sensitive remediation hints.
8
8
  */
9
- export {};
9
+ /** Ordinal ordering for capability states (higher = more mature). */
10
+ export const PROVIDER_CAPABILITY_ORDINAL = {
11
+ missing: 0,
12
+ installed: 1,
13
+ auth_present: 2,
14
+ auth_valid: 3,
15
+ model_available: 4,
16
+ quota_available: 5,
17
+ sandbox_supported: 6,
18
+ tool_contract_verified: 7,
19
+ ready: 8,
20
+ };
21
+ /** Derive a backward-compatible `healthy` boolean from a capability vector. */
22
+ export function isHealthy(vector) {
23
+ return (vector.binary === "ready" &&
24
+ vector.auth === "ready" &&
25
+ vector.model === "ready" &&
26
+ vector.quota === "ready");
27
+ }
28
+ /** Convert the legacy {@link ProviderHealth} contract into a v2 capability vector. */
29
+ export function providerHealthToVector(health) {
30
+ const binary = health.runtimeOk ? "ready" : "missing";
31
+ const auth = health.authOk
32
+ ? "ready"
33
+ : health.failureKind === "auth"
34
+ ? "auth_present"
35
+ : "missing";
36
+ const model = health.modelOk ? "ready" : "missing";
37
+ const quota = health.quotaOk
38
+ ? "ready"
39
+ : health.failureKind === "quota"
40
+ ? "auth_valid"
41
+ : "missing";
42
+ return {
43
+ provider: health.provider,
44
+ binary,
45
+ auth,
46
+ model,
47
+ quota,
48
+ latencyP50Ms: 0,
49
+ latencyP95Ms: 0,
50
+ supportsRead: true,
51
+ supportsWrite: health.writeAuthority !== "none" && health.writeAuthority !== "advisory",
52
+ supportsShell: health.shellAuthority !== "none",
53
+ supportsSandbox: health.shellAuthority !== "none",
54
+ evidencePassRate7d: health.failureKind === "none" ? 1.0 : 0.5,
55
+ failureEwma: health.failureKind === "none" ? 0 : 0.5,
56
+ };
57
+ }
@@ -0,0 +1,101 @@
1
+ /**
2
+ * Evidence Trust Score (ETS) v2 — Algorithm 10
3
+ *
4
+ * Pipeline:
5
+ * ClaimExtractor(output) → RequiredEvidence(claim, taskType, risk)
6
+ * → EvidenceCollector(runArtifacts) → EvidenceVerifier(required, collected)
7
+ * → EvidenceTrustScore() → Pass | Warn | Fail
8
+ *
9
+ * Formula:
10
+ * ETS = 0.30*reproducibility + 0.25*independence + 0.20*coverage_relevance
11
+ * + 0.15*provenance_integrity + 0.10*freshness
12
+ * - gaming_penalty - stale_result_penalty - unverifiable_claim_penalty
13
+ */
14
+ import type { EvidenceItem, EvidenceKind } from "../runtime/contracts/evidence.js";
15
+ /** A claim extracted from agent output. */
16
+ export interface EtsClaim {
17
+ readonly claimId: string;
18
+ readonly text: string;
19
+ readonly category: EtsClaimCategory;
20
+ readonly confidence: number;
21
+ }
22
+ export type EtsClaimCategory = "test" | "build" | "typecheck" | "lint" | "behavioral" | "security" | "performance" | "docs";
23
+ /** Task type that produced the output. */
24
+ export type EtsTaskType = "feature" | "bugfix" | "refactor" | "docs" | "test" | "review" | "security" | "release";
25
+ /** Risk tier for the task. */
26
+ export type EtsRiskTier = "low" | "medium" | "high" | "critical";
27
+ /** Required evidence for a claim. */
28
+ export interface RequiredEvidenceItem {
29
+ readonly evidenceId: string;
30
+ readonly kind: EvidenceKind;
31
+ readonly description: string;
32
+ readonly minConfidence: number;
33
+ }
34
+ /** Metadata about a run artifact. */
35
+ export interface RunArtifactMeta {
36
+ readonly runId: string;
37
+ readonly nodeId?: string;
38
+ readonly provider?: string;
39
+ readonly model?: string;
40
+ readonly cwd?: string;
41
+ readonly treeHashBefore?: string;
42
+ readonly treeHashAfter?: string;
43
+ readonly commandHash?: string;
44
+ readonly timestamp: string;
45
+ readonly command?: string;
46
+ }
47
+ /** Collected evidence with provenance. */
48
+ export interface CollectedEvidence {
49
+ readonly items: readonly EvidenceItem[];
50
+ readonly meta: RunArtifactMeta;
51
+ }
52
+ /** Result of verifying required vs collected evidence. */
53
+ export interface EvidenceVerificationResult {
54
+ readonly satisfied: readonly string[];
55
+ readonly missing: readonly string[];
56
+ readonly partial: readonly string[];
57
+ }
58
+ /** ETS v2 result. */
59
+ export interface EtsV2Result {
60
+ readonly score: number;
61
+ readonly reproducibility: number;
62
+ readonly independence: number;
63
+ readonly coverageRelevance: number;
64
+ readonly provenanceIntegrity: number;
65
+ readonly freshness: number;
66
+ readonly gamingPenalty: number;
67
+ readonly staleResultPenalty: number;
68
+ readonly unverifiableClaimPenalty: number;
69
+ readonly verdict: "pass" | "warn" | "fail";
70
+ readonly reasons: readonly string[];
71
+ }
72
+ /** ETS v2 engine. */
73
+ export interface EtsV2Engine {
74
+ evaluate(params: EtsV2Params): Promise<EtsV2Result>;
75
+ }
76
+ /** Input parameters for ETS v2 evaluation. */
77
+ export interface EtsV2Params {
78
+ readonly output: string;
79
+ readonly taskType: EtsTaskType;
80
+ readonly risk: EtsRiskTier;
81
+ readonly runArtifacts: CollectedEvidence;
82
+ readonly dependencyGraphFiles?: readonly string[];
83
+ readonly now?: string;
84
+ }
85
+ declare const WEIGHTS: {
86
+ readonly reproducibility: 0.3;
87
+ readonly independence: 0.25;
88
+ readonly coverageRelevance: 0.2;
89
+ readonly provenanceIntegrity: 0.15;
90
+ readonly freshness: 0.1;
91
+ };
92
+ export declare function extractClaims(output: string): readonly EtsClaim[];
93
+ export declare function requiredEvidenceForClaim(claim: EtsClaim, taskType: EtsTaskType, risk: EtsRiskTier): readonly RequiredEvidenceItem[];
94
+ export declare function collectEvidenceFromRunDir(runDir: string, meta: RunArtifactMeta): Promise<CollectedEvidence>;
95
+ export declare function verifyEvidence(required: readonly RequiredEvidenceItem[], collected: CollectedEvidence): EvidenceVerificationResult;
96
+ export interface EtsV2EngineOptions {
97
+ readonly customWeights?: Partial<typeof WEIGHTS>;
98
+ readonly now?: string;
99
+ }
100
+ export declare function createEvidenceTrustScoreV2Engine(options?: EtsV2EngineOptions): EtsV2Engine;
101
+ export { createEvidenceTrustScoreV2Engine as createEvidenceTrustScore };