open-multi-agent-kit 0.78.2 → 0.78.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -2
- package/MATURITY.md +2 -2
- package/README.md +4 -4
- package/dist/benchmark/contracts.d.ts +116 -0
- package/dist/benchmark/contracts.js +6 -0
- package/dist/benchmark/fixtures.d.ts +11 -0
- package/dist/benchmark/fixtures.js +124 -0
- package/dist/benchmark/harness.d.ts +13 -0
- package/dist/benchmark/harness.js +191 -0
- package/dist/benchmark/shadow-mode.d.ts +17 -0
- package/dist/benchmark/shadow-mode.js +95 -0
- package/dist/cli/release-promotion-gate.js +14 -4
- package/dist/commands/merge.js +102 -56
- package/dist/contracts/provider-health.d.ts +37 -0
- package/dist/contracts/provider-health.js +49 -1
- package/dist/evidence/evidence-trust-score.d.ts +101 -0
- package/dist/evidence/evidence-trust-score.js +408 -0
- package/dist/evidence/index.d.ts +2 -0
- package/dist/evidence/index.js +1 -0
- package/dist/native/linux-x64/omk-safety +0 -0
- package/dist/orchestration/merge-arbiter.d.ts +91 -0
- package/dist/orchestration/merge-arbiter.js +376 -0
- package/dist/providers/health.d.ts +3 -0
- package/dist/providers/health.js +46 -0
- package/dist/providers/index.d.ts +1 -0
- package/dist/providers/index.js +1 -0
- package/dist/providers/provider-health.d.ts +8 -1
- package/dist/providers/provider-health.js +39 -0
- package/dist/providers/provider-task-runner.js +31 -0
- package/dist/providers/provider.d.ts +2 -0
- package/dist/providers/router.js +80 -3
- package/dist/providers/types.d.ts +4 -0
- package/dist/runtime/contracts/weakness-remediation.d.ts +6 -0
- package/dist/runtime/provider-maturity-gate.d.ts +2 -0
- package/dist/runtime/provider-maturity-gate.js +26 -0
- package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
- package/dist/runtime/tool-dispatch-contracts.js +42 -2
- package/dist/runtime/weakness-remediation-index.d.ts +1 -1
- package/dist/runtime/weakness-remediation-index.js +1 -1
- package/dist/safety/enforcement-engine.d.ts +89 -0
- package/dist/safety/enforcement-engine.js +279 -0
- package/dist/safety/tool-authority-gate.d.ts +40 -0
- package/dist/safety/tool-authority-gate.js +92 -0
- package/dist/schema/evidence.schema.d.ts +2 -2
- package/dist/schema/proof-bundle.schema.d.ts +2 -2
- package/docs/benchmark-design.md +122 -0
- package/docs/getting-started.md +1 -1
- package/docs/provider-maturity.md +1 -1
- package/docs/versioning.md +3 -3
- package/package.json +7 -3
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shadow Mode Engine — side-by-side router v1/v2 recording.
|
|
3
|
+
*/
|
|
4
|
+
import { createRuntimeRouter } from "../runtime/runtime-router.js";
|
|
5
|
+
import { createRouterV2ScoringEngine, scoreRuntimes } from "../runtime/router-v2-scoring.js";
|
|
6
|
+
export function createShadowModeEngine(options) {
|
|
7
|
+
const v1Router = createRuntimeRouter({ runtimes: options.runtimes });
|
|
8
|
+
const v2Engine = createRouterV2ScoringEngine();
|
|
9
|
+
function computeRegret(scores, selectedId) {
|
|
10
|
+
if (scores.length === 0)
|
|
11
|
+
return 0;
|
|
12
|
+
const best = Math.max(...scores.map((s) => s.composite));
|
|
13
|
+
const selected = scores.find((s) => s.runtimeId === selectedId)?.composite ?? 0;
|
|
14
|
+
return Math.max(0, best - selected);
|
|
15
|
+
}
|
|
16
|
+
function evaluate(taskId, nodeId, capsule) {
|
|
17
|
+
const intent = v1Router.classifyIntent(capsule);
|
|
18
|
+
let v1Decision = null;
|
|
19
|
+
let regretV1 = 0;
|
|
20
|
+
try {
|
|
21
|
+
v1Decision = v1Router.selectByIntent(capsule, options.history);
|
|
22
|
+
const v1Scores = v1Decision.scores.map((s) => ({
|
|
23
|
+
runtimeId: s.runtime,
|
|
24
|
+
composite: 0.35 * s.qualityScore +
|
|
25
|
+
0.25 * s.evidencePassRate +
|
|
26
|
+
0.15 * s.costScore +
|
|
27
|
+
0.1 * s.latencyScore +
|
|
28
|
+
0.15 * (1 - s.recentFailurePenalty),
|
|
29
|
+
}));
|
|
30
|
+
regretV1 = computeRegret(v1Scores, v1Decision.runtime.id);
|
|
31
|
+
}
|
|
32
|
+
catch {
|
|
33
|
+
v1Decision = null;
|
|
34
|
+
regretV1 = 1;
|
|
35
|
+
}
|
|
36
|
+
let v2Decision = null;
|
|
37
|
+
let regretV2 = 0;
|
|
38
|
+
try {
|
|
39
|
+
v2Decision = v2Engine.select(options.runtimes, intent, options.history);
|
|
40
|
+
const v2Scores = v2Decision.scores.map((s) => ({
|
|
41
|
+
runtimeId: s.runtimeId,
|
|
42
|
+
composite: s.composite,
|
|
43
|
+
}));
|
|
44
|
+
regretV2 = computeRegret(v2Scores, v2Decision.runtime.id);
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
v2Decision = null;
|
|
48
|
+
regretV2 = 1;
|
|
49
|
+
}
|
|
50
|
+
const disagreement = v1Decision?.runtime.id !== v2Decision?.runtime.id;
|
|
51
|
+
return {
|
|
52
|
+
taskId,
|
|
53
|
+
nodeId,
|
|
54
|
+
intent,
|
|
55
|
+
v1Decision,
|
|
56
|
+
v2Decision,
|
|
57
|
+
regretV1,
|
|
58
|
+
regretV2,
|
|
59
|
+
disagreement,
|
|
60
|
+
timestamp: new Date().toISOString(),
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
function toBenchmarkDecision(record) {
|
|
64
|
+
const out = [];
|
|
65
|
+
if (record.v1Decision) {
|
|
66
|
+
out.push({
|
|
67
|
+
component: "runtime-router-v1",
|
|
68
|
+
selectedRuntime: record.v1Decision.runtime.id,
|
|
69
|
+
bestAvailableRuntime: record.v2Decision?.scores[0]?.runtimeId ?? record.v1Decision.runtime.id,
|
|
70
|
+
regret: record.regretV1,
|
|
71
|
+
reason: record.v1Decision.reason,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
if (record.v2Decision) {
|
|
75
|
+
out.push({
|
|
76
|
+
component: "runtime-router-v2",
|
|
77
|
+
selectedRuntime: record.v2Decision.runtime.id,
|
|
78
|
+
bestAvailableRuntime: record.v2Decision.scores[0]?.runtimeId ?? record.v2Decision.runtime.id,
|
|
79
|
+
regret: record.regretV2,
|
|
80
|
+
reason: record.v2Decision.reason,
|
|
81
|
+
scoresV2: record.v2Decision.scores,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
return out;
|
|
85
|
+
}
|
|
86
|
+
return { evaluate, toBenchmarkDecision };
|
|
87
|
+
}
|
|
88
|
+
export function computeRouterRegret(candidates, intent, history, selectedId) {
|
|
89
|
+
const scores = scoreRuntimes(candidates, intent, history);
|
|
90
|
+
if (scores.length === 0)
|
|
91
|
+
return 0;
|
|
92
|
+
const best = Math.max(...scores.map((s) => s.composite));
|
|
93
|
+
const selected = scores.find((s) => s.runtimeId === selectedId)?.composite ?? 0;
|
|
94
|
+
return Math.max(0, best - selected);
|
|
95
|
+
}
|
|
@@ -12,6 +12,9 @@ export function createReleasePromotionGate() {
|
|
|
12
12
|
const w = RELEASE_GATE_WEIGHTS;
|
|
13
13
|
const demoRun = inputs.demoRun ?? false;
|
|
14
14
|
const maturity = inputs.maturity ?? inputs.providerMinimum ?? 0;
|
|
15
|
+
const versionConsistency = inputs.versionConsistency ?? inputs.semver ?? 1;
|
|
16
|
+
const liveBenchmarkPass = inputs.liveBenchmarkPass ?? false;
|
|
17
|
+
const sandboxViolationCount = inputs.sandboxViolationCount ?? Number.POSITIVE_INFINITY;
|
|
15
18
|
const rawScore = w.ci * inputs.ci +
|
|
16
19
|
w.build * (inputs.build ?? 0) +
|
|
17
20
|
w.types * (inputs.types ?? 0) +
|
|
@@ -20,11 +23,11 @@ export function createReleasePromotionGate() {
|
|
|
20
23
|
w.demo * (demoRun ? 1 : 0) +
|
|
21
24
|
w.proof * inputs.proofMedian +
|
|
22
25
|
w.maturity * maturity +
|
|
23
|
-
w.docs * inputs.docs -
|
|
26
|
+
w.docs * inputs.docs * versionConsistency -
|
|
24
27
|
w.regression * inputs.regressionSeverity;
|
|
25
28
|
const score = clamp01(rawScore);
|
|
26
29
|
const reasons = [];
|
|
27
|
-
const blocked = inputs.ci === 0 || inputs.freshInstallSmoke === 0 || !demoRun;
|
|
30
|
+
const blocked = inputs.ci === 0 || inputs.freshInstallSmoke === 0 || versionConsistency === 0 || !demoRun;
|
|
28
31
|
if (blocked) {
|
|
29
32
|
if (inputs.ci === 0) {
|
|
30
33
|
reasons.push("CI score is 0 (blocking)");
|
|
@@ -32,21 +35,28 @@ export function createReleasePromotionGate() {
|
|
|
32
35
|
if (inputs.freshInstallSmoke === 0) {
|
|
33
36
|
reasons.push("Fresh install smoke is 0 (blocking)");
|
|
34
37
|
}
|
|
38
|
+
if (versionConsistency === 0) {
|
|
39
|
+
reasons.push("Version/package/proof consistency is 0 (blocking)");
|
|
40
|
+
}
|
|
35
41
|
if (!demoRun) {
|
|
36
42
|
reasons.push("Minimal verified demo run failed or missing (blocking)");
|
|
37
43
|
}
|
|
38
44
|
}
|
|
45
|
+
const stableEligible = liveBenchmarkPass && sandboxViolationCount === 0;
|
|
39
46
|
let verdict;
|
|
40
47
|
if (blocked) {
|
|
41
48
|
verdict = "block";
|
|
42
49
|
}
|
|
43
|
-
else if (score >= 0.90 && inputs.proofMedian >= 0.85 && maturity >= 0.80) {
|
|
50
|
+
else if (score >= 0.90 && inputs.proofMedian >= 0.85 && maturity >= 0.80 && stableEligible) {
|
|
44
51
|
verdict = "stable";
|
|
45
|
-
reasons.push(`Score ${formatScore(score)} meets stable threshold (≥0.90) with proof≥0.85
|
|
52
|
+
reasons.push(`Score ${formatScore(score)} meets stable threshold (≥0.90) with proof≥0.85, maturity≥0.80, live benchmark pass, and sandbox violations=0`);
|
|
46
53
|
}
|
|
47
54
|
else if (score >= 0.75 && inputs.proofMedian >= 0.75) {
|
|
48
55
|
verdict = "pre-release";
|
|
49
56
|
reasons.push(`Score ${formatScore(score)} meets pre-release threshold (≥0.75) with proof≥0.75`);
|
|
57
|
+
if (score >= 0.90 && inputs.proofMedian >= 0.85 && maturity >= 0.80 && !stableEligible) {
|
|
58
|
+
reasons.push("Stable verdict withheld until live benchmark passes and sandboxViolationCount is 0");
|
|
59
|
+
}
|
|
50
60
|
}
|
|
51
61
|
else {
|
|
52
62
|
verdict = "block";
|
package/dist/commands/merge.js
CHANGED
|
@@ -11,6 +11,7 @@ import { getOmkResourceSettings } from "../util/resource-profile.js";
|
|
|
11
11
|
import { defaultScopedRoleAgentFile, writeScopedAgentFile } from "../util/scoped-agent-file.js";
|
|
12
12
|
import { createOmkJsonEnvelope } from "../util/json-envelope.js";
|
|
13
13
|
import { emitJson } from "../util/cli-contract.js";
|
|
14
|
+
import { runMergeArbiter, } from "../orchestration/merge-arbiter.js";
|
|
14
15
|
/**
|
|
15
16
|
* JSON path for `omk merge --json`.
|
|
16
17
|
* Read-only preview: resolves the run, collects worktree diffs (git diff +
|
|
@@ -138,65 +139,110 @@ export async function mergeCommand(options) {
|
|
|
138
139
|
if (dryRun)
|
|
139
140
|
console.log(style.orange("🟡 DRY RUN — no changes will be applied"));
|
|
140
141
|
console.log("");
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
const diff = diffResult.stdout;
|
|
151
|
-
const diffLines = diff.split("\n").length;
|
|
152
|
-
// Check apply-ability
|
|
153
|
-
const applyCheck = await runShell("git", ["apply", "--check"], {
|
|
154
|
-
cwd: root,
|
|
155
|
-
input: diff,
|
|
156
|
-
timeout: 15000,
|
|
142
|
+
let report;
|
|
143
|
+
let winner = null;
|
|
144
|
+
if (strategy === "arbiter") {
|
|
145
|
+
// ── Arbiter path ──
|
|
146
|
+
console.log(style.purple("Running merge arbiter..."));
|
|
147
|
+
const config = await readTextFile(join(root, ".omk", "config.toml"), "");
|
|
148
|
+
const arbiterResult = await runMergeArbiter(worktreesDir, currentBranch, root, config, {
|
|
149
|
+
threshold: 0.6,
|
|
150
|
+
testTimeoutMs: 120_000,
|
|
157
151
|
});
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
w.reviewReason = score.reason;
|
|
173
|
-
const color = score.score >= 80 ? style.mint : score.score >= 50 ? style.orange : style.pink;
|
|
174
|
-
console.log(` ${w.name}: ${color(`${score.score}/100`)} ${style.gray(score.reason)}`);
|
|
175
|
-
}
|
|
176
|
-
// ── 3. Test verification in worktrees ──
|
|
177
|
-
console.log("");
|
|
178
|
-
console.log(style.purple("Running tests in worktrees..."));
|
|
179
|
-
for (const w of workers) {
|
|
180
|
-
const testResult = await runShell("sh", ["-c", "npm test 2>/dev/null || pnpm test 2>/dev/null || yarn test 2>/dev/null || true"], {
|
|
181
|
-
cwd: w.path,
|
|
182
|
-
timeout: 120_000,
|
|
152
|
+
// Map arbiter candidates back to WorkerDiff for reporting
|
|
153
|
+
const arbiterWorkers = arbiterResult.trace.steps
|
|
154
|
+
.filter((s) => s.step === "evidence-suite" || s.step === "score")
|
|
155
|
+
.map((s) => {
|
|
156
|
+
return {
|
|
157
|
+
name: s.candidateId.replace("candidate-", ""),
|
|
158
|
+
path: "",
|
|
159
|
+
diff: "",
|
|
160
|
+
diffLines: 0,
|
|
161
|
+
canApply: s.detail.includes("apply=true"),
|
|
162
|
+
reviewScore: 50,
|
|
163
|
+
reviewReason: s.detail,
|
|
164
|
+
testsPassed: s.detail.includes("tests=true"),
|
|
165
|
+
};
|
|
183
166
|
});
|
|
184
|
-
|
|
185
|
-
|
|
167
|
+
// De-duplicate by name
|
|
168
|
+
const workerMap = new Map();
|
|
169
|
+
for (const w of arbiterWorkers)
|
|
170
|
+
workerMap.set(w.name, w);
|
|
171
|
+
report = {
|
|
172
|
+
winner: arbiterResult.winner?.name ?? null,
|
|
173
|
+
reason: arbiterResult.rationale.summary,
|
|
174
|
+
conflicts: arbiterResult.rationale.conflicts,
|
|
175
|
+
filesApplied: 0,
|
|
176
|
+
dryRun,
|
|
177
|
+
workers: [...workerMap.values()],
|
|
178
|
+
};
|
|
179
|
+
if (arbiterResult.requiresHumanApproval) {
|
|
180
|
+
console.log(status.error(arbiterResult.rationale.humanApprovalReason ?? "No candidate meets threshold — human approval required."));
|
|
181
|
+
printReport(report);
|
|
182
|
+
process.exit(1);
|
|
183
|
+
}
|
|
184
|
+
winner = arbiterResult.winner ? { name: arbiterResult.winner.name, path: arbiterResult.winner.path, diff: arbiterResult.winner.diff, diffLines: arbiterResult.winner.diffLines, canApply: arbiterResult.winner.canApply, reviewScore: arbiterResult.winner.evidence.reviewerScore, reviewReason: arbiterResult.winner.evidence.reviewerReason, testsPassed: arbiterResult.winner.evidence.testsPassed } : null;
|
|
185
|
+
}
|
|
186
|
+
else {
|
|
187
|
+
// ── 1. Collect diffs from all worktrees ──
|
|
188
|
+
const workers = [];
|
|
189
|
+
for (const name of workerNames) {
|
|
190
|
+
const wtPath = join(worktreesDir, name);
|
|
191
|
+
const diffResult = await runShell("git", ["-C", wtPath, "diff", currentBranch], { timeout: 15000 });
|
|
192
|
+
if (diffResult.failed || !diffResult.stdout.trim()) {
|
|
193
|
+
console.log(style.gray(` ${name}: no changes`));
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
const diff = diffResult.stdout;
|
|
197
|
+
const diffLines = diff.split("\n").length;
|
|
198
|
+
// Check apply-ability
|
|
199
|
+
const applyCheck = await runShell("git", ["apply", "--check"], {
|
|
200
|
+
cwd: root,
|
|
201
|
+
input: diff,
|
|
202
|
+
timeout: 15000,
|
|
203
|
+
});
|
|
204
|
+
const canApply = !applyCheck.failed;
|
|
205
|
+
workers.push({ name, path: wtPath, diff, diffLines, canApply });
|
|
206
|
+
console.log(` ${style.purpleBold(name)} ${canApply ? style.mint("(clean)") : style.pink("(conflicts)")} ${style.gray(`${diffLines} lines`)}`);
|
|
207
|
+
}
|
|
208
|
+
if (workers.length === 0) {
|
|
209
|
+
console.log(status.warn("No worker changes to merge."));
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
// ── 2. Reviewer scoring ──
|
|
213
|
+
console.log("");
|
|
214
|
+
console.log(style.purple("Scoring diffs with reviewer..."));
|
|
215
|
+
for (const w of workers) {
|
|
216
|
+
const score = await scoreDiff(w.diff, w.name);
|
|
217
|
+
w.reviewScore = score.score;
|
|
218
|
+
w.reviewReason = score.reason;
|
|
219
|
+
const color = score.score >= 80 ? style.mint : score.score >= 50 ? style.orange : style.pink;
|
|
220
|
+
console.log(` ${w.name}: ${color(`${score.score}/100`)} ${style.gray(score.reason)}`);
|
|
221
|
+
}
|
|
222
|
+
// ── 3. Test verification in worktrees ──
|
|
223
|
+
console.log("");
|
|
224
|
+
console.log(style.purple("Running tests in worktrees..."));
|
|
225
|
+
for (const w of workers) {
|
|
226
|
+
const testResult = await runShell("sh", ["-c", "npm test 2>/dev/null || pnpm test 2>/dev/null || yarn test 2>/dev/null || true"], {
|
|
227
|
+
cwd: w.path,
|
|
228
|
+
timeout: 120_000,
|
|
229
|
+
});
|
|
230
|
+
w.testsPassed = !testResult.failed;
|
|
231
|
+
console.log(` ${w.name}: ${w.testsPassed ? style.mint("tests passed") : style.pink("tests failed")}`);
|
|
232
|
+
}
|
|
233
|
+
// ── 4. Select winner ──
|
|
234
|
+
console.log("");
|
|
235
|
+
console.log(style.purple("Selecting winner..."));
|
|
236
|
+
winner = selectWinner(workers, strategy);
|
|
237
|
+
report = {
|
|
238
|
+
winner: winner?.name ?? null,
|
|
239
|
+
reason: winner?.reviewReason ?? "No suitable candidate",
|
|
240
|
+
conflicts: workers.filter((w) => !w.canApply).map((w) => w.name),
|
|
241
|
+
filesApplied: 0,
|
|
242
|
+
dryRun,
|
|
243
|
+
workers,
|
|
244
|
+
};
|
|
186
245
|
}
|
|
187
|
-
// ── 4. Select winner ──
|
|
188
|
-
console.log("");
|
|
189
|
-
console.log(style.purple("Selecting winner..."));
|
|
190
|
-
const winner = selectWinner(workers, strategy);
|
|
191
|
-
// ── 5. Apply or preview ──
|
|
192
|
-
const report = {
|
|
193
|
-
winner: winner?.name ?? null,
|
|
194
|
-
reason: winner?.reviewReason ?? "No suitable candidate",
|
|
195
|
-
conflicts: workers.filter((w) => !w.canApply).map((w) => w.name),
|
|
196
|
-
filesApplied: 0,
|
|
197
|
-
dryRun,
|
|
198
|
-
workers,
|
|
199
|
-
};
|
|
200
246
|
if (!winner) {
|
|
201
247
|
console.log(status.error("No worker diff can be applied cleanly."));
|
|
202
248
|
printReport(report);
|
|
@@ -10,6 +10,43 @@
|
|
|
10
10
|
export type ProviderFailureKind = "none" | "runtime" | "auth" | "model" | "quota" | "policy" | "transient" | "unknown";
|
|
11
11
|
/** Authority level a provider holds for a given capability lane. */
|
|
12
12
|
export type ProviderAuthorityLevel = "none" | "advisory" | "direct" | "full";
|
|
13
|
+
/** Capability-vector state machine for a single provider dimension. */
|
|
14
|
+
export type ProviderCapabilityState = "missing" | "installed" | "auth_present" | "auth_valid" | "model_available" | "quota_available" | "sandbox_supported" | "tool_contract_verified" | "ready";
|
|
15
|
+
/** Ordinal ordering for capability states (higher = more mature). */
|
|
16
|
+
export declare const PROVIDER_CAPABILITY_ORDINAL: Readonly<Record<ProviderCapabilityState, number>>;
|
|
17
|
+
/** Provider health as a capability vector (Profiler v2). */
|
|
18
|
+
export interface ProviderHealthVector {
|
|
19
|
+
/** Provider id (e.g. "kimi", "deepseek", "codex"). */
|
|
20
|
+
provider: string;
|
|
21
|
+
/** Binary/runtime installation state. */
|
|
22
|
+
binary: ProviderCapabilityState;
|
|
23
|
+
/** Authentication state. */
|
|
24
|
+
auth: ProviderCapabilityState;
|
|
25
|
+
/** Model resolution state. */
|
|
26
|
+
model: ProviderCapabilityState;
|
|
27
|
+
/** Quota/balance state. */
|
|
28
|
+
quota: ProviderCapabilityState;
|
|
29
|
+
/** P50 latency in milliseconds (0 = unknown). */
|
|
30
|
+
latencyP50Ms: number;
|
|
31
|
+
/** P95 latency in milliseconds (0 = unknown). */
|
|
32
|
+
latencyP95Ms: number;
|
|
33
|
+
/** Whether the provider supports read operations. */
|
|
34
|
+
supportsRead: boolean;
|
|
35
|
+
/** Whether the provider supports write operations. */
|
|
36
|
+
supportsWrite: boolean;
|
|
37
|
+
/** Whether the provider supports shell execution. */
|
|
38
|
+
supportsShell: boolean;
|
|
39
|
+
/** Whether the provider supports sandboxed execution. */
|
|
40
|
+
supportsSandbox: boolean;
|
|
41
|
+
/** 7-day evidence pass rate [0, 1] (default 0.5 = no data). */
|
|
42
|
+
evidencePassRate7d: number;
|
|
43
|
+
/** Exponentially-weighted moving average of failures [0, 1] (0 = healthy). */
|
|
44
|
+
failureEwma: number;
|
|
45
|
+
}
|
|
46
|
+
/** Derive a backward-compatible `healthy` boolean from a capability vector. */
|
|
47
|
+
export declare function isHealthy(vector: ProviderHealthVector): boolean;
|
|
48
|
+
/** Convert the legacy {@link ProviderHealth} contract into a v2 capability vector. */
|
|
49
|
+
export declare function providerHealthToVector(health: ProviderHealth): ProviderHealthVector;
|
|
13
50
|
/**
|
|
14
51
|
* Normalized provider health snapshot.
|
|
15
52
|
*
|
|
@@ -6,4 +6,52 @@
|
|
|
6
6
|
* renaming any pre-existing keys. It never carries secret values — only
|
|
7
7
|
* boolean signals (e.g. `authOk`) and non-sensitive remediation hints.
|
|
8
8
|
*/
|
|
9
|
-
|
|
9
|
+
/** Ordinal ordering for capability states (higher = more mature). */
|
|
10
|
+
export const PROVIDER_CAPABILITY_ORDINAL = {
|
|
11
|
+
missing: 0,
|
|
12
|
+
installed: 1,
|
|
13
|
+
auth_present: 2,
|
|
14
|
+
auth_valid: 3,
|
|
15
|
+
model_available: 4,
|
|
16
|
+
quota_available: 5,
|
|
17
|
+
sandbox_supported: 6,
|
|
18
|
+
tool_contract_verified: 7,
|
|
19
|
+
ready: 8,
|
|
20
|
+
};
|
|
21
|
+
/** Derive a backward-compatible `healthy` boolean from a capability vector. */
|
|
22
|
+
export function isHealthy(vector) {
|
|
23
|
+
return (vector.binary === "ready" &&
|
|
24
|
+
vector.auth === "ready" &&
|
|
25
|
+
vector.model === "ready" &&
|
|
26
|
+
vector.quota === "ready");
|
|
27
|
+
}
|
|
28
|
+
/** Convert the legacy {@link ProviderHealth} contract into a v2 capability vector. */
|
|
29
|
+
export function providerHealthToVector(health) {
|
|
30
|
+
const binary = health.runtimeOk ? "ready" : "missing";
|
|
31
|
+
const auth = health.authOk
|
|
32
|
+
? "ready"
|
|
33
|
+
: health.failureKind === "auth"
|
|
34
|
+
? "auth_present"
|
|
35
|
+
: "missing";
|
|
36
|
+
const model = health.modelOk ? "ready" : "missing";
|
|
37
|
+
const quota = health.quotaOk
|
|
38
|
+
? "ready"
|
|
39
|
+
: health.failureKind === "quota"
|
|
40
|
+
? "auth_valid"
|
|
41
|
+
: "missing";
|
|
42
|
+
return {
|
|
43
|
+
provider: health.provider,
|
|
44
|
+
binary,
|
|
45
|
+
auth,
|
|
46
|
+
model,
|
|
47
|
+
quota,
|
|
48
|
+
latencyP50Ms: 0,
|
|
49
|
+
latencyP95Ms: 0,
|
|
50
|
+
supportsRead: true,
|
|
51
|
+
supportsWrite: health.writeAuthority !== "none" && health.writeAuthority !== "advisory",
|
|
52
|
+
supportsShell: health.shellAuthority !== "none",
|
|
53
|
+
supportsSandbox: health.shellAuthority !== "none",
|
|
54
|
+
evidencePassRate7d: health.failureKind === "none" ? 1.0 : 0.5,
|
|
55
|
+
failureEwma: health.failureKind === "none" ? 0 : 0.5,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evidence Trust Score (ETS) v2 — Algorithm 10
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* ClaimExtractor(output) → RequiredEvidence(claim, taskType, risk)
|
|
6
|
+
* → EvidenceCollector(runArtifacts) → EvidenceVerifier(required, collected)
|
|
7
|
+
* → EvidenceTrustScore() → Pass | Warn | Fail
|
|
8
|
+
*
|
|
9
|
+
* Formula:
|
|
10
|
+
* ETS = 0.30*reproducibility + 0.25*independence + 0.20*coverage_relevance
|
|
11
|
+
* + 0.15*provenance_integrity + 0.10*freshness
|
|
12
|
+
* - gaming_penalty - stale_result_penalty - unverifiable_claim_penalty
|
|
13
|
+
*/
|
|
14
|
+
import type { EvidenceItem, EvidenceKind } from "../runtime/contracts/evidence.js";
|
|
15
|
+
/** A claim extracted from agent output. */
|
|
16
|
+
export interface EtsClaim {
|
|
17
|
+
readonly claimId: string;
|
|
18
|
+
readonly text: string;
|
|
19
|
+
readonly category: EtsClaimCategory;
|
|
20
|
+
readonly confidence: number;
|
|
21
|
+
}
|
|
22
|
+
export type EtsClaimCategory = "test" | "build" | "typecheck" | "lint" | "behavioral" | "security" | "performance" | "docs";
|
|
23
|
+
/** Task type that produced the output. */
|
|
24
|
+
export type EtsTaskType = "feature" | "bugfix" | "refactor" | "docs" | "test" | "review" | "security" | "release";
|
|
25
|
+
/** Risk tier for the task. */
|
|
26
|
+
export type EtsRiskTier = "low" | "medium" | "high" | "critical";
|
|
27
|
+
/** Required evidence for a claim. */
|
|
28
|
+
export interface RequiredEvidenceItem {
|
|
29
|
+
readonly evidenceId: string;
|
|
30
|
+
readonly kind: EvidenceKind;
|
|
31
|
+
readonly description: string;
|
|
32
|
+
readonly minConfidence: number;
|
|
33
|
+
}
|
|
34
|
+
/** Metadata about a run artifact. */
|
|
35
|
+
export interface RunArtifactMeta {
|
|
36
|
+
readonly runId: string;
|
|
37
|
+
readonly nodeId?: string;
|
|
38
|
+
readonly provider?: string;
|
|
39
|
+
readonly model?: string;
|
|
40
|
+
readonly cwd?: string;
|
|
41
|
+
readonly treeHashBefore?: string;
|
|
42
|
+
readonly treeHashAfter?: string;
|
|
43
|
+
readonly commandHash?: string;
|
|
44
|
+
readonly timestamp: string;
|
|
45
|
+
readonly command?: string;
|
|
46
|
+
}
|
|
47
|
+
/** Collected evidence with provenance. */
|
|
48
|
+
export interface CollectedEvidence {
|
|
49
|
+
readonly items: readonly EvidenceItem[];
|
|
50
|
+
readonly meta: RunArtifactMeta;
|
|
51
|
+
}
|
|
52
|
+
/** Result of verifying required vs collected evidence. */
|
|
53
|
+
export interface EvidenceVerificationResult {
|
|
54
|
+
readonly satisfied: readonly string[];
|
|
55
|
+
readonly missing: readonly string[];
|
|
56
|
+
readonly partial: readonly string[];
|
|
57
|
+
}
|
|
58
|
+
/** ETS v2 result. */
|
|
59
|
+
export interface EtsV2Result {
|
|
60
|
+
readonly score: number;
|
|
61
|
+
readonly reproducibility: number;
|
|
62
|
+
readonly independence: number;
|
|
63
|
+
readonly coverageRelevance: number;
|
|
64
|
+
readonly provenanceIntegrity: number;
|
|
65
|
+
readonly freshness: number;
|
|
66
|
+
readonly gamingPenalty: number;
|
|
67
|
+
readonly staleResultPenalty: number;
|
|
68
|
+
readonly unverifiableClaimPenalty: number;
|
|
69
|
+
readonly verdict: "pass" | "warn" | "fail";
|
|
70
|
+
readonly reasons: readonly string[];
|
|
71
|
+
}
|
|
72
|
+
/** ETS v2 engine. */
|
|
73
|
+
export interface EtsV2Engine {
|
|
74
|
+
evaluate(params: EtsV2Params): Promise<EtsV2Result>;
|
|
75
|
+
}
|
|
76
|
+
/** Input parameters for ETS v2 evaluation. */
|
|
77
|
+
export interface EtsV2Params {
|
|
78
|
+
readonly output: string;
|
|
79
|
+
readonly taskType: EtsTaskType;
|
|
80
|
+
readonly risk: EtsRiskTier;
|
|
81
|
+
readonly runArtifacts: CollectedEvidence;
|
|
82
|
+
readonly dependencyGraphFiles?: readonly string[];
|
|
83
|
+
readonly now?: string;
|
|
84
|
+
}
|
|
85
|
+
declare const WEIGHTS: {
|
|
86
|
+
readonly reproducibility: 0.3;
|
|
87
|
+
readonly independence: 0.25;
|
|
88
|
+
readonly coverageRelevance: 0.2;
|
|
89
|
+
readonly provenanceIntegrity: 0.15;
|
|
90
|
+
readonly freshness: 0.1;
|
|
91
|
+
};
|
|
92
|
+
export declare function extractClaims(output: string): readonly EtsClaim[];
|
|
93
|
+
export declare function requiredEvidenceForClaim(claim: EtsClaim, taskType: EtsTaskType, risk: EtsRiskTier): readonly RequiredEvidenceItem[];
|
|
94
|
+
export declare function collectEvidenceFromRunDir(runDir: string, meta: RunArtifactMeta): Promise<CollectedEvidence>;
|
|
95
|
+
export declare function verifyEvidence(required: readonly RequiredEvidenceItem[], collected: CollectedEvidence): EvidenceVerificationResult;
|
|
96
|
+
export interface EtsV2EngineOptions {
|
|
97
|
+
readonly customWeights?: Partial<typeof WEIGHTS>;
|
|
98
|
+
readonly now?: string;
|
|
99
|
+
}
|
|
100
|
+
export declare function createEvidenceTrustScoreV2Engine(options?: EtsV2EngineOptions): EtsV2Engine;
|
|
101
|
+
export { createEvidenceTrustScoreV2Engine as createEvidenceTrustScore };
|