open-multi-agent-kit 0.78.2 → 0.78.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -2
- package/MATURITY.md +2 -2
- package/README.md +4 -4
- package/dist/benchmark/contracts.d.ts +116 -0
- package/dist/benchmark/contracts.js +6 -0
- package/dist/benchmark/fixtures.d.ts +11 -0
- package/dist/benchmark/fixtures.js +124 -0
- package/dist/benchmark/harness.d.ts +13 -0
- package/dist/benchmark/harness.js +191 -0
- package/dist/benchmark/shadow-mode.d.ts +17 -0
- package/dist/benchmark/shadow-mode.js +95 -0
- package/dist/cli/release-promotion-gate.js +14 -4
- package/dist/commands/merge.js +102 -56
- package/dist/contracts/provider-health.d.ts +37 -0
- package/dist/contracts/provider-health.js +49 -1
- package/dist/evidence/evidence-trust-score.d.ts +101 -0
- package/dist/evidence/evidence-trust-score.js +408 -0
- package/dist/evidence/index.d.ts +2 -0
- package/dist/evidence/index.js +1 -0
- package/dist/native/linux-x64/omk-safety +0 -0
- package/dist/orchestration/merge-arbiter.d.ts +91 -0
- package/dist/orchestration/merge-arbiter.js +376 -0
- package/dist/providers/health.d.ts +3 -0
- package/dist/providers/health.js +46 -0
- package/dist/providers/index.d.ts +1 -0
- package/dist/providers/index.js +1 -0
- package/dist/providers/provider-health.d.ts +8 -1
- package/dist/providers/provider-health.js +39 -0
- package/dist/providers/provider-task-runner.js +31 -0
- package/dist/providers/provider.d.ts +2 -0
- package/dist/providers/router.js +80 -3
- package/dist/providers/types.d.ts +4 -0
- package/dist/runtime/contracts/weakness-remediation.d.ts +6 -0
- package/dist/runtime/provider-maturity-gate.d.ts +2 -0
- package/dist/runtime/provider-maturity-gate.js +26 -0
- package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
- package/dist/runtime/tool-dispatch-contracts.js +42 -2
- package/dist/runtime/weakness-remediation-index.d.ts +1 -1
- package/dist/runtime/weakness-remediation-index.js +1 -1
- package/dist/safety/enforcement-engine.d.ts +89 -0
- package/dist/safety/enforcement-engine.js +279 -0
- package/dist/safety/tool-authority-gate.d.ts +40 -0
- package/dist/safety/tool-authority-gate.js +92 -0
- package/dist/schema/evidence.schema.d.ts +2 -2
- package/dist/schema/proof-bundle.schema.d.ts +2 -2
- package/docs/benchmark-design.md +122 -0
- package/docs/getting-started.md +1 -1
- package/docs/provider-maturity.md +1 -1
- package/docs/versioning.md +3 -3
- package/package.json +7 -3
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evidence Trust Score (ETS) v2 — Algorithm 10
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* ClaimExtractor(output) → RequiredEvidence(claim, taskType, risk)
|
|
6
|
+
* → EvidenceCollector(runArtifacts) → EvidenceVerifier(required, collected)
|
|
7
|
+
* → EvidenceTrustScore() → Pass | Warn | Fail
|
|
8
|
+
*
|
|
9
|
+
* Formula:
|
|
10
|
+
* ETS = 0.30*reproducibility + 0.25*independence + 0.20*coverage_relevance
|
|
11
|
+
* + 0.15*provenance_integrity + 0.10*freshness
|
|
12
|
+
* - gaming_penalty - stale_result_penalty - unverifiable_claim_penalty
|
|
13
|
+
*/
|
|
14
|
+
import { readFile } from "node:fs/promises";
|
|
15
|
+
import { existsSync } from "node:fs";
|
|
16
|
+
import { join } from "node:path";
|
|
17
|
+
// ─── Constants ─────────────────────────────────────────────────────────────
|
|
18
|
+
const WEIGHTS = {
|
|
19
|
+
reproducibility: 0.30,
|
|
20
|
+
independence: 0.25,
|
|
21
|
+
coverageRelevance: 0.20,
|
|
22
|
+
provenanceIntegrity: 0.15,
|
|
23
|
+
freshness: 0.10,
|
|
24
|
+
};
|
|
25
|
+
const STALE_HOURS_BY_RISK = {
|
|
26
|
+
low: 72,
|
|
27
|
+
medium: 48,
|
|
28
|
+
high: 24,
|
|
29
|
+
critical: 6,
|
|
30
|
+
};
|
|
31
|
+
const CLAIM_PATTERNS = [
|
|
32
|
+
{ category: "test", regex: /\b(tests?\s+pass(?:ed|es|ing)|test\s+coverage|all\s+tests?\s+(?:ok|green)|\bnpm\s+test|\bnode\s+--test)/i },
|
|
33
|
+
{ category: "build", regex: /\b(build\s+(?:ok|success|succeeded|pass(?:ed|es|ing))|npm\s+run\s+build|tsc\s+.*(?:no\s+error|success)|esbuild|vite\s+build)/i },
|
|
34
|
+
{ category: "typecheck", regex: /\b(typecheck\s+(?:ok|pass(?:ed|es|ing)|clean)|tsc\s+--noEmit|no\s+type\s+errors?)/i },
|
|
35
|
+
{ category: "lint", regex: /\b(lint\s+(?:ok|pass(?:ed|es|ing)|clean)|eslint.*(?:no\s+error|0\s+(?:problem|warning))|prettier.*check)/i },
|
|
36
|
+
{ category: "security", regex: /\b(secur(?:ity|e)\s+(?:ok|pass(?:ed|es|ing)|scan\s+(?:clean|passed))|secret.*scan|audit.*pass|vulnerability.*0)/i },
|
|
37
|
+
{ category: "performance", regex: /\b(performance\s+(?:ok|pass(?:ed|es|ing)|improved)|latency.*\d+ms|throughput)/i },
|
|
38
|
+
{ category: "docs", regex: /\b(docs?\s+(?:ok|pass(?:ed|es|ing)|updated)|readme.*updated|changelog.*updated)/i },
|
|
39
|
+
{ category: "behavioral", regex: /\b(fix(?:ed|es)\s+(?:bug|issue)|feature\s+(?:works?|implemented)|behavior\s+(?:correct|as\s+expected))/i },
|
|
40
|
+
];
|
|
41
|
+
// ─── Claim Extractor ───────────────────────────────────────────────────────
|
|
42
|
+
export function extractClaims(output) {
|
|
43
|
+
const claims = [];
|
|
44
|
+
const seen = new Set();
|
|
45
|
+
let claimIndex = 0;
|
|
46
|
+
for (const { category, regex } of CLAIM_PATTERNS) {
|
|
47
|
+
const matches = output.match(regex);
|
|
48
|
+
if (matches) {
|
|
49
|
+
for (const match of matches) {
|
|
50
|
+
const key = `${category}:${match.toLowerCase()}`;
|
|
51
|
+
if (seen.has(key))
|
|
52
|
+
continue;
|
|
53
|
+
seen.add(key);
|
|
54
|
+
claims.push({
|
|
55
|
+
claimId: `claim-${category}-${claimIndex++}`,
|
|
56
|
+
text: match,
|
|
57
|
+
category,
|
|
58
|
+
confidence: 0.8,
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return Object.freeze(claims);
|
|
64
|
+
}
|
|
65
|
+
// ─── Required Evidence ─────────────────────────────────────────────────────
|
|
66
|
+
export function requiredEvidenceForClaim(claim, taskType, risk) {
|
|
67
|
+
const required = [];
|
|
68
|
+
const baseKinds = ["command", "trace"];
|
|
69
|
+
const categoryKindMap = {
|
|
70
|
+
test: ["test", "metric"],
|
|
71
|
+
build: ["metric"],
|
|
72
|
+
typecheck: ["metric"],
|
|
73
|
+
lint: ["metric", "audit"],
|
|
74
|
+
security: ["audit", "screenshot"],
|
|
75
|
+
performance: ["metric", "trace"],
|
|
76
|
+
docs: ["diff", "screenshot"],
|
|
77
|
+
behavioral: ["diff", "test"],
|
|
78
|
+
};
|
|
79
|
+
const kinds = [...baseKinds, ...(categoryKindMap[claim.category] ?? [])];
|
|
80
|
+
for (let i = 0; i < kinds.length; i++) {
|
|
81
|
+
required.push({
|
|
82
|
+
evidenceId: `${claim.claimId}-req-${i}`,
|
|
83
|
+
kind: kinds[i],
|
|
84
|
+
description: `Required ${kinds[i]} evidence for ${claim.category} claim`,
|
|
85
|
+
minConfidence: risk === "critical" ? 0.95 : risk === "high" ? 0.85 : risk === "medium" ? 0.75 : 0.6,
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
// High/critical risk adds extra audit trail
|
|
89
|
+
if (risk === "high" || risk === "critical") {
|
|
90
|
+
required.push({
|
|
91
|
+
evidenceId: `${claim.claimId}-req-audit`,
|
|
92
|
+
kind: "audit",
|
|
93
|
+
description: `Audit trail for ${risk} risk task`,
|
|
94
|
+
minConfidence: 0.9,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
// Critical tasks require screenshot or review evidence
|
|
98
|
+
if (risk === "critical") {
|
|
99
|
+
required.push({
|
|
100
|
+
evidenceId: `${claim.claimId}-req-review`,
|
|
101
|
+
kind: "review",
|
|
102
|
+
description: `Review evidence for critical risk task`,
|
|
103
|
+
minConfidence: 0.9,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
return Object.freeze(required);
|
|
107
|
+
}
|
|
108
|
+
// ─── Evidence Collector ────────────────────────────────────────────────────
|
|
109
|
+
export async function collectEvidenceFromRunDir(runDir, meta) {
|
|
110
|
+
const items = [];
|
|
111
|
+
const evidenceJsonlPath = join(runDir, "evidence.jsonl");
|
|
112
|
+
if (existsSync(evidenceJsonlPath)) {
|
|
113
|
+
try {
|
|
114
|
+
const content = await readFile(evidenceJsonlPath, "utf8");
|
|
115
|
+
const lines = content.split(/\r?\n/).filter((l) => l.trim().length > 0);
|
|
116
|
+
for (const line of lines) {
|
|
117
|
+
try {
|
|
118
|
+
const parsed = JSON.parse(line);
|
|
119
|
+
if (isObject(parsed)) {
|
|
120
|
+
const item = evidenceItemFromRecord(parsed);
|
|
121
|
+
if (item)
|
|
122
|
+
items.push(item);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
catch { /* ignore parse errors */ }
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
catch { /* ignore read errors */ }
|
|
129
|
+
}
|
|
130
|
+
return { items: Object.freeze(items), meta };
|
|
131
|
+
}
|
|
132
|
+
function evidenceItemFromRecord(record) {
|
|
133
|
+
const kind = parseEvidenceKind(record.kind);
|
|
134
|
+
const verdict = parseEvidenceVerdict(record.status);
|
|
135
|
+
if (!kind || !verdict)
|
|
136
|
+
return null;
|
|
137
|
+
return {
|
|
138
|
+
id: String(record.evidenceId ?? record.id ?? ""),
|
|
139
|
+
kind,
|
|
140
|
+
source: String(record.source ?? record.nodeId ?? "unknown"),
|
|
141
|
+
description: String(record.message ?? record.description ?? ""),
|
|
142
|
+
verdict,
|
|
143
|
+
timestamp: String(record.observedAt ?? record.timestamp ?? new Date().toISOString()),
|
|
144
|
+
confidence: typeof record.confidence === "number" ? record.confidence : 0.8,
|
|
145
|
+
linkedTraceId: record.linkedTraceId ? String(record.linkedTraceId) : undefined,
|
|
146
|
+
linkedFilePaths: Array.isArray(record.linkedFilePaths)
|
|
147
|
+
? record.linkedFilePaths
|
|
148
|
+
: record.path
|
|
149
|
+
? [String(record.path)]
|
|
150
|
+
: [],
|
|
151
|
+
metadata: record.metadata && isObject(record.metadata) ? record.metadata : undefined,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
function parseEvidenceKind(value) {
|
|
155
|
+
const kinds = ["test", "diff", "command", "screenshot", "trace", "metric", "audit", "review"];
|
|
156
|
+
return kinds.find((k) => k === value) ?? null;
|
|
157
|
+
}
|
|
158
|
+
function parseEvidenceVerdict(value) {
|
|
159
|
+
const verdicts = ["pass", "fail", "partial", "pending"];
|
|
160
|
+
// Map evidence schema statuses to verdicts
|
|
161
|
+
if (value === "passed")
|
|
162
|
+
return "pass";
|
|
163
|
+
if (value === "failed")
|
|
164
|
+
return "fail";
|
|
165
|
+
if (value === "missing" || value === "skipped" || value === "blocked")
|
|
166
|
+
return "pending";
|
|
167
|
+
return verdicts.find((v) => v === value) ?? null;
|
|
168
|
+
}
|
|
169
|
+
// ─── Evidence Verifier ─────────────────────────────────────────────────────
|
|
170
|
+
export function verifyEvidence(required, collected) {
|
|
171
|
+
const satisfied = [];
|
|
172
|
+
const missing = [];
|
|
173
|
+
const partial = [];
|
|
174
|
+
for (const req of required) {
|
|
175
|
+
const matches = collected.items.filter((item) => item.kind === req.kind &&
|
|
176
|
+
item.confidence >= req.minConfidence &&
|
|
177
|
+
(item.verdict === "pass" || item.verdict === "partial"));
|
|
178
|
+
if (matches.length === 0) {
|
|
179
|
+
missing.push(req.evidenceId);
|
|
180
|
+
}
|
|
181
|
+
else if (matches.some((m) => m.verdict === "pass")) {
|
|
182
|
+
satisfied.push(req.evidenceId);
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
partial.push(req.evidenceId);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return { satisfied: Object.freeze(satisfied), missing: Object.freeze(missing), partial: Object.freeze(partial) };
|
|
189
|
+
}
|
|
190
|
+
// ─── Sub-score Computers ───────────────────────────────────────────────────
|
|
191
|
+
function computeReproducibility(meta) {
|
|
192
|
+
let score = 0;
|
|
193
|
+
let max = 0;
|
|
194
|
+
// commandHash present
|
|
195
|
+
if (meta.commandHash && meta.commandHash.length > 0) {
|
|
196
|
+
score += 0.4;
|
|
197
|
+
}
|
|
198
|
+
max += 0.4;
|
|
199
|
+
// treeHashBefore present
|
|
200
|
+
if (meta.treeHashBefore && meta.treeHashBefore.length > 0) {
|
|
201
|
+
score += 0.3;
|
|
202
|
+
}
|
|
203
|
+
max += 0.3;
|
|
204
|
+
// treeHashAfter present
|
|
205
|
+
if (meta.treeHashAfter && meta.treeHashAfter.length > 0) {
|
|
206
|
+
score += 0.3;
|
|
207
|
+
}
|
|
208
|
+
max += 0.3;
|
|
209
|
+
return max > 0 ? score / max : 0;
|
|
210
|
+
}
|
|
211
|
+
function computeIndependence(collected) {
|
|
212
|
+
if (collected.items.length === 0)
|
|
213
|
+
return 0;
|
|
214
|
+
const independentSources = new Set(["runner", "command", "shell", "test", "ci"]);
|
|
215
|
+
let independentCount = 0;
|
|
216
|
+
for (const item of collected.items) {
|
|
217
|
+
const sourceLower = item.source.toLowerCase();
|
|
218
|
+
if (independentSources.has(sourceLower) ||
|
|
219
|
+
item.kind === "test" ||
|
|
220
|
+
item.kind === "command" ||
|
|
221
|
+
item.kind === "metric") {
|
|
222
|
+
independentCount++;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return independentCount / collected.items.length;
|
|
226
|
+
}
|
|
227
|
+
function computeCoverageRelevance(collected, dependencyGraphFiles) {
|
|
228
|
+
if (collected.items.length === 0)
|
|
229
|
+
return 0;
|
|
230
|
+
const linkedCount = collected.items.filter((item) => {
|
|
231
|
+
if (item.linkedFilePaths.length > 0)
|
|
232
|
+
return true;
|
|
233
|
+
if (dependencyGraphFiles && dependencyGraphFiles.length > 0) {
|
|
234
|
+
// If item description mentions a file in the dependency graph
|
|
235
|
+
return dependencyGraphFiles.some((f) => item.description.includes(f));
|
|
236
|
+
}
|
|
237
|
+
return false;
|
|
238
|
+
}).length;
|
|
239
|
+
return linkedCount / collected.items.length;
|
|
240
|
+
}
|
|
241
|
+
function computeProvenanceIntegrity(meta) {
|
|
242
|
+
const fields = [
|
|
243
|
+
"runId",
|
|
244
|
+
"provider",
|
|
245
|
+
"model",
|
|
246
|
+
"cwd",
|
|
247
|
+
"treeHashBefore",
|
|
248
|
+
"treeHashAfter",
|
|
249
|
+
"commandHash",
|
|
250
|
+
];
|
|
251
|
+
const optionalFields = ["nodeId"];
|
|
252
|
+
const allFields = [...fields, ...optionalFields];
|
|
253
|
+
let present = 0;
|
|
254
|
+
for (const field of allFields) {
|
|
255
|
+
const value = meta[field];
|
|
256
|
+
if (typeof value === "string" && value.length > 0) {
|
|
257
|
+
present++;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
return present / allFields.length;
|
|
261
|
+
}
|
|
262
|
+
function computeFreshness(collected, risk, nowIso) {
|
|
263
|
+
if (collected.items.length === 0)
|
|
264
|
+
return 0;
|
|
265
|
+
const now = new Date(nowIso).getTime();
|
|
266
|
+
const staleThresholdMs = STALE_HOURS_BY_RISK[risk] * 60 * 60 * 1000;
|
|
267
|
+
let totalScore = 0;
|
|
268
|
+
for (const item of collected.items) {
|
|
269
|
+
const itemTime = new Date(item.timestamp).getTime();
|
|
270
|
+
const ageMs = now - itemTime;
|
|
271
|
+
if (ageMs < 0 || Number.isNaN(ageMs)) {
|
|
272
|
+
totalScore += 1.0; // Future/now timestamp = fresh
|
|
273
|
+
continue;
|
|
274
|
+
}
|
|
275
|
+
if (ageMs <= staleThresholdMs) {
|
|
276
|
+
totalScore += 1.0;
|
|
277
|
+
}
|
|
278
|
+
else {
|
|
279
|
+
// Linear decay over next 2x threshold
|
|
280
|
+
const decayWindow = staleThresholdMs * 2;
|
|
281
|
+
const decayed = Math.max(0, 1 - (ageMs - staleThresholdMs) / decayWindow);
|
|
282
|
+
totalScore += decayed;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
return totalScore / collected.items.length;
|
|
286
|
+
}
|
|
287
|
+
// ─── Penalty Computers ─────────────────────────────────────────────────────
|
|
288
|
+
function computeGamingPenalty(claims, collected, verification) {
|
|
289
|
+
let penalty = 0;
|
|
290
|
+
// Penalty if claims outnumber independently-sourced evidence
|
|
291
|
+
const independentItems = collected.items.filter((item) => item.source !== "agent" &&
|
|
292
|
+
item.source !== "self" &&
|
|
293
|
+
item.source !== "unknown");
|
|
294
|
+
if (claims.length > 0 && independentItems.length === 0) {
|
|
295
|
+
penalty += 0.15;
|
|
296
|
+
}
|
|
297
|
+
// Penalty if many claims but few verified
|
|
298
|
+
const claimToVerifiedRatio = claims.length > 0 ? verification.satisfied.length / claims.length : 1;
|
|
299
|
+
if (claimToVerifiedRatio < 0.5) {
|
|
300
|
+
penalty += 0.1;
|
|
301
|
+
}
|
|
302
|
+
// Penalty if all evidence is self-reported (agent-sourced)
|
|
303
|
+
const allAgentSourced = collected.items.length > 0 &&
|
|
304
|
+
collected.items.every((item) => item.source === "agent" ||
|
|
305
|
+
item.source === "self" ||
|
|
306
|
+
item.source === "unknown");
|
|
307
|
+
if (allAgentSourced) {
|
|
308
|
+
penalty += 0.1;
|
|
309
|
+
}
|
|
310
|
+
return Math.min(penalty, 0.3);
|
|
311
|
+
}
|
|
312
|
+
function computeStaleResultPenalty(collected, risk, nowIso) {
|
|
313
|
+
const now = new Date(nowIso).getTime();
|
|
314
|
+
const staleThresholdMs = STALE_HOURS_BY_RISK[risk] * 60 * 60 * 1000;
|
|
315
|
+
let staleCount = 0;
|
|
316
|
+
for (const item of collected.items) {
|
|
317
|
+
const itemTime = new Date(item.timestamp).getTime();
|
|
318
|
+
const ageMs = now - itemTime;
|
|
319
|
+
if (ageMs > staleThresholdMs) {
|
|
320
|
+
staleCount++;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return Math.min(staleCount * 0.05, 0.2);
|
|
324
|
+
}
|
|
325
|
+
function computeUnverifiableClaimPenalty(claims, verification) {
|
|
326
|
+
if (claims.length === 0)
|
|
327
|
+
return 0;
|
|
328
|
+
const unverifiedCount = verification.missing.length;
|
|
329
|
+
return Math.min(unverifiedCount * 0.05, 0.3);
|
|
330
|
+
}
|
|
331
|
+
// ─── Verdict ───────────────────────────────────────────────────────────────
|
|
332
|
+
function computeVerdict(score) {
|
|
333
|
+
if (score >= 0.75)
|
|
334
|
+
return "pass";
|
|
335
|
+
if (score >= 0.50)
|
|
336
|
+
return "warn";
|
|
337
|
+
return "fail";
|
|
338
|
+
}
|
|
339
|
+
export function createEvidenceTrustScoreV2Engine(options) {
|
|
340
|
+
const weights = { ...WEIGHTS, ...options?.customWeights };
|
|
341
|
+
const now = options?.now ?? new Date().toISOString();
|
|
342
|
+
return {
|
|
343
|
+
async evaluate(params) {
|
|
344
|
+
const claims = extractClaims(params.output);
|
|
345
|
+
const allRequired = [];
|
|
346
|
+
for (const claim of claims) {
|
|
347
|
+
allRequired.push(...requiredEvidenceForClaim(claim, params.taskType, params.risk));
|
|
348
|
+
}
|
|
349
|
+
const verification = verifyEvidence(allRequired, params.runArtifacts);
|
|
350
|
+
const reproducibility = computeReproducibility(params.runArtifacts.meta);
|
|
351
|
+
const independence = computeIndependence(params.runArtifacts);
|
|
352
|
+
const coverageRelevance = computeCoverageRelevance(params.runArtifacts, params.dependencyGraphFiles);
|
|
353
|
+
const provenanceIntegrity = computeProvenanceIntegrity(params.runArtifacts.meta);
|
|
354
|
+
const freshness = computeFreshness(params.runArtifacts, params.risk, params.now ?? now);
|
|
355
|
+
const gamingPenalty = computeGamingPenalty(claims, params.runArtifacts, verification);
|
|
356
|
+
const staleResultPenalty = computeStaleResultPenalty(params.runArtifacts, params.risk, params.now ?? now);
|
|
357
|
+
const unverifiableClaimPenalty = computeUnverifiableClaimPenalty(claims, verification);
|
|
358
|
+
let score = weights.reproducibility * reproducibility +
|
|
359
|
+
weights.independence * independence +
|
|
360
|
+
weights.coverageRelevance * coverageRelevance +
|
|
361
|
+
weights.provenanceIntegrity * provenanceIntegrity +
|
|
362
|
+
weights.freshness * freshness -
|
|
363
|
+
gamingPenalty -
|
|
364
|
+
staleResultPenalty -
|
|
365
|
+
unverifiableClaimPenalty;
|
|
366
|
+
score = Math.max(0, Math.min(1, Math.round(score * 1000) / 1000));
|
|
367
|
+
const reasons = [];
|
|
368
|
+
if (reproducibility < 0.5)
|
|
369
|
+
reasons.push("reproducibility below 0.5");
|
|
370
|
+
if (independence < 0.5)
|
|
371
|
+
reasons.push("independence below 0.5");
|
|
372
|
+
if (coverageRelevance < 0.5)
|
|
373
|
+
reasons.push("coverage_relevance below 0.5");
|
|
374
|
+
if (provenanceIntegrity < 0.5)
|
|
375
|
+
reasons.push("provenance_integrity below 0.5");
|
|
376
|
+
if (freshness < 0.5)
|
|
377
|
+
reasons.push("freshness below 0.5");
|
|
378
|
+
if (gamingPenalty > 0)
|
|
379
|
+
reasons.push(`gaming_penalty=${gamingPenalty.toFixed(3)}`);
|
|
380
|
+
if (staleResultPenalty > 0)
|
|
381
|
+
reasons.push(`stale_result_penalty=${staleResultPenalty.toFixed(3)}`);
|
|
382
|
+
if (unverifiableClaimPenalty > 0)
|
|
383
|
+
reasons.push(`unverifiable_claim_penalty=${unverifiableClaimPenalty.toFixed(3)}`);
|
|
384
|
+
if (verification.missing.length > 0)
|
|
385
|
+
reasons.push(`missing evidence: ${verification.missing.length} items`);
|
|
386
|
+
const verdict = computeVerdict(score);
|
|
387
|
+
return {
|
|
388
|
+
score,
|
|
389
|
+
reproducibility: Math.round(reproducibility * 1000) / 1000,
|
|
390
|
+
independence: Math.round(independence * 1000) / 1000,
|
|
391
|
+
coverageRelevance: Math.round(coverageRelevance * 1000) / 1000,
|
|
392
|
+
provenanceIntegrity: Math.round(provenanceIntegrity * 1000) / 1000,
|
|
393
|
+
freshness: Math.round(freshness * 1000) / 1000,
|
|
394
|
+
gamingPenalty: Math.round(gamingPenalty * 1000) / 1000,
|
|
395
|
+
staleResultPenalty: Math.round(staleResultPenalty * 1000) / 1000,
|
|
396
|
+
unverifiableClaimPenalty: Math.round(unverifiableClaimPenalty * 1000) / 1000,
|
|
397
|
+
verdict,
|
|
398
|
+
reasons: Object.freeze(reasons),
|
|
399
|
+
};
|
|
400
|
+
},
|
|
401
|
+
};
|
|
402
|
+
}
|
|
403
|
+
// ─── Helpers ───────────────────────────────────────────────────────────────
|
|
404
|
+
function isObject(value) {
|
|
405
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
406
|
+
}
|
|
407
|
+
// ─── Backward-compat: re-export as EvidenceTrustScore for integration ──────
|
|
408
|
+
export { createEvidenceTrustScoreV2Engine as createEvidenceTrustScore };
|
package/dist/evidence/index.d.ts
CHANGED
|
@@ -15,5 +15,7 @@ export type { DecisionTraceStore } from "./decision-trace.js";
|
|
|
15
15
|
export { createDecisionTraceStore } from "./decision-trace.js";
|
|
16
16
|
export type { ProofTrustMvpEngine, ProofTrustResult } from "./proof-trust.js";
|
|
17
17
|
export { createProofTrustMvpEngine } from "./proof-trust.js";
|
|
18
|
+
export type { EtsClaim, EtsClaimCategory, EtsTaskType, EtsRiskTier, RequiredEvidenceItem, RunArtifactMeta, CollectedEvidence, EvidenceVerificationResult, EtsV2Result, EtsV2Engine, EtsV2Params, EtsV2EngineOptions, } from "./evidence-trust-score.js";
|
|
19
|
+
export { extractClaims, requiredEvidenceForClaim, collectEvidenceFromRunDir, verifyEvidence, createEvidenceTrustScoreV2Engine, createEvidenceTrustScore, } from "./evidence-trust-score.js";
|
|
18
20
|
export type { AlgorithmSpec, ReleaseCandidate, RegressionProofMatrixResult, RegressionProofMatrixEngine, RegressionProofMatrixOptions, } from "./regression-proof-matrix.js";
|
|
19
21
|
export { createRegressionProofMatrixEngine } from "./regression-proof-matrix.js";
|
package/dist/evidence/index.js
CHANGED
|
@@ -6,4 +6,5 @@ export { createRunTraceStore } from "./run-trace.js";
|
|
|
6
6
|
export { decideRepair } from "../orchestration/repair-policy.js";
|
|
7
7
|
export { createDecisionTraceStore } from "./decision-trace.js";
|
|
8
8
|
export { createProofTrustMvpEngine } from "./proof-trust.js";
|
|
9
|
+
export { extractClaims, requiredEvidenceForClaim, collectEvidenceFromRunDir, verifyEvidence, createEvidenceTrustScoreV2Engine, createEvidenceTrustScore, } from "./evidence-trust-score.js";
|
|
9
10
|
export { createRegressionProofMatrixEngine } from "./regression-proof-matrix.js";
|
|
Binary file
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Merge Arbiter — patch scoring + conflict detection + winner selection.
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* CollectCandidatePatches → NormalizeDiffs → RunEvidenceSuite → ScorePatch
|
|
6
|
+
* → DetectConflicts → SelectWinnerOrHybrid → ProduceMergeRationale
|
|
7
|
+
*/
|
|
8
|
+
export interface CandidatePatch {
|
|
9
|
+
id: string;
|
|
10
|
+
name: string;
|
|
11
|
+
path: string;
|
|
12
|
+
diff: string;
|
|
13
|
+
normalizedDiff: string;
|
|
14
|
+
fileScopes: string[];
|
|
15
|
+
diffLines: number;
|
|
16
|
+
canApply: boolean;
|
|
17
|
+
conflictsWith: string[];
|
|
18
|
+
evidence: PatchEvidence;
|
|
19
|
+
scores: PatchScores;
|
|
20
|
+
compositeScore: number;
|
|
21
|
+
}
|
|
22
|
+
export interface PatchEvidence {
|
|
23
|
+
testsPassed: boolean;
|
|
24
|
+
lintPassed: boolean;
|
|
25
|
+
typecheckPassed: boolean;
|
|
26
|
+
reviewerScore?: number;
|
|
27
|
+
reviewerReason?: string;
|
|
28
|
+
evidenceTrustScore: number;
|
|
29
|
+
}
|
|
30
|
+
export interface PatchScores {
|
|
31
|
+
testPassScore: number;
|
|
32
|
+
evidenceTrustScore: number;
|
|
33
|
+
minimalityScore: number;
|
|
34
|
+
lintTypecheckScore: number;
|
|
35
|
+
conflictFreeScore: number;
|
|
36
|
+
reviewerAgreementScore: number;
|
|
37
|
+
}
|
|
38
|
+
export interface MergeArbiterResult {
|
|
39
|
+
winner: CandidatePatch | null;
|
|
40
|
+
requiresHumanApproval: boolean;
|
|
41
|
+
rationale: MergeRationale;
|
|
42
|
+
trace: MergeTrace;
|
|
43
|
+
}
|
|
44
|
+
export interface MergeRationale {
|
|
45
|
+
summary: string;
|
|
46
|
+
winnerId: string | null;
|
|
47
|
+
scoreBreakdown: Record<string, number>;
|
|
48
|
+
conflicts: string[];
|
|
49
|
+
threshold: number;
|
|
50
|
+
humanApprovalReason?: string;
|
|
51
|
+
}
|
|
52
|
+
export interface MergeTrace {
|
|
53
|
+
steps: MergeTraceStep[];
|
|
54
|
+
timestamp: string;
|
|
55
|
+
}
|
|
56
|
+
export interface MergeTraceStep {
|
|
57
|
+
step: string;
|
|
58
|
+
candidateId: string;
|
|
59
|
+
detail: string;
|
|
60
|
+
durationMs?: number;
|
|
61
|
+
}
|
|
62
|
+
export interface MergeArbiterOptions {
|
|
63
|
+
/** Minimum composite score (0–1) for auto-approval. */
|
|
64
|
+
threshold?: number;
|
|
65
|
+
/** Max diff lines before minimality score hits zero. */
|
|
66
|
+
maxDiffLines?: number;
|
|
67
|
+
/** Timeout for test execution in worktrees (ms). */
|
|
68
|
+
testTimeoutMs?: number;
|
|
69
|
+
/** Timeout for git apply --check (ms). */
|
|
70
|
+
applyCheckTimeoutMs?: number;
|
|
71
|
+
}
|
|
72
|
+
export declare function collectCandidatePatches(worktreesDir: string, currentBranch: string, options?: MergeArbiterOptions): Promise<CandidatePatch[]>;
|
|
73
|
+
export declare function normalizeDiff(diff: string): string;
|
|
74
|
+
export declare function extractFileScopes(diff: string): string[];
|
|
75
|
+
export declare function runEvidenceSuite(candidate: CandidatePatch, projectRoot: string, config: string, options?: MergeArbiterOptions): Promise<CandidatePatch>;
|
|
76
|
+
export declare function scorePatch(candidate: CandidatePatch, options?: MergeArbiterOptions): CandidatePatch;
|
|
77
|
+
export declare function detectConflicts(candidates: CandidatePatch[]): CandidatePatch[];
|
|
78
|
+
export declare function selectWinnerOrHybrid(candidates: CandidatePatch[], options?: MergeArbiterOptions): {
|
|
79
|
+
winner: CandidatePatch | null;
|
|
80
|
+
requiresHumanApproval: boolean;
|
|
81
|
+
reason?: string;
|
|
82
|
+
};
|
|
83
|
+
export declare function produceMergeRationale(candidates: CandidatePatch[], selection: {
|
|
84
|
+
winner: CandidatePatch | null;
|
|
85
|
+
requiresHumanApproval: boolean;
|
|
86
|
+
reason?: string;
|
|
87
|
+
}, options?: MergeArbiterOptions): {
|
|
88
|
+
rationale: MergeRationale;
|
|
89
|
+
trace: MergeTrace;
|
|
90
|
+
};
|
|
91
|
+
export declare function runMergeArbiter(worktreesDir: string, currentBranch: string, projectRoot: string, config: string, options?: MergeArbiterOptions): Promise<MergeArbiterResult>;
|