open-multi-agent-kit 0.78.1 → 0.78.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +37 -0
- package/MATURITY.md +4 -0
- package/README.md +70 -1
- package/dist/benchmark/contracts.d.ts +116 -0
- package/dist/benchmark/contracts.js +6 -0
- package/dist/benchmark/fixtures.d.ts +11 -0
- package/dist/benchmark/fixtures.js +121 -0
- package/dist/benchmark/harness.d.ts +13 -0
- package/dist/benchmark/harness.js +191 -0
- package/dist/benchmark/shadow-mode.d.ts +17 -0
- package/dist/benchmark/shadow-mode.js +96 -0
- package/dist/cli/register-spec-agent-goal-commands.js +45 -0
- package/dist/cli/release-promotion-gate.d.ts +14 -0
- package/dist/cli/release-promotion-gate.js +71 -0
- package/dist/cli/v2/release-commands.d.ts +29 -0
- package/dist/cli/v2/release-commands.js +95 -0
- package/dist/commands/chat/native-root-loop.js +14 -1
- package/dist/commands/chat/slash/commands/session.js +19 -1
- package/dist/commands/goal-interview.d.ts +18 -0
- package/dist/commands/goal-interview.js +396 -0
- package/dist/commands/merge.js +102 -56
- package/dist/contracts/interview.d.ts +106 -0
- package/dist/contracts/interview.js +9 -0
- package/dist/contracts/provider-health.d.ts +37 -0
- package/dist/contracts/provider-health.js +49 -1
- package/dist/evidence/evidence-trust-score.d.ts +101 -0
- package/dist/evidence/evidence-trust-score.js +408 -0
- package/dist/evidence/index.d.ts +6 -0
- package/dist/evidence/index.js +3 -0
- package/dist/evidence/proof-trust-cli.d.ts +8 -0
- package/dist/evidence/proof-trust-cli.js +27 -0
- package/dist/evidence/proof-trust.d.ts +14 -0
- package/dist/evidence/proof-trust.js +381 -0
- package/dist/evidence/regression-proof-matrix.d.ts +42 -0
- package/dist/evidence/regression-proof-matrix.js +72 -0
- package/dist/goal/intent-frame.d.ts +6 -0
- package/dist/goal/intent-frame.js +21 -9
- package/dist/goal/interview-assimilation.d.ts +13 -0
- package/dist/goal/interview-assimilation.js +383 -0
- package/dist/goal/interview-question-bank.d.ts +11 -0
- package/dist/goal/interview-question-bank.js +225 -0
- package/dist/goal/interview-scoring.d.ts +31 -0
- package/dist/goal/interview-scoring.js +187 -0
- package/dist/goal/interview-session.d.ts +25 -0
- package/dist/goal/interview-session.js +116 -0
- package/dist/input/input-envelope.d.ts +22 -0
- package/dist/input/input-envelope.js +1 -0
- package/dist/orchestration/merge-arbiter.d.ts +91 -0
- package/dist/orchestration/merge-arbiter.js +376 -0
- package/dist/providers/health.d.ts +3 -0
- package/dist/providers/health.js +46 -0
- package/dist/providers/index.d.ts +1 -0
- package/dist/providers/index.js +1 -0
- package/dist/providers/provider-health.d.ts +8 -1
- package/dist/providers/provider-health.js +39 -0
- package/dist/providers/provider-task-runner.js +31 -0
- package/dist/providers/provider.d.ts +2 -0
- package/dist/providers/router.js +87 -3
- package/dist/providers/types.d.ts +4 -0
- package/dist/runtime/advanced-control-loop.d.ts +60 -0
- package/dist/runtime/advanced-control-loop.js +136 -0
- package/dist/runtime/agent-runtime.d.ts +10 -0
- package/dist/runtime/blast-radius.d.ts +10 -0
- package/dist/runtime/blast-radius.js +14 -0
- package/dist/runtime/contracts/evidence.d.ts +87 -0
- package/dist/runtime/contracts/evidence.js +7 -0
- package/dist/runtime/contracts/router-v2.d.ts +44 -0
- package/dist/runtime/contracts/router-v2.js +4 -0
- package/dist/runtime/contracts/weakness-remediation.d.ts +67 -0
- package/dist/runtime/contracts/weakness-remediation.js +36 -0
- package/dist/runtime/kimi-api-runtime.js +59 -1
- package/dist/runtime/proof-bundle-trust.d.ts +74 -0
- package/dist/runtime/proof-bundle-trust.js +100 -0
- package/dist/runtime/provider-maturity-gate.d.ts +43 -0
- package/dist/runtime/provider-maturity-gate.js +129 -0
- package/dist/runtime/public-surface.d.ts +93 -0
- package/dist/runtime/public-surface.js +146 -0
- package/dist/runtime/router-v2-scoring.d.ts +11 -0
- package/dist/runtime/router-v2-scoring.js +151 -0
- package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
- package/dist/runtime/tool-dispatch-contracts.js +42 -2
- package/dist/runtime/weakness-remediation-index.d.ts +27 -0
- package/dist/runtime/weakness-remediation-index.js +37 -0
- package/dist/safety/enforcement-engine.d.ts +89 -0
- package/dist/safety/enforcement-engine.js +279 -0
- package/dist/safety/tool-authority-gate.d.ts +40 -0
- package/dist/safety/tool-authority-gate.js +92 -0
- package/dist/schema/evidence.schema.d.ts +2 -2
- package/dist/schema/proof-bundle.schema.d.ts +28 -28
- package/dist/util/clipboard-image.d.ts +49 -0
- package/dist/util/clipboard-image.js +263 -0
- package/docs/2026-06-09/critical-issues.md +20 -0
- package/docs/2026-06-09/improvements.md +14 -0
- package/docs/2026-06-09/init-checklist.md +25 -0
- package/docs/2026-06-09/plan.md +20 -0
- package/docs/benchmark-design.md +122 -0
- package/docs/github-organic-promotion.md +127 -0
- package/docs/native-root-runtime-algorithms.md +301 -0
- package/package.json +8 -4
- package/readmeasset/ASSET_INDEX.md +1 -0
- package/templates/skills/agents/omk-agent-reach-websearch/SKILL.md +55 -0
- package/templates/skills/kimi/omk-agent-reach-websearch/SKILL.md +55 -0
|
@@ -4,9 +4,47 @@
|
|
|
4
4
|
*
|
|
5
5
|
* Calls https://api.moonshot.cn/v1/chat/completions directly.
|
|
6
6
|
*/
|
|
7
|
+
import { readFileSync, existsSync } from "node:fs";
|
|
8
|
+
import { resolve } from "node:path";
|
|
7
9
|
import { capsuleToTask } from "./context-broker-converter.js";
|
|
8
10
|
import { buildProviderToolPayload } from "./provider-tool-contracts.js";
|
|
9
11
|
import { repairToolCalls } from "./tool-call-repair.js";
|
|
12
|
+
/**
|
|
13
|
+
* Detect "Image file: <path>" patterns in the prompt text (inserted by /paste
|
|
14
|
+
* or Ctrl+V clipboard image) and load the referenced images as base64 data URIs
|
|
15
|
+
* for multimodal API calls.
|
|
16
|
+
*/
|
|
17
|
+
function extractInlineImageParts(prompt) {
|
|
18
|
+
const results = [];
|
|
19
|
+
// Match "Image file: .omk/screenshots/.../screenshot-xxx.png" lines
|
|
20
|
+
const pattern = /^Image file:\s+(.+\.(?:png|jpg|jpeg|webp|gif))\s*$/gim;
|
|
21
|
+
let match;
|
|
22
|
+
while ((match = pattern.exec(prompt)) !== null) {
|
|
23
|
+
const filePath = match[1].trim();
|
|
24
|
+
const absPath = resolve(filePath);
|
|
25
|
+
if (!existsSync(absPath))
|
|
26
|
+
continue;
|
|
27
|
+
try {
|
|
28
|
+
const buf = readFileSync(absPath);
|
|
29
|
+
if (buf.length === 0 || buf.length > 20 * 1024 * 1024)
|
|
30
|
+
continue;
|
|
31
|
+
// Detect mime type from magic bytes
|
|
32
|
+
let mimeType = "image/png";
|
|
33
|
+
if (buf[0] === 0xff && buf[1] === 0xd8)
|
|
34
|
+
mimeType = "image/jpeg";
|
|
35
|
+
else if (buf[0] === 0x52 && buf[1] === 0x49)
|
|
36
|
+
mimeType = "image/webp";
|
|
37
|
+
else if (buf[0] === 0x47 && buf[1] === 0x49)
|
|
38
|
+
mimeType = "image/gif";
|
|
39
|
+
const base64 = buf.toString("base64");
|
|
40
|
+
results.push({ dataUri: `data:${mimeType};base64,${base64}` });
|
|
41
|
+
}
|
|
42
|
+
catch {
|
|
43
|
+
// Skip unreadable files
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
return results;
|
|
47
|
+
}
|
|
10
48
|
function mapToolCalls(apiToolCalls, context) {
|
|
11
49
|
const repaired = repairToolCalls({
|
|
12
50
|
declaredCalls: (apiToolCalls ?? []).map((tc) => ({
|
|
@@ -155,7 +193,27 @@ export class KimiApiRuntime {
|
|
|
155
193
|
if (task.context.system) {
|
|
156
194
|
messages.push({ role: "system", content: task.context.system });
|
|
157
195
|
}
|
|
158
|
-
|
|
196
|
+
// Build multimodal content when attachments are present or when
|
|
197
|
+
// the prompt contains "Image file: <path>" references (from /paste or
|
|
198
|
+
// Ctrl+V clipboard image). This makes clipboard-pasted images send as
|
|
199
|
+
// image_url content parts to OpenAI-compatible multimodal endpoints.
|
|
200
|
+
const attachments = task.attachments ?? [];
|
|
201
|
+
const inlineImages = extractInlineImageParts(task.prompt);
|
|
202
|
+
if (attachments.length > 0 || inlineImages.length > 0) {
|
|
203
|
+
const parts = [{ type: "text", text: task.prompt }];
|
|
204
|
+
for (const attachment of attachments) {
|
|
205
|
+
if (attachment.dataUri) {
|
|
206
|
+
parts.push({ type: "image_url", image_url: { url: attachment.dataUri } });
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
for (const image of inlineImages) {
|
|
210
|
+
parts.push({ type: "image_url", image_url: { url: image.dataUri } });
|
|
211
|
+
}
|
|
212
|
+
messages.push({ role: "user", content: parts });
|
|
213
|
+
}
|
|
214
|
+
else {
|
|
215
|
+
messages.push({ role: "user", content: task.prompt });
|
|
216
|
+
}
|
|
159
217
|
const providerTools = task.capabilities.toolCalling
|
|
160
218
|
? buildProviderToolPayload(task.tools.available)
|
|
161
219
|
: buildProviderToolPayload([]);
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Proof Bundle Trust Score — Phase 2 of OMK Weakness Remediation.
|
|
3
|
+
*
|
|
4
|
+
* Evaluates a curated proof bundle across 8 dimensions and produces
|
|
5
|
+
* a trust score T_b, a permission level, and a pass/fail verdict
|
|
6
|
+
* against τ_proof.
|
|
7
|
+
*/
|
|
8
|
+
import type { ClaimPermissionLevel, EvidenceVerdict } from "./contracts/evidence.js";
|
|
9
|
+
/** The 8 scored dimensions of a proof bundle. */
|
|
10
|
+
export interface ProofBundleScores {
|
|
11
|
+
/** Schema conformance of evidence items [0, 1]. */
|
|
12
|
+
readonly schema: number;
|
|
13
|
+
/** Hash integrity / tamper evidence [0, 1]. */
|
|
14
|
+
readonly hashes: number;
|
|
15
|
+
/** Command trace coverage and correctness [0, 1]. */
|
|
16
|
+
readonly commands: number;
|
|
17
|
+
/** Stdout / stderr capture completeness [0, 1]. */
|
|
18
|
+
readonly stdout: number;
|
|
19
|
+
/** Decision record quality and count [0, 1]. */
|
|
20
|
+
readonly decisions: number;
|
|
21
|
+
/** Evidence item confidence and verdict strength [0, 1]. */
|
|
22
|
+
readonly evidence: number;
|
|
23
|
+
/** Acknowledged limitations documented [0, 1]. */
|
|
24
|
+
readonly limitations: number;
|
|
25
|
+
/** Replay reproducibility score [0, 1]. */
|
|
26
|
+
readonly replay: number;
|
|
27
|
+
}
|
|
28
|
+
/** Result of evaluating a proof bundle. */
|
|
29
|
+
export interface TrustScoreResult {
|
|
30
|
+
/** Computed trust score T_b ∈ [0, 1]. */
|
|
31
|
+
readonly score: number;
|
|
32
|
+
/** Permission level derived from score thresholds. */
|
|
33
|
+
readonly permissionLevel: ClaimPermissionLevel;
|
|
34
|
+
/** Whether score meets τ_proof. */
|
|
35
|
+
readonly passed: boolean;
|
|
36
|
+
/** Individual dimension contributions. */
|
|
37
|
+
readonly breakdown: ProofBundleScores;
|
|
38
|
+
}
|
|
39
|
+
/** Engine that evaluates proof bundle trust. */
|
|
40
|
+
export interface ProofBundleTrustEngine {
|
|
41
|
+
/**
|
|
42
|
+
* Evaluate a proof bundle from its 8 dimension scores.
|
|
43
|
+
*
|
|
44
|
+
* Formula:
|
|
45
|
+
* T_b = 0.15·schema + 0.15·hashes + 0.15·commands + 0.10·stdout
|
|
46
|
+
* + 0.15·decisions + 0.15·evidence + 0.05·limitations + 0.10·replay
|
|
47
|
+
*/
|
|
48
|
+
evaluate(scores: ProofBundleScores): TrustScoreResult;
|
|
49
|
+
/** Derive dimension scores from a raw evidence verdict and coverage. */
|
|
50
|
+
deriveScores(verdict: EvidenceVerdict, coveragePercent: number, options?: DeriveScoresOptions): ProofBundleScores;
|
|
51
|
+
}
|
|
52
|
+
/** Options for automatic score derivation. */
|
|
53
|
+
export interface DeriveScoresOptions {
|
|
54
|
+
/** Override schema conformance (default inferred from verdict). */
|
|
55
|
+
readonly schema?: number;
|
|
56
|
+
/** Override hash integrity (default 1.0). */
|
|
57
|
+
readonly hashes?: number;
|
|
58
|
+
/** Override command trace score (default inferred from coverage). */
|
|
59
|
+
readonly commands?: number;
|
|
60
|
+
/** Override stdout completeness (default inferred from coverage). */
|
|
61
|
+
readonly stdout?: number;
|
|
62
|
+
/** Override decision record score (default inferred from verdict). */
|
|
63
|
+
readonly decisions?: number;
|
|
64
|
+
/** Override evidence strength (default inferred from verdict). */
|
|
65
|
+
readonly evidence?: number;
|
|
66
|
+
/** Override limitations documentation (default 0.5). */
|
|
67
|
+
readonly limitations?: number;
|
|
68
|
+
/** Override replay score (default inferred from verdict). */
|
|
69
|
+
readonly replay?: number;
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Create a ProofBundleTrustEngine with default weights and thresholds.
|
|
73
|
+
*/
|
|
74
|
+
export declare function createProofBundleTrustEngine(): ProofBundleTrustEngine;
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Proof Bundle Trust Score — Phase 2 of OMK Weakness Remediation.
|
|
3
|
+
*
|
|
4
|
+
* Evaluates a curated proof bundle across 8 dimensions and produces
|
|
5
|
+
* a trust score T_b, a permission level, and a pass/fail verdict
|
|
6
|
+
* against τ_proof.
|
|
7
|
+
*/
|
|
8
|
+
import { TAU_PROOF } from "./contracts/weakness-remediation.js";
|
|
9
|
+
// ── Constants ───────────────────────────────────────────────────
|
|
10
|
+
const WEIGHT_SCHEMA = 0.15;
|
|
11
|
+
const WEIGHT_HASHES = 0.15;
|
|
12
|
+
const WEIGHT_COMMANDS = 0.15;
|
|
13
|
+
const WEIGHT_STDOUT = 0.10;
|
|
14
|
+
const WEIGHT_DECISIONS = 0.15;
|
|
15
|
+
const WEIGHT_EVIDENCE = 0.15;
|
|
16
|
+
const WEIGHT_LIMITATIONS = 0.05;
|
|
17
|
+
const WEIGHT_REPLAY = 0.10;
|
|
18
|
+
const STRONG_PUBLIC_THRESHOLD = 0.90;
|
|
19
|
+
const QUALIFIED_PUBLIC_THRESHOLD = 0.75;
|
|
20
|
+
const INTERNAL_CLAIM_THRESHOLD = 0.60;
|
|
21
|
+
// ── Helpers ─────────────────────────────────────────────────────
|
|
22
|
+
function clamp01(n) {
|
|
23
|
+
return Math.max(0, Math.min(1, n));
|
|
24
|
+
}
|
|
25
|
+
function permissionLevelFromScore(score) {
|
|
26
|
+
if (score >= STRONG_PUBLIC_THRESHOLD) {
|
|
27
|
+
return "strong-public-claim";
|
|
28
|
+
}
|
|
29
|
+
if (score >= QUALIFIED_PUBLIC_THRESHOLD) {
|
|
30
|
+
return "qualified-public-claim";
|
|
31
|
+
}
|
|
32
|
+
if (score >= INTERNAL_CLAIM_THRESHOLD) {
|
|
33
|
+
return "internal-claim-only";
|
|
34
|
+
}
|
|
35
|
+
return "no-claim";
|
|
36
|
+
}
|
|
37
|
+
function verdictToBaseScore(verdict) {
|
|
38
|
+
switch (verdict) {
|
|
39
|
+
case "pass":
|
|
40
|
+
return 1.0;
|
|
41
|
+
case "partial":
|
|
42
|
+
return 0.65;
|
|
43
|
+
case "pending":
|
|
44
|
+
return 0.35;
|
|
45
|
+
case "fail":
|
|
46
|
+
return 0.0;
|
|
47
|
+
default:
|
|
48
|
+
return 0.0;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
// ── Engine Factory ──────────────────────────────────────────────
|
|
52
|
+
/**
|
|
53
|
+
* Create a ProofBundleTrustEngine with default weights and thresholds.
|
|
54
|
+
*/
|
|
55
|
+
export function createProofBundleTrustEngine() {
|
|
56
|
+
return {
|
|
57
|
+
evaluate(scores) {
|
|
58
|
+
const clamped = {
|
|
59
|
+
schema: clamp01(scores.schema),
|
|
60
|
+
hashes: clamp01(scores.hashes),
|
|
61
|
+
commands: clamp01(scores.commands),
|
|
62
|
+
stdout: clamp01(scores.stdout),
|
|
63
|
+
decisions: clamp01(scores.decisions),
|
|
64
|
+
evidence: clamp01(scores.evidence),
|
|
65
|
+
limitations: clamp01(scores.limitations),
|
|
66
|
+
replay: clamp01(scores.replay),
|
|
67
|
+
};
|
|
68
|
+
const score = WEIGHT_SCHEMA * clamped.schema +
|
|
69
|
+
WEIGHT_HASHES * clamped.hashes +
|
|
70
|
+
WEIGHT_COMMANDS * clamped.commands +
|
|
71
|
+
WEIGHT_STDOUT * clamped.stdout +
|
|
72
|
+
WEIGHT_DECISIONS * clamped.decisions +
|
|
73
|
+
WEIGHT_EVIDENCE * clamped.evidence +
|
|
74
|
+
WEIGHT_LIMITATIONS * clamped.limitations +
|
|
75
|
+
WEIGHT_REPLAY * clamped.replay;
|
|
76
|
+
const finalScore = clamp01(score);
|
|
77
|
+
const permissionLevel = permissionLevelFromScore(finalScore);
|
|
78
|
+
return Object.freeze({
|
|
79
|
+
score: finalScore,
|
|
80
|
+
permissionLevel,
|
|
81
|
+
passed: finalScore >= TAU_PROOF,
|
|
82
|
+
breakdown: clamped,
|
|
83
|
+
});
|
|
84
|
+
},
|
|
85
|
+
deriveScores(verdict, coveragePercent, options = {}) {
|
|
86
|
+
const base = verdictToBaseScore(verdict);
|
|
87
|
+
const cov = clamp01(coveragePercent / 100);
|
|
88
|
+
return Object.freeze({
|
|
89
|
+
schema: clamp01(options.schema ?? base),
|
|
90
|
+
hashes: clamp01(options.hashes ?? 1.0),
|
|
91
|
+
commands: clamp01(options.commands ?? cov),
|
|
92
|
+
stdout: clamp01(options.stdout ?? cov),
|
|
93
|
+
decisions: clamp01(options.decisions ?? base),
|
|
94
|
+
evidence: clamp01(options.evidence ?? base),
|
|
95
|
+
limitations: clamp01(options.limitations ?? 0.5),
|
|
96
|
+
replay: clamp01(options.replay ?? base),
|
|
97
|
+
});
|
|
98
|
+
},
|
|
99
|
+
};
|
|
100
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Provider Maturity Gate — Phase 3 of OMK Weakness Remediation.
|
|
3
|
+
*
|
|
4
|
+
* Evaluates a provider/runtime across 8 adapter test dimensions and
|
|
5
|
+
* produces a maturity score M_p, an authority class, and a pass/fail
|
|
6
|
+
* verdict.
|
|
7
|
+
*/
|
|
8
|
+
import type { AdapterTestKind, AdapterTestResult, ProviderAuthorityClass } from "./contracts/evidence.js";
|
|
9
|
+
import type { ProviderHealthVector } from "../contracts/provider-health.js";
|
|
10
|
+
/** Maturity evaluation result for a single provider. */
|
|
11
|
+
export interface MaturityResult {
|
|
12
|
+
/** Computed maturity score M_p ∈ [0, 1]. */
|
|
13
|
+
readonly score: number;
|
|
14
|
+
/** Authority class derived from score and sub-score constraints. */
|
|
15
|
+
readonly authorityClass: ProviderAuthorityClass;
|
|
16
|
+
/** Whether the provider meets minimum viability. */
|
|
17
|
+
readonly passed: boolean;
|
|
18
|
+
/** Sub-scores keyed by adapter test kind. */
|
|
19
|
+
readonly subScores: Readonly<Record<AdapterTestKind, number>>;
|
|
20
|
+
}
|
|
21
|
+
/** Engine that evaluates provider maturity. */
|
|
22
|
+
export interface ProviderMaturityGate {
|
|
23
|
+
/**
|
|
24
|
+
* Evaluate provider maturity from adapter test results.
|
|
25
|
+
*
|
|
26
|
+
* Formula:
|
|
27
|
+
* M_p = 0.10·s_auth + 0.10·s_read + 0.15·s_write + 0.10·s_shell
|
|
28
|
+
* + 0.15·s_mcp + 0.15·s_merge + 0.15·s_evidence + 0.10·s_fallback
|
|
29
|
+
*/
|
|
30
|
+
evaluate(results: readonly AdapterTestResult[]): MaturityResult;
|
|
31
|
+
/** Look up a single sub-score by test kind (defaults to 0). */
|
|
32
|
+
getSubScore(results: readonly AdapterTestResult[], kind: AdapterTestKind): number;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Create a ProviderMaturityGate with default weights and thresholds.
|
|
36
|
+
*/
|
|
37
|
+
export interface ProviderMaturityTable {
|
|
38
|
+
lookup(providerId: string): MaturityResult | undefined;
|
|
39
|
+
register(providerId: string, result: MaturityResult): void;
|
|
40
|
+
}
|
|
41
|
+
export declare function createProviderMaturityTable(): ProviderMaturityTable;
|
|
42
|
+
export declare function createProviderMaturityGate(): ProviderMaturityGate;
|
|
43
|
+
export declare function evaluateProviderFromVector(gate: ProviderMaturityGate, vector: ProviderHealthVector): MaturityResult;
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Provider Maturity Gate — Phase 3 of OMK Weakness Remediation.
|
|
3
|
+
*
|
|
4
|
+
* Evaluates a provider/runtime across 8 adapter test dimensions and
|
|
5
|
+
* produces a maturity score M_p, an authority class, and a pass/fail
|
|
6
|
+
* verdict.
|
|
7
|
+
*/
|
|
8
|
+
import { PROVIDER_CAPABILITY_ORDINAL } from "../contracts/provider-health.js";
|
|
9
|
+
// ── Constants ───────────────────────────────────────────────────
|
|
10
|
+
const WEIGHT_AUTH = 0.10;
|
|
11
|
+
const WEIGHT_READ = 0.10;
|
|
12
|
+
const WEIGHT_WRITE = 0.15;
|
|
13
|
+
const WEIGHT_SHELL = 0.10;
|
|
14
|
+
const WEIGHT_MCP = 0.15;
|
|
15
|
+
const WEIGHT_MERGE = 0.15;
|
|
16
|
+
const WEIGHT_EVIDENCE = 0.15;
|
|
17
|
+
const WEIGHT_FALLBACK = 0.10;
|
|
18
|
+
const MERGE_AUTHORITY_THRESHOLD = 0.90;
|
|
19
|
+
const MERGE_SUBSCORE_THRESHOLD = 0.90;
|
|
20
|
+
const EVIDENCE_SUBSCORE_THRESHOLD_FOR_MERGE = 0.85;
|
|
21
|
+
const WRITE_AUTHORITY_THRESHOLD = 0.80;
|
|
22
|
+
const WRITE_SUBSCORE_THRESHOLD = 0.85;
|
|
23
|
+
const REVIEW_AUTHORITY_THRESHOLD = 0.70;
|
|
24
|
+
const READ_SUBSCORE_THRESHOLD = 0.90;
|
|
25
|
+
const READ_ONLY_ADVISORY_THRESHOLD = 0.55;
|
|
26
|
+
// ── Helpers ─────────────────────────────────────────────────────
|
|
27
|
+
function clamp01(n) {
|
|
28
|
+
return Math.max(0, Math.min(1, n));
|
|
29
|
+
}
|
|
30
|
+
function computeAuthorityClass(score, subScores) {
|
|
31
|
+
if (score >= MERGE_AUTHORITY_THRESHOLD &&
|
|
32
|
+
subScores.merge >= MERGE_SUBSCORE_THRESHOLD &&
|
|
33
|
+
subScores.evidence >= EVIDENCE_SUBSCORE_THRESHOLD_FOR_MERGE) {
|
|
34
|
+
return "merge-authority";
|
|
35
|
+
}
|
|
36
|
+
if (score >= WRITE_AUTHORITY_THRESHOLD &&
|
|
37
|
+
subScores.write >= WRITE_SUBSCORE_THRESHOLD) {
|
|
38
|
+
return "write-authority";
|
|
39
|
+
}
|
|
40
|
+
if (score >= REVIEW_AUTHORITY_THRESHOLD &&
|
|
41
|
+
subScores.read >= READ_SUBSCORE_THRESHOLD) {
|
|
42
|
+
return "review-authority";
|
|
43
|
+
}
|
|
44
|
+
if (score >= READ_ONLY_ADVISORY_THRESHOLD) {
|
|
45
|
+
return "read-only-advisory";
|
|
46
|
+
}
|
|
47
|
+
return "disabled";
|
|
48
|
+
}
|
|
49
|
+
function buildSubScoreMap(results) {
|
|
50
|
+
const map = {
|
|
51
|
+
auth: 0,
|
|
52
|
+
read: 0,
|
|
53
|
+
write: 0,
|
|
54
|
+
shell: 0,
|
|
55
|
+
mcp: 0,
|
|
56
|
+
merge: 0,
|
|
57
|
+
evidence: 0,
|
|
58
|
+
fallback: 0,
|
|
59
|
+
};
|
|
60
|
+
for (const r of results) {
|
|
61
|
+
map[r.kind] = clamp01(r.score);
|
|
62
|
+
}
|
|
63
|
+
return map;
|
|
64
|
+
}
|
|
65
|
+
export function createProviderMaturityTable() {
|
|
66
|
+
const table = new Map();
|
|
67
|
+
return {
|
|
68
|
+
lookup(providerId) {
|
|
69
|
+
return table.get(providerId);
|
|
70
|
+
},
|
|
71
|
+
register(providerId, result) {
|
|
72
|
+
table.set(providerId, result);
|
|
73
|
+
},
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
function vectorToAdapterResults(vector) {
|
|
77
|
+
const authOrdinal = PROVIDER_CAPABILITY_ORDINAL[vector.auth];
|
|
78
|
+
const binaryOrdinal = PROVIDER_CAPABILITY_ORDINAL[vector.binary];
|
|
79
|
+
const modelOrdinal = PROVIDER_CAPABILITY_ORDINAL[vector.model];
|
|
80
|
+
const quotaOrdinal = PROVIDER_CAPABILITY_ORDINAL[vector.quota];
|
|
81
|
+
const authScore = authOrdinal >= PROVIDER_CAPABILITY_ORDINAL["auth_valid"] ? 1.0 : authOrdinal / PROVIDER_CAPABILITY_ORDINAL["auth_valid"];
|
|
82
|
+
const readScore = vector.supportsRead ? 1.0 : 0.0;
|
|
83
|
+
const writeScore = vector.supportsWrite ? 1.0 : 0.0;
|
|
84
|
+
const shellScore = vector.supportsShell ? 1.0 : 0.0;
|
|
85
|
+
const mcpScore = binaryOrdinal >= PROVIDER_CAPABILITY_ORDINAL["tool_contract_verified"] ? 1.0 : binaryOrdinal / PROVIDER_CAPABILITY_ORDINAL["tool_contract_verified"];
|
|
86
|
+
const mergeScore = vector.evidencePassRate7d;
|
|
87
|
+
const evidenceScore = vector.evidencePassRate7d;
|
|
88
|
+
const fallbackScore = 1.0 - vector.failureEwma;
|
|
89
|
+
return [
|
|
90
|
+
{ kind: "auth", passed: authScore >= 0.5, score: authScore },
|
|
91
|
+
{ kind: "read", passed: readScore >= 0.5, score: readScore },
|
|
92
|
+
{ kind: "write", passed: writeScore >= 0.5, score: writeScore },
|
|
93
|
+
{ kind: "shell", passed: shellScore >= 0.5, score: shellScore },
|
|
94
|
+
{ kind: "mcp", passed: mcpScore >= 0.5, score: mcpScore },
|
|
95
|
+
{ kind: "merge", passed: mergeScore >= 0.5, score: mergeScore },
|
|
96
|
+
{ kind: "evidence", passed: evidenceScore >= 0.5, score: evidenceScore },
|
|
97
|
+
{ kind: "fallback", passed: fallbackScore >= 0.5, score: fallbackScore },
|
|
98
|
+
];
|
|
99
|
+
}
|
|
100
|
+
export function createProviderMaturityGate() {
|
|
101
|
+
return {
|
|
102
|
+
evaluate(results) {
|
|
103
|
+
const subScores = Object.freeze(buildSubScoreMap(results));
|
|
104
|
+
const score = WEIGHT_AUTH * subScores.auth +
|
|
105
|
+
WEIGHT_READ * subScores.read +
|
|
106
|
+
WEIGHT_WRITE * subScores.write +
|
|
107
|
+
WEIGHT_SHELL * subScores.shell +
|
|
108
|
+
WEIGHT_MCP * subScores.mcp +
|
|
109
|
+
WEIGHT_MERGE * subScores.merge +
|
|
110
|
+
WEIGHT_EVIDENCE * subScores.evidence +
|
|
111
|
+
WEIGHT_FALLBACK * subScores.fallback;
|
|
112
|
+
const finalScore = clamp01(score);
|
|
113
|
+
const authorityClass = computeAuthorityClass(finalScore, subScores);
|
|
114
|
+
return Object.freeze({
|
|
115
|
+
score: finalScore,
|
|
116
|
+
authorityClass,
|
|
117
|
+
passed: authorityClass !== "disabled",
|
|
118
|
+
subScores,
|
|
119
|
+
});
|
|
120
|
+
},
|
|
121
|
+
getSubScore(results, kind) {
|
|
122
|
+
const found = results.find((r) => r.kind === kind);
|
|
123
|
+
return found ? clamp01(found.score) : 0;
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
export function evaluateProviderFromVector(gate, vector) {
|
|
128
|
+
return gate.evaluate(vectorToAdapterResults(vector));
|
|
129
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Public Surface Compression — Phase 1 of OMK Weakness Remediation.
|
|
3
|
+
*
|
|
4
|
+
* Takes a candidate set of runtime surfaces, scores each item,
|
|
5
|
+
* enforces mandatory anchors, applies budget K, and returns
|
|
6
|
+
* public surface S and hidden set H.
|
|
7
|
+
*
|
|
8
|
+
* Also enforces the 5-step flow invariant:
|
|
9
|
+
* goal → dag → route → verify → replay
|
|
10
|
+
*/
|
|
11
|
+
/** A candidate surface item (e.g., a tool, MCP server, skill, or runtime). */
|
|
12
|
+
export interface SurfaceItem {
|
|
13
|
+
readonly id: string;
|
|
14
|
+
readonly name: string;
|
|
15
|
+
readonly category: "tool" | "mcp" | "skill" | "runtime" | "hook";
|
|
16
|
+
/** How often this surface is invoked per 100 turns. */
|
|
17
|
+
readonly usage: number;
|
|
18
|
+
/** Contribution score from verified runs [0, 1]. */
|
|
19
|
+
readonly verifiedRunContribution: number;
|
|
20
|
+
/** Contribution score from evidence items [0, 1]. */
|
|
21
|
+
readonly evidenceContribution: number;
|
|
22
|
+
/** Onboarding difficulty/cost [0, 1]. */
|
|
23
|
+
readonly onboardingCost: number;
|
|
24
|
+
/** Explainability burden [0, 1]. */
|
|
25
|
+
readonly explainabilityCost: number;
|
|
26
|
+
/** Risk of lineage drift [0, 1]. */
|
|
27
|
+
readonly lineageRisk: number;
|
|
28
|
+
}
|
|
29
|
+
/** Scored surface item with computed score. */
|
|
30
|
+
export interface ScoredSurfaceItem extends SurfaceItem {
|
|
31
|
+
readonly score: number;
|
|
32
|
+
}
|
|
33
|
+
/** Mandatory anchor identifiers. */
|
|
34
|
+
export type MandatoryAnchor = "goal" | "dag" | "route" | "verify" | "replay";
|
|
35
|
+
/** Compression result: public surface S and hidden set H. */
|
|
36
|
+
export interface CompressionResult {
|
|
37
|
+
readonly publicSurface: readonly ScoredSurfaceItem[];
|
|
38
|
+
readonly hiddenSet: readonly ScoredSurfaceItem[];
|
|
39
|
+
readonly mandatoryAnchors: readonly MandatoryAnchor[];
|
|
40
|
+
readonly budget: number;
|
|
41
|
+
readonly invariantPassed: boolean;
|
|
42
|
+
readonly invariantViolations: readonly string[];
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Compute surface score from item metrics.
|
|
46
|
+
*
|
|
47
|
+
* Formula:
|
|
48
|
+
* 0.30 * usage
|
|
49
|
+
* + 0.30 * verifiedRunContribution
|
|
50
|
+
* + 0.20 * evidenceContribution
|
|
51
|
+
* - 0.10 * onboardingCost
|
|
52
|
+
* - 0.05 * explainabilityCost
|
|
53
|
+
* - 0.05 * lineageRisk
|
|
54
|
+
*/
|
|
55
|
+
export declare function computeSurfaceScore(item: SurfaceItem): number;
|
|
56
|
+
/**
|
|
57
|
+
* Validate the 5-step flow invariant against the public surface.
|
|
58
|
+
*
|
|
59
|
+
* Invariant: The public surface must contain all mandatory anchors
|
|
60
|
+
* in order: goal → dag → route → verify → replay.
|
|
61
|
+
*
|
|
62
|
+
* Returns violations as human-readable strings.
|
|
63
|
+
*/
|
|
64
|
+
export declare function enforceFlowInvariant(publicSurface: readonly ScoredSurfaceItem[]): {
|
|
65
|
+
readonly passed: boolean;
|
|
66
|
+
readonly violations: readonly string[];
|
|
67
|
+
};
|
|
68
|
+
export interface PublicSurfaceCompressorOptions {
|
|
69
|
+
/** Maximum number of items in the public surface (default 5). */
|
|
70
|
+
readonly budget?: number;
|
|
71
|
+
/** Optional custom scoring function. */
|
|
72
|
+
readonly scoreFn?: (item: SurfaceItem) => number;
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Compresses a candidate surface set into public (S) and hidden (H) subsets.
|
|
76
|
+
*
|
|
77
|
+
* Rules:
|
|
78
|
+
* 1. Mandatory anchors A = {goal, dag, route, verify, replay} are always in S.
|
|
79
|
+
* 2. Remaining slots are filled by highest score until budget K is reached.
|
|
80
|
+
* 3. The 5-step flow invariant is enforced and reported.
|
|
81
|
+
*/
|
|
82
|
+
export declare class PublicSurfaceCompressor {
|
|
83
|
+
private readonly budget;
|
|
84
|
+
private readonly scoreFn;
|
|
85
|
+
constructor(options?: PublicSurfaceCompressorOptions);
|
|
86
|
+
/**
|
|
87
|
+
* Compress candidates into public surface S and hidden set H.
|
|
88
|
+
*
|
|
89
|
+
* @param candidates All candidate surface items.
|
|
90
|
+
* @returns CompressionResult with S, H, and invariant status.
|
|
91
|
+
*/
|
|
92
|
+
compress(candidates: readonly SurfaceItem[]): CompressionResult;
|
|
93
|
+
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Public Surface Compression — Phase 1 of OMK Weakness Remediation.
|
|
3
|
+
*
|
|
4
|
+
* Takes a candidate set of runtime surfaces, scores each item,
|
|
5
|
+
* enforces mandatory anchors, applies budget K, and returns
|
|
6
|
+
* public surface S and hidden set H.
|
|
7
|
+
*
|
|
8
|
+
* Also enforces the 5-step flow invariant:
|
|
9
|
+
* goal → dag → route → verify → replay
|
|
10
|
+
*/
|
|
11
|
+
// ── Constants ───────────────────────────────────────────────────
|
|
12
|
+
const MANDATORY_ANCHORS = [
|
|
13
|
+
"goal",
|
|
14
|
+
"dag",
|
|
15
|
+
"route",
|
|
16
|
+
"verify",
|
|
17
|
+
"replay",
|
|
18
|
+
];
|
|
19
|
+
const DEFAULT_BUDGET = 5;
|
|
20
|
+
// ── Scoring ─────────────────────────────────────────────────────
|
|
21
|
+
/**
|
|
22
|
+
* Compute surface score from item metrics.
|
|
23
|
+
*
|
|
24
|
+
* Formula:
|
|
25
|
+
* 0.30 * usage
|
|
26
|
+
* + 0.30 * verifiedRunContribution
|
|
27
|
+
* + 0.20 * evidenceContribution
|
|
28
|
+
* - 0.10 * onboardingCost
|
|
29
|
+
* - 0.05 * explainabilityCost
|
|
30
|
+
* - 0.05 * lineageRisk
|
|
31
|
+
*/
|
|
32
|
+
export function computeSurfaceScore(item) {
|
|
33
|
+
const raw = 0.30 * item.usage +
|
|
34
|
+
0.30 * item.verifiedRunContribution +
|
|
35
|
+
0.20 * item.evidenceContribution -
|
|
36
|
+
0.10 * item.onboardingCost -
|
|
37
|
+
0.05 * item.explainabilityCost -
|
|
38
|
+
0.05 * item.lineageRisk;
|
|
39
|
+
// Clamp to [0, 1]
|
|
40
|
+
return Math.max(0, Math.min(1, raw));
|
|
41
|
+
}
|
|
42
|
+
// ── Invariant Enforcement ───────────────────────────────────────
|
|
43
|
+
/**
|
|
44
|
+
* Validate the 5-step flow invariant against the public surface.
|
|
45
|
+
*
|
|
46
|
+
* Invariant: The public surface must contain all mandatory anchors
|
|
47
|
+
* in order: goal → dag → route → verify → replay.
|
|
48
|
+
*
|
|
49
|
+
* Returns violations as human-readable strings.
|
|
50
|
+
*/
|
|
51
|
+
export function enforceFlowInvariant(publicSurface) {
|
|
52
|
+
const violations = [];
|
|
53
|
+
const ids = publicSurface.map((s) => s.id);
|
|
54
|
+
for (const anchor of MANDATORY_ANCHORS) {
|
|
55
|
+
if (!ids.includes(anchor)) {
|
|
56
|
+
violations.push(`Missing mandatory anchor: ${anchor}`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
if (violations.length === 0) {
|
|
60
|
+
const orderIndices = MANDATORY_ANCHORS.map((a) => ids.indexOf(a));
|
|
61
|
+
for (let i = 1; i < orderIndices.length; i++) {
|
|
62
|
+
if (orderIndices[i] < orderIndices[i - 1]) {
|
|
63
|
+
violations.push(`Flow order violation: ${MANDATORY_ANCHORS[i - 1]} must precede ${MANDATORY_ANCHORS[i]}`);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return {
|
|
68
|
+
passed: violations.length === 0,
|
|
69
|
+
violations: Object.freeze(violations),
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Compresses a candidate surface set into public (S) and hidden (H) subsets.
|
|
74
|
+
*
|
|
75
|
+
* Rules:
|
|
76
|
+
* 1. Mandatory anchors A = {goal, dag, route, verify, replay} are always in S.
|
|
77
|
+
* 2. Remaining slots are filled by highest score until budget K is reached.
|
|
78
|
+
* 3. The 5-step flow invariant is enforced and reported.
|
|
79
|
+
*/
|
|
80
|
+
export class PublicSurfaceCompressor {
|
|
81
|
+
budget;
|
|
82
|
+
scoreFn;
|
|
83
|
+
constructor(options = {}) {
|
|
84
|
+
this.budget = Math.max(MANDATORY_ANCHORS.length, options.budget ?? DEFAULT_BUDGET);
|
|
85
|
+
this.scoreFn = options.scoreFn ?? computeSurfaceScore;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Compress candidates into public surface S and hidden set H.
|
|
89
|
+
*
|
|
90
|
+
* @param candidates All candidate surface items.
|
|
91
|
+
* @returns CompressionResult with S, H, and invariant status.
|
|
92
|
+
*/
|
|
93
|
+
compress(candidates) {
|
|
94
|
+
const scored = candidates.map((item) => ({
|
|
95
|
+
...item,
|
|
96
|
+
score: this.scoreFn(item),
|
|
97
|
+
}));
|
|
98
|
+
// Partition mandatory vs elective
|
|
99
|
+
const mandatoryItems = [];
|
|
100
|
+
const electiveItems = [];
|
|
101
|
+
for (const item of scored) {
|
|
102
|
+
if (MANDATORY_ANCHORS.includes(item.id)) {
|
|
103
|
+
mandatoryItems.push(item);
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
electiveItems.push(item);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Ensure all mandatory anchors are present; inject placeholders if missing
|
|
110
|
+
const presentIds = new Set(mandatoryItems.map((m) => m.id));
|
|
111
|
+
for (const anchor of MANDATORY_ANCHORS) {
|
|
112
|
+
if (!presentIds.has(anchor)) {
|
|
113
|
+
mandatoryItems.push({
|
|
114
|
+
id: anchor,
|
|
115
|
+
name: anchor,
|
|
116
|
+
category: "runtime",
|
|
117
|
+
usage: 0,
|
|
118
|
+
verifiedRunContribution: 0,
|
|
119
|
+
evidenceContribution: 0,
|
|
120
|
+
onboardingCost: 0,
|
|
121
|
+
explainabilityCost: 0,
|
|
122
|
+
lineageRisk: 0,
|
|
123
|
+
score: 0,
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
// Sort mandatory by canonical order, electives by score desc
|
|
128
|
+
const orderedMandatory = MANDATORY_ANCHORS.map((anchor) => mandatoryItems.find((m) => m.id === anchor));
|
|
129
|
+
electiveItems.sort((a, b) => b.score - a.score);
|
|
130
|
+
const remainingSlots = Math.max(0, this.budget - orderedMandatory.length);
|
|
131
|
+
const publicSurface = Object.freeze([
|
|
132
|
+
...orderedMandatory,
|
|
133
|
+
...electiveItems.slice(0, remainingSlots),
|
|
134
|
+
]);
|
|
135
|
+
const hiddenSet = Object.freeze(electiveItems.slice(remainingSlots));
|
|
136
|
+
const invariant = enforceFlowInvariant(publicSurface);
|
|
137
|
+
return Object.freeze({
|
|
138
|
+
publicSurface,
|
|
139
|
+
hiddenSet,
|
|
140
|
+
mandatoryAnchors: MANDATORY_ANCHORS,
|
|
141
|
+
budget: this.budget,
|
|
142
|
+
invariantPassed: invariant.passed,
|
|
143
|
+
invariantViolations: invariant.violations,
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Router V2 Scoring Engine — Bayesian-smoothed evidence calibration (Algorithm 6).
|
|
3
|
+
*
|
|
4
|
+
* Composite formula:
|
|
5
|
+
* 0.25*E + 0.15*conf + 0.20*cap + 0.15*mat + 0.10*lat + 0.10*cost
|
|
6
|
+
* - 0.15*pen - 0.10*blast
|
|
7
|
+
*/
|
|
8
|
+
import type { AgentRuntime } from "./agent-runtime.js";
|
|
9
|
+
import type { EvidenceHistoryEntry, NodeIntent, RuntimeScoreV2, RouterV2Options, RouterV2ScoringEngine, BlastRadiusParams } from "./contracts/router-v2.js";
|
|
10
|
+
export declare function createRouterV2ScoringEngine(options?: RouterV2Options, blastRadiusFn?: (params: BlastRadiusParams) => number): RouterV2ScoringEngine;
|
|
11
|
+
export declare function scoreRuntimes(candidates: AgentRuntime[], intent: NodeIntent, history: EvidenceHistoryEntry[], options?: RouterV2Options): RuntimeScoreV2[];
|