open-multi-agent-kit 0.78.1 → 0.78.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +37 -0
- package/MATURITY.md +4 -0
- package/README.md +70 -1
- package/dist/benchmark/contracts.d.ts +116 -0
- package/dist/benchmark/contracts.js +6 -0
- package/dist/benchmark/fixtures.d.ts +11 -0
- package/dist/benchmark/fixtures.js +121 -0
- package/dist/benchmark/harness.d.ts +13 -0
- package/dist/benchmark/harness.js +191 -0
- package/dist/benchmark/shadow-mode.d.ts +17 -0
- package/dist/benchmark/shadow-mode.js +96 -0
- package/dist/cli/register-spec-agent-goal-commands.js +45 -0
- package/dist/cli/release-promotion-gate.d.ts +14 -0
- package/dist/cli/release-promotion-gate.js +71 -0
- package/dist/cli/v2/release-commands.d.ts +29 -0
- package/dist/cli/v2/release-commands.js +95 -0
- package/dist/commands/chat/native-root-loop.js +14 -1
- package/dist/commands/chat/slash/commands/session.js +19 -1
- package/dist/commands/goal-interview.d.ts +18 -0
- package/dist/commands/goal-interview.js +396 -0
- package/dist/commands/merge.js +102 -56
- package/dist/contracts/interview.d.ts +106 -0
- package/dist/contracts/interview.js +9 -0
- package/dist/contracts/provider-health.d.ts +37 -0
- package/dist/contracts/provider-health.js +49 -1
- package/dist/evidence/evidence-trust-score.d.ts +101 -0
- package/dist/evidence/evidence-trust-score.js +408 -0
- package/dist/evidence/index.d.ts +6 -0
- package/dist/evidence/index.js +3 -0
- package/dist/evidence/proof-trust-cli.d.ts +8 -0
- package/dist/evidence/proof-trust-cli.js +27 -0
- package/dist/evidence/proof-trust.d.ts +14 -0
- package/dist/evidence/proof-trust.js +381 -0
- package/dist/evidence/regression-proof-matrix.d.ts +42 -0
- package/dist/evidence/regression-proof-matrix.js +72 -0
- package/dist/goal/intent-frame.d.ts +6 -0
- package/dist/goal/intent-frame.js +21 -9
- package/dist/goal/interview-assimilation.d.ts +13 -0
- package/dist/goal/interview-assimilation.js +383 -0
- package/dist/goal/interview-question-bank.d.ts +11 -0
- package/dist/goal/interview-question-bank.js +225 -0
- package/dist/goal/interview-scoring.d.ts +31 -0
- package/dist/goal/interview-scoring.js +187 -0
- package/dist/goal/interview-session.d.ts +25 -0
- package/dist/goal/interview-session.js +116 -0
- package/dist/input/input-envelope.d.ts +22 -0
- package/dist/input/input-envelope.js +1 -0
- package/dist/orchestration/merge-arbiter.d.ts +91 -0
- package/dist/orchestration/merge-arbiter.js +376 -0
- package/dist/providers/health.d.ts +3 -0
- package/dist/providers/health.js +46 -0
- package/dist/providers/index.d.ts +1 -0
- package/dist/providers/index.js +1 -0
- package/dist/providers/provider-health.d.ts +8 -1
- package/dist/providers/provider-health.js +39 -0
- package/dist/providers/provider-task-runner.js +31 -0
- package/dist/providers/provider.d.ts +2 -0
- package/dist/providers/router.js +87 -3
- package/dist/providers/types.d.ts +4 -0
- package/dist/runtime/advanced-control-loop.d.ts +60 -0
- package/dist/runtime/advanced-control-loop.js +136 -0
- package/dist/runtime/agent-runtime.d.ts +10 -0
- package/dist/runtime/blast-radius.d.ts +10 -0
- package/dist/runtime/blast-radius.js +14 -0
- package/dist/runtime/contracts/evidence.d.ts +87 -0
- package/dist/runtime/contracts/evidence.js +7 -0
- package/dist/runtime/contracts/router-v2.d.ts +44 -0
- package/dist/runtime/contracts/router-v2.js +4 -0
- package/dist/runtime/contracts/weakness-remediation.d.ts +67 -0
- package/dist/runtime/contracts/weakness-remediation.js +36 -0
- package/dist/runtime/kimi-api-runtime.js +59 -1
- package/dist/runtime/proof-bundle-trust.d.ts +74 -0
- package/dist/runtime/proof-bundle-trust.js +100 -0
- package/dist/runtime/provider-maturity-gate.d.ts +43 -0
- package/dist/runtime/provider-maturity-gate.js +129 -0
- package/dist/runtime/public-surface.d.ts +93 -0
- package/dist/runtime/public-surface.js +146 -0
- package/dist/runtime/router-v2-scoring.d.ts +11 -0
- package/dist/runtime/router-v2-scoring.js +151 -0
- package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
- package/dist/runtime/tool-dispatch-contracts.js +42 -2
- package/dist/runtime/weakness-remediation-index.d.ts +27 -0
- package/dist/runtime/weakness-remediation-index.js +37 -0
- package/dist/safety/enforcement-engine.d.ts +89 -0
- package/dist/safety/enforcement-engine.js +279 -0
- package/dist/safety/tool-authority-gate.d.ts +40 -0
- package/dist/safety/tool-authority-gate.js +92 -0
- package/dist/schema/evidence.schema.d.ts +2 -2
- package/dist/schema/proof-bundle.schema.d.ts +28 -28
- package/dist/util/clipboard-image.d.ts +49 -0
- package/dist/util/clipboard-image.js +263 -0
- package/docs/2026-06-09/critical-issues.md +20 -0
- package/docs/2026-06-09/improvements.md +14 -0
- package/docs/2026-06-09/init-checklist.md +25 -0
- package/docs/2026-06-09/plan.md +20 -0
- package/docs/benchmark-design.md +122 -0
- package/docs/github-organic-promotion.md +127 -0
- package/docs/native-root-runtime-algorithms.md +301 -0
- package/package.json +8 -4
- package/readmeasset/ASSET_INDEX.md +1 -0
- package/templates/skills/agents/omk-agent-reach-websearch/SKILL.md +55 -0
- package/templates/skills/kimi/omk-agent-reach-websearch/SKILL.md +55 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
// Contract: src/contracts/interview.ts
|
|
2
|
+
// Owner: Contract Worker (Deep Interview phase 0)
|
|
3
|
+
// OMK Deep Interview — uncertainty reducer for goal-driven agent runs.
|
|
4
|
+
//
|
|
5
|
+
// Read-only for downstream interview modules. These types are the shared
|
|
6
|
+
// language for question banking, scoring, assimilation, session building,
|
|
7
|
+
// and the `omk goal interview` / `omk goal refine` CLI commands.
|
|
8
|
+
export const INTERVIEW_SCHEMA_VERSION = "omk.interview.v1";
|
|
9
|
+
export const INTERVIEW_DELTA_SCHEMA_VERSION = "omk.interview-delta.v1";
|
|
@@ -10,6 +10,43 @@
|
|
|
10
10
|
export type ProviderFailureKind = "none" | "runtime" | "auth" | "model" | "quota" | "policy" | "transient" | "unknown";
|
|
11
11
|
/** Authority level a provider holds for a given capability lane. */
|
|
12
12
|
export type ProviderAuthorityLevel = "none" | "advisory" | "direct" | "full";
|
|
13
|
+
/** Capability-vector state machine for a single provider dimension. */
|
|
14
|
+
export type ProviderCapabilityState = "missing" | "installed" | "auth_present" | "auth_valid" | "model_available" | "quota_available" | "sandbox_supported" | "tool_contract_verified" | "ready";
|
|
15
|
+
/** Ordinal ordering for capability states (higher = more mature). */
|
|
16
|
+
export declare const PROVIDER_CAPABILITY_ORDINAL: Readonly<Record<ProviderCapabilityState, number>>;
|
|
17
|
+
/** Provider health as a capability vector (Profiler v2). */
|
|
18
|
+
export interface ProviderHealthVector {
|
|
19
|
+
/** Provider id (e.g. "kimi", "deepseek", "codex"). */
|
|
20
|
+
provider: string;
|
|
21
|
+
/** Binary/runtime installation state. */
|
|
22
|
+
binary: ProviderCapabilityState;
|
|
23
|
+
/** Authentication state. */
|
|
24
|
+
auth: ProviderCapabilityState;
|
|
25
|
+
/** Model resolution state. */
|
|
26
|
+
model: ProviderCapabilityState;
|
|
27
|
+
/** Quota/balance state. */
|
|
28
|
+
quota: ProviderCapabilityState;
|
|
29
|
+
/** P50 latency in milliseconds (0 = unknown). */
|
|
30
|
+
latencyP50Ms: number;
|
|
31
|
+
/** P95 latency in milliseconds (0 = unknown). */
|
|
32
|
+
latencyP95Ms: number;
|
|
33
|
+
/** Whether the provider supports read operations. */
|
|
34
|
+
supportsRead: boolean;
|
|
35
|
+
/** Whether the provider supports write operations. */
|
|
36
|
+
supportsWrite: boolean;
|
|
37
|
+
/** Whether the provider supports shell execution. */
|
|
38
|
+
supportsShell: boolean;
|
|
39
|
+
/** Whether the provider supports sandboxed execution. */
|
|
40
|
+
supportsSandbox: boolean;
|
|
41
|
+
/** 7-day evidence pass rate [0, 1] (default 0.5 = no data). */
|
|
42
|
+
evidencePassRate7d: number;
|
|
43
|
+
/** Exponentially-weighted moving average of failures [0, 1] (0 = healthy). */
|
|
44
|
+
failureEwma: number;
|
|
45
|
+
}
|
|
46
|
+
/** Derive a backward-compatible `healthy` boolean from a capability vector. */
|
|
47
|
+
export declare function isHealthy(vector: ProviderHealthVector): boolean;
|
|
48
|
+
/** Convert the legacy {@link ProviderHealth} contract into a v2 capability vector. */
|
|
49
|
+
export declare function providerHealthToVector(health: ProviderHealth): ProviderHealthVector;
|
|
13
50
|
/**
|
|
14
51
|
* Normalized provider health snapshot.
|
|
15
52
|
*
|
|
@@ -6,4 +6,52 @@
|
|
|
6
6
|
* renaming any pre-existing keys. It never carries secret values — only
|
|
7
7
|
* boolean signals (e.g. `authOk`) and non-sensitive remediation hints.
|
|
8
8
|
*/
|
|
9
|
-
|
|
9
|
+
/** Ordinal ordering for capability states (higher = more mature). */
|
|
10
|
+
export const PROVIDER_CAPABILITY_ORDINAL = {
|
|
11
|
+
missing: 0,
|
|
12
|
+
installed: 1,
|
|
13
|
+
auth_present: 2,
|
|
14
|
+
auth_valid: 3,
|
|
15
|
+
model_available: 4,
|
|
16
|
+
quota_available: 5,
|
|
17
|
+
sandbox_supported: 6,
|
|
18
|
+
tool_contract_verified: 7,
|
|
19
|
+
ready: 8,
|
|
20
|
+
};
|
|
21
|
+
/** Derive a backward-compatible `healthy` boolean from a capability vector. */
|
|
22
|
+
export function isHealthy(vector) {
|
|
23
|
+
return (vector.binary === "ready" &&
|
|
24
|
+
vector.auth === "ready" &&
|
|
25
|
+
vector.model === "ready" &&
|
|
26
|
+
vector.quota === "ready");
|
|
27
|
+
}
|
|
28
|
+
/** Convert the legacy {@link ProviderHealth} contract into a v2 capability vector. */
|
|
29
|
+
export function providerHealthToVector(health) {
|
|
30
|
+
const binary = health.runtimeOk ? "ready" : "missing";
|
|
31
|
+
const auth = health.authOk
|
|
32
|
+
? "ready"
|
|
33
|
+
: health.failureKind === "auth"
|
|
34
|
+
? "auth_present"
|
|
35
|
+
: "missing";
|
|
36
|
+
const model = health.modelOk ? "ready" : "missing";
|
|
37
|
+
const quota = health.quotaOk
|
|
38
|
+
? "ready"
|
|
39
|
+
: health.failureKind === "quota"
|
|
40
|
+
? "auth_valid"
|
|
41
|
+
: "missing";
|
|
42
|
+
return {
|
|
43
|
+
provider: health.provider,
|
|
44
|
+
binary,
|
|
45
|
+
auth,
|
|
46
|
+
model,
|
|
47
|
+
quota,
|
|
48
|
+
latencyP50Ms: 0,
|
|
49
|
+
latencyP95Ms: 0,
|
|
50
|
+
supportsRead: true,
|
|
51
|
+
supportsWrite: health.writeAuthority !== "none" && health.writeAuthority !== "advisory",
|
|
52
|
+
supportsShell: health.shellAuthority !== "none",
|
|
53
|
+
supportsSandbox: health.shellAuthority !== "none",
|
|
54
|
+
evidencePassRate7d: health.failureKind === "none" ? 1.0 : 0.5,
|
|
55
|
+
failureEwma: health.failureKind === "none" ? 0 : 0.5,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evidence Trust Score (ETS) v2 — Algorithm 10
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* ClaimExtractor(output) → RequiredEvidence(claim, taskType, risk)
|
|
6
|
+
* → EvidenceCollector(runArtifacts) → EvidenceVerifier(required, collected)
|
|
7
|
+
* → EvidenceTrustScore() → Pass | Warn | Fail
|
|
8
|
+
*
|
|
9
|
+
* Formula:
|
|
10
|
+
* ETS = 0.30*reproducibility + 0.25*independence + 0.20*coverage_relevance
|
|
11
|
+
* + 0.15*provenance_integrity + 0.10*freshness
|
|
12
|
+
* - gaming_penalty - stale_result_penalty - unverifiable_claim_penalty
|
|
13
|
+
*/
|
|
14
|
+
import type { EvidenceItem, EvidenceKind } from "../runtime/contracts/evidence.js";
|
|
15
|
+
/** A claim extracted from agent output. */
|
|
16
|
+
export interface EtsClaim {
|
|
17
|
+
readonly claimId: string;
|
|
18
|
+
readonly text: string;
|
|
19
|
+
readonly category: EtsClaimCategory;
|
|
20
|
+
readonly confidence: number;
|
|
21
|
+
}
|
|
22
|
+
export type EtsClaimCategory = "test" | "build" | "typecheck" | "lint" | "behavioral" | "security" | "performance" | "docs";
|
|
23
|
+
/** Task type that produced the output. */
|
|
24
|
+
export type EtsTaskType = "feature" | "bugfix" | "refactor" | "docs" | "test" | "review" | "security" | "release";
|
|
25
|
+
/** Risk tier for the task. */
|
|
26
|
+
export type EtsRiskTier = "low" | "medium" | "high" | "critical";
|
|
27
|
+
/** Required evidence for a claim. */
|
|
28
|
+
export interface RequiredEvidenceItem {
|
|
29
|
+
readonly evidenceId: string;
|
|
30
|
+
readonly kind: EvidenceKind;
|
|
31
|
+
readonly description: string;
|
|
32
|
+
readonly minConfidence: number;
|
|
33
|
+
}
|
|
34
|
+
/** Metadata about a run artifact. */
|
|
35
|
+
export interface RunArtifactMeta {
|
|
36
|
+
readonly runId: string;
|
|
37
|
+
readonly nodeId?: string;
|
|
38
|
+
readonly provider?: string;
|
|
39
|
+
readonly model?: string;
|
|
40
|
+
readonly cwd?: string;
|
|
41
|
+
readonly treeHashBefore?: string;
|
|
42
|
+
readonly treeHashAfter?: string;
|
|
43
|
+
readonly commandHash?: string;
|
|
44
|
+
readonly timestamp: string;
|
|
45
|
+
readonly command?: string;
|
|
46
|
+
}
|
|
47
|
+
/** Collected evidence with provenance. */
|
|
48
|
+
export interface CollectedEvidence {
|
|
49
|
+
readonly items: readonly EvidenceItem[];
|
|
50
|
+
readonly meta: RunArtifactMeta;
|
|
51
|
+
}
|
|
52
|
+
/** Result of verifying required vs collected evidence. */
|
|
53
|
+
export interface EvidenceVerificationResult {
|
|
54
|
+
readonly satisfied: readonly string[];
|
|
55
|
+
readonly missing: readonly string[];
|
|
56
|
+
readonly partial: readonly string[];
|
|
57
|
+
}
|
|
58
|
+
/** ETS v2 result. */
|
|
59
|
+
export interface EtsV2Result {
|
|
60
|
+
readonly score: number;
|
|
61
|
+
readonly reproducibility: number;
|
|
62
|
+
readonly independence: number;
|
|
63
|
+
readonly coverageRelevance: number;
|
|
64
|
+
readonly provenanceIntegrity: number;
|
|
65
|
+
readonly freshness: number;
|
|
66
|
+
readonly gamingPenalty: number;
|
|
67
|
+
readonly staleResultPenalty: number;
|
|
68
|
+
readonly unverifiableClaimPenalty: number;
|
|
69
|
+
readonly verdict: "pass" | "warn" | "fail";
|
|
70
|
+
readonly reasons: readonly string[];
|
|
71
|
+
}
|
|
72
|
+
/** ETS v2 engine. */
|
|
73
|
+
export interface EtsV2Engine {
|
|
74
|
+
evaluate(params: EtsV2Params): Promise<EtsV2Result>;
|
|
75
|
+
}
|
|
76
|
+
/** Input parameters for ETS v2 evaluation. */
|
|
77
|
+
export interface EtsV2Params {
|
|
78
|
+
readonly output: string;
|
|
79
|
+
readonly taskType: EtsTaskType;
|
|
80
|
+
readonly risk: EtsRiskTier;
|
|
81
|
+
readonly runArtifacts: CollectedEvidence;
|
|
82
|
+
readonly dependencyGraphFiles?: readonly string[];
|
|
83
|
+
readonly now?: string;
|
|
84
|
+
}
|
|
85
|
+
declare const WEIGHTS: {
|
|
86
|
+
readonly reproducibility: 0.3;
|
|
87
|
+
readonly independence: 0.25;
|
|
88
|
+
readonly coverageRelevance: 0.2;
|
|
89
|
+
readonly provenanceIntegrity: 0.15;
|
|
90
|
+
readonly freshness: 0.1;
|
|
91
|
+
};
|
|
92
|
+
export declare function extractClaims(output: string): readonly EtsClaim[];
|
|
93
|
+
export declare function requiredEvidenceForClaim(claim: EtsClaim, taskType: EtsTaskType, risk: EtsRiskTier): readonly RequiredEvidenceItem[];
|
|
94
|
+
export declare function collectEvidenceFromRunDir(runDir: string, meta: RunArtifactMeta): Promise<CollectedEvidence>;
|
|
95
|
+
export declare function verifyEvidence(required: readonly RequiredEvidenceItem[], collected: CollectedEvidence): EvidenceVerificationResult;
|
|
96
|
+
export interface EtsV2EngineOptions {
|
|
97
|
+
readonly customWeights?: Partial<typeof WEIGHTS>;
|
|
98
|
+
readonly now?: string;
|
|
99
|
+
}
|
|
100
|
+
export declare function createEvidenceTrustScoreV2Engine(options?: EtsV2EngineOptions): EtsV2Engine;
|
|
101
|
+
export { createEvidenceTrustScoreV2Engine as createEvidenceTrustScore };
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evidence Trust Score (ETS) v2 — Algorithm 10
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* ClaimExtractor(output) → RequiredEvidence(claim, taskType, risk)
|
|
6
|
+
* → EvidenceCollector(runArtifacts) → EvidenceVerifier(required, collected)
|
|
7
|
+
* → EvidenceTrustScore() → Pass | Warn | Fail
|
|
8
|
+
*
|
|
9
|
+
* Formula:
|
|
10
|
+
* ETS = 0.30*reproducibility + 0.25*independence + 0.20*coverage_relevance
|
|
11
|
+
* + 0.15*provenance_integrity + 0.10*freshness
|
|
12
|
+
* - gaming_penalty - stale_result_penalty - unverifiable_claim_penalty
|
|
13
|
+
*/
|
|
14
|
+
import { readFile } from "node:fs/promises";
|
|
15
|
+
import { existsSync } from "node:fs";
|
|
16
|
+
import { join } from "node:path";
|
|
17
|
+
// ─── Constants ─────────────────────────────────────────────────────────────
|
|
18
|
+
const WEIGHTS = {
|
|
19
|
+
reproducibility: 0.30,
|
|
20
|
+
independence: 0.25,
|
|
21
|
+
coverageRelevance: 0.20,
|
|
22
|
+
provenanceIntegrity: 0.15,
|
|
23
|
+
freshness: 0.10,
|
|
24
|
+
};
|
|
25
|
+
const STALE_HOURS_BY_RISK = {
|
|
26
|
+
low: 72,
|
|
27
|
+
medium: 48,
|
|
28
|
+
high: 24,
|
|
29
|
+
critical: 6,
|
|
30
|
+
};
|
|
31
|
+
const CLAIM_PATTERNS = [
|
|
32
|
+
{ category: "test", regex: /\b(tests?\s+pass(?:ed|es|ing)|test\s+coverage|all\s+tests?\s+(?:ok|green)|\bnpm\s+test|\bnode\s+--test)/i },
|
|
33
|
+
{ category: "build", regex: /\b(build\s+(?:ok|success|succeeded|pass(?:ed|es|ing))|npm\s+run\s+build|tsc\s+.*(?:no\s+error|success)|esbuild|vite\s+build)/i },
|
|
34
|
+
{ category: "typecheck", regex: /\b(typecheck\s+(?:ok|pass(?:ed|es|ing)|clean)|tsc\s+--noEmit|no\s+type\s+errors?)/i },
|
|
35
|
+
{ category: "lint", regex: /\b(lint\s+(?:ok|pass(?:ed|es|ing)|clean)|eslint.*(?:no\s+error|0\s+(?:problem|warning))|prettier.*check)/i },
|
|
36
|
+
{ category: "security", regex: /\b(secur(?:ity|e)\s+(?:ok|pass(?:ed|es|ing)|scan\s+(?:clean|passed))|secret.*scan|audit.*pass|vulnerability.*0)/i },
|
|
37
|
+
{ category: "performance", regex: /\b(performance\s+(?:ok|pass(?:ed|es|ing)|improved)|latency.*\d+ms|throughput)/i },
|
|
38
|
+
{ category: "docs", regex: /\b(docs?\s+(?:ok|pass(?:ed|es|ing)|updated)|readme.*updated|changelog.*updated)/i },
|
|
39
|
+
{ category: "behavioral", regex: /\b(fix(?:ed|es)\s+(?:bug|issue)|feature\s+(?:works?|implemented)|behavior\s+(?:correct|as\s+expected))/i },
|
|
40
|
+
];
|
|
41
|
+
// ─── Claim Extractor ───────────────────────────────────────────────────────
|
|
42
|
+
export function extractClaims(output) {
|
|
43
|
+
const claims = [];
|
|
44
|
+
const seen = new Set();
|
|
45
|
+
let claimIndex = 0;
|
|
46
|
+
for (const { category, regex } of CLAIM_PATTERNS) {
|
|
47
|
+
const matches = output.match(regex);
|
|
48
|
+
if (matches) {
|
|
49
|
+
for (const match of matches) {
|
|
50
|
+
const key = `${category}:${match.toLowerCase()}`;
|
|
51
|
+
if (seen.has(key))
|
|
52
|
+
continue;
|
|
53
|
+
seen.add(key);
|
|
54
|
+
claims.push({
|
|
55
|
+
claimId: `claim-${category}-${claimIndex++}`,
|
|
56
|
+
text: match,
|
|
57
|
+
category,
|
|
58
|
+
confidence: 0.8,
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return Object.freeze(claims);
|
|
64
|
+
}
|
|
65
|
+
// ─── Required Evidence ─────────────────────────────────────────────────────
|
|
66
|
+
export function requiredEvidenceForClaim(claim, taskType, risk) {
|
|
67
|
+
const required = [];
|
|
68
|
+
const baseKinds = ["command", "trace"];
|
|
69
|
+
const categoryKindMap = {
|
|
70
|
+
test: ["test", "metric"],
|
|
71
|
+
build: ["metric"],
|
|
72
|
+
typecheck: ["metric"],
|
|
73
|
+
lint: ["metric", "audit"],
|
|
74
|
+
security: ["audit", "screenshot"],
|
|
75
|
+
performance: ["metric", "trace"],
|
|
76
|
+
docs: ["diff", "screenshot"],
|
|
77
|
+
behavioral: ["diff", "test"],
|
|
78
|
+
};
|
|
79
|
+
const kinds = [...baseKinds, ...(categoryKindMap[claim.category] ?? [])];
|
|
80
|
+
for (let i = 0; i < kinds.length; i++) {
|
|
81
|
+
required.push({
|
|
82
|
+
evidenceId: `${claim.claimId}-req-${i}`,
|
|
83
|
+
kind: kinds[i],
|
|
84
|
+
description: `Required ${kinds[i]} evidence for ${claim.category} claim`,
|
|
85
|
+
minConfidence: risk === "critical" ? 0.95 : risk === "high" ? 0.85 : risk === "medium" ? 0.75 : 0.6,
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
// High/critical risk adds extra audit trail
|
|
89
|
+
if (risk === "high" || risk === "critical") {
|
|
90
|
+
required.push({
|
|
91
|
+
evidenceId: `${claim.claimId}-req-audit`,
|
|
92
|
+
kind: "audit",
|
|
93
|
+
description: `Audit trail for ${risk} risk task`,
|
|
94
|
+
minConfidence: 0.9,
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
// Critical tasks require screenshot or review evidence
|
|
98
|
+
if (risk === "critical") {
|
|
99
|
+
required.push({
|
|
100
|
+
evidenceId: `${claim.claimId}-req-review`,
|
|
101
|
+
kind: "review",
|
|
102
|
+
description: `Review evidence for critical risk task`,
|
|
103
|
+
minConfidence: 0.9,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
return Object.freeze(required);
|
|
107
|
+
}
|
|
108
|
+
// ─── Evidence Collector ────────────────────────────────────────────────────
|
|
109
|
+
export async function collectEvidenceFromRunDir(runDir, meta) {
|
|
110
|
+
const items = [];
|
|
111
|
+
const evidenceJsonlPath = join(runDir, "evidence.jsonl");
|
|
112
|
+
if (existsSync(evidenceJsonlPath)) {
|
|
113
|
+
try {
|
|
114
|
+
const content = await readFile(evidenceJsonlPath, "utf8");
|
|
115
|
+
const lines = content.split(/\r?\n/).filter((l) => l.trim().length > 0);
|
|
116
|
+
for (const line of lines) {
|
|
117
|
+
try {
|
|
118
|
+
const parsed = JSON.parse(line);
|
|
119
|
+
if (isObject(parsed)) {
|
|
120
|
+
const item = evidenceItemFromRecord(parsed);
|
|
121
|
+
if (item)
|
|
122
|
+
items.push(item);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
catch { /* ignore parse errors */ }
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
catch { /* ignore read errors */ }
|
|
129
|
+
}
|
|
130
|
+
return { items: Object.freeze(items), meta };
|
|
131
|
+
}
|
|
132
|
+
function evidenceItemFromRecord(record) {
|
|
133
|
+
const kind = parseEvidenceKind(record.kind);
|
|
134
|
+
const verdict = parseEvidenceVerdict(record.status);
|
|
135
|
+
if (!kind || !verdict)
|
|
136
|
+
return null;
|
|
137
|
+
return {
|
|
138
|
+
id: String(record.evidenceId ?? record.id ?? ""),
|
|
139
|
+
kind,
|
|
140
|
+
source: String(record.source ?? record.nodeId ?? "unknown"),
|
|
141
|
+
description: String(record.message ?? record.description ?? ""),
|
|
142
|
+
verdict,
|
|
143
|
+
timestamp: String(record.observedAt ?? record.timestamp ?? new Date().toISOString()),
|
|
144
|
+
confidence: typeof record.confidence === "number" ? record.confidence : 0.8,
|
|
145
|
+
linkedTraceId: record.linkedTraceId ? String(record.linkedTraceId) : undefined,
|
|
146
|
+
linkedFilePaths: Array.isArray(record.linkedFilePaths)
|
|
147
|
+
? record.linkedFilePaths
|
|
148
|
+
: record.path
|
|
149
|
+
? [String(record.path)]
|
|
150
|
+
: [],
|
|
151
|
+
metadata: record.metadata && isObject(record.metadata) ? record.metadata : undefined,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
function parseEvidenceKind(value) {
|
|
155
|
+
const kinds = ["test", "diff", "command", "screenshot", "trace", "metric", "audit", "review"];
|
|
156
|
+
return kinds.find((k) => k === value) ?? null;
|
|
157
|
+
}
|
|
158
|
+
function parseEvidenceVerdict(value) {
|
|
159
|
+
const verdicts = ["pass", "fail", "partial", "pending"];
|
|
160
|
+
// Map evidence schema statuses to verdicts
|
|
161
|
+
if (value === "passed")
|
|
162
|
+
return "pass";
|
|
163
|
+
if (value === "failed")
|
|
164
|
+
return "fail";
|
|
165
|
+
if (value === "missing" || value === "skipped" || value === "blocked")
|
|
166
|
+
return "pending";
|
|
167
|
+
return verdicts.find((v) => v === value) ?? null;
|
|
168
|
+
}
|
|
169
|
+
// ─── Evidence Verifier ─────────────────────────────────────────────────────
|
|
170
|
+
export function verifyEvidence(required, collected) {
|
|
171
|
+
const satisfied = [];
|
|
172
|
+
const missing = [];
|
|
173
|
+
const partial = [];
|
|
174
|
+
for (const req of required) {
|
|
175
|
+
const matches = collected.items.filter((item) => item.kind === req.kind &&
|
|
176
|
+
item.confidence >= req.minConfidence &&
|
|
177
|
+
(item.verdict === "pass" || item.verdict === "partial"));
|
|
178
|
+
if (matches.length === 0) {
|
|
179
|
+
missing.push(req.evidenceId);
|
|
180
|
+
}
|
|
181
|
+
else if (matches.some((m) => m.verdict === "pass")) {
|
|
182
|
+
satisfied.push(req.evidenceId);
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
partial.push(req.evidenceId);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return { satisfied: Object.freeze(satisfied), missing: Object.freeze(missing), partial: Object.freeze(partial) };
|
|
189
|
+
}
|
|
190
|
+
// ─── Sub-score Computers ───────────────────────────────────────────────────
|
|
191
|
+
function computeReproducibility(meta) {
|
|
192
|
+
let score = 0;
|
|
193
|
+
let max = 0;
|
|
194
|
+
// commandHash present
|
|
195
|
+
if (meta.commandHash && meta.commandHash.length > 0) {
|
|
196
|
+
score += 0.4;
|
|
197
|
+
}
|
|
198
|
+
max += 0.4;
|
|
199
|
+
// treeHashBefore present
|
|
200
|
+
if (meta.treeHashBefore && meta.treeHashBefore.length > 0) {
|
|
201
|
+
score += 0.3;
|
|
202
|
+
}
|
|
203
|
+
max += 0.3;
|
|
204
|
+
// treeHashAfter present
|
|
205
|
+
if (meta.treeHashAfter && meta.treeHashAfter.length > 0) {
|
|
206
|
+
score += 0.3;
|
|
207
|
+
}
|
|
208
|
+
max += 0.3;
|
|
209
|
+
return max > 0 ? score / max : 0;
|
|
210
|
+
}
|
|
211
|
+
function computeIndependence(collected) {
|
|
212
|
+
if (collected.items.length === 0)
|
|
213
|
+
return 0;
|
|
214
|
+
const independentSources = new Set(["runner", "command", "shell", "test", "ci"]);
|
|
215
|
+
let independentCount = 0;
|
|
216
|
+
for (const item of collected.items) {
|
|
217
|
+
const sourceLower = item.source.toLowerCase();
|
|
218
|
+
if (independentSources.has(sourceLower) ||
|
|
219
|
+
item.kind === "test" ||
|
|
220
|
+
item.kind === "command" ||
|
|
221
|
+
item.kind === "metric") {
|
|
222
|
+
independentCount++;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return independentCount / collected.items.length;
|
|
226
|
+
}
|
|
227
|
+
function computeCoverageRelevance(collected, dependencyGraphFiles) {
|
|
228
|
+
if (collected.items.length === 0)
|
|
229
|
+
return 0;
|
|
230
|
+
const linkedCount = collected.items.filter((item) => {
|
|
231
|
+
if (item.linkedFilePaths.length > 0)
|
|
232
|
+
return true;
|
|
233
|
+
if (dependencyGraphFiles && dependencyGraphFiles.length > 0) {
|
|
234
|
+
// If item description mentions a file in the dependency graph
|
|
235
|
+
return dependencyGraphFiles.some((f) => item.description.includes(f));
|
|
236
|
+
}
|
|
237
|
+
return false;
|
|
238
|
+
}).length;
|
|
239
|
+
return linkedCount / collected.items.length;
|
|
240
|
+
}
|
|
241
|
+
function computeProvenanceIntegrity(meta) {
|
|
242
|
+
const fields = [
|
|
243
|
+
"runId",
|
|
244
|
+
"provider",
|
|
245
|
+
"model",
|
|
246
|
+
"cwd",
|
|
247
|
+
"treeHashBefore",
|
|
248
|
+
"treeHashAfter",
|
|
249
|
+
"commandHash",
|
|
250
|
+
];
|
|
251
|
+
const optionalFields = ["nodeId"];
|
|
252
|
+
const allFields = [...fields, ...optionalFields];
|
|
253
|
+
let present = 0;
|
|
254
|
+
for (const field of allFields) {
|
|
255
|
+
const value = meta[field];
|
|
256
|
+
if (typeof value === "string" && value.length > 0) {
|
|
257
|
+
present++;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
return present / allFields.length;
|
|
261
|
+
}
|
|
262
|
+
function computeFreshness(collected, risk, nowIso) {
|
|
263
|
+
if (collected.items.length === 0)
|
|
264
|
+
return 0;
|
|
265
|
+
const now = new Date(nowIso).getTime();
|
|
266
|
+
const staleThresholdMs = STALE_HOURS_BY_RISK[risk] * 60 * 60 * 1000;
|
|
267
|
+
let totalScore = 0;
|
|
268
|
+
for (const item of collected.items) {
|
|
269
|
+
const itemTime = new Date(item.timestamp).getTime();
|
|
270
|
+
const ageMs = now - itemTime;
|
|
271
|
+
if (ageMs < 0 || Number.isNaN(ageMs)) {
|
|
272
|
+
totalScore += 1.0; // Future/now timestamp = fresh
|
|
273
|
+
continue;
|
|
274
|
+
}
|
|
275
|
+
if (ageMs <= staleThresholdMs) {
|
|
276
|
+
totalScore += 1.0;
|
|
277
|
+
}
|
|
278
|
+
else {
|
|
279
|
+
// Linear decay over next 2x threshold
|
|
280
|
+
const decayWindow = staleThresholdMs * 2;
|
|
281
|
+
const decayed = Math.max(0, 1 - (ageMs - staleThresholdMs) / decayWindow);
|
|
282
|
+
totalScore += decayed;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
return totalScore / collected.items.length;
|
|
286
|
+
}
|
|
287
|
+
// ─── Penalty Computers ─────────────────────────────────────────────────────
|
|
288
|
+
function computeGamingPenalty(claims, collected, verification) {
|
|
289
|
+
let penalty = 0;
|
|
290
|
+
// Penalty if claims outnumber independently-sourced evidence
|
|
291
|
+
const independentItems = collected.items.filter((item) => item.source !== "agent" &&
|
|
292
|
+
item.source !== "self" &&
|
|
293
|
+
item.source !== "unknown");
|
|
294
|
+
if (claims.length > 0 && independentItems.length === 0) {
|
|
295
|
+
penalty += 0.15;
|
|
296
|
+
}
|
|
297
|
+
// Penalty if many claims but few verified
|
|
298
|
+
const claimToVerifiedRatio = claims.length > 0 ? verification.satisfied.length / claims.length : 1;
|
|
299
|
+
if (claimToVerifiedRatio < 0.5) {
|
|
300
|
+
penalty += 0.1;
|
|
301
|
+
}
|
|
302
|
+
// Penalty if all evidence is self-reported (agent-sourced)
|
|
303
|
+
const allAgentSourced = collected.items.length > 0 &&
|
|
304
|
+
collected.items.every((item) => item.source === "agent" ||
|
|
305
|
+
item.source === "self" ||
|
|
306
|
+
item.source === "unknown");
|
|
307
|
+
if (allAgentSourced) {
|
|
308
|
+
penalty += 0.1;
|
|
309
|
+
}
|
|
310
|
+
return Math.min(penalty, 0.3);
|
|
311
|
+
}
|
|
312
|
+
function computeStaleResultPenalty(collected, risk, nowIso) {
|
|
313
|
+
const now = new Date(nowIso).getTime();
|
|
314
|
+
const staleThresholdMs = STALE_HOURS_BY_RISK[risk] * 60 * 60 * 1000;
|
|
315
|
+
let staleCount = 0;
|
|
316
|
+
for (const item of collected.items) {
|
|
317
|
+
const itemTime = new Date(item.timestamp).getTime();
|
|
318
|
+
const ageMs = now - itemTime;
|
|
319
|
+
if (ageMs > staleThresholdMs) {
|
|
320
|
+
staleCount++;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return Math.min(staleCount * 0.05, 0.2);
|
|
324
|
+
}
|
|
325
|
+
function computeUnverifiableClaimPenalty(claims, verification) {
|
|
326
|
+
if (claims.length === 0)
|
|
327
|
+
return 0;
|
|
328
|
+
const unverifiedCount = verification.missing.length;
|
|
329
|
+
return Math.min(unverifiedCount * 0.05, 0.3);
|
|
330
|
+
}
|
|
331
|
+
// ─── Verdict ───────────────────────────────────────────────────────────────
|
|
332
|
+
function computeVerdict(score) {
|
|
333
|
+
if (score >= 0.75)
|
|
334
|
+
return "pass";
|
|
335
|
+
if (score >= 0.50)
|
|
336
|
+
return "warn";
|
|
337
|
+
return "fail";
|
|
338
|
+
}
|
|
339
|
+
export function createEvidenceTrustScoreV2Engine(options) {
|
|
340
|
+
const weights = { ...WEIGHTS, ...options?.customWeights };
|
|
341
|
+
const now = options?.now ?? new Date().toISOString();
|
|
342
|
+
return {
|
|
343
|
+
async evaluate(params) {
|
|
344
|
+
const claims = extractClaims(params.output);
|
|
345
|
+
const allRequired = [];
|
|
346
|
+
for (const claim of claims) {
|
|
347
|
+
allRequired.push(...requiredEvidenceForClaim(claim, params.taskType, params.risk));
|
|
348
|
+
}
|
|
349
|
+
const verification = verifyEvidence(allRequired, params.runArtifacts);
|
|
350
|
+
const reproducibility = computeReproducibility(params.runArtifacts.meta);
|
|
351
|
+
const independence = computeIndependence(params.runArtifacts);
|
|
352
|
+
const coverageRelevance = computeCoverageRelevance(params.runArtifacts, params.dependencyGraphFiles);
|
|
353
|
+
const provenanceIntegrity = computeProvenanceIntegrity(params.runArtifacts.meta);
|
|
354
|
+
const freshness = computeFreshness(params.runArtifacts, params.risk, params.now ?? now);
|
|
355
|
+
const gamingPenalty = computeGamingPenalty(claims, params.runArtifacts, verification);
|
|
356
|
+
const staleResultPenalty = computeStaleResultPenalty(params.runArtifacts, params.risk, params.now ?? now);
|
|
357
|
+
const unverifiableClaimPenalty = computeUnverifiableClaimPenalty(claims, verification);
|
|
358
|
+
let score = weights.reproducibility * reproducibility +
|
|
359
|
+
weights.independence * independence +
|
|
360
|
+
weights.coverageRelevance * coverageRelevance +
|
|
361
|
+
weights.provenanceIntegrity * provenanceIntegrity +
|
|
362
|
+
weights.freshness * freshness -
|
|
363
|
+
gamingPenalty -
|
|
364
|
+
staleResultPenalty -
|
|
365
|
+
unverifiableClaimPenalty;
|
|
366
|
+
score = Math.max(0, Math.min(1, Math.round(score * 1000) / 1000));
|
|
367
|
+
const reasons = [];
|
|
368
|
+
if (reproducibility < 0.5)
|
|
369
|
+
reasons.push("reproducibility below 0.5");
|
|
370
|
+
if (independence < 0.5)
|
|
371
|
+
reasons.push("independence below 0.5");
|
|
372
|
+
if (coverageRelevance < 0.5)
|
|
373
|
+
reasons.push("coverage_relevance below 0.5");
|
|
374
|
+
if (provenanceIntegrity < 0.5)
|
|
375
|
+
reasons.push("provenance_integrity below 0.5");
|
|
376
|
+
if (freshness < 0.5)
|
|
377
|
+
reasons.push("freshness below 0.5");
|
|
378
|
+
if (gamingPenalty > 0)
|
|
379
|
+
reasons.push(`gaming_penalty=${gamingPenalty.toFixed(3)}`);
|
|
380
|
+
if (staleResultPenalty > 0)
|
|
381
|
+
reasons.push(`stale_result_penalty=${staleResultPenalty.toFixed(3)}`);
|
|
382
|
+
if (unverifiableClaimPenalty > 0)
|
|
383
|
+
reasons.push(`unverifiable_claim_penalty=${unverifiableClaimPenalty.toFixed(3)}`);
|
|
384
|
+
if (verification.missing.length > 0)
|
|
385
|
+
reasons.push(`missing evidence: ${verification.missing.length} items`);
|
|
386
|
+
const verdict = computeVerdict(score);
|
|
387
|
+
return {
|
|
388
|
+
score,
|
|
389
|
+
reproducibility: Math.round(reproducibility * 1000) / 1000,
|
|
390
|
+
independence: Math.round(independence * 1000) / 1000,
|
|
391
|
+
coverageRelevance: Math.round(coverageRelevance * 1000) / 1000,
|
|
392
|
+
provenanceIntegrity: Math.round(provenanceIntegrity * 1000) / 1000,
|
|
393
|
+
freshness: Math.round(freshness * 1000) / 1000,
|
|
394
|
+
gamingPenalty: Math.round(gamingPenalty * 1000) / 1000,
|
|
395
|
+
staleResultPenalty: Math.round(staleResultPenalty * 1000) / 1000,
|
|
396
|
+
unverifiableClaimPenalty: Math.round(unverifiableClaimPenalty * 1000) / 1000,
|
|
397
|
+
verdict,
|
|
398
|
+
reasons: Object.freeze(reasons),
|
|
399
|
+
};
|
|
400
|
+
},
|
|
401
|
+
};
|
|
402
|
+
}
|
|
403
|
+
// ─── Helpers ───────────────────────────────────────────────────────────────
|
|
404
|
+
function isObject(value) {
|
|
405
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
406
|
+
}
|
|
407
|
+
// ─── Backward-compat: re-export as EvidenceTrustScore for integration ──────
|
|
408
|
+
export { createEvidenceTrustScoreV2Engine as createEvidenceTrustScore };
|
package/dist/evidence/index.d.ts
CHANGED
|
@@ -13,3 +13,9 @@ export { decideRepair } from "../orchestration/repair-policy.js";
|
|
|
13
13
|
export type { RepairContext } from "../orchestration/repair-policy.js";
|
|
14
14
|
export type { DecisionTraceStore } from "./decision-trace.js";
|
|
15
15
|
export { createDecisionTraceStore } from "./decision-trace.js";
|
|
16
|
+
export type { ProofTrustMvpEngine, ProofTrustResult } from "./proof-trust.js";
|
|
17
|
+
export { createProofTrustMvpEngine } from "./proof-trust.js";
|
|
18
|
+
export type { EtsClaim, EtsClaimCategory, EtsTaskType, EtsRiskTier, RequiredEvidenceItem, RunArtifactMeta, CollectedEvidence, EvidenceVerificationResult, EtsV2Result, EtsV2Engine, EtsV2Params, EtsV2EngineOptions, } from "./evidence-trust-score.js";
|
|
19
|
+
export { extractClaims, requiredEvidenceForClaim, collectEvidenceFromRunDir, verifyEvidence, createEvidenceTrustScoreV2Engine, createEvidenceTrustScore, } from "./evidence-trust-score.js";
|
|
20
|
+
export type { AlgorithmSpec, ReleaseCandidate, RegressionProofMatrixResult, RegressionProofMatrixEngine, RegressionProofMatrixOptions, } from "./regression-proof-matrix.js";
|
|
21
|
+
export { createRegressionProofMatrixEngine } from "./regression-proof-matrix.js";
|
package/dist/evidence/index.js
CHANGED
|
@@ -5,3 +5,6 @@ export { createDiagnosisEngine } from "./diagnosis.js";
|
|
|
5
5
|
export { createRunTraceStore } from "./run-trace.js";
|
|
6
6
|
export { decideRepair } from "../orchestration/repair-policy.js";
|
|
7
7
|
export { createDecisionTraceStore } from "./decision-trace.js";
|
|
8
|
+
export { createProofTrustMvpEngine } from "./proof-trust.js";
|
|
9
|
+
export { extractClaims, requiredEvidenceForClaim, collectEvidenceFromRunDir, verifyEvidence, createEvidenceTrustScoreV2Engine, createEvidenceTrustScore, } from "./evidence-trust-score.js";
|
|
10
|
+
export { createRegressionProofMatrixEngine } from "./regression-proof-matrix.js";
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Proof Trust CLI — thin wrapper around ProofTrustMvpEngine.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* node dist/evidence/proof-trust-cli.js <runDir> <bundlePath>
|
|
7
|
+
*/
|
|
8
|
+
import { readFile } from "node:fs/promises";
|
|
9
|
+
import { createProofTrustMvpEngine } from "./proof-trust.js";
|
|
10
|
+
async function main() {
|
|
11
|
+
const args = process.argv.slice(2);
|
|
12
|
+
const runDir = args[0] ?? ".omk/runs";
|
|
13
|
+
const bundlePath = args[1];
|
|
14
|
+
if (!bundlePath) {
|
|
15
|
+
console.error("Usage: proof-trust-cli <runDir> <bundlePath>");
|
|
16
|
+
process.exit(1);
|
|
17
|
+
}
|
|
18
|
+
const bundle = JSON.parse(await readFile(bundlePath, "utf8"));
|
|
19
|
+
const engine = createProofTrustMvpEngine();
|
|
20
|
+
const result = await engine.evaluate(runDir, bundle);
|
|
21
|
+
console.log(JSON.stringify(result, null, 2));
|
|
22
|
+
process.exit(result.missingFields.length > 0 ? 1 : 0);
|
|
23
|
+
}
|
|
24
|
+
main().catch((err) => {
|
|
25
|
+
console.error(err);
|
|
26
|
+
process.exit(1);
|
|
27
|
+
});
|