npm - open-multi-agent-kit - Versions diffs - 0.78.2 → 0.78.3 - Mend

open-multi-agent-kit 0.78.2 → 0.78.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/CHANGELOG.md +27 -2
package/dist/benchmark/contracts.d.ts +116 -0
package/dist/benchmark/contracts.js +6 -0
package/dist/benchmark/fixtures.d.ts +11 -0
package/dist/benchmark/fixtures.js +121 -0
package/dist/benchmark/harness.d.ts +13 -0
package/dist/benchmark/harness.js +191 -0
package/dist/benchmark/shadow-mode.d.ts +17 -0
package/dist/benchmark/shadow-mode.js +96 -0
package/dist/commands/merge.js +102 -56
package/dist/contracts/provider-health.d.ts +37 -0
package/dist/contracts/provider-health.js +49 -1
package/dist/evidence/evidence-trust-score.d.ts +101 -0
package/dist/evidence/evidence-trust-score.js +408 -0
package/dist/evidence/index.d.ts +2 -0
package/dist/evidence/index.js +1 -0
package/dist/orchestration/merge-arbiter.d.ts +91 -0
package/dist/orchestration/merge-arbiter.js +376 -0
package/dist/providers/health.d.ts +3 -0
package/dist/providers/health.js +46 -0
package/dist/providers/index.d.ts +1 -0
package/dist/providers/index.js +1 -0
package/dist/providers/provider-health.d.ts +8 -1
package/dist/providers/provider-health.js +39 -0
package/dist/providers/provider-task-runner.js +31 -0
package/dist/providers/provider.d.ts +2 -0
package/dist/providers/router.js +87 -3
package/dist/providers/types.d.ts +4 -0
package/dist/runtime/provider-maturity-gate.d.ts +2 -0
package/dist/runtime/provider-maturity-gate.js +28 -0
package/dist/runtime/tool-dispatch-contracts.d.ts +24 -3
package/dist/runtime/tool-dispatch-contracts.js +42 -2
package/dist/runtime/weakness-remediation-index.d.ts +1 -1
package/dist/runtime/weakness-remediation-index.js +1 -1
package/dist/safety/enforcement-engine.d.ts +89 -0
package/dist/safety/enforcement-engine.js +279 -0
package/dist/safety/tool-authority-gate.d.ts +40 -0
package/dist/safety/tool-authority-gate.js +92 -0
package/dist/schema/evidence.schema.d.ts +2 -2
package/dist/schema/proof-bundle.schema.d.ts +2 -2
package/docs/benchmark-design.md +122 -0
package/package.json +5 -2

package/dist/commands/merge.js CHANGED Viewed

@@ -11,6 +11,7 @@ import { getOmkResourceSettings } from "../util/resource-profile.js";
 import { defaultScopedRoleAgentFile, writeScopedAgentFile } from "../util/scoped-agent-file.js";
 import { createOmkJsonEnvelope } from "../util/json-envelope.js";
 import { emitJson } from "../util/cli-contract.js";
+import { runMergeArbiter, } from "../orchestration/merge-arbiter.js";
 /**
  * JSON path for `omk merge --json`.
  * Read-only preview: resolves the run, collects worktree diffs (git diff +
@@ -138,65 +139,110 @@ export async function mergeCommand(options) {
     if (dryRun)
         console.log(style.orange("🟡 DRY RUN — no changes will be applied"));
     console.log("");
-    // ── 1. Collect diffs from all worktrees ──
-    const workers = [];
-    for (const name of workerNames) {
-        const wtPath = join(worktreesDir, name);
-        const diffResult = await runShell("git", ["-C", wtPath, "diff", currentBranch], { timeout: 15000 });
-        if (diffResult.failed || !diffResult.stdout.trim()) {
-            console.log(style.gray(`  ${name}: no changes`));
-            continue;
-        }
-        const diff = diffResult.stdout;
-        const diffLines = diff.split("\n").length;
-        // Check apply-ability
-        const applyCheck = await runShell("git", ["apply", "--check"], {
-            cwd: root,
-            input: diff,
-            timeout: 15000,
+    let report;
+    let winner = null;
+    if (strategy === "arbiter") {
+        // ── Arbiter path ──
+        console.log(style.purple("Running merge arbiter..."));
+        const config = await readTextFile(join(root, ".omk", "config.toml"), "");
+        const arbiterResult = await runMergeArbiter(worktreesDir, currentBranch, root, config, {
+            threshold: 0.6,
+            testTimeoutMs: 120_000,
         });
-        const canApply = !applyCheck.failed;
-        workers.push({ name, path: wtPath, diff, diffLines, canApply });
-        console.log(`  ${style.purpleBold(name)} ${canApply ? style.mint("(clean)") : style.pink("(conflicts)")} ${style.gray(`${diffLines} lines`)}`);
-    }
-    if (workers.length === 0) {
-        console.log(status.warn("No worker changes to merge."));
-        return;
-    }
-    // ── 2. Reviewer scoring ──
-    console.log("");
-    console.log(style.purple("Scoring diffs with reviewer..."));
-    for (const w of workers) {
-        const score = await scoreDiff(w.diff, w.name);
-        w.reviewScore = score.score;
-        w.reviewReason = score.reason;
-        const color = score.score >= 80 ? style.mint : score.score >= 50 ? style.orange : style.pink;
-        console.log(`  ${w.name}: ${color(`${score.score}/100`)} ${style.gray(score.reason)}`);
-    }
-    // ── 3. Test verification in worktrees ──
-    console.log("");
-    console.log(style.purple("Running tests in worktrees..."));
-    for (const w of workers) {
-        const testResult = await runShell("sh", ["-c", "npm test 2>/dev/null || pnpm test 2>/dev/null || yarn test 2>/dev/null || true"], {
-            cwd: w.path,
-            timeout: 120_000,
+        // Map arbiter candidates back to WorkerDiff for reporting
+        const arbiterWorkers = arbiterResult.trace.steps
+            .filter((s) => s.step === "evidence-suite" || s.step === "score")
+            .map((s) => {
+            return {
+                name: s.candidateId.replace("candidate-", ""),
+                path: "",
+                diff: "",
+                diffLines: 0,
+                canApply: s.detail.includes("apply=true"),
+                reviewScore: 50,
+                reviewReason: s.detail,
+                testsPassed: s.detail.includes("tests=true"),
+            };
         });
-        w.testsPassed = !testResult.failed;
-        console.log(`  ${w.name}: ${w.testsPassed ? style.mint("tests passed") : style.pink("tests failed")}`);
+        // De-duplicate by name
+        const workerMap = new Map();
+        for (const w of arbiterWorkers)
+            workerMap.set(w.name, w);
+        report = {
+            winner: arbiterResult.winner?.name ?? null,
+            reason: arbiterResult.rationale.summary,
+            conflicts: arbiterResult.rationale.conflicts,
+            filesApplied: 0,
+            dryRun,
+            workers: [...workerMap.values()],
+        };
+        if (arbiterResult.requiresHumanApproval) {
+            console.log(status.error(arbiterResult.rationale.humanApprovalReason ?? "No candidate meets threshold — human approval required."));
+            printReport(report);
+            process.exit(1);
+        }
+        winner = arbiterResult.winner ? { name: arbiterResult.winner.name, path: arbiterResult.winner.path, diff: arbiterResult.winner.diff, diffLines: arbiterResult.winner.diffLines, canApply: arbiterResult.winner.canApply, reviewScore: arbiterResult.winner.evidence.reviewerScore, reviewReason: arbiterResult.winner.evidence.reviewerReason, testsPassed: arbiterResult.winner.evidence.testsPassed } : null;
+    }
+    else {
+        // ── 1. Collect diffs from all worktrees ──
+        const workers = [];
+        for (const name of workerNames) {
+            const wtPath = join(worktreesDir, name);
+            const diffResult = await runShell("git", ["-C", wtPath, "diff", currentBranch], { timeout: 15000 });
+            if (diffResult.failed || !diffResult.stdout.trim()) {
+                console.log(style.gray(`  ${name}: no changes`));
+                continue;
+            }
+            const diff = diffResult.stdout;
+            const diffLines = diff.split("\n").length;
+            // Check apply-ability
+            const applyCheck = await runShell("git", ["apply", "--check"], {
+                cwd: root,
+                input: diff,
+                timeout: 15000,
+            });
+            const canApply = !applyCheck.failed;
+            workers.push({ name, path: wtPath, diff, diffLines, canApply });
+            console.log(`  ${style.purpleBold(name)} ${canApply ? style.mint("(clean)") : style.pink("(conflicts)")} ${style.gray(`${diffLines} lines`)}`);
+        }
+        if (workers.length === 0) {
+            console.log(status.warn("No worker changes to merge."));
+            return;
+        }
+        // ── 2. Reviewer scoring ──
+        console.log("");
+        console.log(style.purple("Scoring diffs with reviewer..."));
+        for (const w of workers) {
+            const score = await scoreDiff(w.diff, w.name);
+            w.reviewScore = score.score;
+            w.reviewReason = score.reason;
+            const color = score.score >= 80 ? style.mint : score.score >= 50 ? style.orange : style.pink;
+            console.log(`  ${w.name}: ${color(`${score.score}/100`)} ${style.gray(score.reason)}`);
+        }
+        // ── 3. Test verification in worktrees ──
+        console.log("");
+        console.log(style.purple("Running tests in worktrees..."));
+        for (const w of workers) {
+            const testResult = await runShell("sh", ["-c", "npm test 2>/dev/null || pnpm test 2>/dev/null || yarn test 2>/dev/null || true"], {
+                cwd: w.path,
+                timeout: 120_000,
+            });
+            w.testsPassed = !testResult.failed;
+            console.log(`  ${w.name}: ${w.testsPassed ? style.mint("tests passed") : style.pink("tests failed")}`);
+        }
+        // ── 4. Select winner ──
+        console.log("");
+        console.log(style.purple("Selecting winner..."));
+        winner = selectWinner(workers, strategy);
+        report = {
+            winner: winner?.name ?? null,
+            reason: winner?.reviewReason ?? "No suitable candidate",
+            conflicts: workers.filter((w) => !w.canApply).map((w) => w.name),
+            filesApplied: 0,
+            dryRun,
+            workers,
+        };
     }
-    // ── 4. Select winner ──
-    console.log("");
-    console.log(style.purple("Selecting winner..."));
-    const winner = selectWinner(workers, strategy);
-    // ── 5. Apply or preview ──
-    const report = {
-        winner: winner?.name ?? null,
-        reason: winner?.reviewReason ?? "No suitable candidate",
-        conflicts: workers.filter((w) => !w.canApply).map((w) => w.name),
-        filesApplied: 0,
-        dryRun,
-        workers,
-    };
     if (!winner) {
         console.log(status.error("No worker diff can be applied cleanly."));
         printReport(report);

package/dist/contracts/provider-health.d.ts CHANGED Viewed

@@ -10,6 +10,43 @@
 export type ProviderFailureKind = "none" | "runtime" | "auth" | "model" | "quota" | "policy" | "transient" | "unknown";
 /** Authority level a provider holds for a given capability lane. */
 export type ProviderAuthorityLevel = "none" | "advisory" | "direct" | "full";
+/** Capability-vector state machine for a single provider dimension. */
+export type ProviderCapabilityState = "missing" | "installed" | "auth_present" | "auth_valid" | "model_available" | "quota_available" | "sandbox_supported" | "tool_contract_verified" | "ready";
+/** Ordinal ordering for capability states (higher = more mature). */
+export declare const PROVIDER_CAPABILITY_ORDINAL: Readonly<Record<ProviderCapabilityState, number>>;
+/** Provider health as a capability vector (Profiler v2). */
+export interface ProviderHealthVector {
+    /** Provider id (e.g. "kimi", "deepseek", "codex"). */
+    provider: string;
+    /** Binary/runtime installation state. */
+    binary: ProviderCapabilityState;
+    /** Authentication state. */
+    auth: ProviderCapabilityState;
+    /** Model resolution state. */
+    model: ProviderCapabilityState;
+    /** Quota/balance state. */
+    quota: ProviderCapabilityState;
+    /** P50 latency in milliseconds (0 = unknown). */
+    latencyP50Ms: number;
+    /** P95 latency in milliseconds (0 = unknown). */
+    latencyP95Ms: number;
+    /** Whether the provider supports read operations. */
+    supportsRead: boolean;
+    /** Whether the provider supports write operations. */
+    supportsWrite: boolean;
+    /** Whether the provider supports shell execution. */
+    supportsShell: boolean;
+    /** Whether the provider supports sandboxed execution. */
+    supportsSandbox: boolean;
+    /** 7-day evidence pass rate [0, 1] (default 0.5 = no data). */
+    evidencePassRate7d: number;
+    /** Exponentially-weighted moving average of failures [0, 1] (0 = healthy). */
+    failureEwma: number;
+}
+/** Derive a backward-compatible `healthy` boolean from a capability vector. */
+export declare function isHealthy(vector: ProviderHealthVector): boolean;
+/** Convert the legacy {@link ProviderHealth} contract into a v2 capability vector. */
+export declare function providerHealthToVector(health: ProviderHealth): ProviderHealthVector;
 /**
  * Normalized provider health snapshot.
  *

package/dist/contracts/provider-health.js CHANGED Viewed

@@ -6,4 +6,52 @@
  * renaming any pre-existing keys. It never carries secret values — only
  * boolean signals (e.g. `authOk`) and non-sensitive remediation hints.
  */
-export {};
+/** Ordinal ordering for capability states (higher = more mature). */
+export const PROVIDER_CAPABILITY_ORDINAL = {
+    missing: 0,
+    installed: 1,
+    auth_present: 2,
+    auth_valid: 3,
+    model_available: 4,
+    quota_available: 5,
+    sandbox_supported: 6,
+    tool_contract_verified: 7,
+    ready: 8,
+};
+/** Derive a backward-compatible `healthy` boolean from a capability vector. */
+export function isHealthy(vector) {
+    return (vector.binary === "ready" &&
+        vector.auth === "ready" &&
+        vector.model === "ready" &&
+        vector.quota === "ready");
+}
+/** Convert the legacy {@link ProviderHealth} contract into a v2 capability vector. */
+export function providerHealthToVector(health) {
+    const binary = health.runtimeOk ? "ready" : "missing";
+    const auth = health.authOk
+        ? "ready"
+        : health.failureKind === "auth"
+            ? "auth_present"
+            : "missing";
+    const model = health.modelOk ? "ready" : "missing";
+    const quota = health.quotaOk
+        ? "ready"
+        : health.failureKind === "quota"
+            ? "auth_valid"
+            : "missing";
+    return {
+        provider: health.provider,
+        binary,
+        auth,
+        model,
+        quota,
+        latencyP50Ms: 0,
+        latencyP95Ms: 0,
+        supportsRead: true,
+        supportsWrite: health.writeAuthority !== "none" && health.writeAuthority !== "advisory",
+        supportsShell: health.shellAuthority !== "none",
+        supportsSandbox: health.shellAuthority !== "none",
+        evidencePassRate7d: health.failureKind === "none" ? 1.0 : 0.5,
+        failureEwma: health.failureKind === "none" ? 0 : 0.5,
+    };
+}

package/dist/evidence/evidence-trust-score.d.ts ADDED Viewed

@@ -0,0 +1,101 @@
+/**
+ * Evidence Trust Score (ETS) v2 — Algorithm 10
+ *
+ * Pipeline:
+ *   ClaimExtractor(output) → RequiredEvidence(claim, taskType, risk)
+ *   → EvidenceCollector(runArtifacts) → EvidenceVerifier(required, collected)
+ *   → EvidenceTrustScore() → Pass | Warn | Fail
+ *
+ * Formula:
+ *   ETS = 0.30*reproducibility + 0.25*independence + 0.20*coverage_relevance
+ *       + 0.15*provenance_integrity + 0.10*freshness
+ *       - gaming_penalty - stale_result_penalty - unverifiable_claim_penalty
+ */
+import type { EvidenceItem, EvidenceKind } from "../runtime/contracts/evidence.js";
+/** A claim extracted from agent output. */
+export interface EtsClaim {
+    readonly claimId: string;
+    readonly text: string;
+    readonly category: EtsClaimCategory;
+    readonly confidence: number;
+}
+export type EtsClaimCategory = "test" | "build" | "typecheck" | "lint" | "behavioral" | "security" | "performance" | "docs";
+/** Task type that produced the output. */
+export type EtsTaskType = "feature" | "bugfix" | "refactor" | "docs" | "test" | "review" | "security" | "release";
+/** Risk tier for the task. */
+export type EtsRiskTier = "low" | "medium" | "high" | "critical";
+/** Required evidence for a claim. */
+export interface RequiredEvidenceItem {
+    readonly evidenceId: string;
+    readonly kind: EvidenceKind;
+    readonly description: string;
+    readonly minConfidence: number;
+}
+/** Metadata about a run artifact. */
+export interface RunArtifactMeta {
+    readonly runId: string;
+    readonly nodeId?: string;
+    readonly provider?: string;
+    readonly model?: string;
+    readonly cwd?: string;
+    readonly treeHashBefore?: string;
+    readonly treeHashAfter?: string;
+    readonly commandHash?: string;
+    readonly timestamp: string;
+    readonly command?: string;
+}
+/** Collected evidence with provenance. */
+export interface CollectedEvidence {
+    readonly items: readonly EvidenceItem[];
+    readonly meta: RunArtifactMeta;
+}
+/** Result of verifying required vs collected evidence. */
+export interface EvidenceVerificationResult {
+    readonly satisfied: readonly string[];
+    readonly missing: readonly string[];
+    readonly partial: readonly string[];
+}
+/** ETS v2 result. */
+export interface EtsV2Result {
+    readonly score: number;
+    readonly reproducibility: number;
+    readonly independence: number;
+    readonly coverageRelevance: number;
+    readonly provenanceIntegrity: number;
+    readonly freshness: number;
+    readonly gamingPenalty: number;
+    readonly staleResultPenalty: number;
+    readonly unverifiableClaimPenalty: number;
+    readonly verdict: "pass" | "warn" | "fail";
+    readonly reasons: readonly string[];
+}
+/** ETS v2 engine. */
+export interface EtsV2Engine {
+    evaluate(params: EtsV2Params): Promise<EtsV2Result>;
+}
+/** Input parameters for ETS v2 evaluation. */
+export interface EtsV2Params {
+    readonly output: string;
+    readonly taskType: EtsTaskType;
+    readonly risk: EtsRiskTier;
+    readonly runArtifacts: CollectedEvidence;
+    readonly dependencyGraphFiles?: readonly string[];
+    readonly now?: string;
+}
+declare const WEIGHTS: {
+    readonly reproducibility: 0.3;
+    readonly independence: 0.25;
+    readonly coverageRelevance: 0.2;
+    readonly provenanceIntegrity: 0.15;
+    readonly freshness: 0.1;
+};
+export declare function extractClaims(output: string): readonly EtsClaim[];
+export declare function requiredEvidenceForClaim(claim: EtsClaim, taskType: EtsTaskType, risk: EtsRiskTier): readonly RequiredEvidenceItem[];
+export declare function collectEvidenceFromRunDir(runDir: string, meta: RunArtifactMeta): Promise<CollectedEvidence>;
+export declare function verifyEvidence(required: readonly RequiredEvidenceItem[], collected: CollectedEvidence): EvidenceVerificationResult;
+export interface EtsV2EngineOptions {
+    readonly customWeights?: Partial<typeof WEIGHTS>;
+    readonly now?: string;
+}
+export declare function createEvidenceTrustScoreV2Engine(options?: EtsV2EngineOptions): EtsV2Engine;
+export { createEvidenceTrustScoreV2Engine as createEvidenceTrustScore };