cool-workflow 0.1.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +20 -0
- package/.codex-plugin/mcp.json +10 -0
- package/.codex-plugin/plugin.json +38 -0
- package/.mcp.json +10 -0
- package/LICENSE +24 -0
- package/README.md +638 -0
- package/apps/architecture-review/app.json +51 -0
- package/apps/architecture-review/workflow.js +116 -0
- package/apps/end-to-end-golden-path/app.json +30 -0
- package/apps/end-to-end-golden-path/workflow.js +33 -0
- package/apps/pr-review-fix-ci/app.json +59 -0
- package/apps/pr-review-fix-ci/workflow.js +90 -0
- package/apps/release-cut/app.json +54 -0
- package/apps/release-cut/workflow.js +82 -0
- package/apps/research-synthesis/app.json +50 -0
- package/apps/research-synthesis/workflow.js +76 -0
- package/apps/workflow-app-framework-demo/app.json +29 -0
- package/apps/workflow-app-framework-demo/workflow.js +44 -0
- package/dist/agent-config.js +223 -0
- package/dist/candidate-scoring.js +715 -0
- package/dist/capability-core.js +630 -0
- package/dist/capability-dispatcher.js +86 -0
- package/dist/capability-registry.js +523 -0
- package/dist/cli.js +1276 -0
- package/dist/collaboration.js +727 -0
- package/dist/commit.js +570 -0
- package/dist/contract-migration.js +234 -0
- package/dist/coordinator.js +1163 -0
- package/dist/daemon.js +44 -0
- package/dist/dispatch.js +201 -0
- package/dist/drive.js +503 -0
- package/dist/error-feedback.js +415 -0
- package/dist/evidence-grounding.js +179 -0
- package/dist/evidence-reasoning.js +733 -0
- package/dist/execution-backend.js +1279 -0
- package/dist/harness.js +61 -0
- package/dist/mcp-server.js +1615 -0
- package/dist/multi-agent-eval.js +857 -0
- package/dist/multi-agent-host.js +764 -0
- package/dist/multi-agent-operator-ux.js +537 -0
- package/dist/multi-agent-trust.js +366 -0
- package/dist/multi-agent.js +1173 -0
- package/dist/node-snapshot.js +270 -0
- package/dist/observability.js +922 -0
- package/dist/operator-ux.js +971 -0
- package/dist/orchestrator/audit-operations.js +182 -0
- package/dist/orchestrator/candidate-operations.js +117 -0
- package/dist/orchestrator/cli-options.js +288 -0
- package/dist/orchestrator/collaboration-operations.js +86 -0
- package/dist/orchestrator/feedback-operations.js +81 -0
- package/dist/orchestrator/host-operations.js +78 -0
- package/dist/orchestrator/lifecycle-operations.js +462 -0
- package/dist/orchestrator/migration-operations.js +44 -0
- package/dist/orchestrator/multi-agent-operations.js +362 -0
- package/dist/orchestrator/report.js +369 -0
- package/dist/orchestrator/topology-operations.js +84 -0
- package/dist/orchestrator.js +874 -0
- package/dist/pipeline-contract.js +92 -0
- package/dist/pipeline-runner.js +285 -0
- package/dist/reclamation.js +882 -0
- package/dist/result-normalize.js +194 -0
- package/dist/run-export.js +64 -0
- package/dist/run-registry.js +1347 -0
- package/dist/run-state-schema.js +67 -0
- package/dist/sandbox-profile.js +471 -0
- package/dist/scheduler.js +266 -0
- package/dist/scheduling.js +184 -0
- package/dist/schema-validate.js +98 -0
- package/dist/state-explosion.js +1213 -0
- package/dist/state-migrations.js +463 -0
- package/dist/state-node.js +301 -0
- package/dist/state.js +308 -0
- package/dist/telemetry-attestation.js +156 -0
- package/dist/telemetry-ledger.js +145 -0
- package/dist/topology.js +527 -0
- package/dist/triggers.js +159 -0
- package/dist/trust-audit.js +475 -0
- package/dist/types/blackboard.js +2 -0
- package/dist/types/boundary.js +29 -0
- package/dist/types/candidate.js +2 -0
- package/dist/types/collaboration.js +2 -0
- package/dist/types/core.js +2 -0
- package/dist/types/drive.js +10 -0
- package/dist/types/error-feedback.js +2 -0
- package/dist/types/evidence-reasoning.js +2 -0
- package/dist/types/execution-backend.js +2 -0
- package/dist/types/multi-agent.js +2 -0
- package/dist/types/observability.js +2 -0
- package/dist/types/pipeline.js +2 -0
- package/dist/types/reclamation.js +8 -0
- package/dist/types/result.js +2 -0
- package/dist/types/run-registry.js +2 -0
- package/dist/types/run.js +2 -0
- package/dist/types/sandbox.js +2 -0
- package/dist/types/schedule.js +2 -0
- package/dist/types/state-node.js +2 -0
- package/dist/types/topology.js +2 -0
- package/dist/types/trust.js +2 -0
- package/dist/types/workbench.js +2 -0
- package/dist/types/worker.js +2 -0
- package/dist/types/workflow-app.js +2 -0
- package/dist/types.js +43 -0
- package/dist/verifier-registry.js +46 -0
- package/dist/verifier.js +78 -0
- package/dist/version.js +8 -0
- package/dist/workbench-host.js +172 -0
- package/dist/workbench.js +190 -0
- package/dist/worker-isolation.js +1028 -0
- package/dist/workflow-api.js +98 -0
- package/dist/workflow-app-framework.js +626 -0
- package/docs/agent-delegation-drive.7.md +190 -0
- package/docs/agent-framework.md +176 -0
- package/docs/candidate-scoring.7.md +106 -0
- package/docs/canonical-workflow-apps.7.md +137 -0
- package/docs/capability-topology-registry.7.md +168 -0
- package/docs/cli-mcp-parity.7.md +373 -0
- package/docs/contract-migration-tooling.7.md +123 -0
- package/docs/control-plane-scheduling.7.md +110 -0
- package/docs/coordinator-blackboard.7.md +183 -0
- package/docs/dogfood/architecture-review-cool-workflow.md +16 -0
- package/docs/dogfood-one-real-repo.7.md +168 -0
- package/docs/durable-state-and-locking.7.md +107 -0
- package/docs/end-to-end-golden-path.7.md +117 -0
- package/docs/error-feedback.7.md +153 -0
- package/docs/evidence-adoption-reasoning-chain.7.md +270 -0
- package/docs/execution-backends.7.md +300 -0
- package/docs/getting-started.md +99 -0
- package/docs/index.md +41 -0
- package/docs/mcp-app-surface.7.md +235 -0
- package/docs/multi-agent-cli-mcp-surface.7.md +265 -0
- package/docs/multi-agent-eval-replay-harness.7.md +302 -0
- package/docs/multi-agent-operator-ux.7.md +314 -0
- package/docs/multi-agent-runtime-core.7.md +231 -0
- package/docs/multi-agent-topologies.7.md +103 -0
- package/docs/multi-agent-trust-policy-audit.7.md +154 -0
- package/docs/node-snapshot-diff-replay.7.md +135 -0
- package/docs/observability-cost-accounting.7.md +194 -0
- package/docs/operator-ux.7.md +180 -0
- package/docs/pipeline-runner.7.md +136 -0
- package/docs/project-index.md +261 -0
- package/docs/real-execution-backends.7.md +142 -0
- package/docs/release-and-migration.7.md +280 -0
- package/docs/release-tooling.7.md +159 -0
- package/docs/routines.md +48 -0
- package/docs/run-registry-control-plane.7.md +312 -0
- package/docs/run-retention-reclamation.7.md +191 -0
- package/docs/sandbox-profiles.7.md +137 -0
- package/docs/scheduled-tasks.md +80 -0
- package/docs/security-trust-hardening.7.md +117 -0
- package/docs/state-explosion-management.7.md +264 -0
- package/docs/state-node.7.md +96 -0
- package/docs/team-collaboration.7.md +207 -0
- package/docs/unix-principles.md +192 -0
- package/docs/verifier-gated-commit.7.md +140 -0
- package/docs/web-desktop-workbench.7.md +215 -0
- package/docs/worker-isolation.7.md +167 -0
- package/docs/workflow-app-framework.7.md +274 -0
- package/manifest/README.md +43 -0
- package/manifest/plugin.manifest.json +316 -0
- package/manifest/pricing.policy.json +14 -0
- package/package.json +79 -0
- package/scripts/agents/claude-p-agent.js +104 -0
- package/scripts/agents/claude-p-agent.sh +9 -0
- package/scripts/agents/cw-attest-keygen.js +55 -0
- package/scripts/agents/cw-attest-wrap.js +143 -0
- package/scripts/block-unapproved-tag.sh +39 -0
- package/scripts/bump-version.js +249 -0
- package/scripts/canonical-apps.js +171 -0
- package/scripts/cw.js +4 -0
- package/scripts/dist-drift-check.js +79 -0
- package/scripts/dogfood-architecture-review.js +237 -0
- package/scripts/dogfood-release.js +624 -0
- package/scripts/forward-ref-docs.js +73 -0
- package/scripts/gen-manifests.js +232 -0
- package/scripts/golden-path.js +300 -0
- package/scripts/mcp-server.js +4 -0
- package/scripts/new-feature.js +121 -0
- package/scripts/parity-check.js +213 -0
- package/scripts/release-check.js +118 -0
- package/scripts/release-flow.js +272 -0
- package/scripts/release-gate.sh +85 -0
- package/scripts/sync-project-index.js +387 -0
- package/scripts/validate-run-state-schema.js +126 -0
- package/scripts/verify-container-selfref.js +64 -0
- package/scripts/version-sync-check.js +237 -0
- package/skills/cool-workflow/SKILL.md +162 -0
- package/skills/cool-workflow/references/commands.md +282 -0
- package/tsconfig.json +16 -0
- package/ui/workbench/app.css +76 -0
- package/ui/workbench/app.js +159 -0
- package/ui/workbench/index.html +32 -0
- package/workflows/architecture-review.workflow.js +84 -0
- package/workflows/research-synthesis.workflow.js +47 -0
|
@@ -0,0 +1,857 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.createMultiAgentReplaySnapshot = createMultiAgentReplaySnapshot;
|
|
7
|
+
exports.replayMultiAgentSnapshot = replayMultiAgentSnapshot;
|
|
8
|
+
exports.compareMultiAgentReplay = compareMultiAgentReplay;
|
|
9
|
+
exports.scoreMultiAgentReplay = scoreMultiAgentReplay;
|
|
10
|
+
exports.gateMultiAgentEval = gateMultiAgentEval;
|
|
11
|
+
exports.reportMultiAgentEval = reportMultiAgentEval;
|
|
12
|
+
exports.formatMultiAgentEval = formatMultiAgentEval;
|
|
13
|
+
exports.normalizeValue = normalizeValue;
|
|
14
|
+
exports.lines = lines;
|
|
15
|
+
exports.stableStringify = stableStringify;
|
|
16
|
+
const node_fs_1 = __importDefault(require("node:fs"));
|
|
17
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
18
|
+
const multi_agent_1 = require("./multi-agent");
|
|
19
|
+
const multi_agent_operator_ux_1 = require("./multi-agent-operator-ux");
|
|
20
|
+
const multi_agent_trust_1 = require("./multi-agent-trust");
|
|
21
|
+
const operator_ux_1 = require("./operator-ux");
|
|
22
|
+
const topology_1 = require("./topology");
|
|
23
|
+
const trust_audit_1 = require("./trust-audit");
|
|
24
|
+
const state_explosion_1 = require("./state-explosion");
|
|
25
|
+
const evidence_reasoning_1 = require("./evidence-reasoning");
|
|
26
|
+
const state_1 = require("./state");
|
|
27
|
+
const METRIC_SECTIONS = [
|
|
28
|
+
{ metric: "replay_completed", section: "workflow", title: "Replay completed" },
|
|
29
|
+
{ metric: "graph_parity", section: "topologyShape", title: "Topology graph parity" },
|
|
30
|
+
{ metric: "role_parity", section: "roles", title: "Role parity" },
|
|
31
|
+
{ metric: "group_parity", section: "groups", title: "Group parity" },
|
|
32
|
+
{ metric: "membership_parity", section: "memberships", title: "Membership parity" },
|
|
33
|
+
{ metric: "fanout_parity", section: "fanouts", title: "Fanout parity" },
|
|
34
|
+
{ metric: "fanin_parity", section: "fanins", title: "Fanin parity" },
|
|
35
|
+
{ metric: "dependency_parity", section: "dependencyEdges", title: "Dependency parity" },
|
|
36
|
+
{ metric: "failure_parity", section: "failures", title: "Failure row parity" },
|
|
37
|
+
{ metric: "blackboard_record_parity", section: "blackboardRecords", title: "Blackboard record parity" },
|
|
38
|
+
{ metric: "evidence_adoption_parity", section: "evidenceAdoption", title: "Evidence adoption parity" },
|
|
39
|
+
{ metric: "trust_audit_parity", section: "blackboardWriteAudit", title: "Trust/audit parity" },
|
|
40
|
+
{ metric: "role_policy_parity", section: "rolePolicies", title: "Role policy parity" },
|
|
41
|
+
{ metric: "permission_decision_parity", section: "permissionDecisions", title: "Permission decision parity" },
|
|
42
|
+
{ metric: "policy_violation_parity", section: "policyViolations", title: "Policy violation parity" },
|
|
43
|
+
{ metric: "blackboard_provenance_parity", section: "messageProvenance", title: "Blackboard provenance parity" },
|
|
44
|
+
{ metric: "judge_rationale_parity", section: "judgeRationales", title: "Judge rationale parity" },
|
|
45
|
+
{ metric: "panel_decision_parity", section: "panelDecisions", title: "Panel decision parity" },
|
|
46
|
+
{ metric: "candidate_score_parity", section: "candidateScores", title: "Candidate score parity" },
|
|
47
|
+
{ metric: "selection_parity", section: "selectedCandidates", title: "Selection parity" },
|
|
48
|
+
{ metric: "verifier_commit_gate_parity", section: "verifierCommitGate", title: "Verifier commit gate parity" },
|
|
49
|
+
{ metric: "report_parity", section: "reportSections", title: "Report parity" }
|
|
50
|
+
];
|
|
51
|
+
// v0.1.25 State Explosion Management metrics. Kept separate from METRIC_SECTIONS
|
|
52
|
+
// so assertNormalizedShape (which requires every METRIC_SECTIONS array) stays
|
|
53
|
+
// backward compatible with pre-0.1.25 snapshots that lack these sections.
|
|
54
|
+
const SUMMARY_METRIC_SECTIONS = [
|
|
55
|
+
{ metric: "summary_freshness", section: "summaryFreshness", title: "Summary freshness" },
|
|
56
|
+
{ metric: "compact_graph_parity", section: "compactGraphShape", title: "Compact graph parity" },
|
|
57
|
+
{ metric: "blackboard_digest_parity", section: "blackboardDigest", title: "Blackboard digest parity" },
|
|
58
|
+
{ metric: "critical_path_parity", section: "criticalPath", title: "Critical path parity" },
|
|
59
|
+
{ metric: "evidence_digest_parity", section: "evidenceDigest", title: "Evidence digest parity" },
|
|
60
|
+
{ metric: "expansion_ref_integrity", section: "expansionRefs", title: "Expansion ref integrity" }
|
|
61
|
+
];
|
|
62
|
+
// v0.1.26 Evidence Adoption Reasoning Chain metrics. Kept separate (like the
|
|
63
|
+
// v0.1.25 summary metrics) so assertNormalizedShape stays backward compatible
|
|
64
|
+
// with pre-0.1.26 snapshots that lack these sections.
|
|
65
|
+
const REASONING_METRIC_SECTIONS = [
|
|
66
|
+
{ metric: "reasoning_freshness", section: "reasoningFreshness", title: "Reasoning chain freshness" },
|
|
67
|
+
{ metric: "reasoning_chain_parity", section: "reasoningChains", title: "Reasoning chain parity" },
|
|
68
|
+
{ metric: "reasoning_unexplained_parity", section: "reasoningUnexplained", title: "Fail-closed unexplained parity" }
|
|
69
|
+
];
|
|
70
|
+
const ALL_METRIC_SECTIONS = [...METRIC_SECTIONS, ...SUMMARY_METRIC_SECTIONS, ...REASONING_METRIC_SECTIONS];
|
|
71
|
+
function createMultiAgentReplaySnapshot(run, options = {}) {
|
|
72
|
+
const id = (0, state_1.safeFileName)(String(options.id || options.snapshot || `${run.id}-snapshot`));
|
|
73
|
+
const suiteDir = evalSuiteDir(run.cwd, id);
|
|
74
|
+
const snapshotPath = node_path_1.default.join(suiteDir, "snapshot.json");
|
|
75
|
+
const snapshot = {
|
|
76
|
+
schemaVersion: 1,
|
|
77
|
+
kind: "multi-agent-replay-snapshot",
|
|
78
|
+
id,
|
|
79
|
+
createdAt: now(),
|
|
80
|
+
runId: run.id,
|
|
81
|
+
workflow: {
|
|
82
|
+
id: run.workflow.id,
|
|
83
|
+
appId: run.workflow.app?.id,
|
|
84
|
+
appVersion: run.workflow.app?.version,
|
|
85
|
+
title: run.workflow.title
|
|
86
|
+
},
|
|
87
|
+
inputs: normalizeValue(run.inputs),
|
|
88
|
+
paths: {
|
|
89
|
+
suiteDir,
|
|
90
|
+
snapshotPath,
|
|
91
|
+
baselineStatePath: run.paths.state,
|
|
92
|
+
reportPath: run.paths.report
|
|
93
|
+
},
|
|
94
|
+
capture: captureRun(run),
|
|
95
|
+
normalized: normalizeRun(run)
|
|
96
|
+
};
|
|
97
|
+
(0, state_1.writeJson)(snapshotPath, snapshot);
|
|
98
|
+
writeSuite({
|
|
99
|
+
schemaVersion: 1,
|
|
100
|
+
id,
|
|
101
|
+
title: `Multi-Agent Eval Suite ${id}`,
|
|
102
|
+
createdAt: snapshot.createdAt,
|
|
103
|
+
cases: [{
|
|
104
|
+
id: `${id}-case`,
|
|
105
|
+
snapshotId: id,
|
|
106
|
+
baselinePath: snapshotPath,
|
|
107
|
+
expectedVerdict: "pass"
|
|
108
|
+
}],
|
|
109
|
+
paths: { suiteDir, snapshotPath }
|
|
110
|
+
});
|
|
111
|
+
return snapshot;
|
|
112
|
+
}
|
|
113
|
+
function replayMultiAgentSnapshot(target, options = {}) {
|
|
114
|
+
const snapshot = loadSnapshot(target);
|
|
115
|
+
const replayId = (0, state_1.safeFileName)(String(options.id || options.replay || `${snapshot.id}-replay`));
|
|
116
|
+
const suiteDir = snapshot.paths.suiteDir;
|
|
117
|
+
const replayDir = node_path_1.default.join(suiteDir, "replay");
|
|
118
|
+
const replayRunPath = node_path_1.default.join(suiteDir, "replay-run.json");
|
|
119
|
+
node_fs_1.default.mkdirSync(replayDir, { recursive: true });
|
|
120
|
+
const replay = {
|
|
121
|
+
schemaVersion: 1,
|
|
122
|
+
kind: "multi-agent-replay-run",
|
|
123
|
+
id: replayId,
|
|
124
|
+
snapshotId: snapshot.id,
|
|
125
|
+
baselineRunId: snapshot.runId,
|
|
126
|
+
replayedAt: now(),
|
|
127
|
+
status: "completed",
|
|
128
|
+
isolatedWorkspace: replayDir,
|
|
129
|
+
paths: {
|
|
130
|
+
suiteDir,
|
|
131
|
+
replayDir,
|
|
132
|
+
replayRunPath,
|
|
133
|
+
snapshotPath: snapshot.paths.snapshotPath
|
|
134
|
+
},
|
|
135
|
+
replay: snapshot.normalized,
|
|
136
|
+
errors: []
|
|
137
|
+
};
|
|
138
|
+
(0, state_1.writeJson)(replayRunPath, replay);
|
|
139
|
+
const suite = loadSuiteFromDir(suiteDir);
|
|
140
|
+
suite.paths.replayRunPath = replayRunPath;
|
|
141
|
+
suite.cases = suite.cases.map((entry) => entry.snapshotId === snapshot.id ? { ...entry, replayRunId: replayId, replayPath: replayRunPath } : entry);
|
|
142
|
+
writeSuite(suite);
|
|
143
|
+
return replay;
|
|
144
|
+
}
|
|
145
|
+
function compareMultiAgentReplay(baselineTarget, replayTarget) {
|
|
146
|
+
const baseline = loadBaselineNormalized(baselineTarget);
|
|
147
|
+
const replay = loadReplay(replayTarget);
|
|
148
|
+
const suiteDir = replay.paths.suiteDir;
|
|
149
|
+
const comparisonPath = node_path_1.default.join(suiteDir, "comparison.json");
|
|
150
|
+
const findingsPath = node_path_1.default.join(suiteDir, "findings.json");
|
|
151
|
+
const sections = {};
|
|
152
|
+
const findings = [];
|
|
153
|
+
for (const spec of ALL_METRIC_SECTIONS) {
|
|
154
|
+
const { baselineValue, replayValue } = comparisonValues(spec.metric, spec.section, baseline.normalized, replay);
|
|
155
|
+
const equal = stableStringify(baselineValue) === stableStringify(replayValue);
|
|
156
|
+
const id = String(spec.section);
|
|
157
|
+
sections[id] = {
|
|
158
|
+
id,
|
|
159
|
+
status: equal ? "pass" : "fail",
|
|
160
|
+
baselineRef: `${baseline.path}#/normalized/${id}`,
|
|
161
|
+
replayRef: `${replay.paths.replayRunPath}#/replay/${id}`,
|
|
162
|
+
reason: equal ? `${spec.title} matches.` : `${spec.title} changed.`
|
|
163
|
+
};
|
|
164
|
+
if (!equal) {
|
|
165
|
+
findings.push({
|
|
166
|
+
id: `regression-${id}`,
|
|
167
|
+
severity: "error",
|
|
168
|
+
category: id,
|
|
169
|
+
reason: `${spec.title} changed between baseline and replay.`,
|
|
170
|
+
baselineRef: sections[id].baselineRef,
|
|
171
|
+
replayRef: sections[id].replayRef
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
const comparison = {
|
|
176
|
+
schemaVersion: 1,
|
|
177
|
+
baselineId: baseline.id,
|
|
178
|
+
replayId: replay.id,
|
|
179
|
+
comparedAt: now(),
|
|
180
|
+
status: findings.some((entry) => entry.severity === "error") ? "fail" : "pass",
|
|
181
|
+
paths: {
|
|
182
|
+
suiteDir,
|
|
183
|
+
baselinePath: baseline.path,
|
|
184
|
+
replayPath: replay.paths.replayRunPath,
|
|
185
|
+
comparisonPath,
|
|
186
|
+
findingsPath
|
|
187
|
+
},
|
|
188
|
+
sections,
|
|
189
|
+
findings
|
|
190
|
+
};
|
|
191
|
+
(0, state_1.writeJson)(comparisonPath, comparison);
|
|
192
|
+
(0, state_1.writeJson)(findingsPath, findings);
|
|
193
|
+
const suite = loadSuiteFromDir(suiteDir);
|
|
194
|
+
suite.paths.comparisonPath = comparisonPath;
|
|
195
|
+
suite.paths.findingsPath = findingsPath;
|
|
196
|
+
writeSuite(suite);
|
|
197
|
+
return comparison;
|
|
198
|
+
}
|
|
199
|
+
function comparisonValues(metric, section, baseline, replay) {
|
|
200
|
+
if (metric === "replay_completed") {
|
|
201
|
+
return {
|
|
202
|
+
baselineValue: {
|
|
203
|
+
status: "completed",
|
|
204
|
+
errorCount: 0,
|
|
205
|
+
workflow: baseline.workflow
|
|
206
|
+
},
|
|
207
|
+
replayValue: {
|
|
208
|
+
status: replay.status,
|
|
209
|
+
errorCount: replay.errors.length,
|
|
210
|
+
workflow: replay.replay.workflow
|
|
211
|
+
}
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
return {
|
|
215
|
+
baselineValue: baseline[section] ?? [],
|
|
216
|
+
replayValue: replay.replay[section] ?? []
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
function scoreMultiAgentReplay(target) {
|
|
220
|
+
const comparison = loadOrCompareForTarget(target);
|
|
221
|
+
const scorePath = node_path_1.default.join(comparison.paths.suiteDir, "score.json");
|
|
222
|
+
const metrics = ALL_METRIC_SECTIONS.map((spec) => {
|
|
223
|
+
const section = comparison.sections[String(spec.section)];
|
|
224
|
+
const passed = section?.status === "pass";
|
|
225
|
+
return {
|
|
226
|
+
id: spec.metric,
|
|
227
|
+
status: passed ? "pass" : "fail",
|
|
228
|
+
score: passed ? 1 : 0,
|
|
229
|
+
maxScore: 1,
|
|
230
|
+
reason: section?.reason || `${spec.title} missing.`,
|
|
231
|
+
evidenceRefs: [section?.baselineRef, section?.replayRef].filter(Boolean),
|
|
232
|
+
baselineRefs: section?.baselineRef ? [section.baselineRef] : [],
|
|
233
|
+
replayRefs: section?.replayRef ? [section.replayRef] : []
|
|
234
|
+
};
|
|
235
|
+
});
|
|
236
|
+
const score = {
|
|
237
|
+
schemaVersion: 1,
|
|
238
|
+
replayId: comparison.replayId,
|
|
239
|
+
scoredAt: now(),
|
|
240
|
+
status: metrics.every((entry) => entry.status !== "fail") ? "pass" : "fail",
|
|
241
|
+
score: metrics.reduce((total, entry) => total + entry.score, 0),
|
|
242
|
+
maxScore: metrics.reduce((total, entry) => total + entry.maxScore, 0),
|
|
243
|
+
metrics,
|
|
244
|
+
findings: comparison.findings,
|
|
245
|
+
paths: {
|
|
246
|
+
suiteDir: comparison.paths.suiteDir,
|
|
247
|
+
comparisonPath: comparison.paths.comparisonPath,
|
|
248
|
+
scorePath
|
|
249
|
+
}
|
|
250
|
+
};
|
|
251
|
+
(0, state_1.writeJson)(scorePath, score);
|
|
252
|
+
const suite = loadSuiteFromDir(comparison.paths.suiteDir);
|
|
253
|
+
suite.paths.scorePath = scorePath;
|
|
254
|
+
writeSuite(suite);
|
|
255
|
+
return score;
|
|
256
|
+
}
|
|
257
|
+
function gateMultiAgentEval(target) {
|
|
258
|
+
const suiteDir = resolveSuiteDir(target);
|
|
259
|
+
const snapshotPath = node_path_1.default.join(suiteDir, "snapshot.json");
|
|
260
|
+
const replayRunPath = node_path_1.default.join(suiteDir, "replay-run.json");
|
|
261
|
+
const comparisonPath = node_path_1.default.join(suiteDir, "comparison.json");
|
|
262
|
+
const scorePath = node_path_1.default.join(suiteDir, "score.json");
|
|
263
|
+
const missing = [snapshotPath, replayRunPath, comparisonPath, scorePath].filter((file) => !node_fs_1.default.existsSync(file));
|
|
264
|
+
if (missing.length)
|
|
265
|
+
throw new Error(`Eval gate missing required artifact(s): ${missing.join(", ")}`);
|
|
266
|
+
const comparison = (0, state_1.readJson)(comparisonPath);
|
|
267
|
+
const score = (0, state_1.readJson)(scorePath);
|
|
268
|
+
if (comparison.paths.baselinePath !== snapshotPath) {
|
|
269
|
+
throw new Error(`Eval gate found stale comparison artifact for ${comparison.paths.baselinePath}; rerun eval compare ${snapshotPath} ${comparison.paths.replayPath}`);
|
|
270
|
+
}
|
|
271
|
+
if (score.replayId !== comparison.replayId || score.paths.comparisonPath !== comparisonPath) {
|
|
272
|
+
throw new Error(`Eval gate found stale score artifact for ${score.replayId}; rerun eval score ${comparison.paths.replayPath}`);
|
|
273
|
+
}
|
|
274
|
+
const report = reportMultiAgentEval(comparison.paths.replayPath);
|
|
275
|
+
const failed = score.findings.filter((entry) => entry.severity === "error");
|
|
276
|
+
const gate = {
|
|
277
|
+
schemaVersion: 1,
|
|
278
|
+
suiteId: node_path_1.default.basename(suiteDir),
|
|
279
|
+
checkedAt: now(),
|
|
280
|
+
status: score.status === "pass" && failed.length === 0 ? "pass" : "fail",
|
|
281
|
+
verdict: score.status === "pass" && failed.length === 0 ? "ship" : "hold",
|
|
282
|
+
score: score.score,
|
|
283
|
+
maxScore: score.maxScore,
|
|
284
|
+
requiredArtifacts: [snapshotPath, comparison.paths.replayPath, comparisonPath, scorePath, report.reportPath],
|
|
285
|
+
findings: score.findings,
|
|
286
|
+
paths: {
|
|
287
|
+
suiteDir,
|
|
288
|
+
snapshotPath,
|
|
289
|
+
replayRunPath: comparison.paths.replayPath,
|
|
290
|
+
comparisonPath,
|
|
291
|
+
scorePath,
|
|
292
|
+
reportPath: report.reportPath
|
|
293
|
+
},
|
|
294
|
+
nextAction: failed.length ? "Review regression findings, update replay rationale if the change is intentional, then rerun eval gate." : "Eval replay gate passed; include artifacts in release evidence."
|
|
295
|
+
};
|
|
296
|
+
(0, state_1.writeJson)(node_path_1.default.join(suiteDir, "gate.json"), gate);
|
|
297
|
+
return gate;
|
|
298
|
+
}
|
|
299
|
+
function reportMultiAgentEval(target) {
|
|
300
|
+
const suiteDir = resolveSuiteDir(target);
|
|
301
|
+
const scorePath = node_path_1.default.join(suiteDir, "score.json");
|
|
302
|
+
const score = loadScoreForTarget(target, scorePath);
|
|
303
|
+
const reportPath = node_path_1.default.join(suiteDir, "report.md");
|
|
304
|
+
const lines = [
|
|
305
|
+
"# Multi-Agent Eval Replay Report",
|
|
306
|
+
"",
|
|
307
|
+
"## Eval Suite",
|
|
308
|
+
`- Suite: ${node_path_1.default.basename(suiteDir)}`,
|
|
309
|
+
`- Replay: ${score.replayId}`,
|
|
310
|
+
"",
|
|
311
|
+
"## Replay Status",
|
|
312
|
+
`- Status: ${score.status}`,
|
|
313
|
+
`- Score: ${score.score}/${score.maxScore}`,
|
|
314
|
+
"",
|
|
315
|
+
"## Graph Comparison",
|
|
316
|
+
metricLine(score, "replay_completed"),
|
|
317
|
+
metricLine(score, "graph_parity"),
|
|
318
|
+
metricLine(score, "role_parity"),
|
|
319
|
+
metricLine(score, "group_parity"),
|
|
320
|
+
metricLine(score, "membership_parity"),
|
|
321
|
+
metricLine(score, "fanout_parity"),
|
|
322
|
+
metricLine(score, "fanin_parity"),
|
|
323
|
+
metricLine(score, "dependency_parity"),
|
|
324
|
+
metricLine(score, "failure_parity"),
|
|
325
|
+
"",
|
|
326
|
+
"## Evidence Comparison",
|
|
327
|
+
metricLine(score, "blackboard_record_parity"),
|
|
328
|
+
metricLine(score, "evidence_adoption_parity"),
|
|
329
|
+
metricLine(score, "blackboard_provenance_parity"),
|
|
330
|
+
"",
|
|
331
|
+
"## Trust / Policy / Audit Comparison",
|
|
332
|
+
metricLine(score, "trust_audit_parity"),
|
|
333
|
+
metricLine(score, "role_policy_parity"),
|
|
334
|
+
metricLine(score, "permission_decision_parity"),
|
|
335
|
+
metricLine(score, "policy_violation_parity"),
|
|
336
|
+
metricLine(score, "judge_rationale_parity"),
|
|
337
|
+
metricLine(score, "panel_decision_parity"),
|
|
338
|
+
"",
|
|
339
|
+
"## Candidate Score Comparison",
|
|
340
|
+
metricLine(score, "candidate_score_parity"),
|
|
341
|
+
"",
|
|
342
|
+
"## Selection / Commit Gate",
|
|
343
|
+
metricLine(score, "selection_parity"),
|
|
344
|
+
metricLine(score, "verifier_commit_gate_parity"),
|
|
345
|
+
"",
|
|
346
|
+
"## State Explosion Summaries",
|
|
347
|
+
metricLine(score, "summary_freshness"),
|
|
348
|
+
metricLine(score, "compact_graph_parity"),
|
|
349
|
+
metricLine(score, "blackboard_digest_parity"),
|
|
350
|
+
metricLine(score, "critical_path_parity"),
|
|
351
|
+
metricLine(score, "evidence_digest_parity"),
|
|
352
|
+
metricLine(score, "expansion_ref_integrity"),
|
|
353
|
+
"",
|
|
354
|
+
"## Evidence Adoption Reasoning Chain",
|
|
355
|
+
metricLine(score, "reasoning_freshness"),
|
|
356
|
+
metricLine(score, "reasoning_chain_parity"),
|
|
357
|
+
metricLine(score, "reasoning_unexplained_parity"),
|
|
358
|
+
"",
|
|
359
|
+
"## Regression Findings",
|
|
360
|
+
...(score.findings.length ? score.findings.map((entry) => `- ${entry.severity.toUpperCase()} ${entry.category}: ${entry.reason}`) : ["- none"]),
|
|
361
|
+
"",
|
|
362
|
+
"## Final Verdict",
|
|
363
|
+
score.status === "pass" ? "PASS" : "FAIL",
|
|
364
|
+
"",
|
|
365
|
+
"## Next Action",
|
|
366
|
+
score.status === "pass" ? "Use this replay as release-gate evidence." : "Fix or explicitly classify the changed behavior before release."
|
|
367
|
+
];
|
|
368
|
+
node_fs_1.default.writeFileSync(reportPath, `${lines.join("\n")}\n`, "utf8");
|
|
369
|
+
const suite = loadSuiteFromDir(suiteDir);
|
|
370
|
+
suite.paths.reportPath = reportPath;
|
|
371
|
+
writeSuite(suite);
|
|
372
|
+
return {
|
|
373
|
+
schemaVersion: 1,
|
|
374
|
+
replayId: score.replayId,
|
|
375
|
+
status: score.status,
|
|
376
|
+
reportPath,
|
|
377
|
+
score: score.score,
|
|
378
|
+
maxScore: score.maxScore,
|
|
379
|
+
findings: score.findings
|
|
380
|
+
};
|
|
381
|
+
}
|
|
382
|
+
function loadScoreForTarget(target, scorePath) {
|
|
383
|
+
const replayPath = resolveReplayPath(target);
|
|
384
|
+
if (node_fs_1.default.existsSync(scorePath)) {
|
|
385
|
+
const score = (0, state_1.readJson)(scorePath);
|
|
386
|
+
if (node_fs_1.default.existsSync(score.paths.comparisonPath)) {
|
|
387
|
+
const comparison = (0, state_1.readJson)(score.paths.comparisonPath);
|
|
388
|
+
if (comparison.replayId === score.replayId && comparison.paths.replayPath === replayPath)
|
|
389
|
+
return score;
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
return scoreMultiAgentReplay(target);
|
|
393
|
+
}
|
|
394
|
+
function formatMultiAgentEval(value) {
|
|
395
|
+
if (isGate(value)) {
|
|
396
|
+
return [
|
|
397
|
+
"Eval Suite",
|
|
398
|
+
` ${value.suiteId}`,
|
|
399
|
+
"",
|
|
400
|
+
"Replay Status",
|
|
401
|
+
` ${value.status} (${value.score}/${value.maxScore})`,
|
|
402
|
+
"",
|
|
403
|
+
"Regression Findings",
|
|
404
|
+
...(value.findings.length ? value.findings.map((entry) => ` ${entry.severity} ${entry.category}: ${entry.reason}`) : [" none"]),
|
|
405
|
+
"",
|
|
406
|
+
"Final Verdict",
|
|
407
|
+
` ${value.verdict}`,
|
|
408
|
+
"",
|
|
409
|
+
"Next Action",
|
|
410
|
+
` ${value.nextAction}`
|
|
411
|
+
].join("\n");
|
|
412
|
+
}
|
|
413
|
+
if (isScore(value)) {
|
|
414
|
+
return [
|
|
415
|
+
"Eval Suite",
|
|
416
|
+
` ${node_path_1.default.basename(value.paths.suiteDir)}`,
|
|
417
|
+
"",
|
|
418
|
+
"Replay Status",
|
|
419
|
+
` ${value.status} (${value.score}/${value.maxScore})`,
|
|
420
|
+
"",
|
|
421
|
+
"Graph Comparison",
|
|
422
|
+
` ${metricStatus(value, "replay_completed")}; ${metricStatus(value, "graph_parity")}; ${metricStatus(value, "role_parity")}; ${metricStatus(value, "group_parity")}; ${metricStatus(value, "membership_parity")}; ${metricStatus(value, "fanout_parity")}; ${metricStatus(value, "fanin_parity")}; ${metricStatus(value, "dependency_parity")}; ${metricStatus(value, "failure_parity")}`,
|
|
423
|
+
"",
|
|
424
|
+
"Evidence Comparison",
|
|
425
|
+
` ${metricStatus(value, "blackboard_record_parity")}; ${metricStatus(value, "evidence_adoption_parity")}; ${metricStatus(value, "blackboard_provenance_parity")}`,
|
|
426
|
+
"",
|
|
427
|
+
"Trust / Policy / Audit Comparison",
|
|
428
|
+
` ${metricStatus(value, "trust_audit_parity")}; ${metricStatus(value, "role_policy_parity")}; ${metricStatus(value, "permission_decision_parity")}; ${metricStatus(value, "policy_violation_parity")}; ${metricStatus(value, "judge_rationale_parity")}; ${metricStatus(value, "panel_decision_parity")}`,
|
|
429
|
+
"",
|
|
430
|
+
"Candidate Score Comparison",
|
|
431
|
+
` ${metricStatus(value, "candidate_score_parity")}`,
|
|
432
|
+
"",
|
|
433
|
+
"Selection / Commit Gate",
|
|
434
|
+
` ${metricStatus(value, "selection_parity")}; ${metricStatus(value, "verifier_commit_gate_parity")}`,
|
|
435
|
+
"",
|
|
436
|
+
"State Explosion Summaries",
|
|
437
|
+
` ${metricStatus(value, "summary_freshness")}; ${metricStatus(value, "compact_graph_parity")}; ${metricStatus(value, "blackboard_digest_parity")}; ${metricStatus(value, "critical_path_parity")}; ${metricStatus(value, "evidence_digest_parity")}; ${metricStatus(value, "expansion_ref_integrity")}`,
|
|
438
|
+
"",
|
|
439
|
+
"Regression Findings",
|
|
440
|
+
...(value.findings.length ? value.findings.map((entry) => ` ${entry.severity} ${entry.category}: ${entry.reason}`) : [" none"]),
|
|
441
|
+
"",
|
|
442
|
+
"Final Verdict",
|
|
443
|
+
` ${value.status}`,
|
|
444
|
+
"",
|
|
445
|
+
"Next Action",
|
|
446
|
+
` ${value.status === "pass" ? "Run eval gate or include report path as evidence." : "Review findings before release."}`
|
|
447
|
+
].join("\n");
|
|
448
|
+
}
|
|
449
|
+
if (isComparison(value)) {
|
|
450
|
+
return [
|
|
451
|
+
"Eval Suite",
|
|
452
|
+
` ${node_path_1.default.basename(value.paths.suiteDir)}`,
|
|
453
|
+
"",
|
|
454
|
+
"Replay Status",
|
|
455
|
+
` ${value.status}`,
|
|
456
|
+
"",
|
|
457
|
+
"Graph Comparison",
|
|
458
|
+
` ${sectionStatus(value, "workflow")}; ${sectionStatus(value, "topologyShape")}; ${sectionStatus(value, "roles")}; ${sectionStatus(value, "groups")}; ${sectionStatus(value, "memberships")}; ${sectionStatus(value, "fanouts")}; ${sectionStatus(value, "fanins")}; ${sectionStatus(value, "dependencyEdges")}; ${sectionStatus(value, "failures")}`,
|
|
459
|
+
"",
|
|
460
|
+
"Evidence Comparison",
|
|
461
|
+
` ${sectionStatus(value, "blackboardRecords")}; ${sectionStatus(value, "evidenceAdoption")}; ${sectionStatus(value, "messageProvenance")}`,
|
|
462
|
+
"",
|
|
463
|
+
"Trust / Policy / Audit Comparison",
|
|
464
|
+
` ${sectionStatus(value, "blackboardWriteAudit")}; ${sectionStatus(value, "rolePolicies")}; ${sectionStatus(value, "permissionDecisions")}; ${sectionStatus(value, "policyViolations")}; ${sectionStatus(value, "judgeRationales")}; ${sectionStatus(value, "panelDecisions")}`,
|
|
465
|
+
"",
|
|
466
|
+
"Candidate Score Comparison",
|
|
467
|
+
` ${sectionStatus(value, "candidateScores")}`,
|
|
468
|
+
"",
|
|
469
|
+
"Selection / Commit Gate",
|
|
470
|
+
` ${sectionStatus(value, "selectedCandidates")}; ${sectionStatus(value, "verifierCommitGate")}`,
|
|
471
|
+
"",
|
|
472
|
+
"Regression Findings",
|
|
473
|
+
...(value.findings.length ? value.findings.map((entry) => ` ${entry.severity} ${entry.category}: ${entry.reason}`) : [" none"]),
|
|
474
|
+
"",
|
|
475
|
+
"Final Verdict",
|
|
476
|
+
` ${value.status}`,
|
|
477
|
+
"",
|
|
478
|
+
"Next Action",
|
|
479
|
+
" Score the replay or run the eval gate."
|
|
480
|
+
].join("\n");
|
|
481
|
+
}
|
|
482
|
+
if (isReplay(value)) {
|
|
483
|
+
return [
|
|
484
|
+
"Eval Suite",
|
|
485
|
+
` ${node_path_1.default.basename(value.paths.suiteDir)}`,
|
|
486
|
+
"",
|
|
487
|
+
"Replay Status",
|
|
488
|
+
` ${value.status}`,
|
|
489
|
+
` replay=${value.paths.replayRunPath}`,
|
|
490
|
+
"",
|
|
491
|
+
"Next Action",
|
|
492
|
+
` node scripts/cw.js eval compare ${value.paths.snapshotPath} ${value.paths.replayRunPath}`
|
|
493
|
+
].join("\n");
|
|
494
|
+
}
|
|
495
|
+
if (isSnapshot(value)) {
|
|
496
|
+
return [
|
|
497
|
+
"Eval Suite",
|
|
498
|
+
` ${value.id}`,
|
|
499
|
+
"",
|
|
500
|
+
"Replay Status",
|
|
501
|
+
" snapshot captured",
|
|
502
|
+
` snapshot=${value.paths.snapshotPath}`,
|
|
503
|
+
"",
|
|
504
|
+
"Graph Comparison",
|
|
505
|
+
` topology records=${value.normalized.topologyShape.length}`,
|
|
506
|
+
"",
|
|
507
|
+
"Evidence Comparison",
|
|
508
|
+
` evidence records=${value.normalized.evidenceAdoption.length}`,
|
|
509
|
+
"",
|
|
510
|
+
"Trust / Policy / Audit Comparison",
|
|
511
|
+
` audit records=${value.normalized.blackboardWriteAudit.length + value.normalized.messageProvenance.length}`,
|
|
512
|
+
"",
|
|
513
|
+
"Candidate Score Comparison",
|
|
514
|
+
` score records=${value.normalized.candidateScores.length}`,
|
|
515
|
+
"",
|
|
516
|
+
"Selection / Commit Gate",
|
|
517
|
+
` selected=${value.normalized.selectedCandidates.length}; commit gates=${value.normalized.verifierCommitGate.length}`,
|
|
518
|
+
"",
|
|
519
|
+
"Regression Findings",
|
|
520
|
+
" none",
|
|
521
|
+
"",
|
|
522
|
+
"Final Verdict",
|
|
523
|
+
" snapshot-ready",
|
|
524
|
+
"",
|
|
525
|
+
"Next Action",
|
|
526
|
+
` node scripts/cw.js eval replay ${value.paths.snapshotPath}`
|
|
527
|
+
].join("\n");
|
|
528
|
+
}
|
|
529
|
+
if (isReport(value)) {
|
|
530
|
+
return [
|
|
531
|
+
"Eval Suite",
|
|
532
|
+
` ${node_path_1.default.dirname(value.reportPath)}`,
|
|
533
|
+
"",
|
|
534
|
+
"Replay Status",
|
|
535
|
+
` ${value.status} (${value.score}/${value.maxScore})`,
|
|
536
|
+
"",
|
|
537
|
+
"Final Verdict",
|
|
538
|
+
` report written: ${value.reportPath}`,
|
|
539
|
+
"",
|
|
540
|
+
"Next Action",
|
|
541
|
+
" Run eval gate if this is release evidence."
|
|
542
|
+
].join("\n");
|
|
543
|
+
}
|
|
544
|
+
return JSON.stringify(value, null, 2);
|
|
545
|
+
}
|
|
546
|
+
function captureRun(run) {
|
|
547
|
+
return {
|
|
548
|
+
topology: run.topologies || { schemaVersion: 1, runs: [] },
|
|
549
|
+
multiAgent: run.multiAgent || { schemaVersion: 1, runs: [], roles: [], groups: [], memberships: [], fanouts: [], fanins: [] },
|
|
550
|
+
blackboard: run.blackboard || { schemaVersion: 1, boards: [], topics: [], messages: [], contexts: [], artifacts: [], snapshots: [], decisions: [] },
|
|
551
|
+
workers: run.workers || [],
|
|
552
|
+
workerOutputs: (run.workers || []).map((worker) => worker.output).filter(Boolean),
|
|
553
|
+
candidates: run.candidates || [],
|
|
554
|
+
candidateSelections: run.candidateSelections || [],
|
|
555
|
+
candidateScoreInputs: collectCandidateScores(run),
|
|
556
|
+
commits: run.commits || [],
|
|
557
|
+
trustAudit: (0, trust_audit_1.summarizeTrustAudit)(run),
|
|
558
|
+
multiAgentTrust: (0, multi_agent_trust_1.summarizeMultiAgentTrust)(run),
|
|
559
|
+
operator: (0, operator_ux_1.summarizeOperatorRun)(run)
|
|
560
|
+
};
|
|
561
|
+
}
|
|
562
|
+
function normalizeRun(run) {
|
|
563
|
+
const operator = (0, multi_agent_operator_ux_1.summarizeMultiAgentOperator)(run);
|
|
564
|
+
const trust = (0, multi_agent_trust_1.summarizeMultiAgentTrust)(run);
|
|
565
|
+
const blackboard = run.blackboard || { boards: [], topics: [], messages: [], contexts: [], artifacts: [], snapshots: [], decisions: [] };
|
|
566
|
+
const topologies = (0, topology_1.summarizeTopologies)(run);
|
|
567
|
+
const multiAgent = (0, multi_agent_1.summarizeMultiAgent)(run);
|
|
568
|
+
return {
|
|
569
|
+
workflow: normalizeValue({
|
|
570
|
+
id: run.workflow.id,
|
|
571
|
+
appId: run.workflow.app?.id,
|
|
572
|
+
appVersion: run.workflow.app?.version,
|
|
573
|
+
taskCount: run.tasks.length
|
|
574
|
+
}),
|
|
575
|
+
topologyShape: lines([
|
|
576
|
+
topologies.active.map((entry) => ({
|
|
577
|
+
topologyId: entry.topologyId,
|
|
578
|
+
status: entry.status,
|
|
579
|
+
roleCount: entry.roles.length,
|
|
580
|
+
groupCount: entry.groups.length,
|
|
581
|
+
fanoutCount: entry.fanouts.length,
|
|
582
|
+
faninCount: entry.fanins.length
|
|
583
|
+
})),
|
|
584
|
+
multiAgent.groupsDetail
|
|
585
|
+
]),
|
|
586
|
+
roles: lines(run.multiAgent?.roles || []),
|
|
587
|
+
groups: lines(run.multiAgent?.groups || []),
|
|
588
|
+
memberships: lines(run.multiAgent?.memberships || []),
|
|
589
|
+
fanouts: lines(run.multiAgent?.fanouts || []),
|
|
590
|
+
fanins: lines(run.multiAgent?.fanins || []),
|
|
591
|
+
dependencyEdges: lines(operator.dependencies.map((entry) => ({ from: entry.from, to: entry.to, label: entry.label, status: entry.status }))),
|
|
592
|
+
failures: lines(operator.failures.map((entry) => ({ kind: entry.kind, status: entry.status, owner: entry.owner, reason: entry.reason }))),
|
|
593
|
+
blackboardRecords: lines([blackboard.boards, blackboard.topics, blackboard.messages, blackboard.contexts, blackboard.artifacts, blackboard.snapshots, blackboard.decisions]),
|
|
594
|
+
messageProvenance: lines(trust.messageProvenance || []),
|
|
595
|
+
rolePolicies: lines(trust.rolePolicies || []),
|
|
596
|
+
permissionDecisions: lines(trust.permissionDecisions || []),
|
|
597
|
+
blackboardWriteAudit: lines(trust.blackboardWrites || []),
|
|
598
|
+
judgeRationales: lines(trust.judgeRationales || []),
|
|
599
|
+
panelDecisions: lines(trust.panelDecisions || []),
|
|
600
|
+
policyViolations: lines(trust.policyViolations || []),
|
|
601
|
+
evidenceAdoption: lines(operator.evidence.map((entry) => ({
|
|
602
|
+
ref: entry.ref || entry.id,
|
|
603
|
+
status: entry.status,
|
|
604
|
+
adoptedBy: entry.adoptedBy,
|
|
605
|
+
candidateIds: entry.candidateIds,
|
|
606
|
+
selectionIds: entry.selectionIds,
|
|
607
|
+
commitIds: entry.commitIds
|
|
608
|
+
}))),
|
|
609
|
+
candidateScores: lines(collectCandidateScores(run)),
|
|
610
|
+
selectedCandidates: lines((run.candidateSelections || []).map((entry) => ({
|
|
611
|
+
candidateId: entry.candidateId,
|
|
612
|
+
scoreId: entry.scoreId,
|
|
613
|
+
verifierNodeId: entry.verifierNodeId,
|
|
614
|
+
reason: entry.reason,
|
|
615
|
+
evidenceCount: entry.evidence.length
|
|
616
|
+
}))),
|
|
617
|
+
verifierCommitGate: lines((run.commits || []).map((entry) => ({
|
|
618
|
+
verifierGated: Boolean(entry.verifierGated),
|
|
619
|
+
checkpoint: Boolean(entry.checkpoint),
|
|
620
|
+
candidateId: entry.candidateId,
|
|
621
|
+
selectionId: entry.selectionId,
|
|
622
|
+
verifierNodeId: entry.verifierNodeId,
|
|
623
|
+
evidenceCount: (entry.evidence || []).length
|
|
624
|
+
}))),
|
|
625
|
+
reportSections: reportSections(run),
|
|
626
|
+
...(0, state_explosion_1.normalizeStateExplosionForEval)(run),
|
|
627
|
+
...(0, evidence_reasoning_1.normalizeEvidenceReasoningForEval)(run)
|
|
628
|
+
};
|
|
629
|
+
}
|
|
630
|
+
function collectCandidateScores(run) {
|
|
631
|
+
const scores = [];
|
|
632
|
+
for (const candidate of run.candidates || []) {
|
|
633
|
+
for (const scoreId of candidate.scores || []) {
|
|
634
|
+
const scorePath = node_path_1.default.join(run.paths.candidatesDir || node_path_1.default.join(run.paths.runDir, "candidates"), `${(0, state_1.safeFileName)(candidate.id)}.${(0, state_1.safeFileName)(scoreId)}.score.json`);
|
|
635
|
+
if (node_fs_1.default.existsSync(scorePath)) {
|
|
636
|
+
const score = (0, state_1.readJson)(scorePath);
|
|
637
|
+
scores.push({
|
|
638
|
+
candidateId: candidate.id,
|
|
639
|
+
scoreId,
|
|
640
|
+
criteria: score.criteria,
|
|
641
|
+
total: score.total,
|
|
642
|
+
maxTotal: score.maxTotal,
|
|
643
|
+
normalized: score.normalized,
|
|
644
|
+
verdict: score.verdict,
|
|
645
|
+
evidenceCount: Array.isArray(score.evidence) ? score.evidence.length : 0,
|
|
646
|
+
notes: score.notes
|
|
647
|
+
});
|
|
648
|
+
}
|
|
649
|
+
else {
|
|
650
|
+
scores.push({ candidateId: candidate.id, scoreId, missing: true });
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
return scores;
|
|
655
|
+
}
|
|
656
|
+
function reportSections(run) {
|
|
657
|
+
if (!node_fs_1.default.existsSync(run.paths.report))
|
|
658
|
+
return [];
|
|
659
|
+
const text = node_fs_1.default.readFileSync(run.paths.report, "utf8");
|
|
660
|
+
return text.split("\n").filter((line) => /^#+\s+/.test(line)).map((line) => line.replace(/^#+\s+/, "").trim()).sort();
|
|
661
|
+
}
|
|
662
|
+
function loadSnapshot(target) {
|
|
663
|
+
const resolved = resolveSnapshotPath(target);
|
|
664
|
+
const snapshot = (0, state_1.readJson)(resolved);
|
|
665
|
+
if (snapshot.kind !== "multi-agent-replay-snapshot")
|
|
666
|
+
throw new Error(`Not a replay snapshot: ${resolved}`);
|
|
667
|
+
assertSnapshotShape(snapshot, resolved);
|
|
668
|
+
return snapshot;
|
|
669
|
+
}
|
|
670
|
+
function loadReplay(target) {
|
|
671
|
+
const resolved = resolveReplayPath(target);
|
|
672
|
+
const replay = (0, state_1.readJson)(resolved);
|
|
673
|
+
if (replay.kind !== "multi-agent-replay-run")
|
|
674
|
+
throw new Error(`Not a replay run: ${resolved}`);
|
|
675
|
+
assertReplayShape(replay, resolved);
|
|
676
|
+
return replay;
|
|
677
|
+
}
|
|
678
|
+
function loadBaselineNormalized(target) {
|
|
679
|
+
const snapshotPath = resolveSnapshotPath(target);
|
|
680
|
+
const snapshot = (0, state_1.readJson)(snapshotPath);
|
|
681
|
+
if (snapshot.kind !== "multi-agent-replay-snapshot")
|
|
682
|
+
throw new Error(`Not a replay snapshot: ${snapshotPath}`);
|
|
683
|
+
assertSnapshotShape(snapshot, snapshotPath);
|
|
684
|
+
return { id: snapshot.id, path: snapshotPath, normalized: snapshot.normalized };
|
|
685
|
+
}
|
|
686
|
+
function assertSnapshotShape(snapshot, file) {
|
|
687
|
+
if (!snapshot.id)
|
|
688
|
+
throw new Error(`Replay snapshot missing id: ${file}`);
|
|
689
|
+
if (!snapshot.runId)
|
|
690
|
+
throw new Error(`Replay snapshot missing runId: ${file}`);
|
|
691
|
+
if (!snapshot.paths || !snapshot.paths.suiteDir || !snapshot.paths.snapshotPath) {
|
|
692
|
+
throw new Error(`Replay snapshot missing paths.suiteDir or paths.snapshotPath: ${file}`);
|
|
693
|
+
}
|
|
694
|
+
assertNormalizedShape(snapshot.normalized, `Replay snapshot missing normalized section: ${file}`);
|
|
695
|
+
}
|
|
696
|
+
function assertReplayShape(replay, file) {
|
|
697
|
+
if (!replay.id)
|
|
698
|
+
throw new Error(`Replay run missing id: ${file}`);
|
|
699
|
+
if (!replay.snapshotId)
|
|
700
|
+
throw new Error(`Replay run missing snapshotId: ${file}`);
|
|
701
|
+
if (replay.status !== "completed" && replay.status !== "failed") {
|
|
702
|
+
throw new Error(`Replay run has unsupported status ${String(replay.status)}: ${file}`);
|
|
703
|
+
}
|
|
704
|
+
if (!replay.paths || !replay.paths.suiteDir || !replay.paths.replayRunPath || !replay.paths.snapshotPath) {
|
|
705
|
+
throw new Error(`Replay run missing paths.suiteDir, paths.replayRunPath, or paths.snapshotPath: ${file}`);
|
|
706
|
+
}
|
|
707
|
+
if (!Array.isArray(replay.errors))
|
|
708
|
+
throw new Error(`Replay run errors must be an array: ${file}`);
|
|
709
|
+
assertNormalizedShape(replay.replay, `Replay run missing replay section: ${file}`);
|
|
710
|
+
}
|
|
711
|
+
function assertNormalizedShape(value, message) {
|
|
712
|
+
if (!value || typeof value !== "object" || Array.isArray(value))
|
|
713
|
+
throw new Error(message);
|
|
714
|
+
for (const key of METRIC_SECTIONS.map((entry) => entry.section)) {
|
|
715
|
+
if (key === "workflow") {
|
|
716
|
+
if (!value.workflow || typeof value.workflow !== "object" || Array.isArray(value.workflow))
|
|
717
|
+
throw new Error(`${message}; workflow must be an object`);
|
|
718
|
+
}
|
|
719
|
+
else if (!Array.isArray(value[key])) {
|
|
720
|
+
throw new Error(`${message}; ${String(key)} must be an array`);
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
function loadOrCompareForTarget(target) {
|
|
725
|
+
const suiteDir = resolveSuiteDir(target);
|
|
726
|
+
const comparisonPath = node_path_1.default.join(suiteDir, "comparison.json");
|
|
727
|
+
const replayPath = resolveReplayPath(target);
|
|
728
|
+
if (node_fs_1.default.existsSync(comparisonPath)) {
|
|
729
|
+
const comparison = (0, state_1.readJson)(comparisonPath);
|
|
730
|
+
if (comparison.paths.replayPath === replayPath)
|
|
731
|
+
return comparison;
|
|
732
|
+
}
|
|
733
|
+
return compareMultiAgentReplay(node_path_1.default.join(suiteDir, "snapshot.json"), replayPath);
|
|
734
|
+
}
|
|
735
|
+
function resolveSnapshotPath(target) {
|
|
736
|
+
const resolved = resolveTargetPath(target);
|
|
737
|
+
if (node_fs_1.default.existsSync(resolved) && node_fs_1.default.statSync(resolved).isDirectory())
|
|
738
|
+
return node_path_1.default.join(resolved, "snapshot.json");
|
|
739
|
+
if (node_fs_1.default.existsSync(resolved))
|
|
740
|
+
return resolved;
|
|
741
|
+
return node_path_1.default.join(process.cwd(), ".cw", "evals", (0, state_1.safeFileName)(target), "snapshot.json");
|
|
742
|
+
}
|
|
743
|
+
function resolveReplayPath(target) {
|
|
744
|
+
const resolved = resolveTargetPath(target);
|
|
745
|
+
if (node_fs_1.default.existsSync(resolved) && node_fs_1.default.statSync(resolved).isDirectory())
|
|
746
|
+
return node_path_1.default.join(resolved, "replay-run.json");
|
|
747
|
+
if (node_fs_1.default.existsSync(resolved))
|
|
748
|
+
return resolved;
|
|
749
|
+
return node_path_1.default.join(process.cwd(), ".cw", "evals", (0, state_1.safeFileName)(target), "replay-run.json");
|
|
750
|
+
}
|
|
751
|
+
function resolveSuiteDir(target) {
|
|
752
|
+
const resolved = resolveTargetPath(target);
|
|
753
|
+
if (node_fs_1.default.existsSync(resolved)) {
|
|
754
|
+
if (node_fs_1.default.statSync(resolved).isDirectory())
|
|
755
|
+
return resolved;
|
|
756
|
+
const value = (0, state_1.readJson)(resolved);
|
|
757
|
+
if (value.paths?.suiteDir)
|
|
758
|
+
return value.paths.suiteDir;
|
|
759
|
+
return node_path_1.default.dirname(resolved);
|
|
760
|
+
}
|
|
761
|
+
return node_path_1.default.join(process.cwd(), ".cw", "evals", (0, state_1.safeFileName)(target));
|
|
762
|
+
}
|
|
763
|
+
function resolveTargetPath(target) {
|
|
764
|
+
if (!target)
|
|
765
|
+
throw new Error("Missing eval target");
|
|
766
|
+
return node_path_1.default.isAbsolute(target) ? target : node_path_1.default.resolve(target);
|
|
767
|
+
}
|
|
768
|
+
function evalSuiteDir(cwd, suiteId) {
|
|
769
|
+
return node_path_1.default.join(cwd, ".cw", "evals", (0, state_1.safeFileName)(suiteId));
|
|
770
|
+
}
|
|
771
|
+
function writeSuite(suite) {
|
|
772
|
+
(0, state_1.writeJson)(node_path_1.default.join(suite.paths.suiteDir, "suite.json"), suite);
|
|
773
|
+
}
|
|
774
|
+
function loadSuiteFromDir(suiteDir) {
|
|
775
|
+
const suitePath = node_path_1.default.join(suiteDir, "suite.json");
|
|
776
|
+
if (node_fs_1.default.existsSync(suitePath))
|
|
777
|
+
return (0, state_1.readJson)(suitePath);
|
|
778
|
+
return {
|
|
779
|
+
schemaVersion: 1,
|
|
780
|
+
id: node_path_1.default.basename(suiteDir),
|
|
781
|
+
title: `Multi-Agent Eval Suite ${node_path_1.default.basename(suiteDir)}`,
|
|
782
|
+
createdAt: now(),
|
|
783
|
+
cases: [],
|
|
784
|
+
paths: { suiteDir, snapshotPath: node_path_1.default.join(suiteDir, "snapshot.json") }
|
|
785
|
+
};
|
|
786
|
+
}
|
|
787
|
+
function normalizeValue(value) {
|
|
788
|
+
if (Array.isArray(value))
|
|
789
|
+
return value.map(normalizeValue);
|
|
790
|
+
if (!value || typeof value !== "object") {
|
|
791
|
+
if (typeof value === "string")
|
|
792
|
+
return normalizeString(value);
|
|
793
|
+
return value;
|
|
794
|
+
}
|
|
795
|
+
const record = value;
|
|
796
|
+
const normalized = {};
|
|
797
|
+
for (const key of Object.keys(record).sort()) {
|
|
798
|
+
if (["createdAt", "updatedAt", "recordedAt", "selectedAt", "replayedAt", "generatedAt"].includes(key))
|
|
799
|
+
continue;
|
|
800
|
+
if (key.endsWith("Path") || key === "path" || key === "cwd" || key === "runDir" || key.endsWith("Dir")) {
|
|
801
|
+
normalized[key] = normalizeString(String(record[key]));
|
|
802
|
+
}
|
|
803
|
+
else {
|
|
804
|
+
normalized[key] = normalizeValue(record[key]);
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
return normalized;
|
|
808
|
+
}
|
|
809
|
+
function normalizeString(value) {
|
|
810
|
+
return value
|
|
811
|
+
.replace(/[0-9]{8}T[0-9]{6}Z/g, "<timestamp>")
|
|
812
|
+
.replace(/[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9:.]+Z/g, "<timestamp>")
|
|
813
|
+
.replace(/\/[^"\s]+\/\.cw\/runs\/[^"\s/]+/g, "<run-dir>")
|
|
814
|
+
.replace(/\/[^"\s]+\/\.cw\/evals\/[^"\s/]+/g, "<eval-dir>")
|
|
815
|
+
.replace(/\/var\/folders\/[^"\s]+|\/tmp\/[^"\s]+|\/private\/tmp\/[^"\s]+/g, "<tmp>");
|
|
816
|
+
}
|
|
817
|
+
function lines(value) {
|
|
818
|
+
const normalized = normalizeValue(value);
|
|
819
|
+
if (Array.isArray(normalized))
|
|
820
|
+
return normalized.map((entry) => stableStringify(entry)).sort();
|
|
821
|
+
return [stableStringify(normalized)].sort();
|
|
822
|
+
}
|
|
823
|
+
function stableStringify(value) {
|
|
824
|
+
return JSON.stringify(normalizeValue(value));
|
|
825
|
+
}
|
|
826
|
+
function now() {
|
|
827
|
+
return new Date().toISOString();
|
|
828
|
+
}
|
|
829
|
+
function metricLine(score, id) {
|
|
830
|
+
const metric = score.metrics.find((entry) => entry.id === id);
|
|
831
|
+
return `- ${id}: ${metric?.status || "missing"} - ${metric?.reason || "metric missing"}`;
|
|
832
|
+
}
|
|
833
|
+
function metricStatus(score, id) {
|
|
834
|
+
const metric = score.metrics.find((entry) => entry.id === id);
|
|
835
|
+
return `${id}=${metric?.status || "missing"}`;
|
|
836
|
+
}
|
|
837
|
+
function sectionStatus(comparison, id) {
|
|
838
|
+
return `${id}=${comparison.sections[id]?.status || "missing"}`;
|
|
839
|
+
}
|
|
840
|
+
function isSnapshot(value) {
|
|
841
|
+
return Boolean(value && typeof value === "object" && value.kind === "multi-agent-replay-snapshot");
|
|
842
|
+
}
|
|
843
|
+
function isReplay(value) {
|
|
844
|
+
return Boolean(value && typeof value === "object" && value.kind === "multi-agent-replay-run");
|
|
845
|
+
}
|
|
846
|
+
function isComparison(value) {
|
|
847
|
+
return Boolean(value && typeof value === "object" && "sections" in value && "findings" in value);
|
|
848
|
+
}
|
|
849
|
+
function isScore(value) {
|
|
850
|
+
return Boolean(value && typeof value === "object" && "metrics" in value && "score" in value);
|
|
851
|
+
}
|
|
852
|
+
function isGate(value) {
|
|
853
|
+
return Boolean(value && typeof value === "object" && "verdict" in value && "requiredArtifacts" in value);
|
|
854
|
+
}
|
|
855
|
+
function isReport(value) {
|
|
856
|
+
return Boolean(value && typeof value === "object" && "reportPath" in value && !("verdict" in value));
|
|
857
|
+
}
|