@h9-foundry/agentforge-cli 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/bin.js +49 -1
- package/dist/bin.js.map +1 -1
- package/dist/index.d.ts +32 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +716 -6
- package/dist/index.js.map +1 -1
- package/package.json +8 -8
package/dist/index.js
CHANGED
|
@@ -2,11 +2,11 @@ import { existsSync, mkdirSync, readdirSync, readFileSync, statSync, writeFileSy
|
|
|
2
2
|
import { join } from "node:path";
|
|
3
3
|
import { execFileSync } from "node:child_process";
|
|
4
4
|
import yaml from "js-yaml";
|
|
5
|
-
import { renderAuditBundleMarkdown } from "@h9-foundry/agentforge-audit";
|
|
5
|
+
import { buildAuditBundle, createAuditEntry, renderAuditBundleMarkdown } from "@h9-foundry/agentforge-audit";
|
|
6
6
|
import { createWorkflowState, findWorkspaceRoot } from "@h9-foundry/agentforge-context-engine";
|
|
7
7
|
import { createPolicyEngine, loadPolicyDocument, resolvePolicy } from "@h9-foundry/agentforge-policy-engine";
|
|
8
8
|
import { runWorkflow } from "@h9-foundry/agentforge-runtime";
|
|
9
|
-
import { agentforgeConfigSchema, auditBundleSchema, designArtifactSchema, designRequestSchema, implementationRequestSchema, incidentRequestSchema, maintenanceRequestSchema, planningArtifactSchema, planningRequestSchema, qaRequestSchema, releaseRequestSchema, securityRequestSchema, workflowDefinitionSchema } from "@h9-foundry/agentforge-schemas";
|
|
9
|
+
import { agentforgeConfigSchema, auditBundleSchema, benchmarkArtifactSchema, designArtifactSchema, designRequestSchema, evalArtifactSchema, evalFixtureCorpusSchema, implementationRequestSchema, incidentRequestSchema, maintenanceRequestSchema, planningArtifactSchema, planningRequestSchema, qaRequestSchema, releaseRequestSchema, schemaFixtures, securityRequestSchema, workflowDefinitionSchema } from "@h9-foundry/agentforge-schemas";
|
|
10
10
|
import { createBuiltinAdapters } from "./internal/builtin-adapters.js";
|
|
11
11
|
import { createBuiltinAgentRegistry } from "./internal/builtin-agents.js";
|
|
12
12
|
import { LocalPluginRegistry } from "./internal/local-plugin-registry.js";
|
|
@@ -856,6 +856,15 @@ function readLatestCompleteRunBundle(runsRoot) {
|
|
|
856
856
|
if (typeof value !== "string" || value.length === 0) {
|
|
857
857
|
return undefined;
|
|
858
858
|
}
|
|
859
|
+
const compactDateTimeMatch = value.match(/^(\d{4})-(\d{2})-(\d{2})-(\d{2})(\d{2})(\d{2})$/);
|
|
860
|
+
if (compactDateTimeMatch) {
|
|
861
|
+
const [, year, month, day, hour, minute, second] = compactDateTimeMatch;
|
|
862
|
+
const isoCandidate = `${year}-${month}-${day}T${hour}:${minute}:${second}Z`;
|
|
863
|
+
const parsedCompactDateTime = Date.parse(isoCandidate);
|
|
864
|
+
if (!Number.isNaN(parsedCompactDateTime)) {
|
|
865
|
+
return parsedCompactDateTime;
|
|
866
|
+
}
|
|
867
|
+
}
|
|
859
868
|
const parsedDate = Date.parse(value);
|
|
860
869
|
if (!Number.isNaN(parsedDate)) {
|
|
861
870
|
return parsedDate;
|
|
@@ -872,20 +881,24 @@ function readLatestCompleteRunBundle(runsRoot) {
|
|
|
872
881
|
const stats = statSync(bundlePath);
|
|
873
882
|
const bundle = JSON.parse(readFileSync(bundlePath, "utf8"));
|
|
874
883
|
const bundleRunId = typeof bundle.runId === "string" ? bundle.runId : entry;
|
|
875
|
-
const
|
|
884
|
+
const parsedCompletedAtMs = parseRunTimestampMs(bundle.finishedAt) ??
|
|
876
885
|
parseRunTimestampMs(bundle.startedAt) ??
|
|
877
886
|
parseRunTimestampMs(bundleRunId) ??
|
|
878
|
-
parseRunTimestampMs(entry)
|
|
879
|
-
|
|
887
|
+
parseRunTimestampMs(entry);
|
|
888
|
+
const completedAtMs = parsedCompletedAtMs ?? stats.mtimeMs;
|
|
880
889
|
return {
|
|
881
890
|
runDir: entry,
|
|
882
891
|
bundle,
|
|
883
892
|
bundleRunId,
|
|
884
|
-
completedAtMs
|
|
893
|
+
completedAtMs,
|
|
894
|
+
hasExplicitTimestamp: typeof parsedCompletedAtMs === "number"
|
|
885
895
|
};
|
|
886
896
|
})
|
|
887
897
|
.filter((candidate) => Boolean(candidate))
|
|
888
898
|
.sort((left, right) => {
|
|
899
|
+
if (left.hasExplicitTimestamp !== right.hasExplicitTimestamp) {
|
|
900
|
+
return left.hasExplicitTimestamp ? -1 : 1;
|
|
901
|
+
}
|
|
889
902
|
if (left.completedAtMs !== right.completedAtMs) {
|
|
890
903
|
return right.completedAtMs - left.completedAtMs;
|
|
891
904
|
}
|
|
@@ -893,6 +906,29 @@ function readLatestCompleteRunBundle(runsRoot) {
|
|
|
893
906
|
});
|
|
894
907
|
return candidates[0] ? { runDir: candidates[0].runDir, bundle: candidates[0].bundle } : undefined;
|
|
895
908
|
}
|
|
909
|
+
function readRunBundleByRef(root, runRef) {
|
|
910
|
+
const config = loadAgentForgeConfig(root);
|
|
911
|
+
const runsRoot = join(root, config.runtime.runsPath);
|
|
912
|
+
const bundlePath = runRef.endsWith(".json") || runRef.includes("/")
|
|
913
|
+
? (runRef.startsWith("/") ? runRef : join(root, runRef))
|
|
914
|
+
: join(runsRoot, runRef, "bundle.json");
|
|
915
|
+
if (!existsSync(bundlePath)) {
|
|
916
|
+
throw new Error(`Run bundle not found: ${runRef}`);
|
|
917
|
+
}
|
|
918
|
+
const bundle = auditBundleSchema.parse(JSON.parse(readFileSync(bundlePath, "utf8")));
|
|
919
|
+
return {
|
|
920
|
+
runId: typeof bundle.runId === "string" ? bundle.runId : runRef,
|
|
921
|
+
bundlePath,
|
|
922
|
+
bundle
|
|
923
|
+
};
|
|
924
|
+
}
|
|
925
|
+
function extractEvalArtifact(bundle, runRef) {
|
|
926
|
+
const artifact = bundle.lifecycleArtifacts.find((candidate) => candidate.artifactKind === "eval-result");
|
|
927
|
+
if (!artifact) {
|
|
928
|
+
throw new Error(`Run ${runRef} does not contain an eval-result artifact.`);
|
|
929
|
+
}
|
|
930
|
+
return evalArtifactSchema.parse(artifact);
|
|
931
|
+
}
|
|
896
932
|
function loadAgentForgeConfig(root) {
|
|
897
933
|
const configPath = join(root, ".agentops", "agentops.yaml");
|
|
898
934
|
if (!existsSync(configPath)) {
|
|
@@ -921,6 +957,425 @@ function loadAgentForgeConfig(root) {
|
|
|
921
957
|
function ensureDirectory(pathValue) {
|
|
922
958
|
mkdirSync(pathValue, { recursive: true });
|
|
923
959
|
}
|
|
960
|
+
function writeYamlFile(filePath, value) {
|
|
961
|
+
writeFileSync(filePath, yaml.dump(value), "utf8");
|
|
962
|
+
}
|
|
963
|
+
function loadEvalFixtureCorpus() {
|
|
964
|
+
return evalFixtureCorpusSchema.parse(schemaFixtures.evalFixtureCorpus);
|
|
965
|
+
}
|
|
966
|
+
function getEvalSpec(specId) {
|
|
967
|
+
const corpus = loadEvalFixtureCorpus();
|
|
968
|
+
const spec = corpus.specs.find((candidate) => candidate.id === specId);
|
|
969
|
+
if (!spec) {
|
|
970
|
+
throw new Error(`Unknown eval spec: ${specId}`);
|
|
971
|
+
}
|
|
972
|
+
return spec;
|
|
973
|
+
}
|
|
974
|
+
function toBundleRef(run) {
|
|
975
|
+
return `.agentops/runs/${run.runId}/bundle.json`;
|
|
976
|
+
}
|
|
977
|
+
function toSummaryRef(run) {
|
|
978
|
+
return `.agentops/runs/${run.runId}/summary.md`;
|
|
979
|
+
}
|
|
980
|
+
function toSetupRun(workflow, run) {
|
|
981
|
+
return {
|
|
982
|
+
workflow,
|
|
983
|
+
runId: run.runId,
|
|
984
|
+
bundlePath: toBundleRef(run)
|
|
985
|
+
};
|
|
986
|
+
}
|
|
987
|
+
function createBlankEvalWorkspace(root, evalRunId, specId) {
|
|
988
|
+
const workspaceRoot = join(root, ".agentops", "evals", specId, evalRunId, "workspace");
|
|
989
|
+
ensureDirectory(workspaceRoot);
|
|
990
|
+
const evidenceRoot = join(workspaceRoot, ".agentops", "evidence");
|
|
991
|
+
ensureDirectory(evidenceRoot);
|
|
992
|
+
execFileSync("git", ["init"], { cwd: workspaceRoot, stdio: "ignore" });
|
|
993
|
+
execFileSync("git", ["config", "user.email", "eval@example.com"], { cwd: workspaceRoot, stdio: "ignore" });
|
|
994
|
+
execFileSync("git", ["config", "user.name", "AgentForge Eval"], { cwd: workspaceRoot, stdio: "ignore" });
|
|
995
|
+
writeFileSync(join(workspaceRoot, "package.json"), JSON.stringify({
|
|
996
|
+
name: "fixture",
|
|
997
|
+
repository: {
|
|
998
|
+
type: "git",
|
|
999
|
+
url: "https://github.com/H9-Foundry/fixture.git"
|
|
1000
|
+
},
|
|
1001
|
+
scripts: {
|
|
1002
|
+
test: "echo test",
|
|
1003
|
+
lint: "echo lint",
|
|
1004
|
+
typecheck: "echo typecheck",
|
|
1005
|
+
build: "echo build"
|
|
1006
|
+
}
|
|
1007
|
+
}, null, 2), "utf8");
|
|
1008
|
+
writeFileSync(join(workspaceRoot, "pnpm-lock.yaml"), "lockfileVersion: '9.0'\n", "utf8");
|
|
1009
|
+
writeFileSync(join(workspaceRoot, "src.ts"), "export const value = 1;\n", "utf8");
|
|
1010
|
+
writeFileSync(join(evidenceRoot, "dependency-alerts.json"), JSON.stringify({
|
|
1011
|
+
alerts: [
|
|
1012
|
+
{
|
|
1013
|
+
package: "example-dependency",
|
|
1014
|
+
severity: "moderate",
|
|
1015
|
+
summary: "Upgrade pending review for deterministic eval coverage."
|
|
1016
|
+
}
|
|
1017
|
+
]
|
|
1018
|
+
}, null, 2), "utf8");
|
|
1019
|
+
writeFileSync(join(evidenceRoot, "docs-task.md"), "# Docs follow-up\n\n- Align workflow documentation after maintenance triage.\n", "utf8");
|
|
1020
|
+
execFileSync("git", ["add", "."], { cwd: workspaceRoot, stdio: "ignore" });
|
|
1021
|
+
execFileSync("git", ["-c", "commit.gpgsign=false", "commit", "-m", "init"], { cwd: workspaceRoot, stdio: "ignore" });
|
|
1022
|
+
writeFileSync(join(workspaceRoot, "src.ts"), "export const value = 2;\n", "utf8");
|
|
1023
|
+
initProject(workspaceRoot);
|
|
1024
|
+
return workspaceRoot;
|
|
1025
|
+
}
|
|
1026
|
+
function evalRedactionCategories() {
|
|
1027
|
+
return ["github-token", "api-key", "aws-key", "bearer-token", "password", "private-key"];
|
|
1028
|
+
}
|
|
1029
|
+
function createEvalBundle(root, spec, evaluatedRun, workspacePath, setupRuns, deterministicChecks, modelDependentChecks) {
|
|
1030
|
+
const config = loadAgentForgeConfig(root);
|
|
1031
|
+
const policy = resolvePolicy(loadPolicyDocument(join(root, ".agentops", "policy.yaml")), process.env.CI ? "ci" : "local");
|
|
1032
|
+
const state = createWorkflowState({
|
|
1033
|
+
cwd: root,
|
|
1034
|
+
workflow: `eval:${spec.id}`,
|
|
1035
|
+
mode: "inspect",
|
|
1036
|
+
policy
|
|
1037
|
+
});
|
|
1038
|
+
const runsRoot = join(root, config.runtime.runsPath);
|
|
1039
|
+
const outputDir = join(runsRoot, state.runId);
|
|
1040
|
+
ensureDirectory(outputDir);
|
|
1041
|
+
const jsonPath = join(outputDir, "bundle.json");
|
|
1042
|
+
const markdownPath = join(outputDir, "summary.md");
|
|
1043
|
+
const failureCount = deterministicChecks.filter((check) => check.status === "failed").length;
|
|
1044
|
+
const passed = failureCount === 0;
|
|
1045
|
+
const startedAt = new Date().toISOString();
|
|
1046
|
+
const evalArtifact = evalArtifactSchema.parse({
|
|
1047
|
+
schemaVersion: state.version,
|
|
1048
|
+
artifactKind: "eval-result",
|
|
1049
|
+
lifecycleDomain: "evaluate",
|
|
1050
|
+
workflow: {
|
|
1051
|
+
name: state.workflow,
|
|
1052
|
+
displayName: "Eval Runner"
|
|
1053
|
+
},
|
|
1054
|
+
source: {
|
|
1055
|
+
sourceType: "workflow-run",
|
|
1056
|
+
runId: state.runId,
|
|
1057
|
+
inputRefs: [
|
|
1058
|
+
...(evaluatedRun?.jsonPath ? [evaluatedRun.jsonPath] : []),
|
|
1059
|
+
...setupRuns.map((setup) => setup.bundlePath)
|
|
1060
|
+
],
|
|
1061
|
+
issueRefs: ["#165"],
|
|
1062
|
+
githubRefs: []
|
|
1063
|
+
},
|
|
1064
|
+
status: passed ? "complete" : "draft",
|
|
1065
|
+
generatedAt: startedAt,
|
|
1066
|
+
repo: {
|
|
1067
|
+
root: state.repo.root,
|
|
1068
|
+
name: state.repo.name,
|
|
1069
|
+
branch: state.repo.branch
|
|
1070
|
+
},
|
|
1071
|
+
provenance: {
|
|
1072
|
+
generatedBy: "agentforge-runtime",
|
|
1073
|
+
schemaVersion: state.version,
|
|
1074
|
+
executionEnvironment: state.context.ciExecution ? "ci" : "local",
|
|
1075
|
+
repoRoot: state.repo.root
|
|
1076
|
+
},
|
|
1077
|
+
redaction: {
|
|
1078
|
+
applied: true,
|
|
1079
|
+
strategyVersion: "1.0.0",
|
|
1080
|
+
categories: evalRedactionCategories()
|
|
1081
|
+
},
|
|
1082
|
+
auditLink: {
|
|
1083
|
+
bundlePath: jsonPath,
|
|
1084
|
+
entryIds: [`${state.runId}-eval-runner`],
|
|
1085
|
+
findingIds: [],
|
|
1086
|
+
proposedActionIds: []
|
|
1087
|
+
},
|
|
1088
|
+
summary: passed
|
|
1089
|
+
? `Eval result for ${spec.id} passed ${deterministicChecks.length} deterministic check(s).`
|
|
1090
|
+
: `Eval result for ${spec.id} failed ${failureCount} deterministic check(s).`,
|
|
1091
|
+
payload: {
|
|
1092
|
+
specId: spec.id,
|
|
1093
|
+
specName: spec.name,
|
|
1094
|
+
workflow: spec.workflow,
|
|
1095
|
+
repoFixture: spec.repoFixture,
|
|
1096
|
+
workspacePath,
|
|
1097
|
+
evaluatedRunId: evaluatedRun?.runId,
|
|
1098
|
+
evaluatedBundlePath: evaluatedRun ? toBundleRef(evaluatedRun) : undefined,
|
|
1099
|
+
setupRuns,
|
|
1100
|
+
deterministicChecks,
|
|
1101
|
+
modelDependentChecks,
|
|
1102
|
+
passed,
|
|
1103
|
+
failureCount,
|
|
1104
|
+
warningCount: 0
|
|
1105
|
+
}
|
|
1106
|
+
});
|
|
1107
|
+
state.lifecycleArtifacts = [evalArtifact];
|
|
1108
|
+
state.auditTrail = [
|
|
1109
|
+
createAuditEntry({
|
|
1110
|
+
id: `${state.runId}-eval-runner`,
|
|
1111
|
+
nodeId: "eval-runner",
|
|
1112
|
+
nodeName: "eval-runner",
|
|
1113
|
+
kind: "deterministic",
|
|
1114
|
+
startedAt,
|
|
1115
|
+
completedAt: new Date().toISOString(),
|
|
1116
|
+
status: passed ? "success" : "failed",
|
|
1117
|
+
summary: evalArtifact.summary,
|
|
1118
|
+
toolsRequested: [],
|
|
1119
|
+
toolsExecuted: [],
|
|
1120
|
+
blockedActions: [],
|
|
1121
|
+
validationPassed: passed
|
|
1122
|
+
}),
|
|
1123
|
+
createAuditEntry({
|
|
1124
|
+
id: `${state.runId}-report`,
|
|
1125
|
+
nodeId: "report",
|
|
1126
|
+
nodeName: "final-report",
|
|
1127
|
+
kind: "report",
|
|
1128
|
+
startedAt,
|
|
1129
|
+
completedAt: new Date().toISOString(),
|
|
1130
|
+
status: "success",
|
|
1131
|
+
summary: "Generated eval result artifacts.",
|
|
1132
|
+
toolsRequested: [],
|
|
1133
|
+
toolsExecuted: [],
|
|
1134
|
+
blockedActions: [],
|
|
1135
|
+
validationPassed: true
|
|
1136
|
+
})
|
|
1137
|
+
];
|
|
1138
|
+
const bundle = buildAuditBundle(state, {
|
|
1139
|
+
startedAt,
|
|
1140
|
+
finishedAt: new Date().toISOString(),
|
|
1141
|
+
status: passed ? "success" : "partial",
|
|
1142
|
+
jsonPath,
|
|
1143
|
+
markdownPath,
|
|
1144
|
+
provenance: {
|
|
1145
|
+
generatedBy: "agentforge-runtime",
|
|
1146
|
+
schemaVersion: state.version,
|
|
1147
|
+
executionEnvironment: state.context.ciExecution ? "ci" : "local",
|
|
1148
|
+
repoRoot: state.repo.root
|
|
1149
|
+
},
|
|
1150
|
+
redaction: {
|
|
1151
|
+
applied: true,
|
|
1152
|
+
strategyVersion: "1.0.0",
|
|
1153
|
+
categories: evalRedactionCategories()
|
|
1154
|
+
},
|
|
1155
|
+
components: []
|
|
1156
|
+
});
|
|
1157
|
+
writeFileSync(jsonPath, JSON.stringify(bundle, null, 2), "utf8");
|
|
1158
|
+
writeFileSync(markdownPath, renderAuditBundleMarkdown(bundle), "utf8");
|
|
1159
|
+
return { bundle, jsonPath, markdownPath, outputDir };
|
|
1160
|
+
}
|
|
1161
|
+
function compareDeterministicChecks(baselineChecks, candidateChecks) {
|
|
1162
|
+
const regressions = [];
|
|
1163
|
+
const improvements = [];
|
|
1164
|
+
const nonComparableFindings = [];
|
|
1165
|
+
let unchangedCount = 0;
|
|
1166
|
+
const baselineByName = new Map(baselineChecks.map((check) => [check.name, check]));
|
|
1167
|
+
const candidateByName = new Map(candidateChecks.map((check) => [check.name, check]));
|
|
1168
|
+
const checkNames = [...new Set([...baselineByName.keys(), ...candidateByName.keys()])].sort();
|
|
1169
|
+
for (const name of checkNames) {
|
|
1170
|
+
const baselineCheck = baselineByName.get(name);
|
|
1171
|
+
const candidateCheck = candidateByName.get(name);
|
|
1172
|
+
if (!baselineCheck || !candidateCheck) {
|
|
1173
|
+
nonComparableFindings.push(`Deterministic check \`${name}\` is missing from one of the eval results.`);
|
|
1174
|
+
continue;
|
|
1175
|
+
}
|
|
1176
|
+
if (baselineCheck.status === candidateCheck.status) {
|
|
1177
|
+
unchangedCount += 1;
|
|
1178
|
+
continue;
|
|
1179
|
+
}
|
|
1180
|
+
if (baselineCheck.status === "not_applicable" || candidateCheck.status === "not_applicable") {
|
|
1181
|
+
nonComparableFindings.push(`Deterministic check \`${name}\` changed between comparable and not_applicable states (${baselineCheck.status} -> ${candidateCheck.status}).`);
|
|
1182
|
+
continue;
|
|
1183
|
+
}
|
|
1184
|
+
if (baselineCheck.status === "passed" && candidateCheck.status === "failed") {
|
|
1185
|
+
regressions.push({
|
|
1186
|
+
name,
|
|
1187
|
+
classification: "regression",
|
|
1188
|
+
baselineStatus: baselineCheck.status,
|
|
1189
|
+
candidateStatus: candidateCheck.status,
|
|
1190
|
+
details: candidateCheck.details ?? baselineCheck.details
|
|
1191
|
+
});
|
|
1192
|
+
continue;
|
|
1193
|
+
}
|
|
1194
|
+
if (baselineCheck.status === "failed" && candidateCheck.status === "passed") {
|
|
1195
|
+
improvements.push({
|
|
1196
|
+
name,
|
|
1197
|
+
classification: "improvement",
|
|
1198
|
+
baselineStatus: baselineCheck.status,
|
|
1199
|
+
candidateStatus: candidateCheck.status,
|
|
1200
|
+
details: candidateCheck.details ?? baselineCheck.details
|
|
1201
|
+
});
|
|
1202
|
+
continue;
|
|
1203
|
+
}
|
|
1204
|
+
nonComparableFindings.push(`Deterministic check \`${name}\` changed in an unsupported way (${baselineCheck.status} -> ${candidateCheck.status}).`);
|
|
1205
|
+
}
|
|
1206
|
+
return { regressions, improvements, unchangedCount, nonComparableFindings };
|
|
1207
|
+
}
|
|
1208
|
+
function compareEvalArtifacts(baselineRunId, baselineBundlePath, baselineArtifact, candidateRunId, candidateBundlePath, candidateArtifact) {
|
|
1209
|
+
if (baselineArtifact.payload.specId !== candidateArtifact.payload.specId) {
|
|
1210
|
+
return {
|
|
1211
|
+
runId: candidateRunId,
|
|
1212
|
+
bundlePath: candidateBundlePath,
|
|
1213
|
+
specId: candidateArtifact.payload.specId,
|
|
1214
|
+
workflow: candidateArtifact.payload.workflow,
|
|
1215
|
+
comparable: false,
|
|
1216
|
+
passed: candidateArtifact.payload.passed,
|
|
1217
|
+
failureCount: candidateArtifact.payload.failureCount,
|
|
1218
|
+
deterministicCheckCount: candidateArtifact.payload.deterministicChecks.length,
|
|
1219
|
+
regressions: [],
|
|
1220
|
+
improvements: [],
|
|
1221
|
+
unchangedCount: 0,
|
|
1222
|
+
nonComparableFindings: [
|
|
1223
|
+
`Spec mismatch: baseline ${baselineArtifact.payload.specId} vs candidate ${candidateArtifact.payload.specId}.`
|
|
1224
|
+
]
|
|
1225
|
+
};
|
|
1226
|
+
}
|
|
1227
|
+
if (baselineArtifact.payload.workflow !== candidateArtifact.payload.workflow) {
|
|
1228
|
+
return {
|
|
1229
|
+
runId: candidateRunId,
|
|
1230
|
+
bundlePath: candidateBundlePath,
|
|
1231
|
+
specId: candidateArtifact.payload.specId,
|
|
1232
|
+
workflow: candidateArtifact.payload.workflow,
|
|
1233
|
+
comparable: false,
|
|
1234
|
+
passed: candidateArtifact.payload.passed,
|
|
1235
|
+
failureCount: candidateArtifact.payload.failureCount,
|
|
1236
|
+
deterministicCheckCount: candidateArtifact.payload.deterministicChecks.length,
|
|
1237
|
+
regressions: [],
|
|
1238
|
+
improvements: [],
|
|
1239
|
+
unchangedCount: 0,
|
|
1240
|
+
nonComparableFindings: [
|
|
1241
|
+
`Workflow mismatch: baseline ${baselineArtifact.payload.workflow} vs candidate ${candidateArtifact.payload.workflow}.`
|
|
1242
|
+
]
|
|
1243
|
+
};
|
|
1244
|
+
}
|
|
1245
|
+
const comparison = compareDeterministicChecks(baselineArtifact.payload.deterministicChecks, candidateArtifact.payload.deterministicChecks);
|
|
1246
|
+
return {
|
|
1247
|
+
runId: candidateRunId,
|
|
1248
|
+
bundlePath: candidateBundlePath,
|
|
1249
|
+
specId: candidateArtifact.payload.specId,
|
|
1250
|
+
workflow: candidateArtifact.payload.workflow,
|
|
1251
|
+
comparable: comparison.nonComparableFindings.length === 0,
|
|
1252
|
+
passed: candidateArtifact.payload.passed,
|
|
1253
|
+
failureCount: candidateArtifact.payload.failureCount,
|
|
1254
|
+
deterministicCheckCount: candidateArtifact.payload.deterministicChecks.length,
|
|
1255
|
+
regressions: comparison.regressions,
|
|
1256
|
+
improvements: comparison.improvements,
|
|
1257
|
+
unchangedCount: comparison.unchangedCount,
|
|
1258
|
+
nonComparableFindings: comparison.nonComparableFindings
|
|
1259
|
+
};
|
|
1260
|
+
}
|
|
1261
|
+
function createBenchmarkBundle(root, baselineRunId, baselineBundlePath, baselineArtifact, comparedRuns) {
|
|
1262
|
+
const config = loadAgentForgeConfig(root);
|
|
1263
|
+
const policy = resolvePolicy(loadPolicyDocument(join(root, ".agentops", "policy.yaml")), process.env.CI ? "ci" : "local");
|
|
1264
|
+
const state = createWorkflowState({
|
|
1265
|
+
cwd: root,
|
|
1266
|
+
workflow: "eval:compare",
|
|
1267
|
+
mode: "inspect",
|
|
1268
|
+
policy
|
|
1269
|
+
});
|
|
1270
|
+
const runsRoot = join(root, config.runtime.runsPath);
|
|
1271
|
+
const outputDir = join(runsRoot, state.runId);
|
|
1272
|
+
ensureDirectory(outputDir);
|
|
1273
|
+
const jsonPath = join(outputDir, "bundle.json");
|
|
1274
|
+
const markdownPath = join(outputDir, "summary.md");
|
|
1275
|
+
const regressionCount = comparedRuns.reduce((total, candidate) => total + candidate.regressions.length, 0);
|
|
1276
|
+
const improvementCount = comparedRuns.reduce((total, candidate) => total + candidate.improvements.length, 0);
|
|
1277
|
+
const unchangedCount = comparedRuns.reduce((total, candidate) => total + candidate.unchangedCount, 0);
|
|
1278
|
+
const nonComparableCount = comparedRuns.reduce((total, candidate) => total + candidate.nonComparableFindings.length, 0);
|
|
1279
|
+
const summaryConclusion = regressionCount > 0
|
|
1280
|
+
? `Detected ${regressionCount} deterministic regression(s) across compared eval results.`
|
|
1281
|
+
: improvementCount > 0
|
|
1282
|
+
? `Detected ${improvementCount} deterministic improvement(s) with no regressions.`
|
|
1283
|
+
: nonComparableCount > 0
|
|
1284
|
+
? `Compared eval results contain ${nonComparableCount} non-comparable difference(s) and no deterministic regressions.`
|
|
1285
|
+
: "No deterministic regressions detected across compared eval results.";
|
|
1286
|
+
const benchmarkArtifact = benchmarkArtifactSchema.parse({
|
|
1287
|
+
schemaVersion: state.version,
|
|
1288
|
+
artifactKind: "benchmark-summary",
|
|
1289
|
+
lifecycleDomain: "evaluate",
|
|
1290
|
+
workflow: {
|
|
1291
|
+
name: state.workflow,
|
|
1292
|
+
displayName: "Eval Benchmark Compare"
|
|
1293
|
+
},
|
|
1294
|
+
source: {
|
|
1295
|
+
sourceType: "workflow-run",
|
|
1296
|
+
runId: state.runId,
|
|
1297
|
+
inputRefs: [baselineBundlePath, ...comparedRuns.map((candidate) => candidate.bundlePath)],
|
|
1298
|
+
issueRefs: ["#166"],
|
|
1299
|
+
githubRefs: []
|
|
1300
|
+
},
|
|
1301
|
+
status: "complete",
|
|
1302
|
+
generatedAt: new Date().toISOString(),
|
|
1303
|
+
repo: {
|
|
1304
|
+
root: state.repo.root,
|
|
1305
|
+
name: state.repo.name,
|
|
1306
|
+
branch: state.repo.branch
|
|
1307
|
+
},
|
|
1308
|
+
provenance: {
|
|
1309
|
+
generatedBy: "agentforge-runtime",
|
|
1310
|
+
schemaVersion: state.version,
|
|
1311
|
+
executionEnvironment: state.context.ciExecution ? "ci" : "local",
|
|
1312
|
+
repoRoot: state.repo.root
|
|
1313
|
+
},
|
|
1314
|
+
redaction: {
|
|
1315
|
+
applied: true,
|
|
1316
|
+
strategyVersion: "1.0.0",
|
|
1317
|
+
categories: evalRedactionCategories()
|
|
1318
|
+
},
|
|
1319
|
+
auditLink: {
|
|
1320
|
+
bundlePath: jsonPath,
|
|
1321
|
+
entryIds: [`${state.runId}-benchmark-compare`],
|
|
1322
|
+
findingIds: [],
|
|
1323
|
+
proposedActionIds: []
|
|
1324
|
+
},
|
|
1325
|
+
summary: summaryConclusion,
|
|
1326
|
+
payload: {
|
|
1327
|
+
baselineRunId,
|
|
1328
|
+
baselineBundlePath,
|
|
1329
|
+
baselineSpecId: baselineArtifact.payload.specId,
|
|
1330
|
+
baselineWorkflow: baselineArtifact.payload.workflow,
|
|
1331
|
+
comparedRuns,
|
|
1332
|
+
regressionCount,
|
|
1333
|
+
improvementCount,
|
|
1334
|
+
unchangedCount,
|
|
1335
|
+
nonComparableCount,
|
|
1336
|
+
summaryConclusion
|
|
1337
|
+
}
|
|
1338
|
+
});
|
|
1339
|
+
state.lifecycleArtifacts = [benchmarkArtifact];
|
|
1340
|
+
state.auditTrail = [
|
|
1341
|
+
createAuditEntry({
|
|
1342
|
+
id: `${state.runId}-benchmark-compare`,
|
|
1343
|
+
nodeId: "benchmark-compare",
|
|
1344
|
+
nodeName: "benchmark-compare",
|
|
1345
|
+
kind: "deterministic",
|
|
1346
|
+
startedAt: new Date().toISOString(),
|
|
1347
|
+
completedAt: new Date().toISOString(),
|
|
1348
|
+
status: regressionCount > 0 ? "failed" : "success",
|
|
1349
|
+
summary: benchmarkArtifact.summary,
|
|
1350
|
+
toolsRequested: [],
|
|
1351
|
+
toolsExecuted: [],
|
|
1352
|
+
blockedActions: [],
|
|
1353
|
+
validationPassed: regressionCount === 0
|
|
1354
|
+
})
|
|
1355
|
+
];
|
|
1356
|
+
const bundle = buildAuditBundle(state, {
|
|
1357
|
+
startedAt: new Date().toISOString(),
|
|
1358
|
+
finishedAt: new Date().toISOString(),
|
|
1359
|
+
status: regressionCount > 0 || nonComparableCount > 0 ? "partial" : "success",
|
|
1360
|
+
jsonPath,
|
|
1361
|
+
markdownPath,
|
|
1362
|
+
provenance: {
|
|
1363
|
+
generatedBy: "agentforge-runtime",
|
|
1364
|
+
schemaVersion: state.version,
|
|
1365
|
+
executionEnvironment: state.context.ciExecution ? "ci" : "local",
|
|
1366
|
+
repoRoot: state.repo.root
|
|
1367
|
+
},
|
|
1368
|
+
redaction: {
|
|
1369
|
+
applied: true,
|
|
1370
|
+
strategyVersion: "1.0.0",
|
|
1371
|
+
categories: evalRedactionCategories()
|
|
1372
|
+
},
|
|
1373
|
+
components: []
|
|
1374
|
+
});
|
|
1375
|
+
writeFileSync(jsonPath, JSON.stringify(bundle, null, 2), "utf8");
|
|
1376
|
+
writeFileSync(markdownPath, renderAuditBundleMarkdown(bundle), "utf8");
|
|
1377
|
+
return { bundle, jsonPath, markdownPath, outputDir };
|
|
1378
|
+
}
|
|
924
1379
|
function ensureInitFiles(root) {
|
|
925
1380
|
const created = [];
|
|
926
1381
|
const configDir = join(root, ".agentops");
|
|
@@ -1121,6 +1576,261 @@ export async function runLocalWorkflow(workflowName, cwd = process.cwd()) {
|
|
|
1121
1576
|
artifactKinds: bundle.lifecycleArtifacts.map((artifact) => artifact.artifactKind)
|
|
1122
1577
|
};
|
|
1123
1578
|
}
|
|
1579
|
+
function checkResult(status, name, expected, actual, details) {
|
|
1580
|
+
return {
|
|
1581
|
+
name,
|
|
1582
|
+
status,
|
|
1583
|
+
expected,
|
|
1584
|
+
actual,
|
|
1585
|
+
...(details ? { details } : {})
|
|
1586
|
+
};
|
|
1587
|
+
}
|
|
1588
|
+
function compareEvalSpec(spec, bundle, executionError) {
|
|
1589
|
+
const checks = [];
|
|
1590
|
+
if (!bundle) {
|
|
1591
|
+
checks.push(checkResult("failed", "workflow-execution", "successful workflow execution", executionError ?? "unknown failure", "The eval runner could not produce an evaluated workflow bundle."));
|
|
1592
|
+
return {
|
|
1593
|
+
deterministicChecks: checks,
|
|
1594
|
+
modelDependentChecks: [
|
|
1595
|
+
{
|
|
1596
|
+
name: "rubric-scoring",
|
|
1597
|
+
status: "not_executed",
|
|
1598
|
+
details: "Provider-dependent scoring is out of scope for the first local eval runner slice."
|
|
1599
|
+
}
|
|
1600
|
+
]
|
|
1601
|
+
};
|
|
1602
|
+
}
|
|
1603
|
+
checks.push(checkResult(bundle.status === spec.expectedStatus ? "passed" : "failed", "run-status", spec.expectedStatus, bundle.status, "The evaluated workflow status should match the deterministic eval spec."));
|
|
1604
|
+
checks.push(checkResult(bundle.redaction.applied === spec.redactionExpectations.applied ? "passed" : "failed", "redaction-applied", String(spec.redactionExpectations.applied), String(bundle.redaction.applied)));
|
|
1605
|
+
for (const category of spec.redactionExpectations.expectedCategories) {
|
|
1606
|
+
checks.push(checkResult(bundle.redaction.categories.includes(category) ? "passed" : "failed", `redaction-category:${category}`, category, bundle.redaction.categories.join(", ")));
|
|
1607
|
+
}
|
|
1608
|
+
checks.push(checkResult(bundle.policy.defaults.executionMode === spec.policyExpectations.executionMode ? "passed" : "failed", "policy-execution-mode", spec.policyExpectations.executionMode, bundle.policy.defaults.executionMode));
|
|
1609
|
+
if (spec.policyExpectations.readOnly) {
|
|
1610
|
+
checks.push(checkResult(bundle.policy.defaults.writes !== "allow" ? "passed" : "failed", "policy-read-only", "writes not equal allow", bundle.policy.defaults.writes));
|
|
1611
|
+
}
|
|
1612
|
+
for (const sideEffectClass of spec.policyExpectations.sideEffectClasses) {
|
|
1613
|
+
checks.push(checkResult("not_applicable", `side-effect-class:${sideEffectClass}`, sideEffectClass, undefined, "The first eval runner records policy posture and workflow outputs but does not inspect adapter-level side-effect execution traces."));
|
|
1614
|
+
}
|
|
1615
|
+
for (const expectedArtifact of spec.artifactExpectations) {
|
|
1616
|
+
const actualArtifact = bundle.lifecycleArtifacts.find((artifact) => artifact.artifactKind === expectedArtifact.artifactKind);
|
|
1617
|
+
checks.push(checkResult(actualArtifact ? "passed" : "failed", `artifact-kind:${expectedArtifact.artifactKind}`, expectedArtifact.artifactKind, actualArtifact?.artifactKind));
|
|
1618
|
+
if (!actualArtifact || typeof actualArtifact.payload !== "object" || actualArtifact.payload === null) {
|
|
1619
|
+
continue;
|
|
1620
|
+
}
|
|
1621
|
+
const payload = actualArtifact.payload;
|
|
1622
|
+
for (const field of expectedArtifact.requiredPayloadFields) {
|
|
1623
|
+
checks.push(checkResult(field in payload ? "passed" : "failed", `payload-field:${expectedArtifact.artifactKind}:${field}`, field, Object.keys(payload).join(", ")));
|
|
1624
|
+
}
|
|
1625
|
+
for (const term of expectedArtifact.requiredSummaryTerms) {
|
|
1626
|
+
const summary = actualArtifact.summary.toLowerCase();
|
|
1627
|
+
checks.push(checkResult(summary.includes(term.toLowerCase()) ? "passed" : "failed", `summary-term:${expectedArtifact.artifactKind}:${term}`, term, actualArtifact.summary));
|
|
1628
|
+
}
|
|
1629
|
+
}
|
|
1630
|
+
if (spec.artifactExpectations.length === 0) {
|
|
1631
|
+
checks.push(checkResult(bundle.lifecycleArtifacts.length === 0 ? "passed" : "failed", "artifact-count", "0", String(bundle.lifecycleArtifacts.length)));
|
|
1632
|
+
}
|
|
1633
|
+
return {
|
|
1634
|
+
deterministicChecks: checks,
|
|
1635
|
+
modelDependentChecks: [
|
|
1636
|
+
{
|
|
1637
|
+
name: "rubric-scoring",
|
|
1638
|
+
status: "not_executed",
|
|
1639
|
+
details: "Provider-dependent scoring is out of scope for the first local eval runner slice."
|
|
1640
|
+
}
|
|
1641
|
+
]
|
|
1642
|
+
};
|
|
1643
|
+
}
|
|
1644
|
+
async function executeEvalWorkflow(spec, workspaceRoot) {
|
|
1645
|
+
const setupRuns = [];
|
|
1646
|
+
const requestsRoot = join(workspaceRoot, ".agentops", "requests");
|
|
1647
|
+
ensureDirectory(requestsRoot);
|
|
1648
|
+
const runPlanning = async () => {
|
|
1649
|
+
writeYamlFile(join(requestsRoot, "planning.yaml"), schemaFixtures.planningRequest);
|
|
1650
|
+
return runLocalWorkflow("planning-discovery", workspaceRoot);
|
|
1651
|
+
};
|
|
1652
|
+
const runDesign = async () => {
|
|
1653
|
+
const planningRun = await runPlanning();
|
|
1654
|
+
setupRuns.push(toSetupRun("planning-discovery", planningRun));
|
|
1655
|
+
writeYamlFile(join(requestsRoot, "design.yaml"), {
|
|
1656
|
+
...schemaFixtures.designRequest,
|
|
1657
|
+
planningBriefRef: toBundleRef(planningRun)
|
|
1658
|
+
});
|
|
1659
|
+
return runLocalWorkflow("architecture-design-review", workspaceRoot);
|
|
1660
|
+
};
|
|
1661
|
+
const runImplementation = async () => {
|
|
1662
|
+
const designRun = await runDesign();
|
|
1663
|
+
setupRuns.push(toSetupRun("architecture-design-review", designRun));
|
|
1664
|
+
writeYamlFile(join(requestsRoot, "implementation.yaml"), {
|
|
1665
|
+
...schemaFixtures.implementationRequest,
|
|
1666
|
+
designRecordRef: toBundleRef(designRun)
|
|
1667
|
+
});
|
|
1668
|
+
return runLocalWorkflow("implementation-proposal", workspaceRoot);
|
|
1669
|
+
};
|
|
1670
|
+
const runQa = async () => {
|
|
1671
|
+
const implementationRun = await runImplementation();
|
|
1672
|
+
setupRuns.push(toSetupRun("implementation-proposal", implementationRun));
|
|
1673
|
+
writeYamlFile(join(requestsRoot, "qa.yaml"), {
|
|
1674
|
+
...schemaFixtures.qaRequest,
|
|
1675
|
+
targetRef: toBundleRef(implementationRun),
|
|
1676
|
+
evidenceSources: [toSummaryRef(implementationRun)]
|
|
1677
|
+
});
|
|
1678
|
+
return runLocalWorkflow("qa-review", workspaceRoot);
|
|
1679
|
+
};
|
|
1680
|
+
const runSecurity = async () => {
|
|
1681
|
+
const qaRun = await runQa();
|
|
1682
|
+
setupRuns.push(toSetupRun("qa-review", qaRun));
|
|
1683
|
+
writeYamlFile(join(requestsRoot, "security.yaml"), {
|
|
1684
|
+
...schemaFixtures.securityRequest,
|
|
1685
|
+
targetRef: toBundleRef(qaRun),
|
|
1686
|
+
evidenceSources: [toSummaryRef(qaRun)]
|
|
1687
|
+
});
|
|
1688
|
+
return runLocalWorkflow("security-review", workspaceRoot);
|
|
1689
|
+
};
|
|
1690
|
+
const runRelease = async () => {
|
|
1691
|
+
const securityRun = await runSecurity();
|
|
1692
|
+
setupRuns.push(toSetupRun("security-review", securityRun));
|
|
1693
|
+
const qaRun = setupRuns.find((run) => run.workflow === "qa-review");
|
|
1694
|
+
if (!qaRun) {
|
|
1695
|
+
throw new Error("QA setup run was not recorded before release eval execution.");
|
|
1696
|
+
}
|
|
1697
|
+
writeYamlFile(join(requestsRoot, "release.yaml"), {
|
|
1698
|
+
...schemaFixtures.releaseRequest,
|
|
1699
|
+
qaReportRefs: [qaRun.bundlePath],
|
|
1700
|
+
securityReportRefs: [toBundleRef(securityRun)],
|
|
1701
|
+
evidenceSources: [toSummaryRef(securityRun)]
|
|
1702
|
+
});
|
|
1703
|
+
return runLocalWorkflow("release-readiness", workspaceRoot);
|
|
1704
|
+
};
|
|
1705
|
+
switch (spec.workflow) {
|
|
1706
|
+
case "pr-review":
|
|
1707
|
+
return { evaluatedRun: await runLocalWorkflow("pr-review", workspaceRoot), setupRuns };
|
|
1708
|
+
case "planning-discovery":
|
|
1709
|
+
writeYamlFile(join(requestsRoot, "planning.yaml"), spec.request);
|
|
1710
|
+
return { evaluatedRun: await runLocalWorkflow("planning-discovery", workspaceRoot), setupRuns };
|
|
1711
|
+
case "architecture-design-review": {
|
|
1712
|
+
const planningRun = await runPlanning();
|
|
1713
|
+
setupRuns.push(toSetupRun("planning-discovery", planningRun));
|
|
1714
|
+
writeYamlFile(join(requestsRoot, "design.yaml"), {
|
|
1715
|
+
...spec.request,
|
|
1716
|
+
planningBriefRef: toBundleRef(planningRun)
|
|
1717
|
+
});
|
|
1718
|
+
return { evaluatedRun: await runLocalWorkflow("architecture-design-review", workspaceRoot), setupRuns };
|
|
1719
|
+
}
|
|
1720
|
+
case "implementation-proposal": {
|
|
1721
|
+
const designRun = await runDesign();
|
|
1722
|
+
setupRuns.push(toSetupRun("architecture-design-review", designRun));
|
|
1723
|
+
writeYamlFile(join(requestsRoot, "implementation.yaml"), {
|
|
1724
|
+
...spec.request,
|
|
1725
|
+
designRecordRef: toBundleRef(designRun)
|
|
1726
|
+
});
|
|
1727
|
+
return { evaluatedRun: await runLocalWorkflow("implementation-proposal", workspaceRoot), setupRuns };
|
|
1728
|
+
}
|
|
1729
|
+
case "qa-review": {
|
|
1730
|
+
const implementationRun = await runImplementation();
|
|
1731
|
+
setupRuns.push(toSetupRun("implementation-proposal", implementationRun));
|
|
1732
|
+
writeYamlFile(join(requestsRoot, "qa.yaml"), {
|
|
1733
|
+
...spec.request,
|
|
1734
|
+
targetRef: toBundleRef(implementationRun),
|
|
1735
|
+
evidenceSources: [toSummaryRef(implementationRun)]
|
|
1736
|
+
});
|
|
1737
|
+
return { evaluatedRun: await runLocalWorkflow("qa-review", workspaceRoot), setupRuns };
|
|
1738
|
+
}
|
|
1739
|
+
case "security-review": {
|
|
1740
|
+
const qaRun = await runQa();
|
|
1741
|
+
setupRuns.push(toSetupRun("qa-review", qaRun));
|
|
1742
|
+
writeYamlFile(join(requestsRoot, "security.yaml"), {
|
|
1743
|
+
...spec.request,
|
|
1744
|
+
targetRef: toBundleRef(qaRun),
|
|
1745
|
+
evidenceSources: [toSummaryRef(qaRun)]
|
|
1746
|
+
});
|
|
1747
|
+
return { evaluatedRun: await runLocalWorkflow("security-review", workspaceRoot), setupRuns };
|
|
1748
|
+
}
|
|
1749
|
+
case "maintenance-triage": {
|
|
1750
|
+
const releaseRun = await runRelease();
|
|
1751
|
+
setupRuns.push(toSetupRun("release-readiness", releaseRun));
|
|
1752
|
+
writeYamlFile(join(requestsRoot, "maintenance.yaml"), {
|
|
1753
|
+
...spec.request,
|
|
1754
|
+
releaseReportRefs: [toBundleRef(releaseRun)]
|
|
1755
|
+
});
|
|
1756
|
+
return { evaluatedRun: await runLocalWorkflow("maintenance-triage", workspaceRoot), setupRuns };
|
|
1757
|
+
}
|
|
1758
|
+
}
|
|
1759
|
+
}
|
|
1760
|
+
export async function runLocalEval(specId, cwd = process.cwd()) {
|
|
1761
|
+
const root = findWorkspaceRoot(cwd);
|
|
1762
|
+
ensureInitFiles(root);
|
|
1763
|
+
const spec = getEvalSpec(specId);
|
|
1764
|
+
const controlPolicy = resolvePolicy(loadPolicyDocument(join(root, ".agentops", "policy.yaml")), process.env.CI ? "ci" : "local");
|
|
1765
|
+
const controlState = createWorkflowState({
|
|
1766
|
+
cwd: root,
|
|
1767
|
+
workflow: `eval:${spec.id}`,
|
|
1768
|
+
mode: controlPolicy.defaults.executionMode,
|
|
1769
|
+
policy: controlPolicy
|
|
1770
|
+
});
|
|
1771
|
+
const workspaceRoot = spec.repoFixture === "agentforge-monorepo" ? root : createBlankEvalWorkspace(root, controlState.runId, spec.id);
|
|
1772
|
+
let evaluatedRun;
|
|
1773
|
+
let setupRuns = [];
|
|
1774
|
+
let executionError;
|
|
1775
|
+
try {
|
|
1776
|
+
const result = await executeEvalWorkflow(spec, workspaceRoot);
|
|
1777
|
+
evaluatedRun = result.evaluatedRun;
|
|
1778
|
+
setupRuns = result.setupRuns;
|
|
1779
|
+
}
|
|
1780
|
+
catch (error) {
|
|
1781
|
+
executionError = error instanceof Error ? error.message : String(error);
|
|
1782
|
+
}
|
|
1783
|
+
const evaluatedBundle = evaluatedRun && existsSync(evaluatedRun.jsonPath)
|
|
1784
|
+
? auditBundleSchema.parse(JSON.parse(readFileSync(evaluatedRun.jsonPath, "utf8")))
|
|
1785
|
+
: undefined;
|
|
1786
|
+
const { deterministicChecks, modelDependentChecks } = compareEvalSpec(spec, evaluatedBundle, executionError);
|
|
1787
|
+
const { bundle, jsonPath, markdownPath, outputDir } = createEvalBundle(root, spec, evaluatedRun, workspaceRoot, setupRuns, deterministicChecks, modelDependentChecks);
|
|
1788
|
+
return {
|
|
1789
|
+
runId: bundle.runId,
|
|
1790
|
+
specId: spec.id,
|
|
1791
|
+
workflow: spec.workflow,
|
|
1792
|
+
outputDir,
|
|
1793
|
+
jsonPath,
|
|
1794
|
+
markdownPath,
|
|
1795
|
+
status: bundle.status,
|
|
1796
|
+
evaluatedRunId: evaluatedRun?.runId,
|
|
1797
|
+
evaluatedBundlePath: evaluatedRun ? toBundleRef(evaluatedRun) : undefined,
|
|
1798
|
+
setupRunCount: setupRuns.length,
|
|
1799
|
+
deterministicCheckCount: deterministicChecks.length,
|
|
1800
|
+
deterministicFailures: deterministicChecks.filter((check) => check.status === "failed").length,
|
|
1801
|
+
artifactKinds: bundle.lifecycleArtifacts.map((artifact) => artifact.artifactKind)
|
|
1802
|
+
};
|
|
1803
|
+
}
|
|
1804
|
+
export function compareLocalEvalRuns(baselineRunRef, candidateRunRefs, cwd = process.cwd()) {
|
|
1805
|
+
if (candidateRunRefs.length === 0) {
|
|
1806
|
+
throw new Error("Provide at least one candidate eval run to compare against the baseline.");
|
|
1807
|
+
}
|
|
1808
|
+
const root = findWorkspaceRoot(cwd);
|
|
1809
|
+
ensureInitFiles(root);
|
|
1810
|
+
const baseline = readRunBundleByRef(root, baselineRunRef);
|
|
1811
|
+
const baselineArtifact = extractEvalArtifact(baseline.bundle, baselineRunRef);
|
|
1812
|
+
const comparedRuns = candidateRunRefs.map((candidateRunRef) => {
|
|
1813
|
+
const candidate = readRunBundleByRef(root, candidateRunRef);
|
|
1814
|
+
const candidateArtifact = extractEvalArtifact(candidate.bundle, candidateRunRef);
|
|
1815
|
+
return compareEvalArtifacts(baseline.runId, baseline.bundlePath, baselineArtifact, candidate.runId, candidate.bundlePath, candidateArtifact);
|
|
1816
|
+
});
|
|
1817
|
+
const { bundle, jsonPath, markdownPath, outputDir } = createBenchmarkBundle(root, baseline.runId, baseline.bundlePath, baselineArtifact, comparedRuns);
|
|
1818
|
+
return {
|
|
1819
|
+
runId: bundle.runId,
|
|
1820
|
+
outputDir,
|
|
1821
|
+
jsonPath,
|
|
1822
|
+
markdownPath,
|
|
1823
|
+
status: bundle.status,
|
|
1824
|
+
baselineRunId: baseline.runId,
|
|
1825
|
+
comparedRunIds: comparedRuns.map((candidate) => candidate.runId),
|
|
1826
|
+
comparableRunCount: comparedRuns.filter((candidate) => candidate.comparable).length,
|
|
1827
|
+
regressionCount: comparedRuns.reduce((total, candidate) => total + candidate.regressions.length, 0),
|
|
1828
|
+
improvementCount: comparedRuns.reduce((total, candidate) => total + candidate.improvements.length, 0),
|
|
1829
|
+
unchangedCount: comparedRuns.reduce((total, candidate) => total + candidate.unchangedCount, 0),
|
|
1830
|
+
nonComparableCount: comparedRuns.reduce((total, candidate) => total + candidate.nonComparableFindings.length, 0),
|
|
1831
|
+
artifactKinds: bundle.lifecycleArtifacts.map((artifact) => artifact.artifactKind)
|
|
1832
|
+
};
|
|
1833
|
+
}
|
|
1124
1834
|
export function explainLastRun(cwd = process.cwd()) {
|
|
1125
1835
|
const root = findWorkspaceRoot(cwd);
|
|
1126
1836
|
const config = loadAgentForgeConfig(root);
|