@h9-foundry/agentforge-cli 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,11 +1,12 @@
1
1
  import { existsSync, mkdirSync, readdirSync, readFileSync, statSync, writeFileSync } from "node:fs";
2
2
  import { join } from "node:path";
3
+ import { execFileSync } from "node:child_process";
3
4
  import yaml from "js-yaml";
4
- import { renderAuditBundleMarkdown } from "@h9-foundry/agentforge-audit";
5
+ import { buildAuditBundle, createAuditEntry, renderAuditBundleMarkdown } from "@h9-foundry/agentforge-audit";
5
6
  import { createWorkflowState, findWorkspaceRoot } from "@h9-foundry/agentforge-context-engine";
6
7
  import { createPolicyEngine, loadPolicyDocument, resolvePolicy } from "@h9-foundry/agentforge-policy-engine";
7
8
  import { runWorkflow } from "@h9-foundry/agentforge-runtime";
8
- import { agentforgeConfigSchema, auditBundleSchema, designArtifactSchema, designRequestSchema, implementationRequestSchema, planningArtifactSchema, planningRequestSchema, qaRequestSchema, securityRequestSchema, workflowDefinitionSchema } from "@h9-foundry/agentforge-schemas";
9
+ import { agentforgeConfigSchema, auditBundleSchema, benchmarkArtifactSchema, designArtifactSchema, designRequestSchema, evalArtifactSchema, evalFixtureCorpusSchema, implementationRequestSchema, incidentRequestSchema, maintenanceRequestSchema, planningArtifactSchema, planningRequestSchema, qaRequestSchema, releaseRequestSchema, schemaFixtures, securityRequestSchema, workflowDefinitionSchema } from "@h9-foundry/agentforge-schemas";
9
10
  import { createBuiltinAdapters } from "./internal/builtin-adapters.js";
10
11
  import { createBuiltinAgentRegistry } from "./internal/builtin-agents.js";
11
12
  import { LocalPluginRegistry } from "./internal/local-plugin-registry.js";
@@ -231,6 +232,84 @@ nodes:
231
232
  kind: report
232
233
  outputs_to: reports.final
233
234
  `;
235
+ const releaseWorkflowTemplate = `version: 1
236
+ name: release-readiness
237
+ description: Validate a bounded release-readiness request while keeping trusted publish automation separate.
238
+ trigger: manual
239
+ catalog:
240
+ domain: release
241
+ supportLevel: partial
242
+ maturity: mvp
243
+ trustScope: official-core-only
244
+ nodes:
245
+ - id: intake
246
+ kind: deterministic
247
+ agent: release-intake
248
+ outputs_to: agentResults.intake
249
+ - id: evidence
250
+ kind: deterministic
251
+ agent: release-evidence-normalizer
252
+ outputs_to: agentResults.evidence
253
+ - id: release
254
+ kind: reasoning
255
+ agent: release-analyst
256
+ outputs_to: agentResults.release
257
+ - id: report
258
+ kind: report
259
+ outputs_to: reports.final
260
+ `;
261
+ const incidentWorkflowTemplate = `version: 1
262
+ name: incident-handoff
263
+ description: Validate staged incident evidence while keeping the default path local, read-only, and explicit.
264
+ trigger: manual
265
+ catalog:
266
+ domain: operate
267
+ supportLevel: partial
268
+ maturity: mvp
269
+ trustScope: official-core-only
270
+ nodes:
271
+ - id: intake
272
+ kind: deterministic
273
+ agent: incident-intake
274
+ outputs_to: agentResults.intake
275
+ - id: evidence
276
+ kind: deterministic
277
+ agent: incident-evidence-normalizer
278
+ outputs_to: agentResults.evidence
279
+ - id: incident
280
+ kind: reasoning
281
+ agent: incident-analyst
282
+ outputs_to: agentResults.incident
283
+ - id: report
284
+ kind: report
285
+ outputs_to: reports.final
286
+ `;
287
+ const maintenanceWorkflowTemplate = `version: 1
288
+ name: maintenance-triage
289
+ description: Validate a bounded maintenance request while keeping the default path local, read-only, and routing-oriented.
290
+ trigger: manual
291
+ catalog:
292
+ domain: maintain
293
+ supportLevel: partial
294
+ maturity: mvp
295
+ trustScope: official-core-only
296
+ nodes:
297
+ - id: intake
298
+ kind: deterministic
299
+ agent: maintenance-intake
300
+ outputs_to: agentResults.intake
301
+ - id: evidence
302
+ kind: deterministic
303
+ agent: maintenance-evidence-normalizer
304
+ outputs_to: agentResults.evidence
305
+ - id: maintenance
306
+ kind: reasoning
307
+ agent: maintenance-analyst
308
+ outputs_to: agentResults.maintenance
309
+ - id: report
310
+ kind: report
311
+ outputs_to: reports.final
312
+ `;
234
313
  function loadYaml(filePath) {
235
314
  return yaml.load(readFileSync(filePath, "utf8"));
236
315
  }
@@ -240,6 +319,136 @@ function isRecord(value) {
240
319
  function asArray(value) {
241
320
  return Array.isArray(value) ? value : [];
242
321
  }
322
+ function runGit(root, args) {
323
+ try {
324
+ return execFileSync("git", args, { cwd: root, encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] }).trim();
325
+ }
326
+ catch {
327
+ return "";
328
+ }
329
+ }
330
+ function parseGitHubRepositoryUrl(value) {
331
+ const trimmed = value.trim();
332
+ const sshMatch = trimmed.match(/^git@([^:]+):([^/]+)\/([^/]+?)(?:\.git)?$/i);
333
+ if (sshMatch) {
334
+ return {
335
+ host: sshMatch[1].toLowerCase(),
336
+ owner: sshMatch[2],
337
+ repo: sshMatch[3]
338
+ };
339
+ }
340
+ const httpsMatch = trimmed.match(/^https?:\/\/([^/]+)\/([^/]+)\/([^/]+?)(?:\.git)?(?:\/)?$/i);
341
+ if (!httpsMatch) {
342
+ return undefined;
343
+ }
344
+ return {
345
+ host: httpsMatch[1].toLowerCase(),
346
+ owner: httpsMatch[2],
347
+ repo: httpsMatch[3]
348
+ };
349
+ }
350
+ function inferGitHubRepoContext(root) {
351
+ const packageJsonPath = join(root, "package.json");
352
+ if (existsSync(packageJsonPath)) {
353
+ const parsed = JSON.parse(readFileSync(packageJsonPath, "utf8"));
354
+ if (isRecord(parsed)) {
355
+ const repository = parsed.repository;
356
+ if (typeof repository === "string") {
357
+ const context = parseGitHubRepositoryUrl(repository);
358
+ if (context) {
359
+ return context;
360
+ }
361
+ }
362
+ if (isRecord(repository) && typeof repository.url === "string") {
363
+ const context = parseGitHubRepositoryUrl(repository.url);
364
+ if (context) {
365
+ return context;
366
+ }
367
+ }
368
+ }
369
+ }
370
+ const remoteUrl = runGit(root, ["config", "--get", "remote.origin.url"]);
371
+ return remoteUrl ? parseGitHubRepositoryUrl(remoteUrl) : undefined;
372
+ }
373
+ function normalizeGitHubReference(rawValue, repoContext) {
374
+ const raw = rawValue.trim();
375
+ if (!raw) {
376
+ return undefined;
377
+ }
378
+ const fromParts = (context, kind, number) => ({
379
+ platform: "github",
380
+ host: context.host,
381
+ owner: context.owner,
382
+ repo: context.repo,
383
+ kind,
384
+ number,
385
+ canonical: kind === "issue"
386
+ ? `${context.owner}/${context.repo}#${number}`
387
+ : `${context.owner}/${context.repo}/pull/${number}`,
388
+ url: kind === "issue"
389
+ ? `https://${context.host}/${context.owner}/${context.repo}/issues/${number}`
390
+ : `https://${context.host}/${context.owner}/${context.repo}/pull/${number}`,
391
+ source: raw
392
+ });
393
+ const urlMatch = raw.match(/^https?:\/\/([^/]+)\/([^/]+)\/([^/]+)\/(issues|pull)\/(\d+)(?:\/)?$/i);
394
+ if (urlMatch) {
395
+ return fromParts({ host: urlMatch[1].toLowerCase(), owner: urlMatch[2], repo: urlMatch[3] }, urlMatch[4].toLowerCase() === "pull" ? "pull_request" : "issue", Number.parseInt(urlMatch[5], 10));
396
+ }
397
+ const repoIssueMatch = raw.match(/^([^/\s]+)\/([^#\s]+)#(\d+)$/);
398
+ if (repoIssueMatch) {
399
+ return fromParts({ host: repoContext?.host ?? "github.com", owner: repoIssueMatch[1], repo: repoIssueMatch[2] }, "issue", Number.parseInt(repoIssueMatch[3], 10));
400
+ }
401
+ const repoPullMatch = raw.match(/^([^/\s]+)\/([^/\s]+)\/pull\/(\d+)$/i);
402
+ if (repoPullMatch) {
403
+ return fromParts({ host: repoContext?.host ?? "github.com", owner: repoPullMatch[1], repo: repoPullMatch[2] }, "pull_request", Number.parseInt(repoPullMatch[3], 10));
404
+ }
405
+ const shortIssueMatch = raw.match(/^#(\d+)$/);
406
+ if (shortIssueMatch && repoContext) {
407
+ return fromParts(repoContext, "issue", Number.parseInt(shortIssueMatch[1], 10));
408
+ }
409
+ const shortPullMatch = raw.match(/^(?:PR|pr)\s*#(\d+)$/);
410
+ if (shortPullMatch && repoContext) {
411
+ return fromParts(repoContext, "pull_request", Number.parseInt(shortPullMatch[1], 10));
412
+ }
413
+ return undefined;
414
+ }
415
+ function normalizeGitHubReferences(rawValues, repoContext) {
416
+ const seen = new Set();
417
+ const normalized = [];
418
+ for (const rawValue of rawValues) {
419
+ const githubRef = normalizeGitHubReference(rawValue, repoContext);
420
+ if (!githubRef || seen.has(githubRef.canonical)) {
421
+ continue;
422
+ }
423
+ seen.add(githubRef.canonical);
424
+ normalized.push(githubRef);
425
+ }
426
+ return normalized;
427
+ }
428
+ export function mapWorkflowRunStatusToGitHubStatus(workflow, localRunStatus) {
429
+ if (localRunStatus === "success") {
430
+ return {
431
+ workflow,
432
+ localRunStatus,
433
+ githubStatus: "completed",
434
+ reason: "Successful local workflow runs map to completed GitHub handoff status."
435
+ };
436
+ }
437
+ if (localRunStatus === "partial") {
438
+ return {
439
+ workflow,
440
+ localRunStatus,
441
+ githubStatus: "blocked",
442
+ reason: "Partial local workflow runs map to blocked GitHub handoff status until follow-up work resolves them."
443
+ };
444
+ }
445
+ return {
446
+ workflow,
447
+ localRunStatus,
448
+ githubStatus: "failed",
449
+ reason: "Failed local workflow runs map to failed GitHub handoff status."
450
+ };
451
+ }
243
452
  function ensureReadablePath(policyEngine, pathValue, purpose) {
244
453
  const decision = policyEngine.canReadPath(pathValue);
245
454
  if (!decision.allowed) {
@@ -263,9 +472,31 @@ function validatePlanningRequestCompleteness(request) {
263
472
  }
264
473
  return request;
265
474
  }
475
+ function validateIncidentRequestCompleteness(request) {
476
+ const evidenceSignalCount = request.evidenceSources.length + request.releaseReportRefs.length;
477
+ if (evidenceSignalCount === 0) {
478
+ throw new Error("Incident request is underspecified. Add at least one of evidenceSources or releaseReportRefs.");
479
+ }
480
+ return request;
481
+ }
482
+ function validateMaintenanceRequestCompleteness(request) {
483
+ const supportingSignalCount = request.dependencyAlertRefs.length +
484
+ request.docsTaskRefs.length +
485
+ request.releaseReportRefs.length +
486
+ request.issueRefs.length;
487
+ if (supportingSignalCount === 0) {
488
+ throw new Error("Maintenance request is underspecified. Add at least one of dependencyAlertRefs, docsTaskRefs, releaseReportRefs, or issueRefs.");
489
+ }
490
+ return request;
491
+ }
266
492
  function validateWorkflowLifecyclePosture(workflow, policyEngine) {
267
493
  const domain = workflow.catalog?.domain;
268
- if (domain !== "plan" && domain !== "design" && domain !== "build" && domain !== "security") {
494
+ if (domain !== "plan" &&
495
+ domain !== "design" &&
496
+ domain !== "build" &&
497
+ domain !== "security" &&
498
+ domain !== "release" &&
499
+ domain !== "operate") {
269
500
  return;
270
501
  }
271
502
  if (policyEngine.snapshot.defaults.network !== "deny") {
@@ -299,6 +530,19 @@ function loadDesignBundleArtifact(root, designRecordRef) {
299
530
  }
300
531
  return designArtifactSchema.parse(designArtifact);
301
532
  }
533
+ function ensureBundleContainsArtifactKind(root, bundleRef, artifactKind, purpose) {
534
+ const artifactKinds = loadLifecycleArtifactKinds(root, bundleRef);
535
+ if (!artifactKinds.includes(artifactKind)) {
536
+ throw new Error(`Referenced ${purpose} does not contain a ${artifactKind} artifact: ${bundleRef}`);
537
+ }
538
+ }
539
+ function validateReleaseRequestCompleteness(request) {
540
+ const evidenceSignalCount = request.qaReportRefs.length + request.securityReportRefs.length + request.evidenceSources.length;
541
+ if (evidenceSignalCount === 0) {
542
+ throw new Error("Release request is underspecified. Add at least one of qaReportRefs, securityReportRefs, or evidenceSources.");
543
+ }
544
+ return request;
545
+ }
302
546
  function loadLifecycleArtifactKinds(root, bundleRef) {
303
547
  const bundlePath = join(root, bundleRef);
304
548
  if (!existsSync(bundlePath)) {
@@ -307,6 +551,31 @@ function loadLifecycleArtifactKinds(root, bundleRef) {
307
551
  const bundle = auditBundleSchema.parse(JSON.parse(readFileSync(bundlePath, "utf8")));
308
552
  return bundle.lifecycleArtifacts.map((artifact) => artifact.artifactKind);
309
553
  }
554
+ function loadLifecycleArtifactSourceReferences(root, bundleRef) {
555
+ const bundlePath = join(root, bundleRef);
556
+ if (!existsSync(bundlePath)) {
557
+ throw new Error(`Referenced bundle not found: ${bundleRef}`);
558
+ }
559
+ const bundle = auditBundleSchema.parse(JSON.parse(readFileSync(bundlePath, "utf8")));
560
+ const repoContext = inferGitHubRepoContext(root);
561
+ const issueRefs = new Set();
562
+ const githubRefs = new Map();
563
+ for (const artifact of bundle.lifecycleArtifacts) {
564
+ for (const issueRef of artifact.source.issueRefs) {
565
+ issueRefs.add(issueRef);
566
+ }
567
+ for (const githubRef of artifact.source.githubRefs ?? []) {
568
+ githubRefs.set(githubRef.canonical, githubRef);
569
+ }
570
+ for (const githubRef of normalizeGitHubReferences(artifact.source.issueRefs, repoContext)) {
571
+ githubRefs.set(githubRef.canonical, githubRef);
572
+ }
573
+ }
574
+ return {
575
+ issueRefs: [...issueRefs],
576
+ githubRefs: [...githubRefs.values()]
577
+ };
578
+ }
310
579
  function prepareWorkflowInputs(workflow, root, policyEngine) {
311
580
  const requestsDir = join(root, ".agentops", "requests");
312
581
  ensureDirectory(requestsDir);
@@ -314,8 +583,10 @@ function prepareWorkflowInputs(workflow, root, policyEngine) {
314
583
  const requestPath = ".agentops/requests/planning.yaml";
315
584
  ensureReadablePath(policyEngine, requestPath, "planning request");
316
585
  const planningRequest = validatePlanningRequestCompleteness(readYamlFile(join(root, requestPath), planningRequestSchema, "planning request"));
586
+ const planningGithubRefs = normalizeGitHubReferences(planningRequest.issueRefs, inferGitHubRepoContext(root));
317
587
  return {
318
588
  planningRequest,
589
+ planningGithubRefs,
319
590
  requestFile: requestPath
320
591
  };
321
592
  }
@@ -348,11 +619,19 @@ function prepareWorkflowInputs(workflow, root, policyEngine) {
348
619
  ensureReadablePath(policyEngine, requestPath, "QA request");
349
620
  const qaRequest = readYamlFile(join(root, requestPath), qaRequestSchema, "QA request");
350
621
  ensureReadablePath(policyEngine, qaRequest.targetRef, "QA target reference");
622
+ if (!existsSync(join(root, qaRequest.targetRef))) {
623
+ throw new Error(`QA target reference not found: ${qaRequest.targetRef}`);
624
+ }
351
625
  for (const evidenceSource of qaRequest.evidenceSources) {
352
626
  ensureReadablePath(policyEngine, evidenceSource, "QA evidence source");
353
627
  }
628
+ const referencedSourceRefs = qaRequest.targetRef.endsWith("bundle.json")
629
+ ? loadLifecycleArtifactSourceReferences(root, qaRequest.targetRef)
630
+ : { issueRefs: [], githubRefs: [] };
354
631
  return {
355
632
  qaRequest: qaRequest,
633
+ qaIssueRefs: referencedSourceRefs.issueRefs,
634
+ qaGithubRefs: referencedSourceRefs.githubRefs,
356
635
  requestFile: requestPath
357
636
  };
358
637
  }
@@ -374,9 +653,129 @@ function prepareWorkflowInputs(workflow, root, policyEngine) {
374
653
  if (securityRequest.targetRef.endsWith("bundle.json") && !referencedArtifactKinds.some((kind) => allowedSecurityTargets.has(kind))) {
375
654
  throw new Error(`Referenced security bundle does not contain a supported lifecycle artifact: ${securityRequest.targetRef}`);
376
655
  }
656
+ const referencedSourceRefs = securityRequest.targetRef.endsWith("bundle.json")
657
+ ? loadLifecycleArtifactSourceReferences(root, securityRequest.targetRef)
658
+ : { issueRefs: [], githubRefs: [] };
377
659
  return {
378
660
  securityRequest: securityRequest,
379
661
  securityTargetArtifactKinds: referencedArtifactKinds,
662
+ securityIssueRefs: referencedSourceRefs.issueRefs,
663
+ securityGithubRefs: referencedSourceRefs.githubRefs,
664
+ requestFile: requestPath
665
+ };
666
+ }
667
+ if (workflow.name === "release-readiness") {
668
+ const requestPath = ".agentops/requests/release.yaml";
669
+ ensureReadablePath(policyEngine, requestPath, "release request");
670
+ const releaseRequest = validateReleaseRequestCompleteness(readYamlFile(join(root, requestPath), releaseRequestSchema, "release request"));
671
+ const releaseIssueRefs = new Set();
672
+ const releaseGithubRefMap = new Map();
673
+ for (const qaReportRef of releaseRequest.qaReportRefs) {
674
+ ensureReadablePath(policyEngine, qaReportRef, "QA report reference");
675
+ ensureBundleContainsArtifactKind(root, qaReportRef, "qa-report", "QA report reference");
676
+ const refs = loadLifecycleArtifactSourceReferences(root, qaReportRef);
677
+ for (const issueRef of refs.issueRefs) {
678
+ releaseIssueRefs.add(issueRef);
679
+ }
680
+ for (const githubRef of refs.githubRefs) {
681
+ releaseGithubRefMap.set(githubRef.canonical, githubRef);
682
+ }
683
+ }
684
+ for (const securityReportRef of releaseRequest.securityReportRefs) {
685
+ ensureReadablePath(policyEngine, securityReportRef, "security report reference");
686
+ ensureBundleContainsArtifactKind(root, securityReportRef, "security-report", "security report reference");
687
+ const refs = loadLifecycleArtifactSourceReferences(root, securityReportRef);
688
+ for (const issueRef of refs.issueRefs) {
689
+ releaseIssueRefs.add(issueRef);
690
+ }
691
+ for (const githubRef of refs.githubRefs) {
692
+ releaseGithubRefMap.set(githubRef.canonical, githubRef);
693
+ }
694
+ }
695
+ for (const evidenceSource of releaseRequest.evidenceSources) {
696
+ ensureReadablePath(policyEngine, evidenceSource, "release evidence source");
697
+ if (!existsSync(join(root, evidenceSource))) {
698
+ throw new Error(`Release evidence source not found: ${evidenceSource}`);
699
+ }
700
+ }
701
+ return {
702
+ releaseRequest: releaseRequest,
703
+ releaseIssueRefs: [...releaseIssueRefs],
704
+ releaseGithubRefs: [...releaseGithubRefMap.values()],
705
+ requestFile: requestPath
706
+ };
707
+ }
708
+ if (workflow.name === "incident-handoff") {
709
+ const requestPath = ".agentops/requests/incident.yaml";
710
+ ensureReadablePath(policyEngine, requestPath, "incident request");
711
+ const incidentRequest = validateIncidentRequestCompleteness(readYamlFile(join(root, requestPath), incidentRequestSchema, "incident request"));
712
+ const repoContext = inferGitHubRepoContext(root);
713
+ const incidentIssueRefs = new Set(incidentRequest.issueRefs);
714
+ const incidentGithubRefMap = new Map();
715
+ for (const githubRef of normalizeGitHubReferences(incidentRequest.issueRefs, repoContext)) {
716
+ incidentGithubRefMap.set(githubRef.canonical, githubRef);
717
+ }
718
+ for (const releaseReportRef of incidentRequest.releaseReportRefs) {
719
+ ensureReadablePath(policyEngine, releaseReportRef, "release report reference");
720
+ ensureBundleContainsArtifactKind(root, releaseReportRef, "release-report", "release report reference");
721
+ const refs = loadLifecycleArtifactSourceReferences(root, releaseReportRef);
722
+ for (const issueRef of refs.issueRefs) {
723
+ incidentIssueRefs.add(issueRef);
724
+ }
725
+ for (const githubRef of refs.githubRefs) {
726
+ incidentGithubRefMap.set(githubRef.canonical, githubRef);
727
+ }
728
+ }
729
+ for (const evidenceSource of incidentRequest.evidenceSources) {
730
+ ensureReadablePath(policyEngine, evidenceSource, "incident evidence source");
731
+ if (!existsSync(join(root, evidenceSource))) {
732
+ throw new Error(`Incident evidence source not found: ${evidenceSource}`);
733
+ }
734
+ }
735
+ return {
736
+ incidentRequest: incidentRequest,
737
+ incidentIssueRefs: [...incidentIssueRefs],
738
+ incidentGithubRefs: [...incidentGithubRefMap.values()],
739
+ requestFile: requestPath
740
+ };
741
+ }
742
+ if (workflow.name === "maintenance-triage") {
743
+ const requestPath = ".agentops/requests/maintenance.yaml";
744
+ ensureReadablePath(policyEngine, requestPath, "maintenance request");
745
+ const maintenanceRequest = validateMaintenanceRequestCompleteness(readYamlFile(join(root, requestPath), maintenanceRequestSchema, "maintenance request"));
746
+ const repoContext = inferGitHubRepoContext(root);
747
+ const maintenanceIssueRefs = new Set(maintenanceRequest.issueRefs);
748
+ const maintenanceGithubRefMap = new Map();
749
+ for (const githubRef of normalizeGitHubReferences(maintenanceRequest.issueRefs, repoContext)) {
750
+ maintenanceGithubRefMap.set(githubRef.canonical, githubRef);
751
+ }
752
+ for (const releaseReportRef of maintenanceRequest.releaseReportRefs) {
753
+ ensureReadablePath(policyEngine, releaseReportRef, "release report reference");
754
+ ensureBundleContainsArtifactKind(root, releaseReportRef, "release-report", "release report reference");
755
+ const refs = loadLifecycleArtifactSourceReferences(root, releaseReportRef);
756
+ for (const issueRef of refs.issueRefs) {
757
+ maintenanceIssueRefs.add(issueRef);
758
+ }
759
+ for (const githubRef of refs.githubRefs) {
760
+ maintenanceGithubRefMap.set(githubRef.canonical, githubRef);
761
+ }
762
+ }
763
+ for (const dependencyAlertRef of maintenanceRequest.dependencyAlertRefs) {
764
+ ensureReadablePath(policyEngine, dependencyAlertRef, "dependency alert reference");
765
+ if (!existsSync(join(root, dependencyAlertRef))) {
766
+ throw new Error(`Dependency alert reference not found: ${dependencyAlertRef}`);
767
+ }
768
+ }
769
+ for (const docsTaskRef of maintenanceRequest.docsTaskRefs) {
770
+ ensureReadablePath(policyEngine, docsTaskRef, "docs task reference");
771
+ if (!existsSync(join(root, docsTaskRef))) {
772
+ throw new Error(`Docs task reference not found: ${docsTaskRef}`);
773
+ }
774
+ }
775
+ return {
776
+ maintenanceRequest: maintenanceRequest,
777
+ maintenanceIssueRefs: [...maintenanceIssueRefs],
778
+ maintenanceGithubRefs: [...maintenanceGithubRefMap.values()],
380
779
  requestFile: requestPath
381
780
  };
382
781
  }
@@ -453,6 +852,26 @@ function readLatestCompleteRunBundle(runsRoot) {
453
852
  if (!existsSync(runsRoot)) {
454
853
  return undefined;
455
854
  }
855
+ const parseRunTimestampMs = (value) => {
856
+ if (typeof value !== "string" || value.length === 0) {
857
+ return undefined;
858
+ }
859
+ const compactDateTimeMatch = value.match(/^(\d{4})-(\d{2})-(\d{2})-(\d{2})(\d{2})(\d{2})$/);
860
+ if (compactDateTimeMatch) {
861
+ const [, year, month, day, hour, minute, second] = compactDateTimeMatch;
862
+ const isoCandidate = `${year}-${month}-${day}T${hour}:${minute}:${second}Z`;
863
+ const parsedCompactDateTime = Date.parse(isoCandidate);
864
+ if (!Number.isNaN(parsedCompactDateTime)) {
865
+ return parsedCompactDateTime;
866
+ }
867
+ }
868
+ const parsedDate = Date.parse(value);
869
+ if (!Number.isNaN(parsedDate)) {
870
+ return parsedDate;
871
+ }
872
+ const timestampPrefix = Number.parseInt(value.split("-")[0] ?? "", 10);
873
+ return Number.isNaN(timestampPrefix) ? undefined : timestampPrefix;
874
+ };
456
875
  const candidates = readdirSync(runsRoot)
457
876
  .map((entry) => {
458
877
  const bundlePath = join(runsRoot, entry, "bundle.json");
@@ -462,15 +881,24 @@ function readLatestCompleteRunBundle(runsRoot) {
462
881
  const stats = statSync(bundlePath);
463
882
  const bundle = JSON.parse(readFileSync(bundlePath, "utf8"));
464
883
  const bundleRunId = typeof bundle.runId === "string" ? bundle.runId : entry;
884
+ const parsedCompletedAtMs = parseRunTimestampMs(bundle.finishedAt) ??
885
+ parseRunTimestampMs(bundle.startedAt) ??
886
+ parseRunTimestampMs(bundleRunId) ??
887
+ parseRunTimestampMs(entry);
888
+ const completedAtMs = parsedCompletedAtMs ?? stats.mtimeMs;
465
889
  return {
466
890
  runDir: entry,
467
891
  bundle,
468
892
  bundleRunId,
469
- completedAtMs: stats.mtimeMs
893
+ completedAtMs,
894
+ hasExplicitTimestamp: typeof parsedCompletedAtMs === "number"
470
895
  };
471
896
  })
472
897
  .filter((candidate) => Boolean(candidate))
473
898
  .sort((left, right) => {
899
+ if (left.hasExplicitTimestamp !== right.hasExplicitTimestamp) {
900
+ return left.hasExplicitTimestamp ? -1 : 1;
901
+ }
474
902
  if (left.completedAtMs !== right.completedAtMs) {
475
903
  return right.completedAtMs - left.completedAtMs;
476
904
  }
@@ -478,6 +906,29 @@ function readLatestCompleteRunBundle(runsRoot) {
478
906
  });
479
907
  return candidates[0] ? { runDir: candidates[0].runDir, bundle: candidates[0].bundle } : undefined;
480
908
  }
909
+ function readRunBundleByRef(root, runRef) {
910
+ const config = loadAgentForgeConfig(root);
911
+ const runsRoot = join(root, config.runtime.runsPath);
912
+ const bundlePath = runRef.endsWith(".json") || runRef.includes("/")
913
+ ? (runRef.startsWith("/") ? runRef : join(root, runRef))
914
+ : join(runsRoot, runRef, "bundle.json");
915
+ if (!existsSync(bundlePath)) {
916
+ throw new Error(`Run bundle not found: ${runRef}`);
917
+ }
918
+ const bundle = auditBundleSchema.parse(JSON.parse(readFileSync(bundlePath, "utf8")));
919
+ return {
920
+ runId: typeof bundle.runId === "string" ? bundle.runId : runRef,
921
+ bundlePath,
922
+ bundle
923
+ };
924
+ }
925
+ function extractEvalArtifact(bundle, runRef) {
926
+ const artifact = bundle.lifecycleArtifacts.find((candidate) => candidate.artifactKind === "eval-result");
927
+ if (!artifact) {
928
+ throw new Error(`Run ${runRef} does not contain an eval-result artifact.`);
929
+ }
930
+ return evalArtifactSchema.parse(artifact);
931
+ }
481
932
  function loadAgentForgeConfig(root) {
482
933
  const configPath = join(root, ".agentops", "agentops.yaml");
483
934
  if (!existsSync(configPath)) {
@@ -506,6 +957,425 @@ function loadAgentForgeConfig(root) {
506
957
  function ensureDirectory(pathValue) {
507
958
  mkdirSync(pathValue, { recursive: true });
508
959
  }
960
+ function writeYamlFile(filePath, value) {
961
+ writeFileSync(filePath, yaml.dump(value), "utf8");
962
+ }
963
+ function loadEvalFixtureCorpus() {
964
+ return evalFixtureCorpusSchema.parse(schemaFixtures.evalFixtureCorpus);
965
+ }
966
+ function getEvalSpec(specId) {
967
+ const corpus = loadEvalFixtureCorpus();
968
+ const spec = corpus.specs.find((candidate) => candidate.id === specId);
969
+ if (!spec) {
970
+ throw new Error(`Unknown eval spec: ${specId}`);
971
+ }
972
+ return spec;
973
+ }
974
+ function toBundleRef(run) {
975
+ return `.agentops/runs/${run.runId}/bundle.json`;
976
+ }
977
+ function toSummaryRef(run) {
978
+ return `.agentops/runs/${run.runId}/summary.md`;
979
+ }
980
+ function toSetupRun(workflow, run) {
981
+ return {
982
+ workflow,
983
+ runId: run.runId,
984
+ bundlePath: toBundleRef(run)
985
+ };
986
+ }
987
+ function createBlankEvalWorkspace(root, evalRunId, specId) {
988
+ const workspaceRoot = join(root, ".agentops", "evals", specId, evalRunId, "workspace");
989
+ ensureDirectory(workspaceRoot);
990
+ const evidenceRoot = join(workspaceRoot, ".agentops", "evidence");
991
+ ensureDirectory(evidenceRoot);
992
+ execFileSync("git", ["init"], { cwd: workspaceRoot, stdio: "ignore" });
993
+ execFileSync("git", ["config", "user.email", "eval@example.com"], { cwd: workspaceRoot, stdio: "ignore" });
994
+ execFileSync("git", ["config", "user.name", "AgentForge Eval"], { cwd: workspaceRoot, stdio: "ignore" });
995
+ writeFileSync(join(workspaceRoot, "package.json"), JSON.stringify({
996
+ name: "fixture",
997
+ repository: {
998
+ type: "git",
999
+ url: "https://github.com/H9-Foundry/fixture.git"
1000
+ },
1001
+ scripts: {
1002
+ test: "echo test",
1003
+ lint: "echo lint",
1004
+ typecheck: "echo typecheck",
1005
+ build: "echo build"
1006
+ }
1007
+ }, null, 2), "utf8");
1008
+ writeFileSync(join(workspaceRoot, "pnpm-lock.yaml"), "lockfileVersion: '9.0'\n", "utf8");
1009
+ writeFileSync(join(workspaceRoot, "src.ts"), "export const value = 1;\n", "utf8");
1010
+ writeFileSync(join(evidenceRoot, "dependency-alerts.json"), JSON.stringify({
1011
+ alerts: [
1012
+ {
1013
+ package: "example-dependency",
1014
+ severity: "moderate",
1015
+ summary: "Upgrade pending review for deterministic eval coverage."
1016
+ }
1017
+ ]
1018
+ }, null, 2), "utf8");
1019
+ writeFileSync(join(evidenceRoot, "docs-task.md"), "# Docs follow-up\n\n- Align workflow documentation after maintenance triage.\n", "utf8");
1020
+ execFileSync("git", ["add", "."], { cwd: workspaceRoot, stdio: "ignore" });
1021
+ execFileSync("git", ["-c", "commit.gpgsign=false", "commit", "-m", "init"], { cwd: workspaceRoot, stdio: "ignore" });
1022
+ writeFileSync(join(workspaceRoot, "src.ts"), "export const value = 2;\n", "utf8");
1023
+ initProject(workspaceRoot);
1024
+ return workspaceRoot;
1025
+ }
1026
+ function evalRedactionCategories() {
1027
+ return ["github-token", "api-key", "aws-key", "bearer-token", "password", "private-key"];
1028
+ }
1029
+ function createEvalBundle(root, spec, evaluatedRun, workspacePath, setupRuns, deterministicChecks, modelDependentChecks) {
1030
+ const config = loadAgentForgeConfig(root);
1031
+ const policy = resolvePolicy(loadPolicyDocument(join(root, ".agentops", "policy.yaml")), process.env.CI ? "ci" : "local");
1032
+ const state = createWorkflowState({
1033
+ cwd: root,
1034
+ workflow: `eval:${spec.id}`,
1035
+ mode: "inspect",
1036
+ policy
1037
+ });
1038
+ const runsRoot = join(root, config.runtime.runsPath);
1039
+ const outputDir = join(runsRoot, state.runId);
1040
+ ensureDirectory(outputDir);
1041
+ const jsonPath = join(outputDir, "bundle.json");
1042
+ const markdownPath = join(outputDir, "summary.md");
1043
+ const failureCount = deterministicChecks.filter((check) => check.status === "failed").length;
1044
+ const passed = failureCount === 0;
1045
+ const startedAt = new Date().toISOString();
1046
+ const evalArtifact = evalArtifactSchema.parse({
1047
+ schemaVersion: state.version,
1048
+ artifactKind: "eval-result",
1049
+ lifecycleDomain: "evaluate",
1050
+ workflow: {
1051
+ name: state.workflow,
1052
+ displayName: "Eval Runner"
1053
+ },
1054
+ source: {
1055
+ sourceType: "workflow-run",
1056
+ runId: state.runId,
1057
+ inputRefs: [
1058
+ ...(evaluatedRun?.jsonPath ? [evaluatedRun.jsonPath] : []),
1059
+ ...setupRuns.map((setup) => setup.bundlePath)
1060
+ ],
1061
+ issueRefs: ["#165"],
1062
+ githubRefs: []
1063
+ },
1064
+ status: passed ? "complete" : "draft",
1065
+ generatedAt: startedAt,
1066
+ repo: {
1067
+ root: state.repo.root,
1068
+ name: state.repo.name,
1069
+ branch: state.repo.branch
1070
+ },
1071
+ provenance: {
1072
+ generatedBy: "agentforge-runtime",
1073
+ schemaVersion: state.version,
1074
+ executionEnvironment: state.context.ciExecution ? "ci" : "local",
1075
+ repoRoot: state.repo.root
1076
+ },
1077
+ redaction: {
1078
+ applied: true,
1079
+ strategyVersion: "1.0.0",
1080
+ categories: evalRedactionCategories()
1081
+ },
1082
+ auditLink: {
1083
+ bundlePath: jsonPath,
1084
+ entryIds: [`${state.runId}-eval-runner`],
1085
+ findingIds: [],
1086
+ proposedActionIds: []
1087
+ },
1088
+ summary: passed
1089
+ ? `Eval result for ${spec.id} passed ${deterministicChecks.length} deterministic check(s).`
1090
+ : `Eval result for ${spec.id} failed ${failureCount} deterministic check(s).`,
1091
+ payload: {
1092
+ specId: spec.id,
1093
+ specName: spec.name,
1094
+ workflow: spec.workflow,
1095
+ repoFixture: spec.repoFixture,
1096
+ workspacePath,
1097
+ evaluatedRunId: evaluatedRun?.runId,
1098
+ evaluatedBundlePath: evaluatedRun ? toBundleRef(evaluatedRun) : undefined,
1099
+ setupRuns,
1100
+ deterministicChecks,
1101
+ modelDependentChecks,
1102
+ passed,
1103
+ failureCount,
1104
+ warningCount: 0
1105
+ }
1106
+ });
1107
+ state.lifecycleArtifacts = [evalArtifact];
1108
+ state.auditTrail = [
1109
+ createAuditEntry({
1110
+ id: `${state.runId}-eval-runner`,
1111
+ nodeId: "eval-runner",
1112
+ nodeName: "eval-runner",
1113
+ kind: "deterministic",
1114
+ startedAt,
1115
+ completedAt: new Date().toISOString(),
1116
+ status: passed ? "success" : "failed",
1117
+ summary: evalArtifact.summary,
1118
+ toolsRequested: [],
1119
+ toolsExecuted: [],
1120
+ blockedActions: [],
1121
+ validationPassed: passed
1122
+ }),
1123
+ createAuditEntry({
1124
+ id: `${state.runId}-report`,
1125
+ nodeId: "report",
1126
+ nodeName: "final-report",
1127
+ kind: "report",
1128
+ startedAt,
1129
+ completedAt: new Date().toISOString(),
1130
+ status: "success",
1131
+ summary: "Generated eval result artifacts.",
1132
+ toolsRequested: [],
1133
+ toolsExecuted: [],
1134
+ blockedActions: [],
1135
+ validationPassed: true
1136
+ })
1137
+ ];
1138
+ const bundle = buildAuditBundle(state, {
1139
+ startedAt,
1140
+ finishedAt: new Date().toISOString(),
1141
+ status: passed ? "success" : "partial",
1142
+ jsonPath,
1143
+ markdownPath,
1144
+ provenance: {
1145
+ generatedBy: "agentforge-runtime",
1146
+ schemaVersion: state.version,
1147
+ executionEnvironment: state.context.ciExecution ? "ci" : "local",
1148
+ repoRoot: state.repo.root
1149
+ },
1150
+ redaction: {
1151
+ applied: true,
1152
+ strategyVersion: "1.0.0",
1153
+ categories: evalRedactionCategories()
1154
+ },
1155
+ components: []
1156
+ });
1157
+ writeFileSync(jsonPath, JSON.stringify(bundle, null, 2), "utf8");
1158
+ writeFileSync(markdownPath, renderAuditBundleMarkdown(bundle), "utf8");
1159
+ return { bundle, jsonPath, markdownPath, outputDir };
1160
+ }
1161
+ function compareDeterministicChecks(baselineChecks, candidateChecks) {
1162
+ const regressions = [];
1163
+ const improvements = [];
1164
+ const nonComparableFindings = [];
1165
+ let unchangedCount = 0;
1166
+ const baselineByName = new Map(baselineChecks.map((check) => [check.name, check]));
1167
+ const candidateByName = new Map(candidateChecks.map((check) => [check.name, check]));
1168
+ const checkNames = [...new Set([...baselineByName.keys(), ...candidateByName.keys()])].sort();
1169
+ for (const name of checkNames) {
1170
+ const baselineCheck = baselineByName.get(name);
1171
+ const candidateCheck = candidateByName.get(name);
1172
+ if (!baselineCheck || !candidateCheck) {
1173
+ nonComparableFindings.push(`Deterministic check \`${name}\` is missing from one of the eval results.`);
1174
+ continue;
1175
+ }
1176
+ if (baselineCheck.status === candidateCheck.status) {
1177
+ unchangedCount += 1;
1178
+ continue;
1179
+ }
1180
+ if (baselineCheck.status === "not_applicable" || candidateCheck.status === "not_applicable") {
1181
+ nonComparableFindings.push(`Deterministic check \`${name}\` changed between comparable and not_applicable states (${baselineCheck.status} -> ${candidateCheck.status}).`);
1182
+ continue;
1183
+ }
1184
+ if (baselineCheck.status === "passed" && candidateCheck.status === "failed") {
1185
+ regressions.push({
1186
+ name,
1187
+ classification: "regression",
1188
+ baselineStatus: baselineCheck.status,
1189
+ candidateStatus: candidateCheck.status,
1190
+ details: candidateCheck.details ?? baselineCheck.details
1191
+ });
1192
+ continue;
1193
+ }
1194
+ if (baselineCheck.status === "failed" && candidateCheck.status === "passed") {
1195
+ improvements.push({
1196
+ name,
1197
+ classification: "improvement",
1198
+ baselineStatus: baselineCheck.status,
1199
+ candidateStatus: candidateCheck.status,
1200
+ details: candidateCheck.details ?? baselineCheck.details
1201
+ });
1202
+ continue;
1203
+ }
1204
+ nonComparableFindings.push(`Deterministic check \`${name}\` changed in an unsupported way (${baselineCheck.status} -> ${candidateCheck.status}).`);
1205
+ }
1206
+ return { regressions, improvements, unchangedCount, nonComparableFindings };
1207
+ }
1208
+ function compareEvalArtifacts(baselineRunId, baselineBundlePath, baselineArtifact, candidateRunId, candidateBundlePath, candidateArtifact) {
1209
+ if (baselineArtifact.payload.specId !== candidateArtifact.payload.specId) {
1210
+ return {
1211
+ runId: candidateRunId,
1212
+ bundlePath: candidateBundlePath,
1213
+ specId: candidateArtifact.payload.specId,
1214
+ workflow: candidateArtifact.payload.workflow,
1215
+ comparable: false,
1216
+ passed: candidateArtifact.payload.passed,
1217
+ failureCount: candidateArtifact.payload.failureCount,
1218
+ deterministicCheckCount: candidateArtifact.payload.deterministicChecks.length,
1219
+ regressions: [],
1220
+ improvements: [],
1221
+ unchangedCount: 0,
1222
+ nonComparableFindings: [
1223
+ `Spec mismatch: baseline ${baselineArtifact.payload.specId} vs candidate ${candidateArtifact.payload.specId}.`
1224
+ ]
1225
+ };
1226
+ }
1227
+ if (baselineArtifact.payload.workflow !== candidateArtifact.payload.workflow) {
1228
+ return {
1229
+ runId: candidateRunId,
1230
+ bundlePath: candidateBundlePath,
1231
+ specId: candidateArtifact.payload.specId,
1232
+ workflow: candidateArtifact.payload.workflow,
1233
+ comparable: false,
1234
+ passed: candidateArtifact.payload.passed,
1235
+ failureCount: candidateArtifact.payload.failureCount,
1236
+ deterministicCheckCount: candidateArtifact.payload.deterministicChecks.length,
1237
+ regressions: [],
1238
+ improvements: [],
1239
+ unchangedCount: 0,
1240
+ nonComparableFindings: [
1241
+ `Workflow mismatch: baseline ${baselineArtifact.payload.workflow} vs candidate ${candidateArtifact.payload.workflow}.`
1242
+ ]
1243
+ };
1244
+ }
1245
+ const comparison = compareDeterministicChecks(baselineArtifact.payload.deterministicChecks, candidateArtifact.payload.deterministicChecks);
1246
+ return {
1247
+ runId: candidateRunId,
1248
+ bundlePath: candidateBundlePath,
1249
+ specId: candidateArtifact.payload.specId,
1250
+ workflow: candidateArtifact.payload.workflow,
1251
+ comparable: comparison.nonComparableFindings.length === 0,
1252
+ passed: candidateArtifact.payload.passed,
1253
+ failureCount: candidateArtifact.payload.failureCount,
1254
+ deterministicCheckCount: candidateArtifact.payload.deterministicChecks.length,
1255
+ regressions: comparison.regressions,
1256
+ improvements: comparison.improvements,
1257
+ unchangedCount: comparison.unchangedCount,
1258
+ nonComparableFindings: comparison.nonComparableFindings
1259
+ };
1260
+ }
1261
+ function createBenchmarkBundle(root, baselineRunId, baselineBundlePath, baselineArtifact, comparedRuns) {
1262
+ const config = loadAgentForgeConfig(root);
1263
+ const policy = resolvePolicy(loadPolicyDocument(join(root, ".agentops", "policy.yaml")), process.env.CI ? "ci" : "local");
1264
+ const state = createWorkflowState({
1265
+ cwd: root,
1266
+ workflow: "eval:compare",
1267
+ mode: "inspect",
1268
+ policy
1269
+ });
1270
+ const runsRoot = join(root, config.runtime.runsPath);
1271
+ const outputDir = join(runsRoot, state.runId);
1272
+ ensureDirectory(outputDir);
1273
+ const jsonPath = join(outputDir, "bundle.json");
1274
+ const markdownPath = join(outputDir, "summary.md");
1275
+ const regressionCount = comparedRuns.reduce((total, candidate) => total + candidate.regressions.length, 0);
1276
+ const improvementCount = comparedRuns.reduce((total, candidate) => total + candidate.improvements.length, 0);
1277
+ const unchangedCount = comparedRuns.reduce((total, candidate) => total + candidate.unchangedCount, 0);
1278
+ const nonComparableCount = comparedRuns.reduce((total, candidate) => total + candidate.nonComparableFindings.length, 0);
1279
+ const summaryConclusion = regressionCount > 0
1280
+ ? `Detected ${regressionCount} deterministic regression(s) across compared eval results.`
1281
+ : improvementCount > 0
1282
+ ? `Detected ${improvementCount} deterministic improvement(s) with no regressions.`
1283
+ : nonComparableCount > 0
1284
+ ? `Compared eval results contain ${nonComparableCount} non-comparable difference(s) and no deterministic regressions.`
1285
+ : "No deterministic regressions detected across compared eval results.";
1286
+ const benchmarkArtifact = benchmarkArtifactSchema.parse({
1287
+ schemaVersion: state.version,
1288
+ artifactKind: "benchmark-summary",
1289
+ lifecycleDomain: "evaluate",
1290
+ workflow: {
1291
+ name: state.workflow,
1292
+ displayName: "Eval Benchmark Compare"
1293
+ },
1294
+ source: {
1295
+ sourceType: "workflow-run",
1296
+ runId: state.runId,
1297
+ inputRefs: [baselineBundlePath, ...comparedRuns.map((candidate) => candidate.bundlePath)],
1298
+ issueRefs: ["#166"],
1299
+ githubRefs: []
1300
+ },
1301
+ status: "complete",
1302
+ generatedAt: new Date().toISOString(),
1303
+ repo: {
1304
+ root: state.repo.root,
1305
+ name: state.repo.name,
1306
+ branch: state.repo.branch
1307
+ },
1308
+ provenance: {
1309
+ generatedBy: "agentforge-runtime",
1310
+ schemaVersion: state.version,
1311
+ executionEnvironment: state.context.ciExecution ? "ci" : "local",
1312
+ repoRoot: state.repo.root
1313
+ },
1314
+ redaction: {
1315
+ applied: true,
1316
+ strategyVersion: "1.0.0",
1317
+ categories: evalRedactionCategories()
1318
+ },
1319
+ auditLink: {
1320
+ bundlePath: jsonPath,
1321
+ entryIds: [`${state.runId}-benchmark-compare`],
1322
+ findingIds: [],
1323
+ proposedActionIds: []
1324
+ },
1325
+ summary: summaryConclusion,
1326
+ payload: {
1327
+ baselineRunId,
1328
+ baselineBundlePath,
1329
+ baselineSpecId: baselineArtifact.payload.specId,
1330
+ baselineWorkflow: baselineArtifact.payload.workflow,
1331
+ comparedRuns,
1332
+ regressionCount,
1333
+ improvementCount,
1334
+ unchangedCount,
1335
+ nonComparableCount,
1336
+ summaryConclusion
1337
+ }
1338
+ });
1339
+ state.lifecycleArtifacts = [benchmarkArtifact];
1340
+ state.auditTrail = [
1341
+ createAuditEntry({
1342
+ id: `${state.runId}-benchmark-compare`,
1343
+ nodeId: "benchmark-compare",
1344
+ nodeName: "benchmark-compare",
1345
+ kind: "deterministic",
1346
+ startedAt: new Date().toISOString(),
1347
+ completedAt: new Date().toISOString(),
1348
+ status: regressionCount > 0 ? "failed" : "success",
1349
+ summary: benchmarkArtifact.summary,
1350
+ toolsRequested: [],
1351
+ toolsExecuted: [],
1352
+ blockedActions: [],
1353
+ validationPassed: regressionCount === 0
1354
+ })
1355
+ ];
1356
+ const bundle = buildAuditBundle(state, {
1357
+ startedAt: new Date().toISOString(),
1358
+ finishedAt: new Date().toISOString(),
1359
+ status: regressionCount > 0 || nonComparableCount > 0 ? "partial" : "success",
1360
+ jsonPath,
1361
+ markdownPath,
1362
+ provenance: {
1363
+ generatedBy: "agentforge-runtime",
1364
+ schemaVersion: state.version,
1365
+ executionEnvironment: state.context.ciExecution ? "ci" : "local",
1366
+ repoRoot: state.repo.root
1367
+ },
1368
+ redaction: {
1369
+ applied: true,
1370
+ strategyVersion: "1.0.0",
1371
+ categories: evalRedactionCategories()
1372
+ },
1373
+ components: []
1374
+ });
1375
+ writeFileSync(jsonPath, JSON.stringify(bundle, null, 2), "utf8");
1376
+ writeFileSync(markdownPath, renderAuditBundleMarkdown(bundle), "utf8");
1377
+ return { bundle, jsonPath, markdownPath, outputDir };
1378
+ }
509
1379
  function ensureInitFiles(root) {
510
1380
  const created = [];
511
1381
  const configDir = join(root, ".agentops");
@@ -545,6 +1415,18 @@ function ensureInitFiles(root) {
545
1415
  {
546
1416
  path: join(workflowsDir, "security-review.yaml"),
547
1417
  contents: securityWorkflowTemplate
1418
+ },
1419
+ {
1420
+ path: join(workflowsDir, "release-readiness.yaml"),
1421
+ contents: releaseWorkflowTemplate
1422
+ },
1423
+ {
1424
+ path: join(workflowsDir, "incident-handoff.yaml"),
1425
+ contents: incidentWorkflowTemplate
1426
+ },
1427
+ {
1428
+ path: join(workflowsDir, "maintenance-triage.yaml"),
1429
+ contents: maintenanceWorkflowTemplate
548
1430
  }
549
1431
  ];
550
1432
  for (const file of files) {
@@ -694,6 +1576,261 @@ export async function runLocalWorkflow(workflowName, cwd = process.cwd()) {
694
1576
  artifactKinds: bundle.lifecycleArtifacts.map((artifact) => artifact.artifactKind)
695
1577
  };
696
1578
  }
1579
+ function checkResult(status, name, expected, actual, details) {
1580
+ return {
1581
+ name,
1582
+ status,
1583
+ expected,
1584
+ actual,
1585
+ ...(details ? { details } : {})
1586
+ };
1587
+ }
1588
+ function compareEvalSpec(spec, bundle, executionError) {
1589
+ const checks = [];
1590
+ if (!bundle) {
1591
+ checks.push(checkResult("failed", "workflow-execution", "successful workflow execution", executionError ?? "unknown failure", "The eval runner could not produce an evaluated workflow bundle."));
1592
+ return {
1593
+ deterministicChecks: checks,
1594
+ modelDependentChecks: [
1595
+ {
1596
+ name: "rubric-scoring",
1597
+ status: "not_executed",
1598
+ details: "Provider-dependent scoring is out of scope for the first local eval runner slice."
1599
+ }
1600
+ ]
1601
+ };
1602
+ }
1603
+ checks.push(checkResult(bundle.status === spec.expectedStatus ? "passed" : "failed", "run-status", spec.expectedStatus, bundle.status, "The evaluated workflow status should match the deterministic eval spec."));
1604
+ checks.push(checkResult(bundle.redaction.applied === spec.redactionExpectations.applied ? "passed" : "failed", "redaction-applied", String(spec.redactionExpectations.applied), String(bundle.redaction.applied)));
1605
+ for (const category of spec.redactionExpectations.expectedCategories) {
1606
+ checks.push(checkResult(bundle.redaction.categories.includes(category) ? "passed" : "failed", `redaction-category:${category}`, category, bundle.redaction.categories.join(", ")));
1607
+ }
1608
+ checks.push(checkResult(bundle.policy.defaults.executionMode === spec.policyExpectations.executionMode ? "passed" : "failed", "policy-execution-mode", spec.policyExpectations.executionMode, bundle.policy.defaults.executionMode));
1609
+ if (spec.policyExpectations.readOnly) {
1610
+ checks.push(checkResult(bundle.policy.defaults.writes !== "allow" ? "passed" : "failed", "policy-read-only", "writes not equal allow", bundle.policy.defaults.writes));
1611
+ }
1612
+ for (const sideEffectClass of spec.policyExpectations.sideEffectClasses) {
1613
+ checks.push(checkResult("not_applicable", `side-effect-class:${sideEffectClass}`, sideEffectClass, undefined, "The first eval runner records policy posture and workflow outputs but does not inspect adapter-level side-effect execution traces."));
1614
+ }
1615
+ for (const expectedArtifact of spec.artifactExpectations) {
1616
+ const actualArtifact = bundle.lifecycleArtifacts.find((artifact) => artifact.artifactKind === expectedArtifact.artifactKind);
1617
+ checks.push(checkResult(actualArtifact ? "passed" : "failed", `artifact-kind:${expectedArtifact.artifactKind}`, expectedArtifact.artifactKind, actualArtifact?.artifactKind));
1618
+ if (!actualArtifact || typeof actualArtifact.payload !== "object" || actualArtifact.payload === null) {
1619
+ continue;
1620
+ }
1621
+ const payload = actualArtifact.payload;
1622
+ for (const field of expectedArtifact.requiredPayloadFields) {
1623
+ checks.push(checkResult(field in payload ? "passed" : "failed", `payload-field:${expectedArtifact.artifactKind}:${field}`, field, Object.keys(payload).join(", ")));
1624
+ }
1625
+ for (const term of expectedArtifact.requiredSummaryTerms) {
1626
+ const summary = actualArtifact.summary.toLowerCase();
1627
+ checks.push(checkResult(summary.includes(term.toLowerCase()) ? "passed" : "failed", `summary-term:${expectedArtifact.artifactKind}:${term}`, term, actualArtifact.summary));
1628
+ }
1629
+ }
1630
+ if (spec.artifactExpectations.length === 0) {
1631
+ checks.push(checkResult(bundle.lifecycleArtifacts.length === 0 ? "passed" : "failed", "artifact-count", "0", String(bundle.lifecycleArtifacts.length)));
1632
+ }
1633
+ return {
1634
+ deterministicChecks: checks,
1635
+ modelDependentChecks: [
1636
+ {
1637
+ name: "rubric-scoring",
1638
+ status: "not_executed",
1639
+ details: "Provider-dependent scoring is out of scope for the first local eval runner slice."
1640
+ }
1641
+ ]
1642
+ };
1643
+ }
1644
+ async function executeEvalWorkflow(spec, workspaceRoot) {
1645
+ const setupRuns = [];
1646
+ const requestsRoot = join(workspaceRoot, ".agentops", "requests");
1647
+ ensureDirectory(requestsRoot);
1648
+ const runPlanning = async () => {
1649
+ writeYamlFile(join(requestsRoot, "planning.yaml"), schemaFixtures.planningRequest);
1650
+ return runLocalWorkflow("planning-discovery", workspaceRoot);
1651
+ };
1652
+ const runDesign = async () => {
1653
+ const planningRun = await runPlanning();
1654
+ setupRuns.push(toSetupRun("planning-discovery", planningRun));
1655
+ writeYamlFile(join(requestsRoot, "design.yaml"), {
1656
+ ...schemaFixtures.designRequest,
1657
+ planningBriefRef: toBundleRef(planningRun)
1658
+ });
1659
+ return runLocalWorkflow("architecture-design-review", workspaceRoot);
1660
+ };
1661
+ const runImplementation = async () => {
1662
+ const designRun = await runDesign();
1663
+ setupRuns.push(toSetupRun("architecture-design-review", designRun));
1664
+ writeYamlFile(join(requestsRoot, "implementation.yaml"), {
1665
+ ...schemaFixtures.implementationRequest,
1666
+ designRecordRef: toBundleRef(designRun)
1667
+ });
1668
+ return runLocalWorkflow("implementation-proposal", workspaceRoot);
1669
+ };
1670
+ const runQa = async () => {
1671
+ const implementationRun = await runImplementation();
1672
+ setupRuns.push(toSetupRun("implementation-proposal", implementationRun));
1673
+ writeYamlFile(join(requestsRoot, "qa.yaml"), {
1674
+ ...schemaFixtures.qaRequest,
1675
+ targetRef: toBundleRef(implementationRun),
1676
+ evidenceSources: [toSummaryRef(implementationRun)]
1677
+ });
1678
+ return runLocalWorkflow("qa-review", workspaceRoot);
1679
+ };
1680
+ const runSecurity = async () => {
1681
+ const qaRun = await runQa();
1682
+ setupRuns.push(toSetupRun("qa-review", qaRun));
1683
+ writeYamlFile(join(requestsRoot, "security.yaml"), {
1684
+ ...schemaFixtures.securityRequest,
1685
+ targetRef: toBundleRef(qaRun),
1686
+ evidenceSources: [toSummaryRef(qaRun)]
1687
+ });
1688
+ return runLocalWorkflow("security-review", workspaceRoot);
1689
+ };
1690
+ const runRelease = async () => {
1691
+ const securityRun = await runSecurity();
1692
+ setupRuns.push(toSetupRun("security-review", securityRun));
1693
+ const qaRun = setupRuns.find((run) => run.workflow === "qa-review");
1694
+ if (!qaRun) {
1695
+ throw new Error("QA setup run was not recorded before release eval execution.");
1696
+ }
1697
+ writeYamlFile(join(requestsRoot, "release.yaml"), {
1698
+ ...schemaFixtures.releaseRequest,
1699
+ qaReportRefs: [qaRun.bundlePath],
1700
+ securityReportRefs: [toBundleRef(securityRun)],
1701
+ evidenceSources: [toSummaryRef(securityRun)]
1702
+ });
1703
+ return runLocalWorkflow("release-readiness", workspaceRoot);
1704
+ };
1705
+ switch (spec.workflow) {
1706
+ case "pr-review":
1707
+ return { evaluatedRun: await runLocalWorkflow("pr-review", workspaceRoot), setupRuns };
1708
+ case "planning-discovery":
1709
+ writeYamlFile(join(requestsRoot, "planning.yaml"), spec.request);
1710
+ return { evaluatedRun: await runLocalWorkflow("planning-discovery", workspaceRoot), setupRuns };
1711
+ case "architecture-design-review": {
1712
+ const planningRun = await runPlanning();
1713
+ setupRuns.push(toSetupRun("planning-discovery", planningRun));
1714
+ writeYamlFile(join(requestsRoot, "design.yaml"), {
1715
+ ...spec.request,
1716
+ planningBriefRef: toBundleRef(planningRun)
1717
+ });
1718
+ return { evaluatedRun: await runLocalWorkflow("architecture-design-review", workspaceRoot), setupRuns };
1719
+ }
1720
+ case "implementation-proposal": {
1721
+ const designRun = await runDesign();
1722
+ setupRuns.push(toSetupRun("architecture-design-review", designRun));
1723
+ writeYamlFile(join(requestsRoot, "implementation.yaml"), {
1724
+ ...spec.request,
1725
+ designRecordRef: toBundleRef(designRun)
1726
+ });
1727
+ return { evaluatedRun: await runLocalWorkflow("implementation-proposal", workspaceRoot), setupRuns };
1728
+ }
1729
+ case "qa-review": {
1730
+ const implementationRun = await runImplementation();
1731
+ setupRuns.push(toSetupRun("implementation-proposal", implementationRun));
1732
+ writeYamlFile(join(requestsRoot, "qa.yaml"), {
1733
+ ...spec.request,
1734
+ targetRef: toBundleRef(implementationRun),
1735
+ evidenceSources: [toSummaryRef(implementationRun)]
1736
+ });
1737
+ return { evaluatedRun: await runLocalWorkflow("qa-review", workspaceRoot), setupRuns };
1738
+ }
1739
+ case "security-review": {
1740
+ const qaRun = await runQa();
1741
+ setupRuns.push(toSetupRun("qa-review", qaRun));
1742
+ writeYamlFile(join(requestsRoot, "security.yaml"), {
1743
+ ...spec.request,
1744
+ targetRef: toBundleRef(qaRun),
1745
+ evidenceSources: [toSummaryRef(qaRun)]
1746
+ });
1747
+ return { evaluatedRun: await runLocalWorkflow("security-review", workspaceRoot), setupRuns };
1748
+ }
1749
+ case "maintenance-triage": {
1750
+ const releaseRun = await runRelease();
1751
+ setupRuns.push(toSetupRun("release-readiness", releaseRun));
1752
+ writeYamlFile(join(requestsRoot, "maintenance.yaml"), {
1753
+ ...spec.request,
1754
+ releaseReportRefs: [toBundleRef(releaseRun)]
1755
+ });
1756
+ return { evaluatedRun: await runLocalWorkflow("maintenance-triage", workspaceRoot), setupRuns };
1757
+ }
1758
+ }
1759
+ }
1760
+ export async function runLocalEval(specId, cwd = process.cwd()) {
1761
+ const root = findWorkspaceRoot(cwd);
1762
+ ensureInitFiles(root);
1763
+ const spec = getEvalSpec(specId);
1764
+ const controlPolicy = resolvePolicy(loadPolicyDocument(join(root, ".agentops", "policy.yaml")), process.env.CI ? "ci" : "local");
1765
+ const controlState = createWorkflowState({
1766
+ cwd: root,
1767
+ workflow: `eval:${spec.id}`,
1768
+ mode: controlPolicy.defaults.executionMode,
1769
+ policy: controlPolicy
1770
+ });
1771
+ const workspaceRoot = spec.repoFixture === "agentforge-monorepo" ? root : createBlankEvalWorkspace(root, controlState.runId, spec.id);
1772
+ let evaluatedRun;
1773
+ let setupRuns = [];
1774
+ let executionError;
1775
+ try {
1776
+ const result = await executeEvalWorkflow(spec, workspaceRoot);
1777
+ evaluatedRun = result.evaluatedRun;
1778
+ setupRuns = result.setupRuns;
1779
+ }
1780
+ catch (error) {
1781
+ executionError = error instanceof Error ? error.message : String(error);
1782
+ }
1783
+ const evaluatedBundle = evaluatedRun && existsSync(evaluatedRun.jsonPath)
1784
+ ? auditBundleSchema.parse(JSON.parse(readFileSync(evaluatedRun.jsonPath, "utf8")))
1785
+ : undefined;
1786
+ const { deterministicChecks, modelDependentChecks } = compareEvalSpec(spec, evaluatedBundle, executionError);
1787
+ const { bundle, jsonPath, markdownPath, outputDir } = createEvalBundle(root, spec, evaluatedRun, workspaceRoot, setupRuns, deterministicChecks, modelDependentChecks);
1788
+ return {
1789
+ runId: bundle.runId,
1790
+ specId: spec.id,
1791
+ workflow: spec.workflow,
1792
+ outputDir,
1793
+ jsonPath,
1794
+ markdownPath,
1795
+ status: bundle.status,
1796
+ evaluatedRunId: evaluatedRun?.runId,
1797
+ evaluatedBundlePath: evaluatedRun ? toBundleRef(evaluatedRun) : undefined,
1798
+ setupRunCount: setupRuns.length,
1799
+ deterministicCheckCount: deterministicChecks.length,
1800
+ deterministicFailures: deterministicChecks.filter((check) => check.status === "failed").length,
1801
+ artifactKinds: bundle.lifecycleArtifacts.map((artifact) => artifact.artifactKind)
1802
+ };
1803
+ }
1804
+ export function compareLocalEvalRuns(baselineRunRef, candidateRunRefs, cwd = process.cwd()) {
1805
+ if (candidateRunRefs.length === 0) {
1806
+ throw new Error("Provide at least one candidate eval run to compare against the baseline.");
1807
+ }
1808
+ const root = findWorkspaceRoot(cwd);
1809
+ ensureInitFiles(root);
1810
+ const baseline = readRunBundleByRef(root, baselineRunRef);
1811
+ const baselineArtifact = extractEvalArtifact(baseline.bundle, baselineRunRef);
1812
+ const comparedRuns = candidateRunRefs.map((candidateRunRef) => {
1813
+ const candidate = readRunBundleByRef(root, candidateRunRef);
1814
+ const candidateArtifact = extractEvalArtifact(candidate.bundle, candidateRunRef);
1815
+ return compareEvalArtifacts(baseline.runId, baseline.bundlePath, baselineArtifact, candidate.runId, candidate.bundlePath, candidateArtifact);
1816
+ });
1817
+ const { bundle, jsonPath, markdownPath, outputDir } = createBenchmarkBundle(root, baseline.runId, baseline.bundlePath, baselineArtifact, comparedRuns);
1818
+ return {
1819
+ runId: bundle.runId,
1820
+ outputDir,
1821
+ jsonPath,
1822
+ markdownPath,
1823
+ status: bundle.status,
1824
+ baselineRunId: baseline.runId,
1825
+ comparedRunIds: comparedRuns.map((candidate) => candidate.runId),
1826
+ comparableRunCount: comparedRuns.filter((candidate) => candidate.comparable).length,
1827
+ regressionCount: comparedRuns.reduce((total, candidate) => total + candidate.regressions.length, 0),
1828
+ improvementCount: comparedRuns.reduce((total, candidate) => total + candidate.improvements.length, 0),
1829
+ unchangedCount: comparedRuns.reduce((total, candidate) => total + candidate.unchangedCount, 0),
1830
+ nonComparableCount: comparedRuns.reduce((total, candidate) => total + candidate.nonComparableFindings.length, 0),
1831
+ artifactKinds: bundle.lifecycleArtifacts.map((artifact) => artifact.artifactKind)
1832
+ };
1833
+ }
697
1834
  export function explainLastRun(cwd = process.cwd()) {
698
1835
  const root = findWorkspaceRoot(cwd);
699
1836
  const config = loadAgentForgeConfig(root);