npm - @tangle-network/agent-eval - Versions diffs - 0.72.3 → 0.73.0 - Mend

@tangle-network/agent-eval 0.72.3 → 0.73.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/campaign/index.d.ts +34 -1
package/dist/campaign/index.js +75 -0
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-JHA3ZGSO.js → chunk-XGNCBAVZ.js} +326 -1
package/dist/chunk-XGNCBAVZ.js.map +1 -0
package/dist/index.d.ts +1 -1
package/dist/index.js +3 -1
package/dist/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/traces.d.ts +75 -2
package/dist/traces.js +3 -1
package/package.json +1 -1
package/dist/chunk-JHA3ZGSO.js.map +0 -1

package/dist/campaign/index.d.ts CHANGED Viewed

@@ -1059,6 +1059,39 @@ interface ScoreboardRow {
  * evidence behind the verdict.
  */
 declare function userStoryScoreboard(verdicts: readonly UserStoryVerdict[]): ScoreboardRow[];
+/** Launch-readiness headline counts rolled up from the per-requirement rows. */
+interface ScoreboardSummary {
+    /** Distinct user stories on the board. */
+    stories: number;
+    /** Stories whose every requirement passed. */
+    storiesFullyComplete: number;
+    /** Total (story, requirement) rows. */
+    requirements: number;
+    /** Rows with status PASS. */
+    passed: number;
+    /** Rows with status FAIL. */
+    failed: number;
+    /** passed / requirements; 0 when there are no rows. */
+    passRate: number;
+}
+/** Roll the per-requirement rows up into the launch headline counts. */
+declare function scoreboardSummary(rows: readonly ScoreboardRow[]): ScoreboardSummary;
+interface ScoreboardRenderOptions {
+    /** Document H1. Defaults to a generic playback title. */
+    title?: string;
+    /** Key/value run metadata rendered under the headline (runId, backend, model, date). */
+    meta?: Record<string, string>;
+    /** Max chars of joined evidence shown per row. Default 160. */
+    maxEvidenceChars?: number;
+}
+/**
+ * Render the scoreboard as a launch-readiness Markdown document — the literal
+ * "tick off every user story" artifact: a headline roll-up, the open tickets
+ * (FAIL rows) up top as the launch blockers, then a per-story table of
+ * requirement → PASS/FAIL with the evidence behind each verdict. Pure: same
+ * rows in, same bytes out (no clock/random), so it is safe to snapshot.
+ */
+declare function renderScoreboardMarkdown(rows: readonly ScoreboardRow[], opts?: ScoreboardRenderOptions): string;
 /**
  * @experimental
@@ -1239,4 +1272,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
  *  as a ref under the adapter's worktree dir. */
 declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
-export { type AcceptedEdit, type AceDriverOptions, type AnalystArtifact, type AnalystScenario, type ApplySkillPatchResult, type BuildAnalystSurfaceDispatchOptions, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, DRIVER_GUIDE, type DimensionRegression, DispatchContext, type DriverComparison, type DriverEntry, type DriverGoal, type DriverGuideEntry, type DriverName, type DriverPairwise, type DriverRecommendation, type DriverScore, type DriverStrategy, type DriverSurface, type FailureModeRecallJudgeOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, type HaloDriverOptions, type HeldoutSignificance, type HeldoutSignificanceOptions, ImprovementDriver, JudgeConfig, JudgeScore, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type MemoryCurationDriverOptions, MutableSurface, type OptimizerEntryConfig, type PairedHoldout, type PlaybackContext, type PlaybackDriver, type PlaybackStep, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, ProposeContext, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, RunImprovementLoopOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type ScoreboardRow, type SelectDriverCriteria, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type TraceAnalystDriverOptions, type UserStory, type UserStoryVerdict, type Worktree, type WorktreeAdapter, WorktreeAdapterError, aceDriver, applySkillPatch, buildAnalystSurfaceDispatch, campaignBreakdown, campaignMeanComposite, compareDrivers, detectScale, dimensionRegressions, failureModeRecallJudge, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, haloDriver, heldoutSignificance, makePlaybackDispatch, memoryCurationDriver, pairHoldout, parseSkillPatchResponse, patchEditCount, resolveWorktreePath, runProfileMatrix, runSkillOpt, scoreUserStory, selectDriver, skillOptDriver, skillOptEntry, traceAnalystDriver, userStoryScoreboard };
+export { type AcceptedEdit, type AceDriverOptions, type AnalystArtifact, type AnalystScenario, type ApplySkillPatchResult, type BuildAnalystSurfaceDispatchOptions, type CampaignBreakdown, CampaignResult, CampaignStorage, CodeSurface, type CompareDriversOptions, DRIVER_GUIDE, type DimensionRegression, DispatchContext, type DriverComparison, type DriverEntry, type DriverGoal, type DriverGuideEntry, type DriverName, type DriverPairwise, type DriverRecommendation, type DriverScore, type DriverStrategy, type DriverSurface, type FailureModeRecallJudgeOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, type HaloDriverOptions, type HeldoutSignificance, type HeldoutSignificanceOptions, ImprovementDriver, JudgeConfig, JudgeScore, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type MemoryCurationDriverOptions, MutableSurface, type OptimizerEntryConfig, type PairedHoldout, type PlaybackContext, type PlaybackDriver, type PlaybackStep, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, ProposeContext, type ProposePatchesArgs, type RejectedEdit, RunCampaignOptions, RunImprovementLoopOptions, type RunProfileMatrixOptions, type RunProfileMatrixResult, type RunSkillOptOptions, type RunSkillOptResult, Scenario, type ScenarioRollup, type ScoreboardRenderOptions, type ScoreboardRow, type ScoreboardSummary, type SelectDriverCriteria, type SkillOptDriver, type SkillOptDriverOptions, type SkillOptEpochRecord, type SkillOptEvidence, type SkillPatch, type SkillPatchOp, SkillPatchParseError, type SkillPatchRejection, type TraceAnalystDriverOptions, type UserStory, type UserStoryVerdict, type Worktree, type WorktreeAdapter, WorktreeAdapterError, aceDriver, applySkillPatch, buildAnalystSurfaceDispatch, campaignBreakdown, campaignMeanComposite, compareDrivers, detectScale, dimensionRegressions, failureModeRecallJudge, gepaParetoEntry, gepaReflectionEntry, gitWorktreeAdapter, haloDriver, heldoutSignificance, makePlaybackDispatch, memoryCurationDriver, pairHoldout, parseSkillPatchResponse, patchEditCount, renderScoreboardMarkdown, resolveWorktreePath, runProfileMatrix, runSkillOpt, scoreUserStory, scoreboardSummary, selectDriver, skillOptDriver, skillOptEntry, traceAnalystDriver, userStoryScoreboard };

package/dist/campaign/index.js CHANGED Viewed

@@ -1396,6 +1396,79 @@ function userStoryScoreboard(verdicts) {
   }
   return rows;
 }
+function scoreboardSummary(rows) {
+  const byStory = /* @__PURE__ */ new Map();
+  let passed = 0;
+  for (const r of rows) {
+    const s = byStory.get(r.storyId) ?? { total: 0, passed: 0 };
+    s.total++;
+    if (r.status === "PASS") {
+      s.passed++;
+      passed++;
+    }
+    byStory.set(r.storyId, s);
+  }
+  let storiesFullyComplete = 0;
+  for (const s of byStory.values()) if (s.total > 0 && s.passed === s.total) storiesFullyComplete++;
+  return {
+    stories: byStory.size,
+    storiesFullyComplete,
+    requirements: rows.length,
+    passed,
+    failed: rows.length - passed,
+    passRate: rows.length === 0 ? 0 : passed / rows.length
+  };
+}
+function escapeCell(s) {
+  return s.replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
+}
+function truncate2(s, max) {
+  return s.length <= max ? s : `${s.slice(0, Math.max(0, max - 1))}\u2026`;
+}
+function renderScoreboardMarkdown(rows, opts = {}) {
+  const maxEv = opts.maxEvidenceChars ?? 160;
+  const sum = scoreboardSummary(rows);
+  const pct = (n) => `${Math.round(n * 100)}%`;
+  const ev = (e) => escapeCell(truncate2(e.join("; "), maxEv)) || "\u2014";
+  const out = [`# ${opts.title ?? "Product-flow playback scoreboard"}`, ""];
+  if (opts.meta) {
+    for (const [k, v] of Object.entries(opts.meta)) out.push(`- **${k}:** ${v}`);
+    out.push("");
+  }
+  out.push(
+    `**${sum.storiesFullyComplete}/${sum.stories}** user stories fully shipped \xB7 **${sum.passed}/${sum.requirements}** requirements passing (${pct(sum.passRate)}) \xB7 **${sum.failed}** open`,
+    ""
+  );
+  const fails = rows.filter((r) => r.status === "FAIL");
+  if (fails.length > 0) {
+    out.push("## Open tickets", "", "| Story | Requirement | Evidence |", "| --- | --- | --- |");
+    for (const r of fails) {
+      out.push(`| ${escapeCell(r.storyTitle)} | ${escapeCell(r.reqTitle)} | ${ev(r.evidence)} |`);
+    }
+    out.push("");
+  } else {
+    out.push("_All requirements passing \u2014 no open tickets._", "");
+  }
+  out.push("## Per-story tick-off", "");
+  for (const storyId of [...new Set(rows.map((r) => r.storyId))]) {
+    const storyRows = rows.filter((r) => r.storyId === storyId);
+    const passed = storyRows.filter((r) => r.status === "PASS").length;
+    const mark = passed === storyRows.length ? "\u2705" : "\u26A0\uFE0F";
+    out.push(
+      `### ${escapeCell(storyRows[0].storyTitle)} \u2014 ${passed}/${storyRows.length} ${mark}`,
+      "",
+      "| Requirement | Status | Evidence |",
+      "| --- | --- | --- |"
+    );
+    for (const r of storyRows) {
+      out.push(
+        `| ${escapeCell(r.reqTitle)} | ${r.status === "PASS" ? "\u2705 PASS" : "\u274C FAIL"} | ${ev(r.evidence)} |`
+      );
+    }
+    out.push("");
+  }
+  return out.join("\n");
+}
 // src/campaign/presets/run-profile-matrix.ts
 import { createHash as createHash2 } from "crypto";
@@ -1725,6 +1798,7 @@ export {
   patchEditCount,
   provenanceRecordPath,
   provenanceSpansPath,
+  renderScoreboardMarkdown,
   resolveWorktreePath,
   runCampaign,
   runEval,
@@ -1733,6 +1807,7 @@ export {
   runProfileMatrix,
   runSkillOpt,
   scoreUserStory,
+  scoreboardSummary,
   selectDriver,
   skillOptDriver,
   skillOptEntry,