npm - valent-pipeline - Versions diffs - 0.5.3 → 0.5.5 - Mend

valent-pipeline 0.5.3 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/package.json +1 -1
package/pipeline/orchestrators/claude-code/plan.workflow.js +13 -5
package/pipeline/prompts/qa-a.md +1 -0
package/pipeline/scripts/query-kb.ts +7 -1
package/pipeline/steps/common/agent-protocol.md +2 -2
package/pipeline/steps/common/quality-standards.md +14 -0
package/pipeline/steps/critic/test-review.md +3 -0
package/pipeline/steps/orchestration/sprint-init.md +8 -3
package/pipeline/steps/orchestration/sprint-plan.md +11 -3
package/pipeline/steps/orchestration/sprint-review.md +5 -1
package/pipeline/steps/orchestration/sprint-size.md +3 -2
package/pipeline/steps/qa-a/write-spec.md +20 -0
package/pipeline/steps/retrospective/directives.md +7 -2
package/pipeline/templates/sprint-status.template.yaml +1 -0
package/src/lib/sprint.js +28 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "valent-pipeline",
-  "version": "0.5.3",
+  "version": "0.5.5",
   "description": "v3 multi-agent AI pipeline for software development lifecycle",
   "type": "module",
   "bin": {

package/pipeline/orchestrators/claude-code/plan.workflow.js CHANGED Viewed

@@ -101,6 +101,7 @@ const PACK_SCHEMA = {
     buffer_story_ids: { type: 'array', items: { type: 'string' } },
     points_planned: { type: 'integer' },
     remaining_capacity: { type: 'integer' },
+    over_budget: { type: 'boolean' },
   },
 }
@@ -291,10 +292,14 @@ const sized = await parallel(
           }),
           { label: `estimate:${est.toLowerCase()}:${g.storyId}`, phase: 'Size', schema: ESTIMATE_SCHEMA, model: modelFor(est) },
         )),
-    ).then((ests) => ({
-      ...g,
-      points: ests.filter(Boolean).reduce((sum, e) => sum + (e.points || 0), 0),
-    }))
+    ).then((ests) => {
+      // Each estimator sizes the WHOLE story from its surface's lens (BEND sees the full story,
+      // IAC sees the full story), so the points are the MAX single estimate — NOT the sum.
+      // Summing double-counts shared scaffolding: a 2-profile story is not twice the work, and
+      // summing systematically over-points multi-profile stories until they exceed velocity.
+      const vals = ests.filter(Boolean).map((e) => e.points || 0)
+      return { ...g, points: vals.length ? Math.max(...vals) : 0 }
+    })
   }),
 )
 const sizedStories = sized.filter(Boolean)
@@ -313,10 +318,13 @@ phase('Pack')
 // Deterministic greedy packing happens in code (src/lib/sprint.js), invoked via the CLI.
 const pack = await agent(
   `Run exactly: \`node .valent-pipeline/bin/cli.js sprint-pack --velocity ${velocity} --backlog ${backlogPath}\` ` +
-    `in the project root and return its stdout JSON verbatim (fields: sprint_stories, buffer_story_ids, points_planned, remaining_capacity).`,
+    `in the project root and return its stdout JSON verbatim (fields: sprint_stories, buffer_story_ids, points_planned, remaining_capacity, over_budget).`,
   { label: 'sprint-pack', phase: 'Pack', schema: PACK_SCHEMA, model: modelFor('PACK') },
 )
 log(`packed ${pack.sprint_stories.length} stories (${pack.points_planned} pts); buffer: ${pack.buffer_story_ids.length}`)
+if (pack.over_budget) {
+  log(`⚠ sprint ${sprintId} is OVER BUDGET: the highest-priority story exceeds velocity ${velocity} and was planned alone — consider splitting it (${pack.points_planned} pts vs ${velocity} velocity)`)
+}
 phase('Validate')
 // Write the human plan + machine status artifacts, tag the backlog, then cross-check in code.

package/pipeline/prompts/qa-a.md CHANGED Viewed

@@ -73,6 +73,7 @@ Before finalizing, verify:
 - No behavior is spec'd at multiple tiers
 - Every error test case has a specific error code and message pattern
 - Every P0 test case has DB state verification
+- Every P0/P1 AC on an interactive surface (`api`, `ui`) has a human-readable acceptance scenario realized as a Live Smoke Test (api) or UI Integration Smoke Test (ui) row for QA-B to execute against live infrastructure — never left to unit tests alone
 - Seed data uses factory patterns, not raw SQL
 - NFR requirements are tagged ([NFR-PERF], [NFR-SEC], [NFR-REL])
 - Visual validation checkpoints cover all 5 page states for each page (if UI story)

package/pipeline/scripts/query-kb.ts CHANGED Viewed

@@ -78,10 +78,16 @@ switch (mode) {
   case 'directives': {
     const agent = flags['agent'];
+    // ALL-DEV is a cross-cutting directive that applies to every developer agent. Include it when
+    // the queried agent is one of them, so a per-agent query still sees shared dev directives.
+    const DEV_AGENTS = ['BEND', 'FEND', 'DATA', 'MCP-DEV', 'LIBDEV', 'DOCGEN', 'IAC', 'MOBILE'];
     let rows;
     if (agent) {
+      const includeAllDev = DEV_AGENTS.includes(agent.toUpperCase());
       rows = db.prepare(
-        "SELECT id, directive, reason FROM correction_directives WHERE target_agent = ? AND status = 'active'"
+        includeAllDev
+          ? "SELECT id, target_agent, directive, reason FROM correction_directives WHERE (target_agent = ? OR target_agent = 'ALL-DEV') AND status = 'active'"
+          : "SELECT id, directive, reason FROM correction_directives WHERE target_agent = ? AND status = 'active'"
       ).all(agent) as { id: string; directive: string; reason: string }[];
     } else {
       rows = db.prepare(

package/pipeline/steps/common/agent-protocol.md CHANGED Viewed

@@ -42,7 +42,7 @@ When `signal_delivery` is `thread`: Lead steers you with the Design Council ques
 When you need information about project conventions, architectural patterns, existing code structure, or known pitfalls: self-serve from the knowledge base before exploring the codebase directly. Curated knowledge and correction directives answer in seconds what codebase exploration takes minutes to discover.
 **How to self-serve:**
-1. Read correction directives from `{correction_directives}` — filter for `status: active` entries targeting your agent role.
+1. Read correction directives from `{correction_directives}` — filter for `status: active` entries whose `target_agent` is your agent role (or `ALL-DEV` if you are a developer agent).
 2. Read curated knowledge files in `{curated_files_path}` — scan file names and section headers for entries relevant to your current task.
 3. If `{knowledge_mode}` is `sqlite`: query the database via CLI (see your read-inputs step for commands).
 4. If `{story_output_dir}/knowledge-context.md` exists: read it directly — this is a pre-compiled reference containing all relevant knowledge for the current story.
@@ -51,7 +51,7 @@ Reserve direct codebase exploration for when curated knowledge does not have the
 ## Correction Directives
-Read active correction directives from `{correction_directives}`. If the file does not exist or is empty, proceed without directives -- this is expected for new pipelines. Apply ALL directives targeting your agent role. If a directive conflicts with these instructions, the directive takes precedence. Log each applied directive in your YAML frontmatter under `correctionsApplied`.
+Read active correction directives from `{correction_directives}`. If the file does not exist or is empty, proceed without directives -- this is expected for new pipelines. Apply ALL directives whose `target_agent` matches your agent role **OR** is `ALL-DEV` when you are a developer agent (BEND, FEND, DATA, MCP-DEV, LIBDEV, DOCGEN, IAC, MOBILE) — `ALL-DEV` is a cross-cutting directive that applies to every developer agent, not one surface. If a directive conflicts with these instructions, the directive takes precedence. Log each applied directive in your YAML frontmatter under `correctionsApplied`.
 ## YAML Frontmatter

package/pipeline/steps/common/quality-standards.md CHANGED Viewed

@@ -10,8 +10,22 @@ These are non-negotiable. CRITIC and QA-B enforce them. Every developer agent (B
 - **<1.5 minutes per test** -- any test exceeding this is a design problem, not a timeout problem.
 - **Self-cleaning via fixture auto-teardown** -- tests must not leave state behind. Use framework teardown hooks, not manual cleanup.
 - **Explicit assertions in test bodies** -- never hide assertions in helpers. Every test body must contain at least one visible `expect`/`assert`.
+- **Falsifiable assertions** -- every assertion must be able to FAIL when the behavior is wrong. No unfalsifiable disjunctive gates like `expect(a === 0 || b === 0 || fallback).toBe(true)` — a disjunction with a catch-all that is almost always true asserts nothing. Assert the real invariant directly (e.g. "exactly one container exists AND `SELECT 1` succeeds AND no corruption log"), not a condition that passes regardless of outcome.
+- **Bind to the real artifact under test** -- resolve the actual artifact dynamically (built image id, running service/container name, the file the build emits) and assert against THAT. Never assert against a hardcoded tag/name or an artifact that merely happens to exist on your machine. The test: would it still pass on a clean/CI checkout with no leftover state? If it passes only because of an ambient/side-channel artifact, it is broken. (Prove the artifact exists — `expect(imageId).toBeTruthy()` — before inspecting it.)
+- **Statistical assertions match the spec** -- when the spec states a percentile or sample count (e.g. NFR-PERF "P95 < 200ms"), assert exactly that over real samples with warm-up where appropriate. A single-shot `elapsed < 200` is not a P95 and is flaky — it does not satisfy the spec.
 - **Parallel-safe** -- no shared mutable state between tests. Must run cleanly with `--workers=4`.
+## Pre-Handoff Self-Check — MANDATORY before declaring done
+CRITIC enforces every item below and will reject on any High finding, costing a full rework cycle. Run this checklist against your own tests BEFORE writing your handoff; fix anything that fails first. (Each recurred across real stories — this is the gap that buys a guaranteed rework cycle if skipped.)
+- [ ] **Every P0 qa-test-spec case has a named, implemented test.** Count the spec's P0 cases; count your `it(...)`/`test(...)` blocks; a P0 case with no implementation is a High finding. Do not treat a spec note (e.g. "idempotency") as covered unless there is a named test for it.
+- [ ] **No `if`/`switch`/ternary in any test body** (same path every run).
+- [ ] **No unfalsifiable assertions** — no disjunctive/catch-all gates; each assert can fail.
+- [ ] **Tests bind to the real artifact** — no hardcoded tag/name; would pass on a clean/CI machine with no leftover state.
+- [ ] **Statistical/perf assertions match the spec's wording** (percentile + samples, not single-shot).
+- [ ] **No assertion weakening, no infra mocking on happy paths, no hard waits, assertions visible in test bodies.**
 ## Live Infrastructure Standards
 - **Live tests against running infrastructure** -- tests hit real systems. No mocking databases, APIs, pipelines, servers, or external services for happy-path verification.

package/pipeline/steps/critic/test-review.md CHANGED Viewed

@@ -21,6 +21,9 @@
 - **No test deletion** -- all qa-test-spec test cases must have corresponding tests. A missing test is a High finding.
 - **No hard waits** -- `sleep()`, `setTimeout()`, `page.waitForTimeout()` in tests is a High finding.
 - **No conditionals** -- `if` statements in test bodies is a High finding.
+- **Falsifiable assertions** -- a disjunctive/catch-all gate that is almost always true (e.g. `expect(a === 0 || b === 0 || transient).toBe(true)`) asserts nothing. A test that cannot fail when the behavior is wrong is a High finding. Require the real invariant.
+- **Artifact binding** -- a test that asserts against a hardcoded tag/name, or only passes because of an ambient/leftover artifact on the dev's machine (would fail on a clean/CI checkout), is a High finding. The artifact under test must be resolved dynamically and proven to exist before inspection.
+- **Statistical assertions match spec** -- when the spec states a percentile/sample count (NFR-PERF "P95 < Nms"), a single-shot threshold instead is a Med finding (the assertion does not match the spec).
 - **Explicit assertions** -- assertions hidden inside helper functions is a Med finding. Every test body must contain visible `expect`/`assert`.
 ## Output

package/pipeline/steps/orchestration/sprint-init.md CHANGED Viewed

@@ -19,9 +19,14 @@ node .valent-pipeline/bin/cli.js db query-velocity
 ```
 **Velocity rules:**
-- **Sprint 1:** Use `{sprint_initial_velocity}` from config (default: 60 points)
-- **Sprint 2-4:** Average points shipped across all completed sprints
-- **Sprint 5+:** Simple moving average of the last 5 sprints (older data ages out)
+Velocity is your *capacity* — how many points you can ship when capacity is the binding constraint. Only **capacity-constrained** sprints carry that signal. A **supply-constrained** sprint (one that shipped everything eligible with capacity to spare — it ran out of groomed/eligible work, not out of capacity) reflects how much work *existed*, not how much you *could do*, so counting it would falsely ratchet velocity down. Each sprint's status YAML summary records its `constraint:` (`capacity` or `supply`) — see sprint-review.md Step 2.
+- **Sprint 1:** Use `{sprint_initial_velocity}` from config (default: 60 points).
+- **Later sprints:** Moving average of `points_shipped` over **capacity-constrained sprints only** — exclude every `constraint: supply` sprint.
+  - 2-4 capacity-constrained sprints available: average their `points_shipped`.
+  - 5+ available: SMA of the last 5 capacity-constrained sprints (older data ages out).
+- **If no capacity-constrained sprint has happened yet** (e.g. early sprints were all supply-constrained because the backlog is a dependency chain): keep `{sprint_initial_velocity}`. **Never lower velocity based on a supply-constrained sprint** — this is the failure that drives a healthy initial velocity down to a tiny number after one small starter story.
 Record: `current_velocity = {value}` points.

package/pipeline/steps/orchestration/sprint-plan.md CHANGED Viewed

@@ -12,9 +12,17 @@ node .valent-pipeline/bin/cli.js sprint-pack --velocity {current_velocity} --bac
 ```
 It emits JSON: `sprint_stories` (packed, in dependency-safe order), `buffer_story_ids`
-(groomed but not packed — the mid-sprint pull buffer, see Step 1b), `points_planned`, and
-`remaining_capacity`. Only `groomed` stories are eligible; lower `priority` number = higher
-priority; a groomed prerequisite is auto-included before its dependent when it fits the budget.
+(groomed but not packed — the mid-sprint pull buffer, see Step 1b), `points_planned`,
+`remaining_capacity`, and `over_budget`. Only `groomed` stories are eligible; lower `priority`
+number = higher priority; a groomed prerequisite is auto-included before its dependent when it
+fits the budget.
+**If `over_budget` is `true`:** no story fit the velocity budget, so the highest-priority groomed
+story (plus its groomed prerequisites) was planned ALONE and `points_planned` exceeds velocity
+(`remaining_capacity` is negative). This is the anti-stall path — the sprint still makes progress.
+Surface it to the user: this story is larger than a full sprint and is a **split candidate** — note
+it in the sprint plan and consider asking REQS to break it into smaller stories before a future run.
+Do NOT treat the negative `remaining_capacity` as room to groom more stories.
 Use this output directly for Steps 1b–8. If the backlog isn't the right input shape, pass an
 explicit story array with `--stories <path>` instead of `--backlog`.

package/pipeline/steps/orchestration/sprint-review.md CHANGED Viewed

@@ -20,7 +20,11 @@ Update `sprint-{n}-plan.md` Sprint Summary:
 - Points rolled over: sum of unexecuted story points
 - Total elapsed minutes: sum of execution time (not grooming)
 - Velocity this sprint: points_shipped (all shipped stories count toward velocity, including pulls)
-- Updated velocity (SMA-5): compute new moving average
+- **Sprint constraint:** classify this sprint so the next sprint-init calibrates velocity correctly:
+  - `capacity` — capacity was the binding constraint: stories were left in the groomed buffer or rolled over (more eligible work existed than fit), OR the plan was over-budget (a single oversized story planned alone). These sprints count toward velocity.
+  - `supply` — supply was the binding constraint: the sprint shipped all eligible/groomed work with capacity to spare (groomed buffer empty AND positive `capacity_remaining` at plan time). These sprints do NOT count toward velocity (they measure available work, not capacity).
+  Record as `constraint:` in the status YAML summary.
+- Updated velocity (SMA-5): recompute the moving average over **capacity-constrained sprints only** (see sprint-init.md Step 2). A `supply`-constrained sprint must not lower velocity.
 - Mid-sprint pulls: count and list (stories pulled from groomed buffer during execution)
 ## Step 3: Finalize Sprint Status YAML

package/pipeline/steps/orchestration/sprint-size.md CHANGED Viewed

@@ -34,8 +34,9 @@ For each story with status `groomed`:
    - `iac` in profiles → send to IAC
    Multiple profiles can be active (e.g., `[api, data-pipeline]` sends to both BEND and DATA).
 4. Agents write estimation files (`{agent}-estimation.md`)
-5. **Record points:** sum all agent estimates for the story.
-   `story_points = sum of all agent estimates received`
+5. **Record points:** take the **maximum** single estimate, NOT the sum.
+   `story_points = max of all agent estimates received`
+   Each estimator sizes the *whole* story from its surface's lens (BEND sizes the entire story, IAC sizes the entire story), so their estimates overlap on shared scaffolding. Summing double-counts that overlap and systematically over-points multi-profile stories until they exceed velocity. The max is the best-informed single read. (If you believe two surfaces carry genuinely independent, non-overlapping work, flag it for the Lead rather than silently summing.)
 6. Update story's `story_points` field in `{backlog_path}`
 ## Step 3: Update Sprint State

package/pipeline/steps/qa-a/write-spec.md CHANGED Viewed

@@ -31,6 +31,17 @@ Structure per AC:
 - **Edge cases** (P0: required, P1: key boundaries, P2: optional, P3: omit)
 - **Concurrency** (P0: required, P1: if applicable, P2-P3: omit)
+### The Given-When-Then cases are human-readable acceptance scenarios — QA-B executes them
+Your Given-When-Then cases are not just developer test stubs; they are the **human-readable acceptance contract** for the story, written in plain user/consumer language (what a person does and observes), independent of implementation. They describe behavior the way a real developer or QA would verify it by hand.
+For **interactive surfaces this is mandatory and first-class** — it is how the pipeline tests like a real-world human developer, not just by trusting unit tests:
+- **`api` profile →** realize each acceptance scenario as a row in the **Live Smoke Tests** table (see `api.md`): real HTTP method/URL/body → expected status + response, plus a verification request for every mutation. QA-B starts the real server and drives these with real requests.
+- **`ui` profile →** realize each user-facing scenario as a row in the **UI Integration Smoke Tests** table (see `ui.md`): user action → real API call → expected UI state → DB verification, backed by a real-browser E2E test. QA-B runs these against the live UI + API + DB (and PMCP validates the visual checkpoints).
+These acceptance scenarios are **owned and executed by QA-B against live infrastructure** — they are the real-world, black-box evidence JUDGE relies on, distinct from (and never replaced by) the developer's own unit tests. Every P0/P1 AC on an interactive surface MUST have one. The Step 9b quality bar below governs the *automated test code*; it complements these acceptance scenarios — it does not replace them.
 ## Step 4: Database State Verification
 Per-risk DB verification:
@@ -72,6 +83,15 @@ For each NFR-sensitive path: `[NFR-PERF]` response time + load patterns; `[NFR-S
 - If `ui` in `{testing_profiles}` → **MANDATORY.** Read `uxa-spec.md`. If `uxa-spec.md` is missing, send `[BLOCKER]` to Lead — do NOT proceed without it. For each page state define: Checkpoint ID (VV-{NNN}), Page/Route, State (Default/Loading/Empty/Error/Success or custom), AC Reference, Area labels in scope, Screenshot filename (`{story_id}_VV-{NNN}_{page}_{state}.png`), Expected visual elements, Setup instructions, Pass criteria. Write to `{story_output_dir}/visual-validation-checklist.md`.
 - If `ui` NOT in `{testing_profiles}` → skip, note "N/A — no UI profile."
+## Step 9b: Test Quality Bar — make every case implementable AND falsifiable
+The dev agents implement exactly what you specify, and CRITIC rejects tests that are weak, unfalsifiable, or bound to the wrong artifact. Spec the bar IN so it is built right the first time (each rule below traces to a real rework cycle):
+- **Every P0 case is a named, must-implement test** — never leave a P0 (or an idempotency/concurrency requirement for stateful resources) as a prose note. If it matters, give it its own case ID and assertion target so the dev agent implements it directly. For stateful resources (volumes, DBs), enumerate an explicit idempotency case (apply twice → assert no-recreate / no re-init / data intact).
+- **Falsifiable assertion target** — state the real invariant and the exact expected value (status code, row/column value, count, error code + message regex). Forbid disjunctive/catch-all expectations: never spec "passes if A or B or fallback." The assertion must be able to fail when the behavior is wrong.
+- **Bind to the real artifact** — when a case inspects a built artifact (image, container, file), specify that the test resolves it dynamically (the compose-built image id, the actual service/container name) and proves it exists before inspecting it — never a hardcoded tag. Note it must pass on a clean/CI machine with no leftover state.
+- **NFR-PERF wording is statistical** — specify the percentile AND sample count + warm-up (e.g. "P95 < 200ms over 20 sequential samples after one warm-up request"), not a single-shot threshold.
 ## Step 10: Write Final Outputs
 - Write to `{story_output_dir}/qa-test-spec.md` and `{story_output_dir}/visual-validation-checklist.md` (if applicable)

package/pipeline/steps/retrospective/directives.md CHANGED Viewed

@@ -12,13 +12,18 @@ If pattern touches an invariant, do NOT write a CD. Instead:
 For patterns NOT conflicting with invariants:
+**Scope the directive to the failure mode, not the agent that tripped it.** Before setting `target_agent`, ask: *could another agent hit this same pattern?* Many failure modes are cross-cutting — test quality (missing P0 cases, conditionals/unfalsifiable assertions in tests, artifact-binding, hard waits), handoff-format violations, infra-mocking — and apply to **every** developer agent, not just the one that happened to surface it this sprint. For those:
+- Set `target_agent: ALL-DEV` (BEND, FEND, DATA, MCP-DEV, LIBDEV, DOCGEN, IAC, MOBILE) so the lesson generalizes; do NOT pin a cross-cutting test-quality directive to the single agent that tripped it.
+- Prefer reinforcing the shared contract: if the pattern is a standing rule, the durable fix is `.valent-pipeline/steps/common/quality-standards.md` (the dev self-check) or the QA-A spec / CRITIC checklist — note that in the directive's `reason`. A per-agent directive that restates a shared standard is noise.
+- Only use a single `target_agent` when the pattern is genuinely specific to that agent's surface (e.g. an IAC-only compose idiom, a FEND-only component pattern).
 **ADD** new directives:
 ```yaml
 - id: CD-{batch_number}-{seq}
   status: active
-  target_agent: {agent}
+  target_agent: {agent | ALL-DEV}
   directive: "{what to do differently}"
-  reason: "{evidence from batch}"
+  reason: "{evidence from batch; if it restates a shared standard, name the file it belongs in}"
   impact_level: low | medium | high
   created_batch: {batch_number}
   last_reinforced_batch: {batch_number}

package/pipeline/templates/sprint-status.template.yaml CHANGED Viewed

@@ -54,3 +54,4 @@ summary:  # filled post-sprint
   total_elapsed_minutes: null
   velocity_this_sprint: null
   velocity_sma5: null
+  constraint: null  # capacity | supply — only `capacity` sprints count toward velocity (see sprint-review.md Step 2)

package/src/lib/sprint.js CHANGED Viewed

@@ -31,13 +31,19 @@ function depsOf(story) {
  * @param {Array} stories - candidate stories: { id, points|story_points, priority, depends_on, status }
  * @param {number} velocity - capacity in story points
  * @returns {{ sprint_stories: string[], buffer_story_ids: string[], points_planned: number,
- *             remaining_capacity: number, velocity: number }}
+ *             remaining_capacity: number, velocity: number, over_budget: boolean }}
  *
  * Only `groomed` stories are eligible (matches the source). Lower `priority` number = higher
  * priority; missing priority sorts last. A prerequisite is auto-included only when it is also
  * `groomed` and fits the remaining budget — deps already `shipped`/done are assumed satisfied
  * and silently skipped, exactly as the prose specifies.
  *
+ * Anti-stall: if no story fits the budget at all (the smallest groomed story is larger than the
+ * whole velocity), the highest-priority groomed story is planned ALONE — with its groomed
+ * prerequisites pulled in first — accepting an over-budget sprint (`over_budget: true`,
+ * `remaining_capacity` may go negative). This prevents an oversized story from stalling the
+ * project forever; the planner should surface it as a split candidate.
+ *
  * Preserved quirk: if a story's dependency chain does not fully fit, the deps already added
  * for it stay in the sprint (capacity is not rolled back) and the dependent is skipped. This
  * matches the source pseudocode; `validateSprint` is the safety net for ordering consistency.
@@ -85,6 +91,26 @@ export function packSprint(stories, velocity) {
     // else: skip this story, try smaller ones to fill capacity
   }
+  // Anti-stall: if NOTHING fit the budget but groomed work exists, the smallest eligible story is
+  // larger than the entire velocity. Returning an empty sprint would stall the project forever
+  // (that story can never be packed, so the loop never progresses). Plan the highest-priority
+  // groomed story alone — pulling in its groomed prerequisites first so order stays dependency-safe
+  // — accepting an over-budget sprint. `over_budget` is flagged so the planner can surface
+  // "this story exceeds velocity; planned alone — consider splitting it."
+  let overBudget = false;
+  if (sprintStories.length === 0 && byPriority.length > 0) {
+    const forceAdd = (story) => {
+      if (inSprint.has(story.id)) return;
+      for (const depId of depsOf(story)) {
+        const dep = byId.get(depId);
+        if (dep && dep.status === 'groomed' && !inSprint.has(depId)) forceAdd(dep);
+      }
+      add(story);
+    };
+    forceAdd(byPriority[0]);
+    overBudget = true;
+  }
   const buffer = groomed.filter((s) => !inSprint.has(s.id)).map((s) => s.id);
   const pointsPlanned = sprintStories.reduce((sum, id) => sum + pointsOf(byId.get(id)), 0);
@@ -94,6 +120,7 @@ export function packSprint(stories, velocity) {
     points_planned: pointsPlanned,
     remaining_capacity: remaining,
     velocity,
+    over_budget: overBudget,
   };
 }