npm - @tekyzinc/gsd-t - Versions diffs - 3.26.10 → 3.27.10 - Mend

@tekyzinc/gsd-t 3.26.10 → 3.27.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/CHANGELOG.md +64 -0
package/README.md +2 -0
package/bin/context-budget-audit.cjs +17 -2
package/bin/gsd-t-build-coverage.cjs +438 -0
package/bin/gsd-t-ci-parity.cjs +500 -0
package/bin/gsd-t-economics.cjs +37 -9
package/bin/gsd-t.js +21 -0
package/bin/model-windows.cjs +99 -0
package/bin/model-windows.test.cjs +75 -0
package/bin/runway-estimator.cjs +35 -5
package/bin/token-budget.cjs +12 -3
package/commands/gsd-t-help.md +14 -0
package/commands/gsd-t-milestone.md +21 -5
package/commands/gsd-t-promote-debt.md +1 -1
package/commands/gsd-t-scan.md +6 -6
package/commands/gsd-t-verify.md +46 -0
package/package.json +1 -1
package/scripts/context-meter/transcript-parser.js +12 -2
package/scripts/context-meter/transcript-parser.test.js +51 -4
package/scripts/gsd-t-calibration-hook.js +8 -1
package/scripts/gsd-t-context-meter.e2e.test.js +45 -6
package/scripts/gsd-t-context-meter.js +17 -3
package/scripts/gsd-t-context-meter.test.js +85 -0
package/templates/CLAUDE-global.md +30 -0
package/templates/stacks/design-to-code.md +1 -1

package/scripts/gsd-t-context-meter.e2e.test.js CHANGED Viewed

@@ -74,7 +74,7 @@ class Sandbox {
    * The charCount parameter controls how many characters of text content
    * are in the transcript, which determines the estimated token count.
    */
-  writeTranscript(filename = "transcript.jsonl", charCount = 100) {
+  writeTranscript(filename = "transcript.jsonl", charCount = 100, model = "claude-opus-4-6") {
     const userText = "x".repeat(Math.floor(charCount / 2));
     const assistantText = "y".repeat(Math.ceil(charCount / 2));
     const lines = [
@@ -89,7 +89,7 @@ class Sandbox {
         message: {
           role: "assistant",
           content: [{ type: "text", text: assistantText }],
-          model: "claude-opus-4-6",
+          model,
         },
         uuid: "a1",
         sessionId: "sess-1",
@@ -231,7 +231,9 @@ afterEach(async () => {
 /* ──────────────────────────── tests ──────────────────────────── */
 test("E2E 1. below threshold — stdout {} and state reflects estimate", async () => {
-  // 100 chars of text content → ~29 tokens (100/3.5) → 0.014% of 200K window
+  // 100 chars → ~29 tokens. The transcript declares claude-opus-4-6, so the
+  // EFFECTIVE window is the real 1M (model-aware sizing), not the config's
+  // legacy 200K — ~0.003% of 1M, well below threshold.
   sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
   const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 100);
@@ -248,7 +250,11 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
   assert.equal(state.version, 1);
   assert.ok(state.inputTokens > 0, "should have estimated some tokens");
   assert.ok(state.inputTokens < 1000, "small transcript should estimate < 1K tokens");
-  assert.equal(state.modelWindowSize, 200000);
+  assert.equal(
+    state.modelWindowSize,
+    1_000_000,
+    "window resolved from the transcript's claude-opus-4-6 model (1M), not config 200K"
+  );
   assert.ok(state.pct < 1, "pct should be well below threshold");
   assert.equal(state.threshold, "normal");
   assert.equal(state.checkCount, 1);
@@ -258,9 +264,15 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
 });
 test("E2E 2. above threshold — stdout additionalContext with large transcript", async () => {
-  // 600K chars → ~171K tokens → 85.7% of 200K window → warn band + additionalContext
+  // Haiku → real 200K window. 600K chars → ~171K tokens → ~85.7% of 200K →
+  // threshold band + additionalContext. (Model-aware sizing means we pin a
+  // 200K-window model here rather than relying on a stale config default.)
   sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
-  const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 600000);
+  const transcriptPath = sandbox.writeTranscript(
+    "transcript.jsonl",
+    600000,
+    "claude-haiku-4-5-20251001"
+  );
   const { stdout, code } = await sandbox.runHook({
     payload: { session_id: "test-above", transcript_path: transcriptPath },
@@ -283,6 +295,33 @@ test("E2E 2. above threshold — stdout additionalContext with large transcript"
   assert.equal(sandbox.tmpFileExists(), false);
 });
+test("E2E 2b. REGRESSION — large Opus transcript stays 'normal' on the 1M window", async () => {
+  // The reported bug, end-to-end: ~600K chars → ~171K tokens. Under the old
+  // hardcoded 200K window this read as ~85% → false headless handoff while
+  // ~64% of context REMAINED. With model-aware sizing (claude-opus-4-7 → 1M),
+  // 171K is only ~17% → stdout {} → no premature handoff.
+  sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
+  const transcriptPath = sandbox.writeTranscript(
+    "transcript.jsonl",
+    600000,
+    "claude-opus-4-7"
+  );
+  const { stdout, code } = await sandbox.runHook({
+    payload: { session_id: "test-regression", transcript_path: transcriptPath },
+  });
+  assert.equal(code, 0);
+  const parsed = JSON.parse(stdout || "{}");
+  assert.deepEqual(parsed, {}, "must NOT hand off — the reported regression");
+  const state = sandbox.readState();
+  assert.equal(state.modelWindowSize, 1_000_000);
+  assert.ok(state.inputTokens > 100000, "large transcript, >100K tokens");
+  assert.ok(state.pct < 75, `pct ${state.pct} must be below threshold on a 1M window`);
+  assert.equal(state.threshold, "normal");
+});
 test("E2E 3. missing transcript — stdout {}, state has parse error", async () => {
   sandbox.writeConfig({ thresholdPct: 75, checkFrequency: 1 });

package/scripts/gsd-t-context-meter.js CHANGED Viewed

@@ -37,6 +37,7 @@ const { loadConfig: realLoadConfig } = require("../bin/context-meter-config.cjs"
 const { parseTranscript: realParseTranscript } = require("./context-meter/transcript-parser");
 const { estimateTokens: realEstimateTokens } = require("./context-meter/estimate-tokens");
 const { computePct, bandFor, buildAdditionalContext } = require("./context-meter/threshold");
+const { windowForModel } = require("../bin/model-windows.cjs");
 const STATE_VERSION = 1;
@@ -208,6 +209,18 @@ async function runMeter(opts) {
       return {};
     }
+    // 5b. Resolve the EFFECTIVE context window from the model the orchestrator
+    // session is actually running (parsed.model). Opus 4.6/4.7 and Sonnet 4.x
+    // ship a 1M window; the config default (200k) is a legacy fallback that
+    // overcounts usage 5× and fires the headless handoff far too early. We
+    // only override when the transcript reports a model — a missing model or
+    // an explicit project config value falls through to cfg.modelWindowSize.
+    const effectiveWindow =
+      typeof parsed.model === "string" && parsed.model.length > 0
+        ? windowForModel(parsed.model)
+        : cfg.modelWindowSize;
+    state.modelWindowSize = effectiveWindow;
     // 6. Estimate tokens locally (no API call, zero cost).
     let tokenResp;
     try {
@@ -237,7 +250,7 @@ async function runMeter(opts) {
     // 8. Success path — compute pct, band, possibly emit additionalContext.
     const pct = computePct({
       inputTokens: tokenResp.inputTokens,
-      modelWindowSize: cfg.modelWindowSize,
+      modelWindowSize: effectiveWindow,
     });
     const band = bandFor(pct, cfg.thresholdPct);
@@ -251,13 +264,14 @@ async function runMeter(opts) {
       logPath,
       "INFO",
       "measure",
-      `tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band}`,
+      `tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band} ` +
+        `window=${effectiveWindow}${parsed.model ? ` model=${parsed.model}` : ""}`,
       clock
     );
     const additionalContext = buildAdditionalContext({
       pct,
-      modelWindowSize: cfg.modelWindowSize,
+      modelWindowSize: effectiveWindow,
       thresholdPct: cfg.thresholdPct,
     });
     if (additionalContext) {

package/scripts/gsd-t-context-meter.test.js CHANGED Viewed

@@ -384,3 +384,88 @@ test("12. clock injection — timestamp uses injected clock", async () => {
   const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
   assert.equal(state.timestamp, fixed.toISOString());
 });
+/* ── M-fix: model-aware context window (the reported regression) ───────── */
+test("13. Opus 4.7 @ ~36% of a 1M window stays 'normal' (regression repro)", async () => {
+  // The exact reported symptom: ~360k tokens used on an Opus 4.7 session.
+  // With the old hardcoded 200k window this computed 180% → premature
+  // headless handoff at ~64% of context REMAINING. With model-aware sizing
+  // the window is 1M, so 360k = 36% = normal, no handoff.
+  seedState(tmpRoot, { checkCount: 4 });
+  const out = await runMeter({
+    payload: makePayload(),
+    projectRoot: tmpRoot,
+    _loadConfig: () => makeConfig(), // config still says 200k — must be overridden
+    _parseTranscript: async () => ({ ...FAKE_PARSED, model: "claude-opus-4-7" }),
+    _estimateTokens: () => ({ inputTokens: 360000 }),
+  });
+  // No handoff marker — this is the whole point of the fix.
+  assert.deepEqual(out, {});
+  const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
+  assert.equal(state.modelWindowSize, 1_000_000, "window resolved from model, not config");
+  assert.equal(state.pct, 36, "360k / 1M = 36%");
+  assert.equal(state.threshold, "normal");
+});
+test("14. Opus 4.7 @ 80% of the true 1M window DOES hand off", async () => {
+  // The handoff must still fire at the real 75% threshold against the
+  // corrected window — we keep the guard, we just size it correctly.
+  seedState(tmpRoot, { checkCount: 4 });
+  const out = await runMeter({
+    payload: makePayload(),
+    projectRoot: tmpRoot,
+    _loadConfig: () => makeConfig(),
+    _parseTranscript: async () => ({ ...FAKE_PARSED, model: "claude-opus-4-7-20260115" }),
+    _estimateTokens: () => ({ inputTokens: 800000 }), // 80% of 1M > 75%
+  });
+  assert.equal(out.additionalContext, "next-spawn-headless:true");
+  const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
+  assert.equal(state.modelWindowSize, 1_000_000);
+  assert.equal(state.pct, 80);
+  assert.equal(state.threshold, "threshold");
+});
+test("15. no model in transcript → falls back to config window (back-compat)", async () => {
+  // Existing transcripts / stubs without a model field must behave exactly
+  // as before: config's modelWindowSize governs.
+  seedState(tmpRoot, { checkCount: 4 });
+  const out = await runMeter({
+    payload: makePayload(),
+    projectRoot: tmpRoot,
+    _loadConfig: () => makeConfig({ modelWindowSize: 200000 }),
+    _parseTranscript: async () => FAKE_PARSED, // no `model` key
+    _estimateTokens: () => ({ inputTokens: 160000 }), // 80% of 200k
+  });
+  assert.equal(out.additionalContext, "next-spawn-headless:true");
+  const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
+  assert.equal(state.modelWindowSize, 200000);
+  assert.equal(state.pct, 80);
+});
+test("16. Haiku session correctly sized at 200k (not over-large 1M)", async () => {
+  seedState(tmpRoot, { checkCount: 4 });
+  const out = await runMeter({
+    payload: makePayload(),
+    projectRoot: tmpRoot,
+    _loadConfig: () => makeConfig(),
+    _parseTranscript: async () => ({
+      ...FAKE_PARSED,
+      model: "claude-haiku-4-5-20251001",
+    }),
+    _estimateTokens: () => ({ inputTokens: 170000 }), // 85% of 200k
+  });
+  assert.equal(out.additionalContext, "next-spawn-headless:true");
+  const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
+  assert.equal(state.modelWindowSize, 200000);
+  assert.equal(state.pct, 85);
+});

package/templates/CLAUDE-global.md CHANGED Viewed

@@ -537,6 +537,12 @@ BEFORE EVERY COMMIT:
   │     YES → Verify test names and paths are referenced in requirements
   ├── Did I change UI, routes, or user flows?
   │     YES → Update affected E2E test specs (Playwright/Cypress)
+  ├── Did I add a new top-level dir, or change build/CI config?
+  │     This is ENFORCED MECHANICALLY by `gsd-t-verify` Step 2.6
+  │     (CI-Parity Gate: `gsd-t build-coverage` + `gsd-t ci-parity`,
+  │     FAIL-blocking). You do NOT self-attest this — verify runs the
+  │     real CI build. It exists because TimeTracking v1.10.12 shipped
+  │     VERIFIED+tagged with a new dir absent from the Dockerfile COPY.
   └── Did I run the affected tests?
         YES → Verify they pass. NO → Run them now.
 ```
@@ -572,6 +578,30 @@ BEFORE reporting "done" or presenting a summary:
 **The test for this gate**: If the user asks "did you update all the documents?" and the answer would be "no, I missed some" — you failed this gate. The user should never need to ask.
+## Effort Estimates — GSD-T-Native Units (MANDATORY)
+**NEVER express effort or scope in developer-hours, dev-days, sprints, story points, or person-weeks.** GSD-T operates on a different cost model — the worker is Claude, not a human team — and human-time estimates have no predictive value for GSD-T workflows. They actively mislead by suggesting a calendar shape that doesn't match how the system runs.
+Use GSD-T-native units instead:
+| Unit | When to use |
+|------|-------------|
+| **Domain count** | Milestone scope (1-2 simple, 3-4 medium, 5+ complex) |
+| **Wave count** | Cross-domain dependency depth — how many serial gates exist |
+| **Parallel-domain count** | How many domains can run concurrently (file-disjoint) |
+| **Spawn count** | Estimated `claude -p` / Task subagent invocations |
+| **Token-spend range** | `$X-Y` dollars based on trailing-3 comparable milestones in `.gsd-t/token-log.md` |
+| **Rate-limit-window count** | If the work might span > 1 5h Claude Max window |
+Where this applies:
+- `/gsd-t-milestone` Step 4 — Pre-Partition Assessment
+- `/gsd-t-scan` techdebt milestone suggestions
+- `/gsd-t-promote-debt` effort fields
+- `docs/requirements.md`, `progress.md` Decision Log entries
+- Any internal estimate the user might read
+Acceptable: machine-time references (e.g. "5 min cache TTL", "5h rate-limit window", "14 day staleness threshold") — these are concrete system properties, not effort estimates. The rule applies to **effort/scope**, not to **system timeouts**.
 ## Execution Behavior
 - ALWAYS check docs/architecture.md before adding or modifying components.
 - ALWAYS check docs/workflows.md before changing any multi-step process.

package/templates/stacks/design-to-code.md CHANGED Viewed

@@ -176,7 +176,7 @@ MANDATORY:
   │
   ├── For each design requirement, assess:
   │     Supported → stack handles this natively, proceed
-  │     Partial → needs an addon/library — name it, estimate effort
+  │     Partial → needs an addon/library — name it, scope in GSD-T units (domain/wave/spawn/token, NOT dev-hours)
   │     Unsupported → stack CANNOT achieve this — flag as a blocker
   │
   ├── If ANY requirement is Unsupported: