npm - @tekyzinc/gsd-t - Versions diffs - 3.26.11 → 3.29.10 - Mend

@tekyzinc/gsd-t 3.26.11 → 3.29.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/CHANGELOG.md +151 -0
package/README.md +4 -0
package/bin/context-budget-audit.cjs +17 -2
package/bin/gsd-t-build-coverage.cjs +438 -0
package/bin/gsd-t-ci-parity.cjs +500 -0
package/bin/gsd-t-economics.cjs +37 -9
package/bin/gsd-t-test-data-adapters/file-json-array.cjs +56 -0
package/bin/gsd-t-test-data-adapters/localstorage-key-prefix.cjs +44 -0
package/bin/gsd-t-test-data-adapters/sqlite-table-where.cjs +71 -0
package/bin/gsd-t-test-data-ledger.cjs +290 -0
package/bin/gsd-t-time-format.cjs +94 -0
package/bin/gsd-t.js +30 -0
package/bin/model-windows.cjs +99 -0
package/bin/model-windows.test.cjs +75 -0
package/bin/orchestrator.js +4 -1
package/bin/runway-estimator.cjs +35 -5
package/bin/token-budget.cjs +12 -3
package/commands/gsd-t-complete-milestone.md +7 -3
package/commands/gsd-t-help.md +21 -0
package/commands/gsd-t-init.md +1 -1
package/commands/gsd-t-verify.md +90 -0
package/package.json +1 -1
package/scripts/context-meter/transcript-parser.js +12 -2
package/scripts/context-meter/transcript-parser.test.js +51 -4
package/scripts/gsd-t-calibration-hook.js +8 -1
package/scripts/gsd-t-context-meter.e2e.test.js +45 -6
package/scripts/gsd-t-context-meter.js +17 -3
package/scripts/gsd-t-context-meter.test.js +85 -0
package/scripts/gsd-t-date-guard.js +26 -5
package/scripts/gsd-t-design-review-server.js +3 -1
package/templates/CLAUDE-global.md +37 -1
package/templates/progress.md +6 -2
package/templates/test-helpers/README.md +98 -0
package/templates/test-helpers/test-data-fixture.ts +153 -0

package/bin/orchestrator.js CHANGED Viewed

@@ -26,6 +26,7 @@
 const fs = require("fs");
 const path = require("path");
 const { execFileSync, execFile, spawn: cpSpawn } = require("child_process");
+const { localIsoWithOffset } = require(path.join(__dirname, "gsd-t-time-format.cjs"));
 // ─── ANSI Colors ────────────────────────────────────────────────────────────
@@ -1275,11 +1276,13 @@ ${BOLD}Phases:${RESET} ${this.wf.phases.join(" → ")}
       }
       // 6f. Record phase completion
+      // M59 (v3.29.10): completedAt is local-offset ISO (`YYYY-MM-DDTHH:MM:SS±HH:MM`)
+      // rather than UTC `Z` — matches the human-readable progress.md fields.
       state.phaseResults[phase] = {
         completed: true,
         builtPaths,
         reviewCycles: reviewCycle + 1,
-        completedAt: new Date().toISOString(),
+        completedAt: localIsoWithOffset(),
       };
       state.completedPhases.push(phase);
       this.clearQueue(projectDir);

package/bin/runway-estimator.cjs CHANGED Viewed

@@ -26,12 +26,43 @@
 const fs = require('fs');
 const path = require('path');
+const { SAFE_DEFAULT_WINDOW } = require('./model-windows.cjs');
 const DEFAULT_K = 5;
-const DEFAULT_MODEL_CONTEXT_CAP = 200000;
+// The model context cap IS the model's true window. Default to the model-aware
+// safe LARGE window (1M) — the old 200K literal was correct only for pre-4
+// models and made turn-to-compact predictions fire 5× too early on Opus/Sonnet.
+// Callers may still pass an explicit `modelContextCap`, and resolveContextCap()
+// below prefers a fresh Context Meter reading when available.
+const DEFAULT_MODEL_CONTEXT_CAP = SAFE_DEFAULT_WINDOW;
 // Claude Code starts auto-compacting ~8% before the model window fills, so the
 // effective dialog ceiling is 0.92 × modelContextCap.
 const PRE_COMPACT_HEADROOM = 0.92;
+// Context Meter state — its modelWindowSize is model-aware (bin/model-windows.cjs).
+const METER_STATE_REL = '.gsd-t/.context-meter-state.json';
+const METER_STATE_STALE_MS = 5 * 60 * 1000;
+/**
+ * Resolve the effective model context cap. Priority:
+ *   1. explicit opts.modelContextCap (caller override)
+ *   2. fresh Context Meter state modelWindowSize (model-aware, written by the
+ *      meter hook from the running model)
+ *   3. DEFAULT_MODEL_CONTEXT_CAP (safe large fallback)
+ */
+function resolveContextCap(projectDir, optCap) {
+  if (Number.isFinite(optCap) && optCap > 0) return optCap;
+  try {
+    const fp = path.join(projectDir || '.', METER_STATE_REL);
+    const s = JSON.parse(fs.readFileSync(fp, 'utf8'));
+    if (s && typeof s.modelWindowSize === 'number' && s.modelWindowSize > 0 && s.timestamp) {
+      const age = Date.now() - Date.parse(s.timestamp);
+      if (!isNaN(age) && age >= 0 && age <= METER_STATE_STALE_MS) {
+        return s.modelWindowSize;
+      }
+    }
+  } catch (_) { /* fall through to default */ }
+  return DEFAULT_MODEL_CONTEXT_CAP;
+}
 const DEFAULT_WARN_THRESHOLD_TURNS = 5;
 const MIN_HISTORY = 3;
@@ -107,7 +138,8 @@ function _sortTurns(rows) {
  * @param {string} opts.projectDir
  * @param {string} opts.sessionId                required
  * @param {number} [opts.k]                      default 5 (last K turns)
- * @param {number} [opts.modelContextCap]        default 200000
+ * @param {number} [opts.modelContextCap]        default: fresh Context Meter
+ *        modelWindowSize if available, else the model-aware safe window (1M)
  * @param {number} [opts.warnThresholdTurns]     default 5
  * @returns {{
  *   shouldWarn: boolean,
@@ -124,9 +156,7 @@ function estimateDialogGrowth(opts) {
   const projectDir = (opts && opts.projectDir) || '.';
   const sessionId = opts && opts.sessionId;
   const k = (opts && Number.isFinite(opts.k) && opts.k > 0) ? Math.floor(opts.k) : DEFAULT_K;
-  const cap = (opts && Number.isFinite(opts.modelContextCap) && opts.modelContextCap > 0)
-    ? opts.modelContextCap
-    : DEFAULT_MODEL_CONTEXT_CAP;
+  const cap = resolveContextCap(projectDir, opts && opts.modelContextCap);
   const warnThreshold = (opts && Number.isFinite(opts.warnThresholdTurns) && opts.warnThresholdTurns > 0)
     ? opts.warnThresholdTurns
     : DEFAULT_WARN_THRESHOLD_TURNS;

package/bin/token-budget.cjs CHANGED Viewed

@@ -26,11 +26,18 @@
 const fs = require("fs");
 const path = require("path");
+const { SAFE_DEFAULT_WINDOW } = require("./model-windows.cjs");
 // ── Constants ────────────────────────────────────────────────────────────────
 const MODEL_RATIOS = { haiku: 1, sonnet: 5, opus: 25 };
+// Fallback context window when no per-model signal is available. Uses the
+// model-windows safe LARGE default (1M) rather than a bare 200K literal: a
+// too-small fallback re-introduces the premature-headless-handoff bug the
+// model-aware sizing was added to fix. See bin/model-windows.cjs.
+const FALLBACK_WINDOW = SAFE_DEFAULT_WINDOW;
 // Base token estimates per task type (in haiku-equivalent units)
 const BASE_ESTIMATES = {
   execute: 8000,
@@ -106,7 +113,9 @@ function getSessionStatus(projectDir) {
   const real = readContextMeterState(dir);
   if (real) {
     const consumed = real.inputTokens;
-    const window = real.modelWindowSize > 0 ? real.modelWindowSize : 200000;
+    // Primary: the Context Meter writes a model-aware modelWindowSize into
+    // state (bin/model-windows.cjs). Fallback only when state predates that.
+    const window = real.modelWindowSize > 0 ? real.modelWindowSize : FALLBACK_WINDOW;
     const estimated_remaining = Math.max(0, window - consumed);
     const pct = Math.round(real.pct * 10) / 10;
     const threshold = bandFor(pct, thresholdPct);
@@ -145,7 +154,7 @@ function resolveThresholdPct(dir) {
 }
 function getSessionStatusHeuristic(dir, thresholdPct) {
-  const window = 200000;
+  const window = FALLBACK_WINDOW;
   const consumed = readSessionConsumed(dir);
   const estimated_remaining = Math.max(0, window - consumed);
   const pct = window > 0 ? Math.round((consumed / window) * 100 * 10) / 10 : 0;
@@ -180,7 +189,7 @@ function recordUsage(usage) {
  */
 function estimateMilestoneCost(remainingTasks, projectDir) {
   const status = getSessionStatus(projectDir);
-  const window = status.consumed + status.estimated_remaining || 200000;
+  const window = status.consumed + status.estimated_remaining || FALLBACK_WINDOW;
   const estimatedTokens = remainingTasks.reduce((sum, t) => {
     return sum + estimateCost(t.model, t.taskType, { complexity: t.complexity, projectDir });
   }, 0);

package/commands/gsd-t-complete-milestone.md CHANGED Viewed

@@ -336,7 +336,7 @@ Create `summary.md`:
 ```markdown
 # Milestone Complete: {name}
-**Completed**: {date}
+**Completed**: {YYYY-MM-DD HH:MM TZ}
 **Duration**: {start date} → {end date}
 **Status**: {VERIFIED | FORCED}
@@ -419,14 +419,18 @@ Steps to apply the trim:
 # GSD-T Progress
 ## Version: {new version}
+## Status: ACTIVE
+## Date: {YYYY-MM-DD HH:MM TZ — source from live `[GSD-T NOW]`}
 ## Current Milestone
 None — ready for next milestone
 ## Completed Milestones
 | Milestone | Version | Completed | Tag |
 |-----------|---------|-----------|-----|
-| {name} | {version} | {date} | v{version} |
-| {previous} | {version} | {date} | v{version} |
+| {name} | {version} | {YYYY-MM-DD HH:MM TZ} | v{version} |
+| {previous} | {version} | {YYYY-MM-DD HH:MM TZ — keep existing rows as-is; forward-only format} | v{version} |
+<!-- M59 (v3.29.10): "Completed" cells are written `YYYY-MM-DD HH:MM TZ` from this version forward. Pre-3.29.10 rows that read `YYYY-MM-DD` stay as-is (forward-only — never rewrite). Readers MUST accept both. -->
 ## Decision Log

package/commands/gsd-t-help.md CHANGED Viewed

@@ -455,6 +455,27 @@ Use these when user asks for help on a specific command:
 - **Use when**: Final pre-merge gate. Both tracks always run; both report. `ok` is purely deterministic — LLM verdict is advisory.
 - **CLI**: `gsd-t verify-gate [--skip-track1] [--skip-track2] [--max-concurrency N] [--fail-fast] [--json]`. Exit 0/4/2/3.
+### build-coverage (M57)
+- **Summary**: Detects new top-level paths in a milestone commit range not referenced by a real CI build input. Coverage is decided by STRUCTURALLY parsing CI files (Dockerfile COPY/ADD source args incl. relative `--from=`; cloudbuild `args`-positional; workflow `run`-positional via a block-scalar-aware YAML walker) — never substring-matching. `node_modules` never counts. No documented false-negative residual.
+- **Auto-invoked**: Yes — by `gsd-t-verify` Step 2.6 (FAIL-blocking, never warning-only)
+- **Files**: `bin/gsd-t-build-coverage.cjs`
+- **Use when**: Final pre-merge gate. Catches the TimeTracking v1.10.12 class (new `hooks/` dir committed, absent from Dockerfile COPY, shipped broken while verify passed).
+- **CLI**: `gsd-t build-coverage [--json] [--base REF] [--head REF] [--project-dir PATH]`. Exit 0/4/2.
+### ci-parity (M57)
+- **Summary**: Reproduces the project's actual CI build locally instead of assuming warm-cache local tsc/test parity. Auto-detects CI config (cloudbuild → workflows → Dockerfile RUN → package.json scripts), clears build caches (containment-safe — refuses any config-derived delete resolving outside OR equal-to projectRoot), and auto-runs the real `docker build` when a Dockerfile is present (presence is the trigger, no opt-in flag).
+- **Auto-invoked**: Yes — by `gsd-t-verify` Step 2.6 (FAIL-blocking, never warning-only)
+- **Files**: `bin/gsd-t-ci-parity.cjs`
+- **Use when**: Final pre-merge gate. Catches the TimeTracking v1.10.12 class (noImplicitAny regressions passed a warm-cache local tsc but failed CI's cold build).
+- **CLI**: `gsd-t ci-parity [--project-dir PATH] [--timeout-ms MS] [--json]`. Exit 0/4/2.
+### test-data (M58)
+- **Summary**: Append-only test-data ledger + purge engine. Tests register inserts via the `withTestData()` Playwright fixture; `gsd-t-verify` Step 4.5 purges them by adapter before VERDICT. Three built-in adapters: `localStorage-key-prefix`, `file-json-array`, `sqlite-table-where`. Each adapter refuses to delete records whose id does not start with the ledger row's `taggedPrefix` (defense in depth).
+- **Auto-invoked**: Yes — by `gsd-t-verify` Step 4.5 (FAIL-blocking, never warning-only)
+- **Files**: `bin/gsd-t-test-data-ledger.cjs`, `bin/gsd-t-test-data-adapters/*.cjs`, `templates/test-helpers/test-data-fixture.ts`
+- **Use when**: Test data hygiene. Catches the GSD-T-Board class (2442 orphaned `E2E_TEST_*` / `E2E_DRAG_*` ideas left in the production data store after a passing Verify run).
+- **CLI**: `gsd-t test-data --list [--run <id>] [--json]` / `gsd-t test-data --purge --run <id> [--dry-run] [--json] [--project <dir>]`. Exit 0 on success, 4 on adapter errors, 64 on usage error.
 ## Unknown Command
 If user asks for help on unrecognized command:

package/commands/gsd-t-init.md CHANGED Viewed

@@ -180,7 +180,7 @@ Create `.gsd-t/progress.md`:
 ## Project: {name from CLAUDE.md or $ARGUMENTS}
 ## Version: {detected version, or 0.1.00}
 ## Status: INITIALIZED
-## Date: {today}
+## Date: {today YYYY-MM-DD HH:MM TZ — source from the live `[GSD-T NOW]` signal; never date-only}
 ## Milestones
 | # | Milestone | Status | Domains |

package/commands/gsd-t-verify.md CHANGED Viewed

@@ -102,6 +102,52 @@ Defensive on missing `.gsd-t/ratelimit-map.json` — verify-gate falls back to
 Contract: `.gsd-t/contracts/verify-gate-contract.md` v1.0.0 STABLE.
+<!-- M57: CI-parity FAIL-blocking gate -->
+## Step 2.6: CI-Parity Gate (MANDATORY — FAIL-blocking, never warning-only)
+```bash
+node scripts/gsd-t-watch-state.js advance --agent-id "$GSD_T_AGENT_ID" --parent-id "${GSD_T_PARENT_AGENT_ID:-null}" --command gsd-t-verify --step 2 --step-label ".6: CI-Parity Gate" 2>/dev/null || true
+```
+Origin: TimeTracking v1.10.12 shipped VERIFIED + tagged while Cloud Build
+failed (a new top-level `hooks/` dir was committed but never added to the
+Dockerfile `COPY` directives, and `noImplicitAny` regressions passed a
+warm-cache local `tsc` but failed CI's cold build). `gsd-t-verify` must
+reproduce the project's *actual* CI build, not assume local parity.
+Run BOTH checks. **Either failing is a verify FAIL — it blocks
+complete-milestone. This is never a warning-only signal.**
+```bash
+# 1. Build-coverage — every new top-level path in the milestone range must be
+#    referenced by a real CI build input (structural parse, not substring).
+gsd-t build-coverage --json > /tmp/gsd-t-build-coverage.json
+BC_EXIT=$?
+# 2. CI-parity — reproduce the project's actual CI build locally with caches
+#    cleared; auto-runs `docker build` when a Dockerfile is present.
+gsd-t ci-parity --json > /tmp/gsd-t-ci-parity.json
+CP_EXIT=$?
+```
+- `build-coverage` exit **4** (`ok:false`, `missing[]` non-empty) → verify
+  FAIL. Report each uncovered path; the fix is to add the path to the
+  Dockerfile `COPY` / cloudbuild artifact / workflow build input.
+- `ci-parity` exit **4** (`ok:false` — a detected CI command or the real
+  `docker build` failed) → verify FAIL. Report the failing command.
+- exit **0** from both → gate passes.
+- exit **2** (usage error, e.g. not a git repo) → record as a structured
+  note; not a pass-by-default (investigate before proceeding).
+Both are pure-deterministic CLI checks (no LLM). They consume the same
+preflight envelope as the M55 verify-gate Track 1, so failing here at
+verify mirrors what CI would do — catching the TimeTracking class before
+the milestone is tagged.
+Contracts: `.gsd-t/contracts/cli-build-coverage-contract.md` v2.0.0 STABLE,
+`.gsd-t/contracts/ci-parity-contract.md` v2.0.0 STABLE.
+<!-- /M57: CI-parity FAIL-blocking gate -->
 ## Step 2.5: High-Risk Domain Gate (MANDATORY — Categories 2 and 7)
 ```bash
@@ -257,6 +303,49 @@ const { captureSpawn } = require('./bin/gsd-t-token-capture.cjs');
 `captureSpawn` parses `result.usage` and writes the row to `.gsd-t/token-log.md` under the canonical header. Tokens column renders as `in=N out=N cr=N cc=N $X.XX` or `—`, never `N/A`. Collect all reports, synthesize, create remediation plan.
 ```
+<!-- M58: Test Data Cleanup Gate -->
+## Step 4.5: Test Data Cleanup Gate (MANDATORY — FAIL-blocking, never warning-only)
+```bash
+node scripts/gsd-t-watch-state.js advance --agent-id "$GSD_T_AGENT_ID" --parent-id "${GSD_T_PARENT_AGENT_ID:-null}" --command gsd-t-verify --step 4 --step-label ".5: Test Data Cleanup Gate" 2>/dev/null || true
+```
+Origin: GSD-T-Board v0.1.10 Verify ran the Playwright suite, the suite
+passed, the milestone was tagged VERIFIED — and 2442 `E2E_TEST_*` /
+`E2E_DRAG_*` ideas stayed live in the production data store. The gate
+exists to catch that class: any test data registered during Verify via the
+`withTestData()` Playwright fixture (or by direct calls to
+`appendInsert(...)`) MUST be purged before VERDICT.
+```bash
+# Verify-run id — set at Step 1; if not, derive from milestone + UTC.
+: "${GSD_T_VERIFY_RUN_ID:=verify-${MILESTONE:-current}-$(date -u +%Y%m%dT%H%M%SZ)}"
+export GSD_T_VERIFY_RUN_ID
+# Purge anything the fixture (or appendInsert) registered during this run.
+gsd-t test-data --purge --run "$GSD_T_VERIFY_RUN_ID" --json > /tmp/gsd-t-test-data-purge.json
+TD_EXIT=$?
+```
+- `test-data --purge` exit **0** (`errors:[]`) → record
+  `Test data: purged=<N> skipped=<M>` in the verify report. Gate passes.
+- exit **4** (`errors.length > 0`) → verify FAIL. Append the first 5
+  `errors[].message` values to the verify report and surface the count of
+  remaining records. The fix is either (a) repair the adapter / store
+  configuration so purge succeeds, or (b) update the test to use the
+  fixture so the ledger has accurate entries.
+The gate runs AFTER the E2E suite (Step 4) so any tests that inserted
+test data via `withTestData(...)` have already populated the ledger.
+It runs BEFORE Step 5 (verify report) so the purge counts can land in the
+report. Tests that bypass the fixture and leave un-tagged data will not
+be caught here — they're caught at the project's next dataset audit.
+Pure-deterministic CLI check (no LLM). Contract:
+`.gsd-t/contracts/test-data-ledger-contract.md` v1.0.0 STABLE.
+Tagging: `.gsd-t/contracts/test-data-tagging-contract.md` v1.0.0 STABLE.
+<!-- /M58: Test Data Cleanup Gate -->
 ## Step 5: Compile Verification Report
 ```bash
@@ -276,6 +365,7 @@ Create or update `.gsd-t/verify-report.md`:
 - Code Quality: {PASS/WARN/FAIL} — {N} issues found
 - Unit Tests: {PASS/WARN/FAIL} — {N}/{total} passing
 - E2E Tests: {PASS/WARN/FAIL} — {N}/{total} specs passing
+- Test Data Cleanup: {PASS/FAIL} — purged={N} skipped={M} errors={E}
 - Security: {PASS/WARN/FAIL} — {N} findings
 - Integration: {PASS/WARN/FAIL}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tekyzinc/gsd-t",
-  "version": "3.26.11",
+  "version": "3.29.10",
   "description": "GSD-T: Contract-Driven Development for Claude Code — 54 slash commands with headless-by-default workflow spawning, unattended supervisor relay with event stream, graph-powered code analysis, real-time agent dashboard, task telemetry, doc-ripple enforcement, backlog management, impact analysis, test sync, milestone archival, and PRD generation",
   "author": "Tekyz, Inc.",
   "license": "MIT",

package/scripts/context-meter/transcript-parser.js CHANGED Viewed

@@ -82,9 +82,12 @@ const readline = require("readline");
  * Parse a Claude Code transcript JSONL file.
  *
  * @param {string} transcriptPath - absolute path to a Claude Code transcript .jsonl
- * @returns {Promise<{system: string, messages: Array<{role: string, content: Array}>} | null>}
+ * @returns {Promise<{system: string, messages: Array<{role: string, content: Array}>, model: string|null} | null>}
  *          Resolves to the reconstructed body, or `null` on unreadable file /
  *          catastrophic parse failure. Caller treats `null` as "bail out, fail open".
+ *          `model` is the last-seen assistant `message.model` id (the model the
+ *          orchestrator session is running) or `null` if none observed — the
+ *          context meter uses it to size the context window correctly.
  */
 async function parseTranscript(transcriptPath) {
   if (typeof transcriptPath !== "string" || transcriptPath.length === 0) {
@@ -101,6 +104,10 @@ async function parseTranscript(transcriptPath) {
   const messages = [];
   let system = "";
+  // Last-seen assistant model id. Claude Code records `message.model` on every
+  // assistant turn; the orchestrator session runs one model for its lifetime,
+  // so the last value is authoritative for sizing the context window.
+  let model = null;
   let stream;
   try {
@@ -136,6 +143,9 @@ async function parseTranscript(transcriptPath) {
         if (!msg || typeof msg !== "object") continue;
         const role = msg.role || type;
+        if (type === "assistant" && typeof msg.model === "string" && msg.model.length > 0) {
+          model = msg.model;
+        }
         const content = normalizeContent(msg.content, role);
         if (content === null) continue;
@@ -152,7 +162,7 @@ async function parseTranscript(transcriptPath) {
     return null;
   }
-  return { system, messages: sanitizeToolPairs(messages) };
+  return { system, messages: sanitizeToolPairs(messages), model };
 }
 /**

package/scripts/context-meter/transcript-parser.test.js CHANGED Viewed

@@ -39,17 +39,17 @@ test("empty path / non-string → returns null", async () => {
   assert.equal(await parseTranscript(undefined), null);
 });
-test("empty file → returns { system:'', messages:[] }", async () => {
+test("empty file → returns { system:'', messages:[], model:null }", async () => {
   const { dir, file } = mkTmpFile([]);
   try {
     const got = await parseTranscript(file);
-    assert.deepEqual(got, { system: "", messages: [] });
+    assert.deepEqual(got, { system: "", messages: [], model: null });
   } finally {
     cleanup(dir);
   }
 });
-test("file with only unknown event types → { system:'', messages:[] }", async () => {
+test("file with only unknown event types → { system:'', messages:[], model:null }", async () => {
   const { dir, file } = mkTmpFile([
     { type: "summary", foo: "bar" },
     { type: "system", subtype: "hook", hookInfos: [] },
@@ -60,7 +60,7 @@ test("file with only unknown event types → { system:'', messages:[] }", async
   ]);
   try {
     const got = await parseTranscript(file);
-    assert.deepEqual(got, { system: "", messages: [] });
+    assert.deepEqual(got, { system: "", messages: [], model: null });
   } finally {
     cleanup(dir);
   }
@@ -104,6 +104,53 @@ test("normal conversation — string-content user + text assistant", async () =>
   }
 });
+test("captures last-seen assistant model id (for window sizing)", async () => {
+  const { dir, file } = mkTmpFile([
+    { type: "user", message: { role: "user", content: "hi" } },
+    {
+      type: "assistant",
+      message: {
+        role: "assistant",
+        model: "claude-opus-4-7",
+        content: [{ type: "text", text: "first" }],
+      },
+    },
+    { type: "user", message: { role: "user", content: "more" } },
+    {
+      type: "assistant",
+      message: {
+        role: "assistant",
+        model: "claude-opus-4-7-20260115",
+        content: [{ type: "text", text: "second" }],
+      },
+    },
+  ]);
+  try {
+    const got = await parseTranscript(file);
+    // Last assistant model wins (orchestrator session is single-model, but the
+    // last value is authoritative if a dated id supersedes a bare one).
+    assert.equal(got.model, "claude-opus-4-7-20260115");
+  } finally {
+    cleanup(dir);
+  }
+});
+test("model stays null when no assistant message carries one", async () => {
+  const { dir, file } = mkTmpFile([
+    { type: "user", message: { role: "user", content: "hi" } },
+    {
+      type: "assistant",
+      message: { role: "assistant", content: [{ type: "text", text: "no model field" }] },
+    },
+  ]);
+  try {
+    const got = await parseTranscript(file);
+    assert.equal(got.model, null);
+  } finally {
+    cleanup(dir);
+  }
+});
 test("tool_use / tool_result pairing by tool_use_id preserved in order", async () => {
   const TOOL_ID = "toolu_01ABC";
   const { dir, file } = mkTmpFile([

package/scripts/gsd-t-calibration-hook.js CHANGED Viewed

@@ -39,9 +39,16 @@
 const fs = require("fs");
 const path = require("path");
+const { SAFE_DEFAULT_WINDOW } = require("../bin/model-windows.cjs");
 const MAX_STDIN = 1024 * 1024; // 1 MiB
 const SCHEMA_VERSION = 1;
-const DEFAULT_CW_CEILING_TOKENS = 200000; // input-token budget per CW
+// Input-token budget per CW = the model context window. Default to the
+// model-aware safe window (1M); the old 200K literal was correct only for
+// pre-4 models and skewed every actualCwPct calibration ratio 5× on
+// Opus/Sonnet. Event state may still override via cwCeilingTokens (the
+// economics estimator records the model-aware ceiling it actually used).
+const DEFAULT_CW_CEILING_TOKENS = SAFE_DEFAULT_WINDOW;
 if (require.main === module) {
   let input = "";

package/scripts/gsd-t-context-meter.e2e.test.js CHANGED Viewed

@@ -74,7 +74,7 @@ class Sandbox {
    * The charCount parameter controls how many characters of text content
    * are in the transcript, which determines the estimated token count.
    */
-  writeTranscript(filename = "transcript.jsonl", charCount = 100) {
+  writeTranscript(filename = "transcript.jsonl", charCount = 100, model = "claude-opus-4-6") {
     const userText = "x".repeat(Math.floor(charCount / 2));
     const assistantText = "y".repeat(Math.ceil(charCount / 2));
     const lines = [
@@ -89,7 +89,7 @@ class Sandbox {
         message: {
           role: "assistant",
           content: [{ type: "text", text: assistantText }],
-          model: "claude-opus-4-6",
+          model,
         },
         uuid: "a1",
         sessionId: "sess-1",
@@ -231,7 +231,9 @@ afterEach(async () => {
 /* ──────────────────────────── tests ──────────────────────────── */
 test("E2E 1. below threshold — stdout {} and state reflects estimate", async () => {
-  // 100 chars of text content → ~29 tokens (100/3.5) → 0.014% of 200K window
+  // 100 chars → ~29 tokens. The transcript declares claude-opus-4-6, so the
+  // EFFECTIVE window is the real 1M (model-aware sizing), not the config's
+  // legacy 200K — ~0.003% of 1M, well below threshold.
   sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
   const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 100);
@@ -248,7 +250,11 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
   assert.equal(state.version, 1);
   assert.ok(state.inputTokens > 0, "should have estimated some tokens");
   assert.ok(state.inputTokens < 1000, "small transcript should estimate < 1K tokens");
-  assert.equal(state.modelWindowSize, 200000);
+  assert.equal(
+    state.modelWindowSize,
+    1_000_000,
+    "window resolved from the transcript's claude-opus-4-6 model (1M), not config 200K"
+  );
   assert.ok(state.pct < 1, "pct should be well below threshold");
   assert.equal(state.threshold, "normal");
   assert.equal(state.checkCount, 1);
@@ -258,9 +264,15 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
 });
 test("E2E 2. above threshold — stdout additionalContext with large transcript", async () => {
-  // 600K chars → ~171K tokens → 85.7% of 200K window → warn band + additionalContext
+  // Haiku → real 200K window. 600K chars → ~171K tokens → ~85.7% of 200K →
+  // threshold band + additionalContext. (Model-aware sizing means we pin a
+  // 200K-window model here rather than relying on a stale config default.)
   sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
-  const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 600000);
+  const transcriptPath = sandbox.writeTranscript(
+    "transcript.jsonl",
+    600000,
+    "claude-haiku-4-5-20251001"
+  );
   const { stdout, code } = await sandbox.runHook({
     payload: { session_id: "test-above", transcript_path: transcriptPath },
@@ -283,6 +295,33 @@ test("E2E 2. above threshold — stdout additionalContext with large transcript"
   assert.equal(sandbox.tmpFileExists(), false);
 });
+test("E2E 2b. REGRESSION — large Opus transcript stays 'normal' on the 1M window", async () => {
+  // The reported bug, end-to-end: ~600K chars → ~171K tokens. Under the old
+  // hardcoded 200K window this read as ~85% → false headless handoff while
+  // ~64% of context REMAINED. With model-aware sizing (claude-opus-4-7 → 1M),
+  // 171K is only ~17% → stdout {} → no premature handoff.
+  sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
+  const transcriptPath = sandbox.writeTranscript(
+    "transcript.jsonl",
+    600000,
+    "claude-opus-4-7"
+  );
+  const { stdout, code } = await sandbox.runHook({
+    payload: { session_id: "test-regression", transcript_path: transcriptPath },
+  });
+  assert.equal(code, 0);
+  const parsed = JSON.parse(stdout || "{}");
+  assert.deepEqual(parsed, {}, "must NOT hand off — the reported regression");
+  const state = sandbox.readState();
+  assert.equal(state.modelWindowSize, 1_000_000);
+  assert.ok(state.inputTokens > 100000, "large transcript, >100K tokens");
+  assert.ok(state.pct < 75, `pct ${state.pct} must be below threshold on a 1M window`);
+  assert.equal(state.threshold, "normal");
+});
 test("E2E 3. missing transcript — stdout {}, state has parse error", async () => {
   sandbox.writeConfig({ thresholdPct: 75, checkFrequency: 1 });

package/scripts/gsd-t-context-meter.js CHANGED Viewed

@@ -37,6 +37,7 @@ const { loadConfig: realLoadConfig } = require("../bin/context-meter-config.cjs"
 const { parseTranscript: realParseTranscript } = require("./context-meter/transcript-parser");
 const { estimateTokens: realEstimateTokens } = require("./context-meter/estimate-tokens");
 const { computePct, bandFor, buildAdditionalContext } = require("./context-meter/threshold");
+const { windowForModel } = require("../bin/model-windows.cjs");
 const STATE_VERSION = 1;
@@ -208,6 +209,18 @@ async function runMeter(opts) {
       return {};
     }
+    // 5b. Resolve the EFFECTIVE context window from the model the orchestrator
+    // session is actually running (parsed.model). Opus 4.6/4.7 and Sonnet 4.x
+    // ship a 1M window; the config default (200k) is a legacy fallback that
+    // overcounts usage 5× and fires the headless handoff far too early. We
+    // only override when the transcript reports a model — a missing model or
+    // an explicit project config value falls through to cfg.modelWindowSize.
+    const effectiveWindow =
+      typeof parsed.model === "string" && parsed.model.length > 0
+        ? windowForModel(parsed.model)
+        : cfg.modelWindowSize;
+    state.modelWindowSize = effectiveWindow;
     // 6. Estimate tokens locally (no API call, zero cost).
     let tokenResp;
     try {
@@ -237,7 +250,7 @@ async function runMeter(opts) {
     // 8. Success path — compute pct, band, possibly emit additionalContext.
     const pct = computePct({
       inputTokens: tokenResp.inputTokens,
-      modelWindowSize: cfg.modelWindowSize,
+      modelWindowSize: effectiveWindow,
     });
     const band = bandFor(pct, cfg.thresholdPct);
@@ -251,13 +264,14 @@ async function runMeter(opts) {
       logPath,
       "INFO",
       "measure",
-      `tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band}`,
+      `tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band} ` +
+        `window=${effectiveWindow}${parsed.model ? ` model=${parsed.model}` : ""}`,
       clock
     );
     const additionalContext = buildAdditionalContext({
       pct,
-      modelWindowSize: cfg.modelWindowSize,
+      modelWindowSize: effectiveWindow,
       thresholdPct: cfg.thresholdPct,
     });
     if (additionalContext) {