@tekyzinc/gsd-t 3.26.11 → 3.29.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/CHANGELOG.md +151 -0
  2. package/README.md +4 -0
  3. package/bin/context-budget-audit.cjs +17 -2
  4. package/bin/gsd-t-build-coverage.cjs +438 -0
  5. package/bin/gsd-t-ci-parity.cjs +500 -0
  6. package/bin/gsd-t-economics.cjs +37 -9
  7. package/bin/gsd-t-test-data-adapters/file-json-array.cjs +56 -0
  8. package/bin/gsd-t-test-data-adapters/localstorage-key-prefix.cjs +44 -0
  9. package/bin/gsd-t-test-data-adapters/sqlite-table-where.cjs +71 -0
  10. package/bin/gsd-t-test-data-ledger.cjs +290 -0
  11. package/bin/gsd-t-time-format.cjs +94 -0
  12. package/bin/gsd-t.js +30 -0
  13. package/bin/model-windows.cjs +99 -0
  14. package/bin/model-windows.test.cjs +75 -0
  15. package/bin/orchestrator.js +4 -1
  16. package/bin/runway-estimator.cjs +35 -5
  17. package/bin/token-budget.cjs +12 -3
  18. package/commands/gsd-t-complete-milestone.md +7 -3
  19. package/commands/gsd-t-help.md +21 -0
  20. package/commands/gsd-t-init.md +1 -1
  21. package/commands/gsd-t-verify.md +90 -0
  22. package/package.json +1 -1
  23. package/scripts/context-meter/transcript-parser.js +12 -2
  24. package/scripts/context-meter/transcript-parser.test.js +51 -4
  25. package/scripts/gsd-t-calibration-hook.js +8 -1
  26. package/scripts/gsd-t-context-meter.e2e.test.js +45 -6
  27. package/scripts/gsd-t-context-meter.js +17 -3
  28. package/scripts/gsd-t-context-meter.test.js +85 -0
  29. package/scripts/gsd-t-date-guard.js +26 -5
  30. package/scripts/gsd-t-design-review-server.js +3 -1
  31. package/templates/CLAUDE-global.md +37 -1
  32. package/templates/progress.md +6 -2
  33. package/templates/test-helpers/README.md +98 -0
  34. package/templates/test-helpers/test-data-fixture.ts +153 -0
@@ -26,6 +26,7 @@
26
26
  const fs = require("fs");
27
27
  const path = require("path");
28
28
  const { execFileSync, execFile, spawn: cpSpawn } = require("child_process");
29
+ const { localIsoWithOffset } = require(path.join(__dirname, "gsd-t-time-format.cjs"));
29
30
 
30
31
  // ─── ANSI Colors ────────────────────────────────────────────────────────────
31
32
 
@@ -1275,11 +1276,13 @@ ${BOLD}Phases:${RESET} ${this.wf.phases.join(" → ")}
1275
1276
  }
1276
1277
 
1277
1278
  // 6f. Record phase completion
1279
+ // M59 (v3.29.10): completedAt is local-offset ISO (`YYYY-MM-DDTHH:MM:SS±HH:MM`)
1280
+ // rather than UTC `Z` — matches the human-readable progress.md fields.
1278
1281
  state.phaseResults[phase] = {
1279
1282
  completed: true,
1280
1283
  builtPaths,
1281
1284
  reviewCycles: reviewCycle + 1,
1282
- completedAt: new Date().toISOString(),
1285
+ completedAt: localIsoWithOffset(),
1283
1286
  };
1284
1287
  state.completedPhases.push(phase);
1285
1288
  this.clearQueue(projectDir);
@@ -26,12 +26,43 @@
26
26
 
27
27
  const fs = require('fs');
28
28
  const path = require('path');
29
+ const { SAFE_DEFAULT_WINDOW } = require('./model-windows.cjs');
29
30
 
30
31
  const DEFAULT_K = 5;
31
- const DEFAULT_MODEL_CONTEXT_CAP = 200000;
32
+ // The model context cap IS the model's true window. Default to the model-aware
33
+ // safe LARGE window (1M) — the old 200K literal was correct only for pre-4
34
+ // models and made turn-to-compact predictions fire 5× too early on Opus/Sonnet.
35
+ // Callers may still pass an explicit `modelContextCap`, and resolveContextCap()
36
+ // below prefers a fresh Context Meter reading when available.
37
+ const DEFAULT_MODEL_CONTEXT_CAP = SAFE_DEFAULT_WINDOW;
32
38
  // Claude Code starts auto-compacting ~8% before the model window fills, so the
33
39
  // effective dialog ceiling is 0.92 × modelContextCap.
34
40
  const PRE_COMPACT_HEADROOM = 0.92;
41
+ // Context Meter state — its modelWindowSize is model-aware (bin/model-windows.cjs).
42
+ const METER_STATE_REL = '.gsd-t/.context-meter-state.json';
43
+ const METER_STATE_STALE_MS = 5 * 60 * 1000;
44
+
45
+ /**
46
+ * Resolve the effective model context cap. Priority:
47
+ * 1. explicit opts.modelContextCap (caller override)
48
+ * 2. fresh Context Meter state modelWindowSize (model-aware, written by the
49
+ * meter hook from the running model)
50
+ * 3. DEFAULT_MODEL_CONTEXT_CAP (safe large fallback)
51
+ */
52
+ function resolveContextCap(projectDir, optCap) {
53
+ if (Number.isFinite(optCap) && optCap > 0) return optCap;
54
+ try {
55
+ const fp = path.join(projectDir || '.', METER_STATE_REL);
56
+ const s = JSON.parse(fs.readFileSync(fp, 'utf8'));
57
+ if (s && typeof s.modelWindowSize === 'number' && s.modelWindowSize > 0 && s.timestamp) {
58
+ const age = Date.now() - Date.parse(s.timestamp);
59
+ if (!isNaN(age) && age >= 0 && age <= METER_STATE_STALE_MS) {
60
+ return s.modelWindowSize;
61
+ }
62
+ }
63
+ } catch (_) { /* fall through to default */ }
64
+ return DEFAULT_MODEL_CONTEXT_CAP;
65
+ }
35
66
  const DEFAULT_WARN_THRESHOLD_TURNS = 5;
36
67
  const MIN_HISTORY = 3;
37
68
 
@@ -107,7 +138,8 @@ function _sortTurns(rows) {
107
138
  * @param {string} opts.projectDir
108
139
  * @param {string} opts.sessionId required
109
140
  * @param {number} [opts.k] default 5 (last K turns)
110
- * @param {number} [opts.modelContextCap] default 200000
141
+ * @param {number} [opts.modelContextCap] default: fresh Context Meter
142
+ * modelWindowSize if available, else the model-aware safe window (1M)
111
143
  * @param {number} [opts.warnThresholdTurns] default 5
112
144
  * @returns {{
113
145
  * shouldWarn: boolean,
@@ -124,9 +156,7 @@ function estimateDialogGrowth(opts) {
124
156
  const projectDir = (opts && opts.projectDir) || '.';
125
157
  const sessionId = opts && opts.sessionId;
126
158
  const k = (opts && Number.isFinite(opts.k) && opts.k > 0) ? Math.floor(opts.k) : DEFAULT_K;
127
- const cap = (opts && Number.isFinite(opts.modelContextCap) && opts.modelContextCap > 0)
128
- ? opts.modelContextCap
129
- : DEFAULT_MODEL_CONTEXT_CAP;
159
+ const cap = resolveContextCap(projectDir, opts && opts.modelContextCap);
130
160
  const warnThreshold = (opts && Number.isFinite(opts.warnThresholdTurns) && opts.warnThresholdTurns > 0)
131
161
  ? opts.warnThresholdTurns
132
162
  : DEFAULT_WARN_THRESHOLD_TURNS;
@@ -26,11 +26,18 @@
26
26
 
27
27
  const fs = require("fs");
28
28
  const path = require("path");
29
+ const { SAFE_DEFAULT_WINDOW } = require("./model-windows.cjs");
29
30
 
30
31
  // ── Constants ────────────────────────────────────────────────────────────────
31
32
 
32
33
  const MODEL_RATIOS = { haiku: 1, sonnet: 5, opus: 25 };
33
34
 
35
+ // Fallback context window when no per-model signal is available. Uses the
36
+ // model-windows safe LARGE default (1M) rather than a bare 200K literal: a
37
+ // too-small fallback re-introduces the premature-headless-handoff bug the
38
+ // model-aware sizing was added to fix. See bin/model-windows.cjs.
39
+ const FALLBACK_WINDOW = SAFE_DEFAULT_WINDOW;
40
+
34
41
  // Base token estimates per task type (in haiku-equivalent units)
35
42
  const BASE_ESTIMATES = {
36
43
  execute: 8000,
@@ -106,7 +113,9 @@ function getSessionStatus(projectDir) {
106
113
  const real = readContextMeterState(dir);
107
114
  if (real) {
108
115
  const consumed = real.inputTokens;
109
- const window = real.modelWindowSize > 0 ? real.modelWindowSize : 200000;
116
+ // Primary: the Context Meter writes a model-aware modelWindowSize into
117
+ // state (bin/model-windows.cjs). Fallback only when state predates that.
118
+ const window = real.modelWindowSize > 0 ? real.modelWindowSize : FALLBACK_WINDOW;
110
119
  const estimated_remaining = Math.max(0, window - consumed);
111
120
  const pct = Math.round(real.pct * 10) / 10;
112
121
  const threshold = bandFor(pct, thresholdPct);
@@ -145,7 +154,7 @@ function resolveThresholdPct(dir) {
145
154
  }
146
155
 
147
156
  function getSessionStatusHeuristic(dir, thresholdPct) {
148
- const window = 200000;
157
+ const window = FALLBACK_WINDOW;
149
158
  const consumed = readSessionConsumed(dir);
150
159
  const estimated_remaining = Math.max(0, window - consumed);
151
160
  const pct = window > 0 ? Math.round((consumed / window) * 100 * 10) / 10 : 0;
@@ -180,7 +189,7 @@ function recordUsage(usage) {
180
189
  */
181
190
  function estimateMilestoneCost(remainingTasks, projectDir) {
182
191
  const status = getSessionStatus(projectDir);
183
- const window = status.consumed + status.estimated_remaining || 200000;
192
+ const window = status.consumed + status.estimated_remaining || FALLBACK_WINDOW;
184
193
  const estimatedTokens = remainingTasks.reduce((sum, t) => {
185
194
  return sum + estimateCost(t.model, t.taskType, { complexity: t.complexity, projectDir });
186
195
  }, 0);
@@ -336,7 +336,7 @@ Create `summary.md`:
336
336
  ```markdown
337
337
  # Milestone Complete: {name}
338
338
 
339
- **Completed**: {date}
339
+ **Completed**: {YYYY-MM-DD HH:MM TZ}
340
340
  **Duration**: {start date} → {end date}
341
341
  **Status**: {VERIFIED | FORCED}
342
342
 
@@ -419,14 +419,18 @@ Steps to apply the trim:
419
419
  # GSD-T Progress
420
420
 
421
421
  ## Version: {new version}
422
+ ## Status: ACTIVE
423
+ ## Date: {YYYY-MM-DD HH:MM TZ — source from live `[GSD-T NOW]`}
422
424
  ## Current Milestone
423
425
  None — ready for next milestone
424
426
 
425
427
  ## Completed Milestones
426
428
  | Milestone | Version | Completed | Tag |
427
429
  |-----------|---------|-----------|-----|
428
- | {name} | {version} | {date} | v{version} |
429
- | {previous} | {version} | {date} | v{version} |
430
+ | {name} | {version} | {YYYY-MM-DD HH:MM TZ} | v{version} |
431
+ | {previous} | {version} | {YYYY-MM-DD HH:MM TZ — keep existing rows as-is; forward-only format} | v{version} |
432
+ <!-- M59 (v3.29.10): "Completed" cells are written `YYYY-MM-DD HH:MM TZ` from this version forward. Pre-3.29.10 rows that read `YYYY-MM-DD` stay as-is (forward-only — never rewrite). Readers MUST accept both. -->
433
+
430
434
 
431
435
  ## Decision Log
432
436
 
@@ -455,6 +455,27 @@ Use these when user asks for help on a specific command:
455
455
  - **Use when**: Final pre-merge gate. Both tracks always run; both report. `ok` is purely deterministic — LLM verdict is advisory.
456
456
  - **CLI**: `gsd-t verify-gate [--skip-track1] [--skip-track2] [--max-concurrency N] [--fail-fast] [--json]`. Exit 0/4/2/3.
457
457
 
458
+ ### build-coverage (M57)
459
+ - **Summary**: Detects new top-level paths in a milestone commit range not referenced by a real CI build input. Coverage is decided by STRUCTURALLY parsing CI files (Dockerfile COPY/ADD source args incl. relative `--from=`; cloudbuild `args`-positional; workflow `run`-positional via a block-scalar-aware YAML walker) — never substring-matching. `node_modules` never counts. No documented false-negative residual.
460
+ - **Auto-invoked**: Yes — by `gsd-t-verify` Step 2.6 (FAIL-blocking, never warning-only)
461
+ - **Files**: `bin/gsd-t-build-coverage.cjs`
462
+ - **Use when**: Final pre-merge gate. Catches the TimeTracking v1.10.12 class (new `hooks/` dir committed, absent from Dockerfile COPY, shipped broken while verify passed).
463
+ - **CLI**: `gsd-t build-coverage [--json] [--base REF] [--head REF] [--project-dir PATH]`. Exit 0/4/2.
464
+
465
+ ### ci-parity (M57)
466
+ - **Summary**: Reproduces the project's actual CI build locally instead of assuming warm-cache local tsc/test parity. Auto-detects CI config (cloudbuild → workflows → Dockerfile RUN → package.json scripts), clears build caches (containment-safe — refuses any config-derived delete resolving outside OR equal-to projectRoot), and auto-runs the real `docker build` when a Dockerfile is present (presence is the trigger, no opt-in flag).
467
+ - **Auto-invoked**: Yes — by `gsd-t-verify` Step 2.6 (FAIL-blocking, never warning-only)
468
+ - **Files**: `bin/gsd-t-ci-parity.cjs`
469
+ - **Use when**: Final pre-merge gate. Catches the TimeTracking v1.10.12 class (noImplicitAny regressions passed a warm-cache local tsc but failed CI's cold build).
470
+ - **CLI**: `gsd-t ci-parity [--project-dir PATH] [--timeout-ms MS] [--json]`. Exit 0/4/2.
471
+
472
+ ### test-data (M58)
473
+ - **Summary**: Append-only test-data ledger + purge engine. Tests register inserts via the `withTestData()` Playwright fixture; `gsd-t-verify` Step 4.5 purges them by adapter before VERDICT. Three built-in adapters: `localStorage-key-prefix`, `file-json-array`, `sqlite-table-where`. Each adapter refuses to delete records whose id does not start with the ledger row's `taggedPrefix` (defense in depth).
474
+ - **Auto-invoked**: Yes — by `gsd-t-verify` Step 4.5 (FAIL-blocking, never warning-only)
475
+ - **Files**: `bin/gsd-t-test-data-ledger.cjs`, `bin/gsd-t-test-data-adapters/*.cjs`, `templates/test-helpers/test-data-fixture.ts`
476
+ - **Use when**: Test data hygiene. Catches the GSD-T-Board class (2442 orphaned `E2E_TEST_*` / `E2E_DRAG_*` ideas left in the production data store after a passing Verify run).
477
+ - **CLI**: `gsd-t test-data --list [--run <id>] [--json]` / `gsd-t test-data --purge --run <id> [--dry-run] [--json] [--project <dir>]`. Exit 0 on success, 4 on adapter errors, 64 on usage error.
478
+
458
479
  ## Unknown Command
459
480
 
460
481
  If user asks for help on unrecognized command:
@@ -180,7 +180,7 @@ Create `.gsd-t/progress.md`:
180
180
  ## Project: {name from CLAUDE.md or $ARGUMENTS}
181
181
  ## Version: {detected version, or 0.1.00}
182
182
  ## Status: INITIALIZED
183
- ## Date: {today}
183
+ ## Date: {today YYYY-MM-DD HH:MM TZ — source from the live `[GSD-T NOW]` signal; never date-only}
184
184
 
185
185
  ## Milestones
186
186
  | # | Milestone | Status | Domains |
@@ -102,6 +102,52 @@ Defensive on missing `.gsd-t/ratelimit-map.json` — verify-gate falls back to
102
102
 
103
103
  Contract: `.gsd-t/contracts/verify-gate-contract.md` v1.0.0 STABLE.
104
104
 
105
+ <!-- M57: CI-parity FAIL-blocking gate -->
106
+ ## Step 2.6: CI-Parity Gate (MANDATORY — FAIL-blocking, never warning-only)
107
+
108
+ ```bash
109
+ node scripts/gsd-t-watch-state.js advance --agent-id "$GSD_T_AGENT_ID" --parent-id "${GSD_T_PARENT_AGENT_ID:-null}" --command gsd-t-verify --step 2 --step-label ".6: CI-Parity Gate" 2>/dev/null || true
110
+ ```
111
+
112
+ Origin: TimeTracking v1.10.12 shipped VERIFIED + tagged while Cloud Build
113
+ failed (a new top-level `hooks/` dir was committed but never added to the
114
+ Dockerfile `COPY` directives, and `noImplicitAny` regressions passed a
115
+ warm-cache local `tsc` but failed CI's cold build). `gsd-t-verify` must
116
+ reproduce the project's *actual* CI build, not assume local parity.
117
+
118
+ Run BOTH checks. **Either failing is a verify FAIL — it blocks
119
+ complete-milestone. This is never a warning-only signal.**
120
+
121
+ ```bash
122
+ # 1. Build-coverage — every new top-level path in the milestone range must be
123
+ # referenced by a real CI build input (structural parse, not substring).
124
+ gsd-t build-coverage --json > /tmp/gsd-t-build-coverage.json
125
+ BC_EXIT=$?
126
+
127
+ # 2. CI-parity — reproduce the project's actual CI build locally with caches
128
+ # cleared; auto-runs `docker build` when a Dockerfile is present.
129
+ gsd-t ci-parity --json > /tmp/gsd-t-ci-parity.json
130
+ CP_EXIT=$?
131
+ ```
132
+
133
+ - `build-coverage` exit **4** (`ok:false`, `missing[]` non-empty) → verify
134
+ FAIL. Report each uncovered path; the fix is to add the path to the
135
+ Dockerfile `COPY` / cloudbuild artifact / workflow build input.
136
+ - `ci-parity` exit **4** (`ok:false` — a detected CI command or the real
137
+ `docker build` failed) → verify FAIL. Report the failing command.
138
+ - exit **0** from both → gate passes.
139
+ - exit **2** (usage error, e.g. not a git repo) → record as a structured
140
+ note; not a pass-by-default (investigate before proceeding).
141
+
142
+ Both are pure-deterministic CLI checks (no LLM). They consume the same
143
+ preflight envelope as the M55 verify-gate Track 1, so failing here at
144
+ verify mirrors what CI would do — catching the TimeTracking class before
145
+ the milestone is tagged.
146
+
147
+ Contracts: `.gsd-t/contracts/cli-build-coverage-contract.md` v2.0.0 STABLE,
148
+ `.gsd-t/contracts/ci-parity-contract.md` v2.0.0 STABLE.
149
+ <!-- /M57: CI-parity FAIL-blocking gate -->
150
+
105
151
  ## Step 2.5: High-Risk Domain Gate (MANDATORY — Categories 2 and 7)
106
152
 
107
153
  ```bash
@@ -257,6 +303,49 @@ const { captureSpawn } = require('./bin/gsd-t-token-capture.cjs');
257
303
  `captureSpawn` parses `result.usage` and writes the row to `.gsd-t/token-log.md` under the canonical header. Tokens column renders as `in=N out=N cr=N cc=N $X.XX` or `—`, never `N/A`. Collect all reports, synthesize, create remediation plan.
258
304
  ```
259
305
 
306
+ <!-- M58: Test Data Cleanup Gate -->
307
+ ## Step 4.5: Test Data Cleanup Gate (MANDATORY — FAIL-blocking, never warning-only)
308
+
309
+ ```bash
310
+ node scripts/gsd-t-watch-state.js advance --agent-id "$GSD_T_AGENT_ID" --parent-id "${GSD_T_PARENT_AGENT_ID:-null}" --command gsd-t-verify --step 4 --step-label ".5: Test Data Cleanup Gate" 2>/dev/null || true
311
+ ```
312
+
313
+ Origin: GSD-T-Board v0.1.10 Verify ran the Playwright suite, the suite
314
+ passed, the milestone was tagged VERIFIED — and 2442 `E2E_TEST_*` /
315
+ `E2E_DRAG_*` ideas stayed live in the production data store. The gate
316
+ exists to catch that class: any test data registered during Verify via the
317
+ `withTestData()` Playwright fixture (or by direct calls to
318
+ `appendInsert(...)`) MUST be purged before VERDICT.
319
+
320
+ ```bash
321
+ # Verify-run id — set at Step 1; if not, derive from milestone + UTC.
322
+ : "${GSD_T_VERIFY_RUN_ID:=verify-${MILESTONE:-current}-$(date -u +%Y%m%dT%H%M%SZ)}"
323
+ export GSD_T_VERIFY_RUN_ID
324
+
325
+ # Purge anything the fixture (or appendInsert) registered during this run.
326
+ gsd-t test-data --purge --run "$GSD_T_VERIFY_RUN_ID" --json > /tmp/gsd-t-test-data-purge.json
327
+ TD_EXIT=$?
328
+ ```
329
+
330
+ - `test-data --purge` exit **0** (`errors:[]`) → record
331
+ `Test data: purged=<N> skipped=<M>` in the verify report. Gate passes.
332
+ - exit **4** (`errors.length > 0`) → verify FAIL. Append the first 5
333
+ `errors[].message` values to the verify report and surface the count of
334
+ remaining records. The fix is either (a) repair the adapter / store
335
+ configuration so purge succeeds, or (b) update the test to use the
336
+ fixture so the ledger has accurate entries.
337
+
338
+ The gate runs AFTER the E2E suite (Step 4) so any tests that inserted
339
+ test data via `withTestData(...)` have already populated the ledger.
340
+ It runs BEFORE Step 5 (verify report) so the purge counts can land in the
341
+ report. Tests that bypass the fixture and leave un-tagged data will not
342
+ be caught here — they're caught at the project's next dataset audit.
343
+
344
+ Pure-deterministic CLI check (no LLM). Contract:
345
+ `.gsd-t/contracts/test-data-ledger-contract.md` v1.0.0 STABLE.
346
+ Tagging: `.gsd-t/contracts/test-data-tagging-contract.md` v1.0.0 STABLE.
347
+ <!-- /M58: Test Data Cleanup Gate -->
348
+
260
349
  ## Step 5: Compile Verification Report
261
350
 
262
351
  ```bash
@@ -276,6 +365,7 @@ Create or update `.gsd-t/verify-report.md`:
276
365
  - Code Quality: {PASS/WARN/FAIL} — {N} issues found
277
366
  - Unit Tests: {PASS/WARN/FAIL} — {N}/{total} passing
278
367
  - E2E Tests: {PASS/WARN/FAIL} — {N}/{total} specs passing
368
+ - Test Data Cleanup: {PASS/FAIL} — purged={N} skipped={M} errors={E}
279
369
  - Security: {PASS/WARN/FAIL} — {N} findings
280
370
  - Integration: {PASS/WARN/FAIL}
281
371
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tekyzinc/gsd-t",
3
- "version": "3.26.11",
3
+ "version": "3.29.10",
4
4
  "description": "GSD-T: Contract-Driven Development for Claude Code — 54 slash commands with headless-by-default workflow spawning, unattended supervisor relay with event stream, graph-powered code analysis, real-time agent dashboard, task telemetry, doc-ripple enforcement, backlog management, impact analysis, test sync, milestone archival, and PRD generation",
5
5
  "author": "Tekyz, Inc.",
6
6
  "license": "MIT",
@@ -82,9 +82,12 @@ const readline = require("readline");
82
82
  * Parse a Claude Code transcript JSONL file.
83
83
  *
84
84
  * @param {string} transcriptPath - absolute path to a Claude Code transcript .jsonl
85
- * @returns {Promise<{system: string, messages: Array<{role: string, content: Array}>} | null>}
85
+ * @returns {Promise<{system: string, messages: Array<{role: string, content: Array}>, model: string|null} | null>}
86
86
  * Resolves to the reconstructed body, or `null` on unreadable file /
87
87
  * catastrophic parse failure. Caller treats `null` as "bail out, fail open".
88
+ * `model` is the last-seen assistant `message.model` id (the model the
89
+ * orchestrator session is running) or `null` if none observed — the
90
+ * context meter uses it to size the context window correctly.
88
91
  */
89
92
  async function parseTranscript(transcriptPath) {
90
93
  if (typeof transcriptPath !== "string" || transcriptPath.length === 0) {
@@ -101,6 +104,10 @@ async function parseTranscript(transcriptPath) {
101
104
 
102
105
  const messages = [];
103
106
  let system = "";
107
+ // Last-seen assistant model id. Claude Code records `message.model` on every
108
+ // assistant turn; the orchestrator session runs one model for its lifetime,
109
+ // so the last value is authoritative for sizing the context window.
110
+ let model = null;
104
111
 
105
112
  let stream;
106
113
  try {
@@ -136,6 +143,9 @@ async function parseTranscript(transcriptPath) {
136
143
  if (!msg || typeof msg !== "object") continue;
137
144
 
138
145
  const role = msg.role || type;
146
+ if (type === "assistant" && typeof msg.model === "string" && msg.model.length > 0) {
147
+ model = msg.model;
148
+ }
139
149
  const content = normalizeContent(msg.content, role);
140
150
  if (content === null) continue;
141
151
 
@@ -152,7 +162,7 @@ async function parseTranscript(transcriptPath) {
152
162
  return null;
153
163
  }
154
164
 
155
- return { system, messages: sanitizeToolPairs(messages) };
165
+ return { system, messages: sanitizeToolPairs(messages), model };
156
166
  }
157
167
 
158
168
  /**
@@ -39,17 +39,17 @@ test("empty path / non-string → returns null", async () => {
39
39
  assert.equal(await parseTranscript(undefined), null);
40
40
  });
41
41
 
42
- test("empty file → returns { system:'', messages:[] }", async () => {
42
+ test("empty file → returns { system:'', messages:[], model:null }", async () => {
43
43
  const { dir, file } = mkTmpFile([]);
44
44
  try {
45
45
  const got = await parseTranscript(file);
46
- assert.deepEqual(got, { system: "", messages: [] });
46
+ assert.deepEqual(got, { system: "", messages: [], model: null });
47
47
  } finally {
48
48
  cleanup(dir);
49
49
  }
50
50
  });
51
51
 
52
- test("file with only unknown event types → { system:'', messages:[] }", async () => {
52
+ test("file with only unknown event types → { system:'', messages:[], model:null }", async () => {
53
53
  const { dir, file } = mkTmpFile([
54
54
  { type: "summary", foo: "bar" },
55
55
  { type: "system", subtype: "hook", hookInfos: [] },
@@ -60,7 +60,7 @@ test("file with only unknown event types → { system:'', messages:[] }", async
60
60
  ]);
61
61
  try {
62
62
  const got = await parseTranscript(file);
63
- assert.deepEqual(got, { system: "", messages: [] });
63
+ assert.deepEqual(got, { system: "", messages: [], model: null });
64
64
  } finally {
65
65
  cleanup(dir);
66
66
  }
@@ -104,6 +104,53 @@ test("normal conversation — string-content user + text assistant", async () =>
104
104
  }
105
105
  });
106
106
 
107
+ test("captures last-seen assistant model id (for window sizing)", async () => {
108
+ const { dir, file } = mkTmpFile([
109
+ { type: "user", message: { role: "user", content: "hi" } },
110
+ {
111
+ type: "assistant",
112
+ message: {
113
+ role: "assistant",
114
+ model: "claude-opus-4-7",
115
+ content: [{ type: "text", text: "first" }],
116
+ },
117
+ },
118
+ { type: "user", message: { role: "user", content: "more" } },
119
+ {
120
+ type: "assistant",
121
+ message: {
122
+ role: "assistant",
123
+ model: "claude-opus-4-7-20260115",
124
+ content: [{ type: "text", text: "second" }],
125
+ },
126
+ },
127
+ ]);
128
+ try {
129
+ const got = await parseTranscript(file);
130
+ // Last assistant model wins (orchestrator session is single-model, but the
131
+ // last value is authoritative if a dated id supersedes a bare one).
132
+ assert.equal(got.model, "claude-opus-4-7-20260115");
133
+ } finally {
134
+ cleanup(dir);
135
+ }
136
+ });
137
+
138
+ test("model stays null when no assistant message carries one", async () => {
139
+ const { dir, file } = mkTmpFile([
140
+ { type: "user", message: { role: "user", content: "hi" } },
141
+ {
142
+ type: "assistant",
143
+ message: { role: "assistant", content: [{ type: "text", text: "no model field" }] },
144
+ },
145
+ ]);
146
+ try {
147
+ const got = await parseTranscript(file);
148
+ assert.equal(got.model, null);
149
+ } finally {
150
+ cleanup(dir);
151
+ }
152
+ });
153
+
107
154
  test("tool_use / tool_result pairing by tool_use_id preserved in order", async () => {
108
155
  const TOOL_ID = "toolu_01ABC";
109
156
  const { dir, file } = mkTmpFile([
@@ -39,9 +39,16 @@
39
39
  const fs = require("fs");
40
40
  const path = require("path");
41
41
 
42
+ const { SAFE_DEFAULT_WINDOW } = require("../bin/model-windows.cjs");
43
+
42
44
  const MAX_STDIN = 1024 * 1024; // 1 MiB
43
45
  const SCHEMA_VERSION = 1;
44
- const DEFAULT_CW_CEILING_TOKENS = 200000; // input-token budget per CW
46
+ // Input-token budget per CW = the model context window. Default to the
47
+ // model-aware safe window (1M); the old 200K literal was correct only for
48
+ // pre-4 models and skewed every actualCwPct calibration ratio 5× on
49
+ // Opus/Sonnet. Event state may still override via cwCeilingTokens (the
50
+ // economics estimator records the model-aware ceiling it actually used).
51
+ const DEFAULT_CW_CEILING_TOKENS = SAFE_DEFAULT_WINDOW;
45
52
 
46
53
  if (require.main === module) {
47
54
  let input = "";
@@ -74,7 +74,7 @@ class Sandbox {
74
74
  * The charCount parameter controls how many characters of text content
75
75
  * are in the transcript, which determines the estimated token count.
76
76
  */
77
- writeTranscript(filename = "transcript.jsonl", charCount = 100) {
77
+ writeTranscript(filename = "transcript.jsonl", charCount = 100, model = "claude-opus-4-6") {
78
78
  const userText = "x".repeat(Math.floor(charCount / 2));
79
79
  const assistantText = "y".repeat(Math.ceil(charCount / 2));
80
80
  const lines = [
@@ -89,7 +89,7 @@ class Sandbox {
89
89
  message: {
90
90
  role: "assistant",
91
91
  content: [{ type: "text", text: assistantText }],
92
- model: "claude-opus-4-6",
92
+ model,
93
93
  },
94
94
  uuid: "a1",
95
95
  sessionId: "sess-1",
@@ -231,7 +231,9 @@ afterEach(async () => {
231
231
  /* ──────────────────────────── tests ──────────────────────────── */
232
232
 
233
233
  test("E2E 1. below threshold — stdout {} and state reflects estimate", async () => {
234
- // 100 chars of text content → ~29 tokens (100/3.5) 0.014% of 200K window
234
+ // 100 chars → ~29 tokens. The transcript declares claude-opus-4-6, so the
235
+ // EFFECTIVE window is the real 1M (model-aware sizing), not the config's
236
+ // legacy 200K — ~0.003% of 1M, well below threshold.
235
237
  sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
236
238
  const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 100);
237
239
 
@@ -248,7 +250,11 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
248
250
  assert.equal(state.version, 1);
249
251
  assert.ok(state.inputTokens > 0, "should have estimated some tokens");
250
252
  assert.ok(state.inputTokens < 1000, "small transcript should estimate < 1K tokens");
251
- assert.equal(state.modelWindowSize, 200000);
253
+ assert.equal(
254
+ state.modelWindowSize,
255
+ 1_000_000,
256
+ "window resolved from the transcript's claude-opus-4-6 model (1M), not config 200K"
257
+ );
252
258
  assert.ok(state.pct < 1, "pct should be well below threshold");
253
259
  assert.equal(state.threshold, "normal");
254
260
  assert.equal(state.checkCount, 1);
@@ -258,9 +264,15 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
258
264
  });
259
265
 
260
266
  test("E2E 2. above threshold — stdout additionalContext with large transcript", async () => {
261
- // 600K chars → ~171K tokens → 85.7% of 200K window warn band + additionalContext
267
+ // Haiku → real 200K window. 600K chars → ~171K tokens → ~85.7% of 200K →
268
+ // threshold band + additionalContext. (Model-aware sizing means we pin a
269
+ // 200K-window model here rather than relying on a stale config default.)
262
270
  sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
263
- const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 600000);
271
+ const transcriptPath = sandbox.writeTranscript(
272
+ "transcript.jsonl",
273
+ 600000,
274
+ "claude-haiku-4-5-20251001"
275
+ );
264
276
 
265
277
  const { stdout, code } = await sandbox.runHook({
266
278
  payload: { session_id: "test-above", transcript_path: transcriptPath },
@@ -283,6 +295,33 @@ test("E2E 2. above threshold — stdout additionalContext with large transcript"
283
295
  assert.equal(sandbox.tmpFileExists(), false);
284
296
  });
285
297
 
298
+ test("E2E 2b. REGRESSION — large Opus transcript stays 'normal' on the 1M window", async () => {
299
+ // The reported bug, end-to-end: ~600K chars → ~171K tokens. Under the old
300
+ // hardcoded 200K window this read as ~85% → false headless handoff while
301
+ // ~64% of context REMAINED. With model-aware sizing (claude-opus-4-7 → 1M),
302
+ // 171K is only ~17% → stdout {} → no premature handoff.
303
+ sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
304
+ const transcriptPath = sandbox.writeTranscript(
305
+ "transcript.jsonl",
306
+ 600000,
307
+ "claude-opus-4-7"
308
+ );
309
+
310
+ const { stdout, code } = await sandbox.runHook({
311
+ payload: { session_id: "test-regression", transcript_path: transcriptPath },
312
+ });
313
+
314
+ assert.equal(code, 0);
315
+ const parsed = JSON.parse(stdout || "{}");
316
+ assert.deepEqual(parsed, {}, "must NOT hand off — the reported regression");
317
+
318
+ const state = sandbox.readState();
319
+ assert.equal(state.modelWindowSize, 1_000_000);
320
+ assert.ok(state.inputTokens > 100000, "large transcript, >100K tokens");
321
+ assert.ok(state.pct < 75, `pct ${state.pct} must be below threshold on a 1M window`);
322
+ assert.equal(state.threshold, "normal");
323
+ });
324
+
286
325
  test("E2E 3. missing transcript — stdout {}, state has parse error", async () => {
287
326
  sandbox.writeConfig({ thresholdPct: 75, checkFrequency: 1 });
288
327
 
@@ -37,6 +37,7 @@ const { loadConfig: realLoadConfig } = require("../bin/context-meter-config.cjs"
37
37
  const { parseTranscript: realParseTranscript } = require("./context-meter/transcript-parser");
38
38
  const { estimateTokens: realEstimateTokens } = require("./context-meter/estimate-tokens");
39
39
  const { computePct, bandFor, buildAdditionalContext } = require("./context-meter/threshold");
40
+ const { windowForModel } = require("../bin/model-windows.cjs");
40
41
 
41
42
  const STATE_VERSION = 1;
42
43
 
@@ -208,6 +209,18 @@ async function runMeter(opts) {
208
209
  return {};
209
210
  }
210
211
 
212
+ // 5b. Resolve the EFFECTIVE context window from the model the orchestrator
213
+ // session is actually running (parsed.model). Opus 4.6/4.7 and Sonnet 4.x
214
+ // ship a 1M window; the config default (200k) is a legacy fallback that
215
+ // overcounts usage 5× and fires the headless handoff far too early. We
216
+ // only override when the transcript reports a model — a missing model or
217
+ // an explicit project config value falls through to cfg.modelWindowSize.
218
+ const effectiveWindow =
219
+ typeof parsed.model === "string" && parsed.model.length > 0
220
+ ? windowForModel(parsed.model)
221
+ : cfg.modelWindowSize;
222
+ state.modelWindowSize = effectiveWindow;
223
+
211
224
  // 6. Estimate tokens locally (no API call, zero cost).
212
225
  let tokenResp;
213
226
  try {
@@ -237,7 +250,7 @@ async function runMeter(opts) {
237
250
  // 8. Success path — compute pct, band, possibly emit additionalContext.
238
251
  const pct = computePct({
239
252
  inputTokens: tokenResp.inputTokens,
240
- modelWindowSize: cfg.modelWindowSize,
253
+ modelWindowSize: effectiveWindow,
241
254
  });
242
255
  const band = bandFor(pct, cfg.thresholdPct);
243
256
 
@@ -251,13 +264,14 @@ async function runMeter(opts) {
251
264
  logPath,
252
265
  "INFO",
253
266
  "measure",
254
- `tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band}`,
267
+ `tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band} ` +
268
+ `window=${effectiveWindow}${parsed.model ? ` model=${parsed.model}` : ""}`,
255
269
  clock
256
270
  );
257
271
 
258
272
  const additionalContext = buildAdditionalContext({
259
273
  pct,
260
- modelWindowSize: cfg.modelWindowSize,
274
+ modelWindowSize: effectiveWindow,
261
275
  thresholdPct: cfg.thresholdPct,
262
276
  });
263
277
  if (additionalContext) {