@tekyzinc/gsd-t 3.26.11 → 3.27.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +47 -0
- package/README.md +2 -0
- package/bin/context-budget-audit.cjs +17 -2
- package/bin/gsd-t-build-coverage.cjs +438 -0
- package/bin/gsd-t-ci-parity.cjs +500 -0
- package/bin/gsd-t-economics.cjs +37 -9
- package/bin/gsd-t.js +21 -0
- package/bin/model-windows.cjs +99 -0
- package/bin/model-windows.test.cjs +75 -0
- package/bin/runway-estimator.cjs +35 -5
- package/bin/token-budget.cjs +12 -3
- package/commands/gsd-t-help.md +14 -0
- package/commands/gsd-t-verify.md +46 -0
- package/package.json +1 -1
- package/scripts/context-meter/transcript-parser.js +12 -2
- package/scripts/context-meter/transcript-parser.test.js +51 -4
- package/scripts/gsd-t-calibration-hook.js +8 -1
- package/scripts/gsd-t-context-meter.e2e.test.js +45 -6
- package/scripts/gsd-t-context-meter.js +17 -3
- package/scripts/gsd-t-context-meter.test.js +85 -0
- package/templates/CLAUDE-global.md +6 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* bin/model-windows.cjs
|
|
3
|
+
*
|
|
4
|
+
* Single source of truth for Claude model → context-window size (in input
|
|
5
|
+
* tokens). The context meter and every downstream budget module must size the
|
|
6
|
+
* window from the MODEL ACTUALLY RUNNING, not a hardcoded constant.
|
|
7
|
+
*
|
|
8
|
+
* Why this exists
|
|
9
|
+
* ---------------
|
|
10
|
+
* Prior to this module every budget site hardcoded `200000` with a comment
|
|
11
|
+
* "claude-opus-4-6 default". Opus 4.6 and 4.7 (and Sonnet 4.x) ship a
|
|
12
|
+
* 1,000,000-token context window. Hardcoding 200k made the context meter
|
|
13
|
+
* overcount usage 5× and fire the headless handoff at ~64% of context
|
|
14
|
+
* REMAINING. This map fixes that at the source.
|
|
15
|
+
*
|
|
16
|
+
* Resolution strategy
|
|
17
|
+
* -------------------
|
|
18
|
+
* GSD-T jumps between models per-subagent, so a static config value is wrong.
|
|
19
|
+
* The orchestrator session whose transcript the meter reads, however, runs a
|
|
20
|
+
* single model for its lifetime, and every assistant message in the transcript
|
|
21
|
+
* records its `model` id. `windowForModel(modelId)` maps that id to a window.
|
|
22
|
+
*
|
|
23
|
+
* Matching is by longest-prefix so versioned ids resolve even if a future
|
|
24
|
+
* dated suffix appears (e.g. "claude-opus-4-7-20260115" → opus 4.x entry).
|
|
25
|
+
* Unknown / missing model → SAFE_DEFAULT_WINDOW (the large 1M window: a guard
|
|
26
|
+
* that triggers late is worse than one that never undercounts a real 1M
|
|
27
|
+
* session — but see note below; we deliberately pick the large default so the
|
|
28
|
+
* meter does NOT regress to premature handoffs on an unrecognized new model).
|
|
29
|
+
*
|
|
30
|
+
* Zero dependencies. CommonJS. Pure functions.
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
"use strict";
|
|
34
|
+
|
|
35
|
+
// The conservative fallback when a model can't be resolved. We choose the
|
|
36
|
+
// LARGE window (1M) on purpose: the bug we are fixing is premature handoff
|
|
37
|
+
// from a too-SMALL assumed window. An unknown future model is far more likely
|
|
38
|
+
// to have a >=1M window than a 200k one, and an over-large window degrades
|
|
39
|
+
// gracefully (handoff a little late) whereas an under-small one breaks the
|
|
40
|
+
// workflow (handoff way too early, the reported symptom).
|
|
41
|
+
const SAFE_DEFAULT_WINDOW = 1_000_000;
|
|
42
|
+
|
|
43
|
+
// The legacy small window, kept as a named export for the few call sites that
|
|
44
|
+
// must preserve old behavior explicitly (e.g. fixtures, back-compat configs).
|
|
45
|
+
const LEGACY_SMALL_WINDOW = 200_000;
|
|
46
|
+
|
|
47
|
+
// Longest-prefix map: key is a model-id prefix, value is the input-token
|
|
48
|
+
// context window for that model family. Order does not matter — resolution
|
|
49
|
+
// picks the LONGEST matching prefix.
|
|
50
|
+
const MODEL_WINDOWS = Object.freeze({
|
|
51
|
+
// Opus 4.6 / 4.7 — 1M context window.
|
|
52
|
+
"claude-opus-4-6": 1_000_000,
|
|
53
|
+
"claude-opus-4-7": 1_000_000,
|
|
54
|
+
// Generic opus-4 fallback (covers any 4.x point release not listed above).
|
|
55
|
+
"claude-opus-4": 1_000_000,
|
|
56
|
+
|
|
57
|
+
// Sonnet 4.x — 1M context window.
|
|
58
|
+
"claude-sonnet-4": 1_000_000,
|
|
59
|
+
|
|
60
|
+
// Haiku 4.x — 200k context window.
|
|
61
|
+
"claude-haiku-4": 200_000,
|
|
62
|
+
|
|
63
|
+
// Pre-4 families (defensive — older long sessions / replayed transcripts).
|
|
64
|
+
"claude-3-7-sonnet": 200_000,
|
|
65
|
+
"claude-3-5-sonnet": 200_000,
|
|
66
|
+
"claude-3-5-haiku": 200_000,
|
|
67
|
+
"claude-3-opus": 200_000,
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Resolve a context-window size (input tokens) for a Claude model id.
|
|
72
|
+
*
|
|
73
|
+
* @param {string|null|undefined} modelId e.g. "claude-opus-4-7" or
|
|
74
|
+
* "claude-opus-4-7-20260115". Non-string / empty → SAFE_DEFAULT_WINDOW.
|
|
75
|
+
* @returns {number} positive integer window size
|
|
76
|
+
*/
|
|
77
|
+
function windowForModel(modelId) {
|
|
78
|
+
if (typeof modelId !== "string" || modelId.length === 0) {
|
|
79
|
+
return SAFE_DEFAULT_WINDOW;
|
|
80
|
+
}
|
|
81
|
+
const id = modelId.trim().toLowerCase();
|
|
82
|
+
|
|
83
|
+
let best = null;
|
|
84
|
+
let bestLen = -1;
|
|
85
|
+
for (const prefix of Object.keys(MODEL_WINDOWS)) {
|
|
86
|
+
if (id.startsWith(prefix) && prefix.length > bestLen) {
|
|
87
|
+
best = MODEL_WINDOWS[prefix];
|
|
88
|
+
bestLen = prefix.length;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return best != null ? best : SAFE_DEFAULT_WINDOW;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
module.exports = {
|
|
95
|
+
windowForModel,
|
|
96
|
+
MODEL_WINDOWS,
|
|
97
|
+
SAFE_DEFAULT_WINDOW,
|
|
98
|
+
LEGACY_SMALL_WINDOW,
|
|
99
|
+
};
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for bin/model-windows.cjs — model → context-window resolution.
|
|
3
|
+
*
|
|
4
|
+
* The bug this fixes: the context meter hardcoded a 200k window so an Opus 4.7
|
|
5
|
+
* session (1M window) read as 5× over budget, firing the headless handoff at
|
|
6
|
+
* ~64% of context REMAINING. These tests pin the corrected windows.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
"use strict";
|
|
10
|
+
|
|
11
|
+
const test = require("node:test");
|
|
12
|
+
const assert = require("node:assert/strict");
|
|
13
|
+
|
|
14
|
+
const {
|
|
15
|
+
windowForModel,
|
|
16
|
+
MODEL_WINDOWS,
|
|
17
|
+
SAFE_DEFAULT_WINDOW,
|
|
18
|
+
LEGACY_SMALL_WINDOW,
|
|
19
|
+
} = require("./model-windows.cjs");
|
|
20
|
+
|
|
21
|
+
test("Opus 4.7 resolves to a 1M window (the reported regression)", () => {
|
|
22
|
+
assert.equal(windowForModel("claude-opus-4-7"), 1_000_000);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
test("Opus 4.6 resolves to a 1M window", () => {
|
|
26
|
+
assert.equal(windowForModel("claude-opus-4-6"), 1_000_000);
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
test("dated/versioned suffix still resolves via longest-prefix", () => {
|
|
30
|
+
assert.equal(windowForModel("claude-opus-4-7-20260115"), 1_000_000);
|
|
31
|
+
assert.equal(windowForModel("claude-sonnet-4-6-20251201"), 1_000_000);
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
test("Sonnet 4.x resolves to a 1M window", () => {
|
|
35
|
+
assert.equal(windowForModel("claude-sonnet-4-6"), 1_000_000);
|
|
36
|
+
assert.equal(windowForModel("claude-sonnet-4"), 1_000_000);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
test("Haiku 4.x resolves to the 200k window", () => {
|
|
40
|
+
assert.equal(windowForModel("claude-haiku-4-5-20251001"), 200_000);
|
|
41
|
+
assert.equal(windowForModel("claude-haiku-4"), 200_000);
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
test("longest-prefix wins over a shorter generic prefix", () => {
|
|
45
|
+
// "claude-opus-4-7" (15) must beat "claude-opus-4" (13). Both map to 1M
|
|
46
|
+
// here, so assert the resolution mechanism via a value-independent check:
|
|
47
|
+
// a hypothetical future divergence would surface if this regressed.
|
|
48
|
+
assert.equal(windowForModel("claude-opus-4-7"), MODEL_WINDOWS["claude-opus-4-7"]);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
test("case-insensitive and whitespace-tolerant", () => {
|
|
52
|
+
assert.equal(windowForModel(" CLAUDE-OPUS-4-7 "), 1_000_000);
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
test("unknown / missing model falls back to the SAFE large default", () => {
|
|
56
|
+
assert.equal(windowForModel("claude-future-99"), SAFE_DEFAULT_WINDOW);
|
|
57
|
+
assert.equal(windowForModel(""), SAFE_DEFAULT_WINDOW);
|
|
58
|
+
assert.equal(windowForModel(null), SAFE_DEFAULT_WINDOW);
|
|
59
|
+
assert.equal(windowForModel(undefined), SAFE_DEFAULT_WINDOW);
|
|
60
|
+
assert.equal(windowForModel(42), SAFE_DEFAULT_WINDOW);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
test("SAFE_DEFAULT_WINDOW is the large (1M) window, not the legacy 200k", () => {
|
|
64
|
+
// Core anti-regression assertion: the fallback must NOT reintroduce the
|
|
65
|
+
// premature-handoff bug for an unrecognized model.
|
|
66
|
+
assert.equal(SAFE_DEFAULT_WINDOW, 1_000_000);
|
|
67
|
+
assert.equal(LEGACY_SMALL_WINDOW, 200_000);
|
|
68
|
+
assert.notEqual(SAFE_DEFAULT_WINDOW, LEGACY_SMALL_WINDOW);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
test("every mapped window is a positive integer", () => {
|
|
72
|
+
for (const [k, v] of Object.entries(MODEL_WINDOWS)) {
|
|
73
|
+
assert.ok(Number.isInteger(v) && v > 0, `${k} → ${v} must be a positive int`);
|
|
74
|
+
}
|
|
75
|
+
});
|
package/bin/runway-estimator.cjs
CHANGED
|
@@ -26,12 +26,43 @@
|
|
|
26
26
|
|
|
27
27
|
const fs = require('fs');
|
|
28
28
|
const path = require('path');
|
|
29
|
+
const { SAFE_DEFAULT_WINDOW } = require('./model-windows.cjs');
|
|
29
30
|
|
|
30
31
|
const DEFAULT_K = 5;
|
|
31
|
-
|
|
32
|
+
// The model context cap IS the model's true window. Default to the model-aware
|
|
33
|
+
// safe LARGE window (1M) — the old 200K literal was correct only for pre-4
|
|
34
|
+
// models and made turn-to-compact predictions fire 5× too early on Opus/Sonnet.
|
|
35
|
+
// Callers may still pass an explicit `modelContextCap`, and resolveContextCap()
|
|
36
|
+
// below prefers a fresh Context Meter reading when available.
|
|
37
|
+
const DEFAULT_MODEL_CONTEXT_CAP = SAFE_DEFAULT_WINDOW;
|
|
32
38
|
// Claude Code starts auto-compacting ~8% before the model window fills, so the
|
|
33
39
|
// effective dialog ceiling is 0.92 × modelContextCap.
|
|
34
40
|
const PRE_COMPACT_HEADROOM = 0.92;
|
|
41
|
+
// Context Meter state — its modelWindowSize is model-aware (bin/model-windows.cjs).
|
|
42
|
+
const METER_STATE_REL = '.gsd-t/.context-meter-state.json';
|
|
43
|
+
const METER_STATE_STALE_MS = 5 * 60 * 1000;
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Resolve the effective model context cap. Priority:
|
|
47
|
+
* 1. explicit opts.modelContextCap (caller override)
|
|
48
|
+
* 2. fresh Context Meter state modelWindowSize (model-aware, written by the
|
|
49
|
+
* meter hook from the running model)
|
|
50
|
+
* 3. DEFAULT_MODEL_CONTEXT_CAP (safe large fallback)
|
|
51
|
+
*/
|
|
52
|
+
function resolveContextCap(projectDir, optCap) {
|
|
53
|
+
if (Number.isFinite(optCap) && optCap > 0) return optCap;
|
|
54
|
+
try {
|
|
55
|
+
const fp = path.join(projectDir || '.', METER_STATE_REL);
|
|
56
|
+
const s = JSON.parse(fs.readFileSync(fp, 'utf8'));
|
|
57
|
+
if (s && typeof s.modelWindowSize === 'number' && s.modelWindowSize > 0 && s.timestamp) {
|
|
58
|
+
const age = Date.now() - Date.parse(s.timestamp);
|
|
59
|
+
if (!isNaN(age) && age >= 0 && age <= METER_STATE_STALE_MS) {
|
|
60
|
+
return s.modelWindowSize;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
} catch (_) { /* fall through to default */ }
|
|
64
|
+
return DEFAULT_MODEL_CONTEXT_CAP;
|
|
65
|
+
}
|
|
35
66
|
const DEFAULT_WARN_THRESHOLD_TURNS = 5;
|
|
36
67
|
const MIN_HISTORY = 3;
|
|
37
68
|
|
|
@@ -107,7 +138,8 @@ function _sortTurns(rows) {
|
|
|
107
138
|
* @param {string} opts.projectDir
|
|
108
139
|
* @param {string} opts.sessionId required
|
|
109
140
|
* @param {number} [opts.k] default 5 (last K turns)
|
|
110
|
-
* @param {number} [opts.modelContextCap] default
|
|
141
|
+
* @param {number} [opts.modelContextCap] default: fresh Context Meter
|
|
142
|
+
* modelWindowSize if available, else the model-aware safe window (1M)
|
|
111
143
|
* @param {number} [opts.warnThresholdTurns] default 5
|
|
112
144
|
* @returns {{
|
|
113
145
|
* shouldWarn: boolean,
|
|
@@ -124,9 +156,7 @@ function estimateDialogGrowth(opts) {
|
|
|
124
156
|
const projectDir = (opts && opts.projectDir) || '.';
|
|
125
157
|
const sessionId = opts && opts.sessionId;
|
|
126
158
|
const k = (opts && Number.isFinite(opts.k) && opts.k > 0) ? Math.floor(opts.k) : DEFAULT_K;
|
|
127
|
-
const cap = (
|
|
128
|
-
? opts.modelContextCap
|
|
129
|
-
: DEFAULT_MODEL_CONTEXT_CAP;
|
|
159
|
+
const cap = resolveContextCap(projectDir, opts && opts.modelContextCap);
|
|
130
160
|
const warnThreshold = (opts && Number.isFinite(opts.warnThresholdTurns) && opts.warnThresholdTurns > 0)
|
|
131
161
|
? opts.warnThresholdTurns
|
|
132
162
|
: DEFAULT_WARN_THRESHOLD_TURNS;
|
package/bin/token-budget.cjs
CHANGED
|
@@ -26,11 +26,18 @@
|
|
|
26
26
|
|
|
27
27
|
const fs = require("fs");
|
|
28
28
|
const path = require("path");
|
|
29
|
+
const { SAFE_DEFAULT_WINDOW } = require("./model-windows.cjs");
|
|
29
30
|
|
|
30
31
|
// ── Constants ────────────────────────────────────────────────────────────────
|
|
31
32
|
|
|
32
33
|
const MODEL_RATIOS = { haiku: 1, sonnet: 5, opus: 25 };
|
|
33
34
|
|
|
35
|
+
// Fallback context window when no per-model signal is available. Uses the
|
|
36
|
+
// model-windows safe LARGE default (1M) rather than a bare 200K literal: a
|
|
37
|
+
// too-small fallback re-introduces the premature-headless-handoff bug the
|
|
38
|
+
// model-aware sizing was added to fix. See bin/model-windows.cjs.
|
|
39
|
+
const FALLBACK_WINDOW = SAFE_DEFAULT_WINDOW;
|
|
40
|
+
|
|
34
41
|
// Base token estimates per task type (in haiku-equivalent units)
|
|
35
42
|
const BASE_ESTIMATES = {
|
|
36
43
|
execute: 8000,
|
|
@@ -106,7 +113,9 @@ function getSessionStatus(projectDir) {
|
|
|
106
113
|
const real = readContextMeterState(dir);
|
|
107
114
|
if (real) {
|
|
108
115
|
const consumed = real.inputTokens;
|
|
109
|
-
|
|
116
|
+
// Primary: the Context Meter writes a model-aware modelWindowSize into
|
|
117
|
+
// state (bin/model-windows.cjs). Fallback only when state predates that.
|
|
118
|
+
const window = real.modelWindowSize > 0 ? real.modelWindowSize : FALLBACK_WINDOW;
|
|
110
119
|
const estimated_remaining = Math.max(0, window - consumed);
|
|
111
120
|
const pct = Math.round(real.pct * 10) / 10;
|
|
112
121
|
const threshold = bandFor(pct, thresholdPct);
|
|
@@ -145,7 +154,7 @@ function resolveThresholdPct(dir) {
|
|
|
145
154
|
}
|
|
146
155
|
|
|
147
156
|
function getSessionStatusHeuristic(dir, thresholdPct) {
|
|
148
|
-
const window =
|
|
157
|
+
const window = FALLBACK_WINDOW;
|
|
149
158
|
const consumed = readSessionConsumed(dir);
|
|
150
159
|
const estimated_remaining = Math.max(0, window - consumed);
|
|
151
160
|
const pct = window > 0 ? Math.round((consumed / window) * 100 * 10) / 10 : 0;
|
|
@@ -180,7 +189,7 @@ function recordUsage(usage) {
|
|
|
180
189
|
*/
|
|
181
190
|
function estimateMilestoneCost(remainingTasks, projectDir) {
|
|
182
191
|
const status = getSessionStatus(projectDir);
|
|
183
|
-
const window = status.consumed + status.estimated_remaining ||
|
|
192
|
+
const window = status.consumed + status.estimated_remaining || FALLBACK_WINDOW;
|
|
184
193
|
const estimatedTokens = remainingTasks.reduce((sum, t) => {
|
|
185
194
|
return sum + estimateCost(t.model, t.taskType, { complexity: t.complexity, projectDir });
|
|
186
195
|
}, 0);
|
package/commands/gsd-t-help.md
CHANGED
|
@@ -455,6 +455,20 @@ Use these when user asks for help on a specific command:
|
|
|
455
455
|
- **Use when**: Final pre-merge gate. Both tracks always run; both report. `ok` is purely deterministic — LLM verdict is advisory.
|
|
456
456
|
- **CLI**: `gsd-t verify-gate [--skip-track1] [--skip-track2] [--max-concurrency N] [--fail-fast] [--json]`. Exit 0/4/2/3.
|
|
457
457
|
|
|
458
|
+
### build-coverage (M57)
|
|
459
|
+
- **Summary**: Detects new top-level paths in a milestone commit range not referenced by a real CI build input. Coverage is decided by STRUCTURALLY parsing CI files (Dockerfile COPY/ADD source args incl. relative `--from=`; cloudbuild `args`-positional; workflow `run`-positional via a block-scalar-aware YAML walker) — never substring-matching. `node_modules` never counts. No documented false-negative residual.
|
|
460
|
+
- **Auto-invoked**: Yes — by `gsd-t-verify` Step 2.6 (FAIL-blocking, never warning-only)
|
|
461
|
+
- **Files**: `bin/gsd-t-build-coverage.cjs`
|
|
462
|
+
- **Use when**: Final pre-merge gate. Catches the TimeTracking v1.10.12 class (new `hooks/` dir committed, absent from Dockerfile COPY, shipped broken while verify passed).
|
|
463
|
+
- **CLI**: `gsd-t build-coverage [--json] [--base REF] [--head REF] [--project-dir PATH]`. Exit 0/4/2.
|
|
464
|
+
|
|
465
|
+
### ci-parity (M57)
|
|
466
|
+
- **Summary**: Reproduces the project's actual CI build locally instead of assuming warm-cache local tsc/test parity. Auto-detects CI config (cloudbuild → workflows → Dockerfile RUN → package.json scripts), clears build caches (containment-safe — refuses any config-derived delete resolving outside OR equal-to projectRoot), and auto-runs the real `docker build` when a Dockerfile is present (presence is the trigger, no opt-in flag).
|
|
467
|
+
- **Auto-invoked**: Yes — by `gsd-t-verify` Step 2.6 (FAIL-blocking, never warning-only)
|
|
468
|
+
- **Files**: `bin/gsd-t-ci-parity.cjs`
|
|
469
|
+
- **Use when**: Final pre-merge gate. Catches the TimeTracking v1.10.12 class (noImplicitAny regressions passed a warm-cache local tsc but failed CI's cold build).
|
|
470
|
+
- **CLI**: `gsd-t ci-parity [--project-dir PATH] [--timeout-ms MS] [--json]`. Exit 0/4/2.
|
|
471
|
+
|
|
458
472
|
## Unknown Command
|
|
459
473
|
|
|
460
474
|
If user asks for help on unrecognized command:
|
package/commands/gsd-t-verify.md
CHANGED
|
@@ -102,6 +102,52 @@ Defensive on missing `.gsd-t/ratelimit-map.json` — verify-gate falls back to
|
|
|
102
102
|
|
|
103
103
|
Contract: `.gsd-t/contracts/verify-gate-contract.md` v1.0.0 STABLE.
|
|
104
104
|
|
|
105
|
+
<!-- M57: CI-parity FAIL-blocking gate -->
|
|
106
|
+
## Step 2.6: CI-Parity Gate (MANDATORY — FAIL-blocking, never warning-only)
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
node scripts/gsd-t-watch-state.js advance --agent-id "$GSD_T_AGENT_ID" --parent-id "${GSD_T_PARENT_AGENT_ID:-null}" --command gsd-t-verify --step 2 --step-label ".6: CI-Parity Gate" 2>/dev/null || true
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Origin: TimeTracking v1.10.12 shipped VERIFIED + tagged while Cloud Build
|
|
113
|
+
failed (a new top-level `hooks/` dir was committed but never added to the
|
|
114
|
+
Dockerfile `COPY` directives, and `noImplicitAny` regressions passed a
|
|
115
|
+
warm-cache local `tsc` but failed CI's cold build). `gsd-t-verify` must
|
|
116
|
+
reproduce the project's *actual* CI build, not assume local parity.
|
|
117
|
+
|
|
118
|
+
Run BOTH checks. **Either failing is a verify FAIL — it blocks
|
|
119
|
+
complete-milestone. This is never a warning-only signal.**
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
# 1. Build-coverage — every new top-level path in the milestone range must be
|
|
123
|
+
# referenced by a real CI build input (structural parse, not substring).
|
|
124
|
+
gsd-t build-coverage --json > /tmp/gsd-t-build-coverage.json
|
|
125
|
+
BC_EXIT=$?
|
|
126
|
+
|
|
127
|
+
# 2. CI-parity — reproduce the project's actual CI build locally with caches
|
|
128
|
+
# cleared; auto-runs `docker build` when a Dockerfile is present.
|
|
129
|
+
gsd-t ci-parity --json > /tmp/gsd-t-ci-parity.json
|
|
130
|
+
CP_EXIT=$?
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
- `build-coverage` exit **4** (`ok:false`, `missing[]` non-empty) → verify
|
|
134
|
+
FAIL. Report each uncovered path; the fix is to add the path to the
|
|
135
|
+
Dockerfile `COPY` / cloudbuild artifact / workflow build input.
|
|
136
|
+
- `ci-parity` exit **4** (`ok:false` — a detected CI command or the real
|
|
137
|
+
`docker build` failed) → verify FAIL. Report the failing command.
|
|
138
|
+
- exit **0** from both → gate passes.
|
|
139
|
+
- exit **2** (usage error, e.g. not a git repo) → record as a structured
|
|
140
|
+
note; not a pass-by-default (investigate before proceeding).
|
|
141
|
+
|
|
142
|
+
Both are pure-deterministic CLI checks (no LLM). They consume the same
|
|
143
|
+
preflight envelope as the M55 verify-gate Track 1, so failing here at
|
|
144
|
+
verify mirrors what CI would do — catching the TimeTracking class before
|
|
145
|
+
the milestone is tagged.
|
|
146
|
+
|
|
147
|
+
Contracts: `.gsd-t/contracts/cli-build-coverage-contract.md` v2.0.0 STABLE,
|
|
148
|
+
`.gsd-t/contracts/ci-parity-contract.md` v2.0.0 STABLE.
|
|
149
|
+
<!-- /M57: CI-parity FAIL-blocking gate -->
|
|
150
|
+
|
|
105
151
|
## Step 2.5: High-Risk Domain Gate (MANDATORY — Categories 2 and 7)
|
|
106
152
|
|
|
107
153
|
```bash
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tekyzinc/gsd-t",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.27.10",
|
|
4
4
|
"description": "GSD-T: Contract-Driven Development for Claude Code — 54 slash commands with headless-by-default workflow spawning, unattended supervisor relay with event stream, graph-powered code analysis, real-time agent dashboard, task telemetry, doc-ripple enforcement, backlog management, impact analysis, test sync, milestone archival, and PRD generation",
|
|
5
5
|
"author": "Tekyz, Inc.",
|
|
6
6
|
"license": "MIT",
|
|
@@ -82,9 +82,12 @@ const readline = require("readline");
|
|
|
82
82
|
* Parse a Claude Code transcript JSONL file.
|
|
83
83
|
*
|
|
84
84
|
* @param {string} transcriptPath - absolute path to a Claude Code transcript .jsonl
|
|
85
|
-
* @returns {Promise<{system: string, messages: Array<{role: string, content: Array}
|
|
85
|
+
* @returns {Promise<{system: string, messages: Array<{role: string, content: Array}>, model: string|null} | null>}
|
|
86
86
|
* Resolves to the reconstructed body, or `null` on unreadable file /
|
|
87
87
|
* catastrophic parse failure. Caller treats `null` as "bail out, fail open".
|
|
88
|
+
* `model` is the last-seen assistant `message.model` id (the model the
|
|
89
|
+
* orchestrator session is running) or `null` if none observed — the
|
|
90
|
+
* context meter uses it to size the context window correctly.
|
|
88
91
|
*/
|
|
89
92
|
async function parseTranscript(transcriptPath) {
|
|
90
93
|
if (typeof transcriptPath !== "string" || transcriptPath.length === 0) {
|
|
@@ -101,6 +104,10 @@ async function parseTranscript(transcriptPath) {
|
|
|
101
104
|
|
|
102
105
|
const messages = [];
|
|
103
106
|
let system = "";
|
|
107
|
+
// Last-seen assistant model id. Claude Code records `message.model` on every
|
|
108
|
+
// assistant turn; the orchestrator session runs one model for its lifetime,
|
|
109
|
+
// so the last value is authoritative for sizing the context window.
|
|
110
|
+
let model = null;
|
|
104
111
|
|
|
105
112
|
let stream;
|
|
106
113
|
try {
|
|
@@ -136,6 +143,9 @@ async function parseTranscript(transcriptPath) {
|
|
|
136
143
|
if (!msg || typeof msg !== "object") continue;
|
|
137
144
|
|
|
138
145
|
const role = msg.role || type;
|
|
146
|
+
if (type === "assistant" && typeof msg.model === "string" && msg.model.length > 0) {
|
|
147
|
+
model = msg.model;
|
|
148
|
+
}
|
|
139
149
|
const content = normalizeContent(msg.content, role);
|
|
140
150
|
if (content === null) continue;
|
|
141
151
|
|
|
@@ -152,7 +162,7 @@ async function parseTranscript(transcriptPath) {
|
|
|
152
162
|
return null;
|
|
153
163
|
}
|
|
154
164
|
|
|
155
|
-
return { system, messages: sanitizeToolPairs(messages) };
|
|
165
|
+
return { system, messages: sanitizeToolPairs(messages), model };
|
|
156
166
|
}
|
|
157
167
|
|
|
158
168
|
/**
|
|
@@ -39,17 +39,17 @@ test("empty path / non-string → returns null", async () => {
|
|
|
39
39
|
assert.equal(await parseTranscript(undefined), null);
|
|
40
40
|
});
|
|
41
41
|
|
|
42
|
-
test("empty file → returns { system:'', messages:[] }", async () => {
|
|
42
|
+
test("empty file → returns { system:'', messages:[], model:null }", async () => {
|
|
43
43
|
const { dir, file } = mkTmpFile([]);
|
|
44
44
|
try {
|
|
45
45
|
const got = await parseTranscript(file);
|
|
46
|
-
assert.deepEqual(got, { system: "", messages: [] });
|
|
46
|
+
assert.deepEqual(got, { system: "", messages: [], model: null });
|
|
47
47
|
} finally {
|
|
48
48
|
cleanup(dir);
|
|
49
49
|
}
|
|
50
50
|
});
|
|
51
51
|
|
|
52
|
-
test("file with only unknown event types → { system:'', messages:[] }", async () => {
|
|
52
|
+
test("file with only unknown event types → { system:'', messages:[], model:null }", async () => {
|
|
53
53
|
const { dir, file } = mkTmpFile([
|
|
54
54
|
{ type: "summary", foo: "bar" },
|
|
55
55
|
{ type: "system", subtype: "hook", hookInfos: [] },
|
|
@@ -60,7 +60,7 @@ test("file with only unknown event types → { system:'', messages:[] }", async
|
|
|
60
60
|
]);
|
|
61
61
|
try {
|
|
62
62
|
const got = await parseTranscript(file);
|
|
63
|
-
assert.deepEqual(got, { system: "", messages: [] });
|
|
63
|
+
assert.deepEqual(got, { system: "", messages: [], model: null });
|
|
64
64
|
} finally {
|
|
65
65
|
cleanup(dir);
|
|
66
66
|
}
|
|
@@ -104,6 +104,53 @@ test("normal conversation — string-content user + text assistant", async () =>
|
|
|
104
104
|
}
|
|
105
105
|
});
|
|
106
106
|
|
|
107
|
+
test("captures last-seen assistant model id (for window sizing)", async () => {
|
|
108
|
+
const { dir, file } = mkTmpFile([
|
|
109
|
+
{ type: "user", message: { role: "user", content: "hi" } },
|
|
110
|
+
{
|
|
111
|
+
type: "assistant",
|
|
112
|
+
message: {
|
|
113
|
+
role: "assistant",
|
|
114
|
+
model: "claude-opus-4-7",
|
|
115
|
+
content: [{ type: "text", text: "first" }],
|
|
116
|
+
},
|
|
117
|
+
},
|
|
118
|
+
{ type: "user", message: { role: "user", content: "more" } },
|
|
119
|
+
{
|
|
120
|
+
type: "assistant",
|
|
121
|
+
message: {
|
|
122
|
+
role: "assistant",
|
|
123
|
+
model: "claude-opus-4-7-20260115",
|
|
124
|
+
content: [{ type: "text", text: "second" }],
|
|
125
|
+
},
|
|
126
|
+
},
|
|
127
|
+
]);
|
|
128
|
+
try {
|
|
129
|
+
const got = await parseTranscript(file);
|
|
130
|
+
// Last assistant model wins (orchestrator session is single-model, but the
|
|
131
|
+
// last value is authoritative if a dated id supersedes a bare one).
|
|
132
|
+
assert.equal(got.model, "claude-opus-4-7-20260115");
|
|
133
|
+
} finally {
|
|
134
|
+
cleanup(dir);
|
|
135
|
+
}
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
test("model stays null when no assistant message carries one", async () => {
|
|
139
|
+
const { dir, file } = mkTmpFile([
|
|
140
|
+
{ type: "user", message: { role: "user", content: "hi" } },
|
|
141
|
+
{
|
|
142
|
+
type: "assistant",
|
|
143
|
+
message: { role: "assistant", content: [{ type: "text", text: "no model field" }] },
|
|
144
|
+
},
|
|
145
|
+
]);
|
|
146
|
+
try {
|
|
147
|
+
const got = await parseTranscript(file);
|
|
148
|
+
assert.equal(got.model, null);
|
|
149
|
+
} finally {
|
|
150
|
+
cleanup(dir);
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
|
|
107
154
|
test("tool_use / tool_result pairing by tool_use_id preserved in order", async () => {
|
|
108
155
|
const TOOL_ID = "toolu_01ABC";
|
|
109
156
|
const { dir, file } = mkTmpFile([
|
|
@@ -39,9 +39,16 @@
|
|
|
39
39
|
const fs = require("fs");
|
|
40
40
|
const path = require("path");
|
|
41
41
|
|
|
42
|
+
const { SAFE_DEFAULT_WINDOW } = require("../bin/model-windows.cjs");
|
|
43
|
+
|
|
42
44
|
const MAX_STDIN = 1024 * 1024; // 1 MiB
|
|
43
45
|
const SCHEMA_VERSION = 1;
|
|
44
|
-
|
|
46
|
+
// Input-token budget per CW = the model context window. Default to the
|
|
47
|
+
// model-aware safe window (1M); the old 200K literal was correct only for
|
|
48
|
+
// pre-4 models and skewed every actualCwPct calibration ratio 5× on
|
|
49
|
+
// Opus/Sonnet. Event state may still override via cwCeilingTokens (the
|
|
50
|
+
// economics estimator records the model-aware ceiling it actually used).
|
|
51
|
+
const DEFAULT_CW_CEILING_TOKENS = SAFE_DEFAULT_WINDOW;
|
|
45
52
|
|
|
46
53
|
if (require.main === module) {
|
|
47
54
|
let input = "";
|
|
@@ -74,7 +74,7 @@ class Sandbox {
|
|
|
74
74
|
* The charCount parameter controls how many characters of text content
|
|
75
75
|
* are in the transcript, which determines the estimated token count.
|
|
76
76
|
*/
|
|
77
|
-
writeTranscript(filename = "transcript.jsonl", charCount = 100) {
|
|
77
|
+
writeTranscript(filename = "transcript.jsonl", charCount = 100, model = "claude-opus-4-6") {
|
|
78
78
|
const userText = "x".repeat(Math.floor(charCount / 2));
|
|
79
79
|
const assistantText = "y".repeat(Math.ceil(charCount / 2));
|
|
80
80
|
const lines = [
|
|
@@ -89,7 +89,7 @@ class Sandbox {
|
|
|
89
89
|
message: {
|
|
90
90
|
role: "assistant",
|
|
91
91
|
content: [{ type: "text", text: assistantText }],
|
|
92
|
-
model
|
|
92
|
+
model,
|
|
93
93
|
},
|
|
94
94
|
uuid: "a1",
|
|
95
95
|
sessionId: "sess-1",
|
|
@@ -231,7 +231,9 @@ afterEach(async () => {
|
|
|
231
231
|
/* ──────────────────────────── tests ──────────────────────────── */
|
|
232
232
|
|
|
233
233
|
test("E2E 1. below threshold — stdout {} and state reflects estimate", async () => {
|
|
234
|
-
// 100 chars
|
|
234
|
+
// 100 chars → ~29 tokens. The transcript declares claude-opus-4-6, so the
|
|
235
|
+
// EFFECTIVE window is the real 1M (model-aware sizing), not the config's
|
|
236
|
+
// legacy 200K — ~0.003% of 1M, well below threshold.
|
|
235
237
|
sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
|
|
236
238
|
const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 100);
|
|
237
239
|
|
|
@@ -248,7 +250,11 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
|
|
|
248
250
|
assert.equal(state.version, 1);
|
|
249
251
|
assert.ok(state.inputTokens > 0, "should have estimated some tokens");
|
|
250
252
|
assert.ok(state.inputTokens < 1000, "small transcript should estimate < 1K tokens");
|
|
251
|
-
assert.equal(
|
|
253
|
+
assert.equal(
|
|
254
|
+
state.modelWindowSize,
|
|
255
|
+
1_000_000,
|
|
256
|
+
"window resolved from the transcript's claude-opus-4-6 model (1M), not config 200K"
|
|
257
|
+
);
|
|
252
258
|
assert.ok(state.pct < 1, "pct should be well below threshold");
|
|
253
259
|
assert.equal(state.threshold, "normal");
|
|
254
260
|
assert.equal(state.checkCount, 1);
|
|
@@ -258,9 +264,15 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
|
|
|
258
264
|
});
|
|
259
265
|
|
|
260
266
|
test("E2E 2. above threshold — stdout additionalContext with large transcript", async () => {
|
|
261
|
-
// 600K chars → ~171K tokens → 85.7% of 200K
|
|
267
|
+
// Haiku → real 200K window. 600K chars → ~171K tokens → ~85.7% of 200K →
|
|
268
|
+
// threshold band + additionalContext. (Model-aware sizing means we pin a
|
|
269
|
+
// 200K-window model here rather than relying on a stale config default.)
|
|
262
270
|
sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
|
|
263
|
-
const transcriptPath = sandbox.writeTranscript(
|
|
271
|
+
const transcriptPath = sandbox.writeTranscript(
|
|
272
|
+
"transcript.jsonl",
|
|
273
|
+
600000,
|
|
274
|
+
"claude-haiku-4-5-20251001"
|
|
275
|
+
);
|
|
264
276
|
|
|
265
277
|
const { stdout, code } = await sandbox.runHook({
|
|
266
278
|
payload: { session_id: "test-above", transcript_path: transcriptPath },
|
|
@@ -283,6 +295,33 @@ test("E2E 2. above threshold — stdout additionalContext with large transcript"
|
|
|
283
295
|
assert.equal(sandbox.tmpFileExists(), false);
|
|
284
296
|
});
|
|
285
297
|
|
|
298
|
+
test("E2E 2b. REGRESSION — large Opus transcript stays 'normal' on the 1M window", async () => {
|
|
299
|
+
// The reported bug, end-to-end: ~600K chars → ~171K tokens. Under the old
|
|
300
|
+
// hardcoded 200K window this read as ~85% → false headless handoff while
|
|
301
|
+
// ~64% of context REMAINED. With model-aware sizing (claude-opus-4-7 → 1M),
|
|
302
|
+
// 171K is only ~17% → stdout {} → no premature handoff.
|
|
303
|
+
sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
|
|
304
|
+
const transcriptPath = sandbox.writeTranscript(
|
|
305
|
+
"transcript.jsonl",
|
|
306
|
+
600000,
|
|
307
|
+
"claude-opus-4-7"
|
|
308
|
+
);
|
|
309
|
+
|
|
310
|
+
const { stdout, code } = await sandbox.runHook({
|
|
311
|
+
payload: { session_id: "test-regression", transcript_path: transcriptPath },
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
assert.equal(code, 0);
|
|
315
|
+
const parsed = JSON.parse(stdout || "{}");
|
|
316
|
+
assert.deepEqual(parsed, {}, "must NOT hand off — the reported regression");
|
|
317
|
+
|
|
318
|
+
const state = sandbox.readState();
|
|
319
|
+
assert.equal(state.modelWindowSize, 1_000_000);
|
|
320
|
+
assert.ok(state.inputTokens > 100000, "large transcript, >100K tokens");
|
|
321
|
+
assert.ok(state.pct < 75, `pct ${state.pct} must be below threshold on a 1M window`);
|
|
322
|
+
assert.equal(state.threshold, "normal");
|
|
323
|
+
});
|
|
324
|
+
|
|
286
325
|
test("E2E 3. missing transcript — stdout {}, state has parse error", async () => {
|
|
287
326
|
sandbox.writeConfig({ thresholdPct: 75, checkFrequency: 1 });
|
|
288
327
|
|
|
@@ -37,6 +37,7 @@ const { loadConfig: realLoadConfig } = require("../bin/context-meter-config.cjs"
|
|
|
37
37
|
const { parseTranscript: realParseTranscript } = require("./context-meter/transcript-parser");
|
|
38
38
|
const { estimateTokens: realEstimateTokens } = require("./context-meter/estimate-tokens");
|
|
39
39
|
const { computePct, bandFor, buildAdditionalContext } = require("./context-meter/threshold");
|
|
40
|
+
const { windowForModel } = require("../bin/model-windows.cjs");
|
|
40
41
|
|
|
41
42
|
const STATE_VERSION = 1;
|
|
42
43
|
|
|
@@ -208,6 +209,18 @@ async function runMeter(opts) {
|
|
|
208
209
|
return {};
|
|
209
210
|
}
|
|
210
211
|
|
|
212
|
+
// 5b. Resolve the EFFECTIVE context window from the model the orchestrator
|
|
213
|
+
// session is actually running (parsed.model). Opus 4.6/4.7 and Sonnet 4.x
|
|
214
|
+
// ship a 1M window; the config default (200k) is a legacy fallback that
|
|
215
|
+
// overcounts usage 5× and fires the headless handoff far too early. We
|
|
216
|
+
// only override when the transcript reports a model — a missing model or
|
|
217
|
+
// an explicit project config value falls through to cfg.modelWindowSize.
|
|
218
|
+
const effectiveWindow =
|
|
219
|
+
typeof parsed.model === "string" && parsed.model.length > 0
|
|
220
|
+
? windowForModel(parsed.model)
|
|
221
|
+
: cfg.modelWindowSize;
|
|
222
|
+
state.modelWindowSize = effectiveWindow;
|
|
223
|
+
|
|
211
224
|
// 6. Estimate tokens locally (no API call, zero cost).
|
|
212
225
|
let tokenResp;
|
|
213
226
|
try {
|
|
@@ -237,7 +250,7 @@ async function runMeter(opts) {
|
|
|
237
250
|
// 8. Success path — compute pct, band, possibly emit additionalContext.
|
|
238
251
|
const pct = computePct({
|
|
239
252
|
inputTokens: tokenResp.inputTokens,
|
|
240
|
-
modelWindowSize:
|
|
253
|
+
modelWindowSize: effectiveWindow,
|
|
241
254
|
});
|
|
242
255
|
const band = bandFor(pct, cfg.thresholdPct);
|
|
243
256
|
|
|
@@ -251,13 +264,14 @@ async function runMeter(opts) {
|
|
|
251
264
|
logPath,
|
|
252
265
|
"INFO",
|
|
253
266
|
"measure",
|
|
254
|
-
`tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band}
|
|
267
|
+
`tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band} ` +
|
|
268
|
+
`window=${effectiveWindow}${parsed.model ? ` model=${parsed.model}` : ""}`,
|
|
255
269
|
clock
|
|
256
270
|
);
|
|
257
271
|
|
|
258
272
|
const additionalContext = buildAdditionalContext({
|
|
259
273
|
pct,
|
|
260
|
-
modelWindowSize:
|
|
274
|
+
modelWindowSize: effectiveWindow,
|
|
261
275
|
thresholdPct: cfg.thresholdPct,
|
|
262
276
|
});
|
|
263
277
|
if (additionalContext) {
|