@tekyzinc/gsd-t 3.26.10 → 3.27.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +64 -0
- package/README.md +2 -0
- package/bin/context-budget-audit.cjs +17 -2
- package/bin/gsd-t-build-coverage.cjs +438 -0
- package/bin/gsd-t-ci-parity.cjs +500 -0
- package/bin/gsd-t-economics.cjs +37 -9
- package/bin/gsd-t.js +21 -0
- package/bin/model-windows.cjs +99 -0
- package/bin/model-windows.test.cjs +75 -0
- package/bin/runway-estimator.cjs +35 -5
- package/bin/token-budget.cjs +12 -3
- package/commands/gsd-t-help.md +14 -0
- package/commands/gsd-t-milestone.md +21 -5
- package/commands/gsd-t-promote-debt.md +1 -1
- package/commands/gsd-t-scan.md +6 -6
- package/commands/gsd-t-verify.md +46 -0
- package/package.json +1 -1
- package/scripts/context-meter/transcript-parser.js +12 -2
- package/scripts/context-meter/transcript-parser.test.js +51 -4
- package/scripts/gsd-t-calibration-hook.js +8 -1
- package/scripts/gsd-t-context-meter.e2e.test.js +45 -6
- package/scripts/gsd-t-context-meter.js +17 -3
- package/scripts/gsd-t-context-meter.test.js +85 -0
- package/templates/CLAUDE-global.md +30 -0
- package/templates/stacks/design-to-code.md +1 -1
|
@@ -74,7 +74,7 @@ class Sandbox {
|
|
|
74
74
|
* The charCount parameter controls how many characters of text content
|
|
75
75
|
* are in the transcript, which determines the estimated token count.
|
|
76
76
|
*/
|
|
77
|
-
writeTranscript(filename = "transcript.jsonl", charCount = 100) {
|
|
77
|
+
writeTranscript(filename = "transcript.jsonl", charCount = 100, model = "claude-opus-4-6") {
|
|
78
78
|
const userText = "x".repeat(Math.floor(charCount / 2));
|
|
79
79
|
const assistantText = "y".repeat(Math.ceil(charCount / 2));
|
|
80
80
|
const lines = [
|
|
@@ -89,7 +89,7 @@ class Sandbox {
|
|
|
89
89
|
message: {
|
|
90
90
|
role: "assistant",
|
|
91
91
|
content: [{ type: "text", text: assistantText }],
|
|
92
|
-
model
|
|
92
|
+
model,
|
|
93
93
|
},
|
|
94
94
|
uuid: "a1",
|
|
95
95
|
sessionId: "sess-1",
|
|
@@ -231,7 +231,9 @@ afterEach(async () => {
|
|
|
231
231
|
/* ──────────────────────────── tests ──────────────────────────── */
|
|
232
232
|
|
|
233
233
|
test("E2E 1. below threshold — stdout {} and state reflects estimate", async () => {
|
|
234
|
-
// 100 chars
|
|
234
|
+
// 100 chars → ~29 tokens. The transcript declares claude-opus-4-6, so the
|
|
235
|
+
// EFFECTIVE window is the real 1M (model-aware sizing), not the config's
|
|
236
|
+
// legacy 200K — ~0.003% of 1M, well below threshold.
|
|
235
237
|
sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
|
|
236
238
|
const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 100);
|
|
237
239
|
|
|
@@ -248,7 +250,11 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
|
|
|
248
250
|
assert.equal(state.version, 1);
|
|
249
251
|
assert.ok(state.inputTokens > 0, "should have estimated some tokens");
|
|
250
252
|
assert.ok(state.inputTokens < 1000, "small transcript should estimate < 1K tokens");
|
|
251
|
-
assert.equal(
|
|
253
|
+
assert.equal(
|
|
254
|
+
state.modelWindowSize,
|
|
255
|
+
1_000_000,
|
|
256
|
+
"window resolved from the transcript's claude-opus-4-6 model (1M), not config 200K"
|
|
257
|
+
);
|
|
252
258
|
assert.ok(state.pct < 1, "pct should be well below threshold");
|
|
253
259
|
assert.equal(state.threshold, "normal");
|
|
254
260
|
assert.equal(state.checkCount, 1);
|
|
@@ -258,9 +264,15 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
|
|
|
258
264
|
});
|
|
259
265
|
|
|
260
266
|
test("E2E 2. above threshold — stdout additionalContext with large transcript", async () => {
|
|
261
|
-
// 600K chars → ~171K tokens → 85.7% of 200K
|
|
267
|
+
// Haiku → real 200K window. 600K chars → ~171K tokens → ~85.7% of 200K →
|
|
268
|
+
// threshold band + additionalContext. (Model-aware sizing means we pin a
|
|
269
|
+
// 200K-window model here rather than relying on a stale config default.)
|
|
262
270
|
sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
|
|
263
|
-
const transcriptPath = sandbox.writeTranscript(
|
|
271
|
+
const transcriptPath = sandbox.writeTranscript(
|
|
272
|
+
"transcript.jsonl",
|
|
273
|
+
600000,
|
|
274
|
+
"claude-haiku-4-5-20251001"
|
|
275
|
+
);
|
|
264
276
|
|
|
265
277
|
const { stdout, code } = await sandbox.runHook({
|
|
266
278
|
payload: { session_id: "test-above", transcript_path: transcriptPath },
|
|
@@ -283,6 +295,33 @@ test("E2E 2. above threshold — stdout additionalContext with large transcript"
|
|
|
283
295
|
assert.equal(sandbox.tmpFileExists(), false);
|
|
284
296
|
});
|
|
285
297
|
|
|
298
|
+
test("E2E 2b. REGRESSION — large Opus transcript stays 'normal' on the 1M window", async () => {
|
|
299
|
+
// The reported bug, end-to-end: ~600K chars → ~171K tokens. Under the old
|
|
300
|
+
// hardcoded 200K window this read as ~85% → false headless handoff while
|
|
301
|
+
// ~64% of context REMAINED. With model-aware sizing (claude-opus-4-7 → 1M),
|
|
302
|
+
// 171K is only ~17% → stdout {} → no premature handoff.
|
|
303
|
+
sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
|
|
304
|
+
const transcriptPath = sandbox.writeTranscript(
|
|
305
|
+
"transcript.jsonl",
|
|
306
|
+
600000,
|
|
307
|
+
"claude-opus-4-7"
|
|
308
|
+
);
|
|
309
|
+
|
|
310
|
+
const { stdout, code } = await sandbox.runHook({
|
|
311
|
+
payload: { session_id: "test-regression", transcript_path: transcriptPath },
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
assert.equal(code, 0);
|
|
315
|
+
const parsed = JSON.parse(stdout || "{}");
|
|
316
|
+
assert.deepEqual(parsed, {}, "must NOT hand off — the reported regression");
|
|
317
|
+
|
|
318
|
+
const state = sandbox.readState();
|
|
319
|
+
assert.equal(state.modelWindowSize, 1_000_000);
|
|
320
|
+
assert.ok(state.inputTokens > 100000, "large transcript, >100K tokens");
|
|
321
|
+
assert.ok(state.pct < 75, `pct ${state.pct} must be below threshold on a 1M window`);
|
|
322
|
+
assert.equal(state.threshold, "normal");
|
|
323
|
+
});
|
|
324
|
+
|
|
286
325
|
test("E2E 3. missing transcript — stdout {}, state has parse error", async () => {
|
|
287
326
|
sandbox.writeConfig({ thresholdPct: 75, checkFrequency: 1 });
|
|
288
327
|
|
|
@@ -37,6 +37,7 @@ const { loadConfig: realLoadConfig } = require("../bin/context-meter-config.cjs"
|
|
|
37
37
|
const { parseTranscript: realParseTranscript } = require("./context-meter/transcript-parser");
|
|
38
38
|
const { estimateTokens: realEstimateTokens } = require("./context-meter/estimate-tokens");
|
|
39
39
|
const { computePct, bandFor, buildAdditionalContext } = require("./context-meter/threshold");
|
|
40
|
+
const { windowForModel } = require("../bin/model-windows.cjs");
|
|
40
41
|
|
|
41
42
|
const STATE_VERSION = 1;
|
|
42
43
|
|
|
@@ -208,6 +209,18 @@ async function runMeter(opts) {
|
|
|
208
209
|
return {};
|
|
209
210
|
}
|
|
210
211
|
|
|
212
|
+
// 5b. Resolve the EFFECTIVE context window from the model the orchestrator
|
|
213
|
+
// session is actually running (parsed.model). Opus 4.6/4.7 and Sonnet 4.x
|
|
214
|
+
// ship a 1M window; the config default (200k) is a legacy fallback that
|
|
215
|
+
// overcounts usage 5× and fires the headless handoff far too early. We
|
|
216
|
+
// only override when the transcript reports a model — a missing model or
|
|
217
|
+
// an explicit project config value falls through to cfg.modelWindowSize.
|
|
218
|
+
const effectiveWindow =
|
|
219
|
+
typeof parsed.model === "string" && parsed.model.length > 0
|
|
220
|
+
? windowForModel(parsed.model)
|
|
221
|
+
: cfg.modelWindowSize;
|
|
222
|
+
state.modelWindowSize = effectiveWindow;
|
|
223
|
+
|
|
211
224
|
// 6. Estimate tokens locally (no API call, zero cost).
|
|
212
225
|
let tokenResp;
|
|
213
226
|
try {
|
|
@@ -237,7 +250,7 @@ async function runMeter(opts) {
|
|
|
237
250
|
// 8. Success path — compute pct, band, possibly emit additionalContext.
|
|
238
251
|
const pct = computePct({
|
|
239
252
|
inputTokens: tokenResp.inputTokens,
|
|
240
|
-
modelWindowSize:
|
|
253
|
+
modelWindowSize: effectiveWindow,
|
|
241
254
|
});
|
|
242
255
|
const band = bandFor(pct, cfg.thresholdPct);
|
|
243
256
|
|
|
@@ -251,13 +264,14 @@ async function runMeter(opts) {
|
|
|
251
264
|
logPath,
|
|
252
265
|
"INFO",
|
|
253
266
|
"measure",
|
|
254
|
-
`tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band}
|
|
267
|
+
`tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band} ` +
|
|
268
|
+
`window=${effectiveWindow}${parsed.model ? ` model=${parsed.model}` : ""}`,
|
|
255
269
|
clock
|
|
256
270
|
);
|
|
257
271
|
|
|
258
272
|
const additionalContext = buildAdditionalContext({
|
|
259
273
|
pct,
|
|
260
|
-
modelWindowSize:
|
|
274
|
+
modelWindowSize: effectiveWindow,
|
|
261
275
|
thresholdPct: cfg.thresholdPct,
|
|
262
276
|
});
|
|
263
277
|
if (additionalContext) {
|
|
@@ -384,3 +384,88 @@ test("12. clock injection — timestamp uses injected clock", async () => {
|
|
|
384
384
|
const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
|
|
385
385
|
assert.equal(state.timestamp, fixed.toISOString());
|
|
386
386
|
});
|
|
387
|
+
|
|
388
|
+
/* ── M-fix: model-aware context window (the reported regression) ───────── */
|
|
389
|
+
|
|
390
|
+
test("13. Opus 4.7 @ ~36% of a 1M window stays 'normal' (regression repro)", async () => {
|
|
391
|
+
// The exact reported symptom: ~360k tokens used on an Opus 4.7 session.
|
|
392
|
+
// With the old hardcoded 200k window this computed 180% → premature
|
|
393
|
+
// headless handoff at ~64% of context REMAINING. With model-aware sizing
|
|
394
|
+
// the window is 1M, so 360k = 36% = normal, no handoff.
|
|
395
|
+
seedState(tmpRoot, { checkCount: 4 });
|
|
396
|
+
|
|
397
|
+
const out = await runMeter({
|
|
398
|
+
payload: makePayload(),
|
|
399
|
+
projectRoot: tmpRoot,
|
|
400
|
+
_loadConfig: () => makeConfig(), // config still says 200k — must be overridden
|
|
401
|
+
_parseTranscript: async () => ({ ...FAKE_PARSED, model: "claude-opus-4-7" }),
|
|
402
|
+
_estimateTokens: () => ({ inputTokens: 360000 }),
|
|
403
|
+
});
|
|
404
|
+
|
|
405
|
+
// No handoff marker — this is the whole point of the fix.
|
|
406
|
+
assert.deepEqual(out, {});
|
|
407
|
+
|
|
408
|
+
const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
|
|
409
|
+
assert.equal(state.modelWindowSize, 1_000_000, "window resolved from model, not config");
|
|
410
|
+
assert.equal(state.pct, 36, "360k / 1M = 36%");
|
|
411
|
+
assert.equal(state.threshold, "normal");
|
|
412
|
+
});
|
|
413
|
+
|
|
414
|
+
test("14. Opus 4.7 @ 80% of the true 1M window DOES hand off", async () => {
|
|
415
|
+
// The handoff must still fire at the real 75% threshold against the
|
|
416
|
+
// corrected window — we keep the guard, we just size it correctly.
|
|
417
|
+
seedState(tmpRoot, { checkCount: 4 });
|
|
418
|
+
|
|
419
|
+
const out = await runMeter({
|
|
420
|
+
payload: makePayload(),
|
|
421
|
+
projectRoot: tmpRoot,
|
|
422
|
+
_loadConfig: () => makeConfig(),
|
|
423
|
+
_parseTranscript: async () => ({ ...FAKE_PARSED, model: "claude-opus-4-7-20260115" }),
|
|
424
|
+
_estimateTokens: () => ({ inputTokens: 800000 }), // 80% of 1M > 75%
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
assert.equal(out.additionalContext, "next-spawn-headless:true");
|
|
428
|
+
const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
|
|
429
|
+
assert.equal(state.modelWindowSize, 1_000_000);
|
|
430
|
+
assert.equal(state.pct, 80);
|
|
431
|
+
assert.equal(state.threshold, "threshold");
|
|
432
|
+
});
|
|
433
|
+
|
|
434
|
+
test("15. no model in transcript → falls back to config window (back-compat)", async () => {
|
|
435
|
+
// Existing transcripts / stubs without a model field must behave exactly
|
|
436
|
+
// as before: config's modelWindowSize governs.
|
|
437
|
+
seedState(tmpRoot, { checkCount: 4 });
|
|
438
|
+
|
|
439
|
+
const out = await runMeter({
|
|
440
|
+
payload: makePayload(),
|
|
441
|
+
projectRoot: tmpRoot,
|
|
442
|
+
_loadConfig: () => makeConfig({ modelWindowSize: 200000 }),
|
|
443
|
+
_parseTranscript: async () => FAKE_PARSED, // no `model` key
|
|
444
|
+
_estimateTokens: () => ({ inputTokens: 160000 }), // 80% of 200k
|
|
445
|
+
});
|
|
446
|
+
|
|
447
|
+
assert.equal(out.additionalContext, "next-spawn-headless:true");
|
|
448
|
+
const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
|
|
449
|
+
assert.equal(state.modelWindowSize, 200000);
|
|
450
|
+
assert.equal(state.pct, 80);
|
|
451
|
+
});
|
|
452
|
+
|
|
453
|
+
test("16. Haiku session correctly sized at 200k (not over-large 1M)", async () => {
|
|
454
|
+
seedState(tmpRoot, { checkCount: 4 });
|
|
455
|
+
|
|
456
|
+
const out = await runMeter({
|
|
457
|
+
payload: makePayload(),
|
|
458
|
+
projectRoot: tmpRoot,
|
|
459
|
+
_loadConfig: () => makeConfig(),
|
|
460
|
+
_parseTranscript: async () => ({
|
|
461
|
+
...FAKE_PARSED,
|
|
462
|
+
model: "claude-haiku-4-5-20251001",
|
|
463
|
+
}),
|
|
464
|
+
_estimateTokens: () => ({ inputTokens: 170000 }), // 85% of 200k
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
assert.equal(out.additionalContext, "next-spawn-headless:true");
|
|
468
|
+
const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
|
|
469
|
+
assert.equal(state.modelWindowSize, 200000);
|
|
470
|
+
assert.equal(state.pct, 85);
|
|
471
|
+
});
|
|
@@ -537,6 +537,12 @@ BEFORE EVERY COMMIT:
|
|
|
537
537
|
│ YES → Verify test names and paths are referenced in requirements
|
|
538
538
|
├── Did I change UI, routes, or user flows?
|
|
539
539
|
│ YES → Update affected E2E test specs (Playwright/Cypress)
|
|
540
|
+
├── Did I add a new top-level dir, or change build/CI config?
|
|
541
|
+
│ This is ENFORCED MECHANICALLY by `gsd-t-verify` Step 2.6
|
|
542
|
+
│ (CI-Parity Gate: `gsd-t build-coverage` + `gsd-t ci-parity`,
|
|
543
|
+
│ FAIL-blocking). You do NOT self-attest this — verify runs the
|
|
544
|
+
│ real CI build. It exists because TimeTracking v1.10.12 shipped
|
|
545
|
+
│ VERIFIED+tagged with a new dir absent from the Dockerfile COPY.
|
|
540
546
|
└── Did I run the affected tests?
|
|
541
547
|
YES → Verify they pass. NO → Run them now.
|
|
542
548
|
```
|
|
@@ -572,6 +578,30 @@ BEFORE reporting "done" or presenting a summary:
|
|
|
572
578
|
|
|
573
579
|
**The test for this gate**: If the user asks "did you update all the documents?" and the answer would be "no, I missed some" — you failed this gate. The user should never need to ask.
|
|
574
580
|
|
|
581
|
+
## Effort Estimates — GSD-T-Native Units (MANDATORY)
|
|
582
|
+
|
|
583
|
+
**NEVER express effort or scope in developer-hours, dev-days, sprints, story points, or person-weeks.** GSD-T operates on a different cost model — the worker is Claude, not a human team — and human-time estimates have no predictive value for GSD-T workflows. They actively mislead by suggesting a calendar shape that doesn't match how the system runs.
|
|
584
|
+
|
|
585
|
+
Use GSD-T-native units instead:
|
|
586
|
+
|
|
587
|
+
| Unit | When to use |
|
|
588
|
+
|------|-------------|
|
|
589
|
+
| **Domain count** | Milestone scope (1-2 simple, 3-4 medium, 5+ complex) |
|
|
590
|
+
| **Wave count** | Cross-domain dependency depth — how many serial gates exist |
|
|
591
|
+
| **Parallel-domain count** | How many domains can run concurrently (file-disjoint) |
|
|
592
|
+
| **Spawn count** | Estimated `claude -p` / Task subagent invocations |
|
|
593
|
+
| **Token-spend range** | `$X-Y` dollars based on trailing-3 comparable milestones in `.gsd-t/token-log.md` |
|
|
594
|
+
| **Rate-limit-window count** | If the work might span > 1 5h Claude Max window |
|
|
595
|
+
|
|
596
|
+
Where this applies:
|
|
597
|
+
- `/gsd-t-milestone` Step 4 — Pre-Partition Assessment
|
|
598
|
+
- `/gsd-t-scan` techdebt milestone suggestions
|
|
599
|
+
- `/gsd-t-promote-debt` effort fields
|
|
600
|
+
- `docs/requirements.md`, `progress.md` Decision Log entries
|
|
601
|
+
- Any internal estimate the user might read
|
|
602
|
+
|
|
603
|
+
Acceptable: machine-time references (e.g. "5 min cache TTL", "5h rate-limit window", "14 day staleness threshold") — these are concrete system properties, not effort estimates. The rule applies to **effort/scope**, not to **system timeouts**.
|
|
604
|
+
|
|
575
605
|
## Execution Behavior
|
|
576
606
|
- ALWAYS check docs/architecture.md before adding or modifying components.
|
|
577
607
|
- ALWAYS check docs/workflows.md before changing any multi-step process.
|
|
@@ -176,7 +176,7 @@ MANDATORY:
|
|
|
176
176
|
│
|
|
177
177
|
├── For each design requirement, assess:
|
|
178
178
|
│ Supported → stack handles this natively, proceed
|
|
179
|
-
│ Partial → needs an addon/library — name it,
|
|
179
|
+
│ Partial → needs an addon/library — name it, scope in GSD-T units (domain/wave/spawn/token, NOT dev-hours)
|
|
180
180
|
│ Unsupported → stack CANNOT achieve this — flag as a blocker
|
|
181
181
|
│
|
|
182
182
|
├── If ANY requirement is Unsupported:
|