@tekyzinc/gsd-t 3.26.10 → 3.27.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,7 +74,7 @@ class Sandbox {
74
74
  * The charCount parameter controls how many characters of text content
75
75
  * are in the transcript, which determines the estimated token count.
76
76
  */
77
- writeTranscript(filename = "transcript.jsonl", charCount = 100) {
77
+ writeTranscript(filename = "transcript.jsonl", charCount = 100, model = "claude-opus-4-6") {
78
78
  const userText = "x".repeat(Math.floor(charCount / 2));
79
79
  const assistantText = "y".repeat(Math.ceil(charCount / 2));
80
80
  const lines = [
@@ -89,7 +89,7 @@ class Sandbox {
89
89
  message: {
90
90
  role: "assistant",
91
91
  content: [{ type: "text", text: assistantText }],
92
- model: "claude-opus-4-6",
92
+ model,
93
93
  },
94
94
  uuid: "a1",
95
95
  sessionId: "sess-1",
@@ -231,7 +231,9 @@ afterEach(async () => {
231
231
  /* ──────────────────────────── tests ──────────────────────────── */
232
232
 
233
233
  test("E2E 1. below threshold — stdout {} and state reflects estimate", async () => {
234
- // 100 chars of text content → ~29 tokens (100/3.5) 0.014% of 200K window
234
+ // 100 chars → ~29 tokens. The transcript declares claude-opus-4-6, so the
235
+ // EFFECTIVE window is the real 1M (model-aware sizing), not the config's
236
+ // legacy 200K — ~0.003% of 1M, well below threshold.
235
237
  sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
236
238
  const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 100);
237
239
 
@@ -248,7 +250,11 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
248
250
  assert.equal(state.version, 1);
249
251
  assert.ok(state.inputTokens > 0, "should have estimated some tokens");
250
252
  assert.ok(state.inputTokens < 1000, "small transcript should estimate < 1K tokens");
251
- assert.equal(state.modelWindowSize, 200000);
253
+ assert.equal(
254
+ state.modelWindowSize,
255
+ 1_000_000,
256
+ "window resolved from the transcript's claude-opus-4-6 model (1M), not config 200K"
257
+ );
252
258
  assert.ok(state.pct < 1, "pct should be well below threshold");
253
259
  assert.equal(state.threshold, "normal");
254
260
  assert.equal(state.checkCount, 1);
@@ -258,9 +264,15 @@ test("E2E 1. below threshold — stdout {} and state reflects estimate", async (
258
264
  });
259
265
 
260
266
  test("E2E 2. above threshold — stdout additionalContext with large transcript", async () => {
261
- // 600K chars → ~171K tokens → 85.7% of 200K window warn band + additionalContext
267
+ // Haiku → real 200K window. 600K chars → ~171K tokens → ~85.7% of 200K →
268
+ // threshold band + additionalContext. (Model-aware sizing means we pin a
269
+ // 200K-window model here rather than relying on a stale config default.)
262
270
  sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
263
- const transcriptPath = sandbox.writeTranscript("transcript.jsonl", 600000);
271
+ const transcriptPath = sandbox.writeTranscript(
272
+ "transcript.jsonl",
273
+ 600000,
274
+ "claude-haiku-4-5-20251001"
275
+ );
264
276
 
265
277
  const { stdout, code } = await sandbox.runHook({
266
278
  payload: { session_id: "test-above", transcript_path: transcriptPath },
@@ -283,6 +295,33 @@ test("E2E 2. above threshold — stdout additionalContext with large transcript"
283
295
  assert.equal(sandbox.tmpFileExists(), false);
284
296
  });
285
297
 
298
+ test("E2E 2b. REGRESSION — large Opus transcript stays 'normal' on the 1M window", async () => {
299
+ // The reported bug, end-to-end: ~600K chars → ~171K tokens. Under the old
300
+ // hardcoded 200K window this read as ~85% → false headless handoff while
301
+ // ~64% of context REMAINED. With model-aware sizing (claude-opus-4-7 → 1M),
302
+ // 171K is only ~17% → stdout {} → no premature handoff.
303
+ sandbox.writeConfig({ thresholdPct: 75, modelWindowSize: 200000, checkFrequency: 1 });
304
+ const transcriptPath = sandbox.writeTranscript(
305
+ "transcript.jsonl",
306
+ 600000,
307
+ "claude-opus-4-7"
308
+ );
309
+
310
+ const { stdout, code } = await sandbox.runHook({
311
+ payload: { session_id: "test-regression", transcript_path: transcriptPath },
312
+ });
313
+
314
+ assert.equal(code, 0);
315
+ const parsed = JSON.parse(stdout || "{}");
316
+ assert.deepEqual(parsed, {}, "must NOT hand off — the reported regression");
317
+
318
+ const state = sandbox.readState();
319
+ assert.equal(state.modelWindowSize, 1_000_000);
320
+ assert.ok(state.inputTokens > 100000, "large transcript, >100K tokens");
321
+ assert.ok(state.pct < 75, `pct ${state.pct} must be below threshold on a 1M window`);
322
+ assert.equal(state.threshold, "normal");
323
+ });
324
+
286
325
  test("E2E 3. missing transcript — stdout {}, state has parse error", async () => {
287
326
  sandbox.writeConfig({ thresholdPct: 75, checkFrequency: 1 });
288
327
 
@@ -37,6 +37,7 @@ const { loadConfig: realLoadConfig } = require("../bin/context-meter-config.cjs"
37
37
  const { parseTranscript: realParseTranscript } = require("./context-meter/transcript-parser");
38
38
  const { estimateTokens: realEstimateTokens } = require("./context-meter/estimate-tokens");
39
39
  const { computePct, bandFor, buildAdditionalContext } = require("./context-meter/threshold");
40
+ const { windowForModel } = require("../bin/model-windows.cjs");
40
41
 
41
42
  const STATE_VERSION = 1;
42
43
 
@@ -208,6 +209,18 @@ async function runMeter(opts) {
208
209
  return {};
209
210
  }
210
211
 
212
+ // 5b. Resolve the EFFECTIVE context window from the model the orchestrator
213
+ // session is actually running (parsed.model). Opus 4.6/4.7 and Sonnet 4.x
214
+ // ship a 1M window; the config default (200k) is a legacy fallback that
215
+ // overcounts usage 5× and fires the headless handoff far too early. We
216
+ // only override when the transcript reports a model — a missing model or
217
+ // an explicit project config value falls through to cfg.modelWindowSize.
218
+ const effectiveWindow =
219
+ typeof parsed.model === "string" && parsed.model.length > 0
220
+ ? windowForModel(parsed.model)
221
+ : cfg.modelWindowSize;
222
+ state.modelWindowSize = effectiveWindow;
223
+
211
224
  // 6. Estimate tokens locally (no API call, zero cost).
212
225
  let tokenResp;
213
226
  try {
@@ -237,7 +250,7 @@ async function runMeter(opts) {
237
250
  // 8. Success path — compute pct, band, possibly emit additionalContext.
238
251
  const pct = computePct({
239
252
  inputTokens: tokenResp.inputTokens,
240
- modelWindowSize: cfg.modelWindowSize,
253
+ modelWindowSize: effectiveWindow,
241
254
  });
242
255
  const band = bandFor(pct, cfg.thresholdPct);
243
256
 
@@ -251,13 +264,14 @@ async function runMeter(opts) {
251
264
  logPath,
252
265
  "INFO",
253
266
  "measure",
254
- `tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band}`,
267
+ `tokens=${tokenResp.inputTokens} pct=${pct.toFixed(1)} band=${band} ` +
268
+ `window=${effectiveWindow}${parsed.model ? ` model=${parsed.model}` : ""}`,
255
269
  clock
256
270
  );
257
271
 
258
272
  const additionalContext = buildAdditionalContext({
259
273
  pct,
260
- modelWindowSize: cfg.modelWindowSize,
274
+ modelWindowSize: effectiveWindow,
261
275
  thresholdPct: cfg.thresholdPct,
262
276
  });
263
277
  if (additionalContext) {
@@ -384,3 +384,88 @@ test("12. clock injection — timestamp uses injected clock", async () => {
384
384
  const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
385
385
  assert.equal(state.timestamp, fixed.toISOString());
386
386
  });
387
+
388
+ /* ── M-fix: model-aware context window (the reported regression) ───────── */
389
+
390
+ test("13. Opus 4.7 @ ~36% of a 1M window stays 'normal' (regression repro)", async () => {
391
+ // The exact reported symptom: ~360k tokens used on an Opus 4.7 session.
392
+ // With the old hardcoded 200k window this computed 180% → premature
393
+ // headless handoff at ~64% of context REMAINING. With model-aware sizing
394
+ // the window is 1M, so 360k = 36% = normal, no handoff.
395
+ seedState(tmpRoot, { checkCount: 4 });
396
+
397
+ const out = await runMeter({
398
+ payload: makePayload(),
399
+ projectRoot: tmpRoot,
400
+ _loadConfig: () => makeConfig(), // config still says 200k — must be overridden
401
+ _parseTranscript: async () => ({ ...FAKE_PARSED, model: "claude-opus-4-7" }),
402
+ _estimateTokens: () => ({ inputTokens: 360000 }),
403
+ });
404
+
405
+ // No handoff marker — this is the whole point of the fix.
406
+ assert.deepEqual(out, {});
407
+
408
+ const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
409
+ assert.equal(state.modelWindowSize, 1_000_000, "window resolved from model, not config");
410
+ assert.equal(state.pct, 36, "360k / 1M = 36%");
411
+ assert.equal(state.threshold, "normal");
412
+ });
413
+
414
+ test("14. Opus 4.7 @ 80% of the true 1M window DOES hand off", async () => {
415
+ // The handoff must still fire at the real 75% threshold against the
416
+ // corrected window — we keep the guard, we just size it correctly.
417
+ seedState(tmpRoot, { checkCount: 4 });
418
+
419
+ const out = await runMeter({
420
+ payload: makePayload(),
421
+ projectRoot: tmpRoot,
422
+ _loadConfig: () => makeConfig(),
423
+ _parseTranscript: async () => ({ ...FAKE_PARSED, model: "claude-opus-4-7-20260115" }),
424
+ _estimateTokens: () => ({ inputTokens: 800000 }), // 80% of 1M > 75%
425
+ });
426
+
427
+ assert.equal(out.additionalContext, "next-spawn-headless:true");
428
+ const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
429
+ assert.equal(state.modelWindowSize, 1_000_000);
430
+ assert.equal(state.pct, 80);
431
+ assert.equal(state.threshold, "threshold");
432
+ });
433
+
434
+ test("15. no model in transcript → falls back to config window (back-compat)", async () => {
435
+ // Existing transcripts / stubs without a model field must behave exactly
436
+ // as before: config's modelWindowSize governs.
437
+ seedState(tmpRoot, { checkCount: 4 });
438
+
439
+ const out = await runMeter({
440
+ payload: makePayload(),
441
+ projectRoot: tmpRoot,
442
+ _loadConfig: () => makeConfig({ modelWindowSize: 200000 }),
443
+ _parseTranscript: async () => FAKE_PARSED, // no `model` key
444
+ _estimateTokens: () => ({ inputTokens: 160000 }), // 80% of 200k
445
+ });
446
+
447
+ assert.equal(out.additionalContext, "next-spawn-headless:true");
448
+ const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
449
+ assert.equal(state.modelWindowSize, 200000);
450
+ assert.equal(state.pct, 80);
451
+ });
452
+
453
+ test("16. Haiku session correctly sized at 200k (not over-large 1M)", async () => {
454
+ seedState(tmpRoot, { checkCount: 4 });
455
+
456
+ const out = await runMeter({
457
+ payload: makePayload(),
458
+ projectRoot: tmpRoot,
459
+ _loadConfig: () => makeConfig(),
460
+ _parseTranscript: async () => ({
461
+ ...FAKE_PARSED,
462
+ model: "claude-haiku-4-5-20251001",
463
+ }),
464
+ _estimateTokens: () => ({ inputTokens: 170000 }), // 85% of 200k
465
+ });
466
+
467
+ assert.equal(out.additionalContext, "next-spawn-headless:true");
468
+ const state = JSON.parse(fs.readFileSync(stateFile(tmpRoot), "utf8"));
469
+ assert.equal(state.modelWindowSize, 200000);
470
+ assert.equal(state.pct, 85);
471
+ });
@@ -537,6 +537,12 @@ BEFORE EVERY COMMIT:
537
537
  │ YES → Verify test names and paths are referenced in requirements
538
538
  ├── Did I change UI, routes, or user flows?
539
539
  │ YES → Update affected E2E test specs (Playwright/Cypress)
540
+ ├── Did I add a new top-level dir, or change build/CI config?
541
+ │ This is ENFORCED MECHANICALLY by `gsd-t-verify` Step 2.6
542
+ │ (CI-Parity Gate: `gsd-t build-coverage` + `gsd-t ci-parity`,
543
+ │ FAIL-blocking). You do NOT self-attest this — verify runs the
544
+ │ real CI build. It exists because TimeTracking v1.10.12 shipped
545
+ │ VERIFIED+tagged with a new dir absent from the Dockerfile COPY.
540
546
  └── Did I run the affected tests?
541
547
  YES → Verify they pass. NO → Run them now.
542
548
  ```
@@ -572,6 +578,30 @@ BEFORE reporting "done" or presenting a summary:
572
578
 
573
579
  **The test for this gate**: If the user asks "did you update all the documents?" and the answer would be "no, I missed some" — you failed this gate. The user should never need to ask.
574
580
 
581
+ ## Effort Estimates — GSD-T-Native Units (MANDATORY)
582
+
583
+ **NEVER express effort or scope in developer-hours, dev-days, sprints, story points, or person-weeks.** GSD-T operates on a different cost model — the worker is Claude, not a human team — and human-time estimates have no predictive value for GSD-T workflows. They actively mislead by suggesting a calendar shape that doesn't match how the system runs.
584
+
585
+ Use GSD-T-native units instead:
586
+
587
+ | Unit | When to use |
588
+ |------|-------------|
589
+ | **Domain count** | Milestone scope (1-2 simple, 3-4 medium, 5+ complex) |
590
+ | **Wave count** | Cross-domain dependency depth — how many serial gates exist |
591
+ | **Parallel-domain count** | How many domains can run concurrently (file-disjoint) |
592
+ | **Spawn count** | Estimated `claude -p` / Task subagent invocations |
593
+ | **Token-spend range** | `$X-Y` dollars based on trailing-3 comparable milestones in `.gsd-t/token-log.md` |
594
+ | **Rate-limit-window count** | If the work might span > 1 5h Claude Max window |
595
+
596
+ Where this applies:
597
+ - `/gsd-t-milestone` Step 4 — Pre-Partition Assessment
598
+ - `/gsd-t-scan` techdebt milestone suggestions
599
+ - `/gsd-t-promote-debt` effort fields
600
+ - `docs/requirements.md`, `progress.md` Decision Log entries
601
+ - Any internal estimate the user might read
602
+
603
+ Acceptable: machine-time references (e.g. "5 min cache TTL", "5h rate-limit window", "14 day staleness threshold") — these are concrete system properties, not effort estimates. The rule applies to **effort/scope**, not to **system timeouts**.
604
+
575
605
  ## Execution Behavior
576
606
  - ALWAYS check docs/architecture.md before adding or modifying components.
577
607
  - ALWAYS check docs/workflows.md before changing any multi-step process.
@@ -176,7 +176,7 @@ MANDATORY:
176
176
 
177
177
  ├── For each design requirement, assess:
178
178
  │ Supported → stack handles this natively, proceed
179
- │ Partial → needs an addon/library — name it, estimate effort
179
+ │ Partial → needs an addon/library — name it, scope in GSD-T units (domain/wave/spawn/token, NOT dev-hours)
180
180
  │ Unsupported → stack CANNOT achieve this — flag as a blocker
181
181
 
182
182
  ├── If ANY requirement is Unsupported: