switchroom 0.13.4 → 0.13.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@
19
19
  import { existsSync, mkdirSync, appendFileSync } from 'node:fs'
20
20
  import { dirname, join } from 'node:path'
21
21
  import { captureEvent } from './analytics-posthog.js'
22
+ import type { PokeLevel } from './silence-poke.js'
22
23
 
23
24
  export type RuntimeMetricEvent =
24
25
  /**
@@ -62,28 +63,33 @@ export type RuntimeMetricEvent =
62
63
  ended_via: 'reply' | 'stream_reply_done' | 'silent' | 'forced' | 'framework_fallback'
63
64
  }
64
65
  /**
65
- * Framework safety-net: a silence-poke was armed at 75s (soft) or
66
- * 180s (firm). The system-reminder appended to the next tool result
67
- * nudges the model to send an update. Doubles as a design-health
68
- * signal if these fire frequently, the conversational-pacing
69
- * prompt isn't doing its job.
66
+ * Framework safety-net: a silence-poke was armed. `ack` is the early
67
+ * (~10s) ack-budget poke the model has sent NOTHING this turn and is
68
+ * leaving the user on a silent chat. `soft` (75s) / `firm` (180s) are
69
+ * the silence-since-last-outbound ladder. The system-reminder appended
70
+ * to the next tool result nudges the model to send an update. Doubles
71
+ * as a design-health signal — if these fire frequently, the
72
+ * conversational-pacing prompt isn't doing its job.
70
73
  */
71
74
  | {
72
75
  kind: 'silence_poke_fired'
73
76
  key: string
74
- level: 'soft' | 'firm'
77
+ level: PokeLevel
75
78
  silence_ms: number
76
79
  subagent_wait: boolean
77
80
  }
78
81
  /**
79
82
  * The model sent an outbound message within the success window
80
83
  * (default 15s) after a poke fired. Pair with `silence_poke_fired`
81
- * to compute success rate — the design target is >80%.
84
+ * to compute success rate — the design target is >80%. (`ack`-level
85
+ * success is not currently emitted — the ack poke sits outside the
86
+ * `pokesFired` ladder noteOutbound measures against; the type admits
87
+ * `ack` only so the silence-poke metric union stays assignable.)
82
88
  */
83
89
  | {
84
90
  kind: 'silence_poke_succeeded'
85
91
  key: string
86
- level: 'soft' | 'firm'
92
+ level: PokeLevel
87
93
  latency_ms: number
88
94
  }
89
95
  /**
@@ -43,7 +43,7 @@
43
43
  * pacing prompt still applies; only the framework safety net is off.
44
44
  */
45
45
 
46
- export type PokeLevel = 'soft' | 'firm'
46
+ export type PokeLevel = 'ack' | 'soft' | 'firm'
47
47
 
48
48
  /** #1292: snapshot of an in-flight tool call, surfaced in the 300s
49
49
  * framework-fallback message so the user sees the actual observable
@@ -76,6 +76,10 @@ export interface SilencePokeState {
76
76
  lastThinkingAt: number | null
77
77
  /** True once the 300s framework fallback has fired this turn. */
78
78
  fallbackFired: boolean
79
+ /** True once the early ack-budget poke has fired this turn. One-shot:
80
+ * the ack nudge is specifically about the *first* outbound, so it
81
+ * never re-arms even after the model later goes quiet again. */
82
+ ackPokeFired: boolean
79
83
  /** Wall-clock ms of last poke fire — used for poke-success latency. */
80
84
  lastPokeFiredAt: number | null
81
85
  /** #1292: in-flight tool calls keyed by toolUseId. Populated by
@@ -91,6 +95,12 @@ export interface SilencePokeState {
91
95
  }
92
96
 
93
97
  export interface ThresholdsMs {
98
+ /** Ack budget: if NO outbound at all has landed this many ms after
99
+ * turn start, arm an 'ack' poke. This is the framework enforcing the
100
+ * human-baseline "acknowledge within a beat" — far tighter than the
101
+ * 75s `soft` threshold, which measures silence-since-last-outbound
102
+ * and is the wrong instrument for "you never said hello." */
103
+ ack: number
94
104
  soft: number
95
105
  firm: number
96
106
  fallback: number
@@ -101,6 +111,7 @@ export interface ThresholdsMs {
101
111
  }
102
112
 
103
113
  export const DEFAULT_THRESHOLDS: ThresholdsMs = {
114
+ ack: 10_000,
104
115
  soft: 75_000,
105
116
  firm: 180_000,
106
117
  fallback: 300_000,
@@ -176,6 +187,7 @@ export function startTurn(key: string, now: number): void {
176
187
  subagentDispatchActive: false,
177
188
  lastThinkingAt: null,
178
189
  fallbackFired: false,
190
+ ackPokeFired: false,
179
191
  lastPokeFiredAt: null,
180
192
  inFlightTools: new Map(),
181
193
  })
@@ -340,6 +352,16 @@ export function endTurn(key: string): void {
340
352
 
341
353
  /** Verbatim poke text. Wording is load-bearing — see issue #1122 design. */
342
354
  export function formatPokeText(level: PokeLevel): string {
355
+ if (level === 'ack') {
356
+ return (
357
+ "[silence-poke] You haven't sent the user anything yet this turn — "
358
+ + 'they are looking at a silent chat. Send a short, human one-line '
359
+ + 'acknowledgement now via `reply` (e.g. "on it — checking"), in your '
360
+ + "persona's voice, before you do any more work. A good colleague "
361
+ + "answers in a beat; don't leave the message hanging while you think. "
362
+ + 'If the full answer is genuinely seconds away, send that instead.'
363
+ )
364
+ }
343
365
  if (level === 'soft') {
344
366
  return (
345
367
  "[silence-poke] You've been silent to the user for 75s. If you're "
@@ -437,6 +459,32 @@ function tick(now: number): void {
437
459
  ? thresholds.subagentSoft
438
460
  : thresholds.soft
439
461
 
462
+ // Ack budget — the framework enforcing the human-baseline "answer
463
+ // in a beat." Fires once, only when NOTHING has been sent this turn
464
+ // (`lastOutboundAt == null`), well before the 75s `soft` threshold.
465
+ // `soft` measures silence-since-last-outbound and is the wrong
466
+ // instrument for "you never acknowledged me." Independent of the
467
+ // soft/firm/fallback ladder: if the model never acks, it still
468
+ // escalates soft → firm → fallback on schedule after this.
469
+ if (
470
+ !s.ackPokeFired
471
+ && s.lastOutboundAt == null
472
+ && s.pokesFired === 0
473
+ && silence >= thresholds.ack
474
+ ) {
475
+ s.pokeArmed = { level: 'ack' }
476
+ s.ackPokeFired = true
477
+ s.lastPokeFiredAt = now
478
+ activeDeps.emitMetric({
479
+ kind: 'silence_poke_fired',
480
+ key,
481
+ level: 'ack',
482
+ silence_ms: silence,
483
+ subagent_wait: s.subagentDispatchActive,
484
+ })
485
+ continue
486
+ }
487
+
440
488
  if (s.pokesFired === 0 && silence >= softThreshold) {
441
489
  s.pokeArmed = { level: 'soft' }
442
490
  s.pokesFired = 1
@@ -33,7 +33,15 @@ function setupDeps(opts?: { thresholds?: Partial<typeof DEFAULT_THRESHOLDS> }):
33
33
  __setDepsForTests({
34
34
  emitMetric: (e) => fixtures.emitted.push(e),
35
35
  onFrameworkFallback: (ctx) => { fixtures.fallbacks.push(ctx) },
36
- thresholdsMs: { ...DEFAULT_THRESHOLDS, ...(opts?.thresholds ?? {}) },
36
+ // The ack budget (a new poke that fires *earlier* than `soft`) is
37
+ // disabled by default in this fixture so the soft/firm/fallback
38
+ // ladder tests stay isolated from it. The 'ack budget' describe
39
+ // block opts back in with a real value.
40
+ thresholdsMs: {
41
+ ...DEFAULT_THRESHOLDS,
42
+ ack: Number.MAX_SAFE_INTEGER,
43
+ ...(opts?.thresholds ?? {}),
44
+ },
37
45
  })
38
46
  return fixtures
39
47
  }
@@ -139,6 +147,127 @@ describe('silence-poke — escalation ladder', () => {
139
147
  })
140
148
  })
141
149
 
150
+ // PR1 (human-feel UX epic): the ack budget. A person you message
151
+ // answers in a beat — the framework enforces that baseline by arming an
152
+ // 'ack' poke if NOTHING has been sent within `thresholds.ack` of turn
153
+ // start. It is a one-shot nudge (the model still authors every word),
154
+ // deliberately OUTSIDE the soft/firm/fallback `pokesFired` ladder: if
155
+ // the model never acks, the ladder still escalates on its own schedule.
156
+ // See `reference/conversational-pacing.md` and the "Open with an
157
+ // acknowledgement" bullet in `profiles/_shared/telegram-style.md.hbs`.
158
+ //
159
+ // NB: `setupDeps` disables the ack budget by default (ack = MAX_SAFE);
160
+ // every test here opts back in with a real `ack` threshold.
161
+ describe('silence-poke — ack budget (PR1 human-feel UX)', () => {
162
+ it('arms an ack poke at the ack threshold when nothing has been sent', () => {
163
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
164
+ startTurn('chat:0', 0)
165
+
166
+ __tickForTests(9_000) // before the ack budget
167
+ expect(consumeArmedPoke()).toBeNull()
168
+ expect(fx.emitted).toHaveLength(0)
169
+
170
+ __tickForTests(10_000) // at the ack budget
171
+ expect(fx.emitted).toEqual([
172
+ expect.objectContaining({ kind: 'silence_poke_fired', level: 'ack' }),
173
+ ])
174
+ const text = consumeArmedPoke()
175
+ expect(text).toContain('[silence-poke]')
176
+ expect(text).toContain('reply')
177
+ })
178
+
179
+ it('does NOT arm an ack poke if an outbound landed before the budget', () => {
180
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
181
+ startTurn('chat:0', 0)
182
+ noteOutbound('chat:0', 3_000) // model acked fast — inside the budget
183
+ __tickForTests(10_000)
184
+ __tickForTests(20_000)
185
+ expect(consumeArmedPoke()).toBeNull()
186
+ expect(
187
+ fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'ack'),
188
+ ).toHaveLength(0)
189
+ })
190
+
191
+ it('is one-shot — never re-arms even if the model goes quiet again', () => {
192
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
193
+ startTurn('chat:0', 0)
194
+ __tickForTests(10_000) // ack fires
195
+ consumeArmedPoke() // drain it
196
+ noteOutbound('chat:0', 12_000) // model finally acks
197
+ // The model goes quiet again. The ack poke is specifically about the
198
+ // FIRST outbound — it must not fire twice. A later silence is the
199
+ // soft poke's job, not the ack budget's.
200
+ __tickForTests(40_000)
201
+ expect(
202
+ fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'ack'),
203
+ ).toHaveLength(1)
204
+ })
205
+
206
+ it('ackPokeFired resets across turns even when endTurn was skipped (CC-5 invariant)', () => {
207
+ // Mirrors the subagentDispatchActive CC-5 guard: `ackPokeFired` is a
208
+ // turn-scoped one-shot flag, and the only thing that keeps it from
209
+ // leaking into the next turn (when an abnormal abort skips endTurn)
210
+ // is startTurn's unconditional state overwrite. Pin that here so a
211
+ // future read-modify-write refactor of startTurn fails loud.
212
+ setupDeps({ thresholds: { ack: 10_000 } })
213
+ startTurn('k', 0)
214
+ __tickForTests(10_000) // ack fires
215
+ expect(__getStateForTests('k')?.ackPokeFired).toBe(true)
216
+ // Turn 2 in the same key, no endTurn — startTurn MUST clear the flag.
217
+ startTurn('k', 1_000_000)
218
+ expect(__getStateForTests('k')?.ackPokeFired).toBe(false)
219
+ })
220
+
221
+ it('does not advance the ladder — soft still requires a full 75s of silence', () => {
222
+ // The ack poke is deliberately outside `pokesFired`. After it fires,
223
+ // a soft poke must still wait the normal 75s.
224
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
225
+ startTurn('chat:0', 0)
226
+ __tickForTests(10_000) // ack
227
+ consumeArmedPoke()
228
+ __tickForTests(70_000) // 70s total — under the 75s soft threshold
229
+ expect(
230
+ fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'soft'),
231
+ ).toHaveLength(0)
232
+ __tickForTests(75_000)
233
+ expect(
234
+ fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'soft'),
235
+ ).toHaveLength(1)
236
+ })
237
+
238
+ it('still escalates ack -> soft -> firm -> fallback on a turn that never acks', () => {
239
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
240
+ startTurn('chat:0', 0)
241
+ __tickForTests(10_000) // ack
242
+ consumeArmedPoke()
243
+ __tickForTests(75_000) // soft
244
+ consumeArmedPoke()
245
+ __tickForTests(180_000) // firm
246
+ consumeArmedPoke()
247
+ __tickForTests(300_000) // fallback
248
+ const trail = fx.emitted.map((e) =>
249
+ e.kind === 'silence_poke_fired'
250
+ ? `poke:${e.level}`
251
+ : e.kind === 'silence_fallback_sent'
252
+ ? `fallback:${e.fallback_kind}`
253
+ : e.kind,
254
+ )
255
+ expect(trail).toEqual([
256
+ 'poke:ack',
257
+ 'poke:soft',
258
+ 'poke:firm',
259
+ 'fallback:working',
260
+ ])
261
+ })
262
+
263
+ it('formatPokeText("ack") nudges for a human acknowledgement via reply', () => {
264
+ const text = formatPokeText('ack')
265
+ expect(text).toContain('[silence-poke]')
266
+ expect(text.toLowerCase()).toContain('acknowledg')
267
+ expect(text).toContain('reply')
268
+ })
269
+ })
270
+
142
271
  describe('silence-poke — outbound resets clock + success measurement', () => {
143
272
  it('noteOutbound resets the silence clock', () => {
144
273
  setupDeps()
@@ -608,7 +737,9 @@ describe('silence-poke — fallback handler errors do not break timer', () => {
608
737
  __setDepsForTests({
609
738
  emitMetric: (e) => fx.emitted.push(e),
610
739
  onFrameworkFallback: () => { throw new Error('oh no') },
611
- thresholdsMs: DEFAULT_THRESHOLDS,
740
+ // ack budget out of the way — this test exercises the
741
+ // soft/firm/fallback ladder under a throwing fallback handler.
742
+ thresholdsMs: { ...DEFAULT_THRESHOLDS, ack: Number.MAX_SAFE_INTEGER },
612
743
  })
613
744
  startTurn('k', 0)
614
745
  expect(() => {
@@ -625,7 +756,8 @@ describe('silence-poke — fallback handler errors do not break timer', () => {
625
756
  __setDepsForTests({
626
757
  emitMetric: (e) => fx.emitted.push(e),
627
758
  onFrameworkFallback: () => Promise.reject(new Error('async fail')),
628
- thresholdsMs: DEFAULT_THRESHOLDS,
759
+ // ack budget out of the way — see the throwing-handler test above.
760
+ thresholdsMs: { ...DEFAULT_THRESHOLDS, ack: Number.MAX_SAFE_INTEGER },
629
761
  })
630
762
  startTurn('k', 0)
631
763
  __tickForTests(75_000)
@@ -0,0 +1,217 @@
1
+ /**
2
+ * JTBD scenario — guaranteed fast acknowledgement (human-feel UX epic).
3
+ *
4
+ * Serves: `reference/conversational-pacing.md` and the JTBD
5
+ * "talking to my agent feels like talking to a capable person".
6
+ *
7
+ * A person you message answers in a beat — "got it", "on it, checking
8
+ * now" — before the work is done. PR #1633 made that opening
9
+ * acknowledgement a *guarantee*, split across two layers:
10
+ *
11
+ * - the conversational-pacing prompt teaches the model to open with
12
+ * a short human one-liner unless the real answer lands in a second
13
+ * or two;
14
+ * - the silence-poke subsystem *enforces* it — a ~10s ack-budget
15
+ * poke fires when nothing at all has been sent this turn, nudging
16
+ * the model to acknowledge before it does more work.
17
+ *
18
+ * This UAT drives a FUZZY set of non-trivial prompt shapes — research,
19
+ * multi-step compute, open-ended advice, code, reflective asks. Every
20
+ * one needs real work, so a turn that goes silent for tens of seconds
21
+ * is a black box. The invariant under test: the user sees a sign of
22
+ * life FAST, every time, across every prompt shape.
23
+ *
24
+ * ## Targets
25
+ *
26
+ * - **Hard contract:** the first outbound lands within `ACK_HARD_MS`
27
+ * for every prompt. This is a tight *latency target*, not a
28
+ * framework guarantee. The silence-poke ack rung is a *nudge*
29
+ * piggybacked on the model's next tool result (`consumeArmedPoke`
30
+ * drained at the gateway tool-result chokepoint) — not a
31
+ * framework-composed send. It helps the model along, but a
32
+ * pure-reasoning prompt that issues no tool call never drains the
33
+ * nudge, so the bound ultimately depends on model latency. It
34
+ * still has teeth: pre-#1633 a slow prompt's first outbound was
35
+ * the full answer, often 30-60s out, so 20s cleanly separates the
36
+ * fixed behaviour from a regression. A failure here means the
37
+ * agent left the user on a silent chat — a real pacing defect.
38
+ * - **Vision target (soft, per-case forensic):** the first outbound
39
+ * lands within `ACK_VISION_MS` and is short — a genuine
40
+ * acknowledgement, not a full-answer dump. The model self-acking
41
+ * quickly is what makes it *feel* human. Logged, not failed: real
42
+ * model runs vary, and the prompt explicitly lets a turn skip the
43
+ * ack when the answer itself arrives in the first couple seconds.
44
+ *
45
+ * ## Relationship to adjacent UATs
46
+ *
47
+ * - `jtbd-fast-trivial-dm.test.ts` — TRIVIAL prompts: the answer
48
+ * itself should land fast, no ack ceremony. This file is the
49
+ * non-trivial inverse: real work, but a fast *acknowledgement*.
50
+ * - `jtbd-soft-commit-dm.test.ts` — the predecessor: a single slow
51
+ * prompt, a looser "first reply within 30s" floor. This file is
52
+ * the stronger, fuzzed successor of that contract.
53
+ *
54
+ * Each case is a single inbound; cases run sequentially. As with the
55
+ * other fuzz files, a prior turn may still be finishing in the
56
+ * background when the next case starts — an accepted, noted risk.
57
+ */
58
+
59
+ import { describe, it, expect } from "vitest";
60
+ import { spinUp } from "../harness.js";
61
+
62
+ const AGENT = "test-harness";
63
+
64
+ // Hard contract: a sign of life within this budget, every prompt.
65
+ // A tight latency target — well above a healthy self-ack (~3-8s on a
66
+ // warm agent) and well below the pre-#1633 silent-then-dump regression
67
+ // (30-60s). Model-dependent, not a framework guarantee (see header
68
+ // doc), so it carries generous headroom for mtcute polling jitter and
69
+ // for a model that leans on the ack-poke nudge instead of self-acking.
70
+ const ACK_HARD_MS = 20_000;
71
+
72
+ // Vision target: the model self-acknowledges in a beat, fast enough
73
+ // that the ack-poke nudge never has to come into it.
74
+ const ACK_VISION_MS = 8_000;
75
+
76
+ // A first outbound at or under this length reads as an acknowledgement
77
+ // one-liner rather than a full-answer dump. Mirrors the >200-char
78
+ // "long answer" heuristic in jtbd-soft-commit-dm, with headroom for a
79
+ // persona-voiced ack ("on it — pulling the os-release and hostname now").
80
+ const ACK_LEN_CEILING = 320;
81
+
82
+ interface AckCase {
83
+ name: string;
84
+ /** A prompt that genuinely needs more than a second or two of work,
85
+ * so an instant full answer is not a legitimate ack-skip. */
86
+ prompt: string;
87
+ }
88
+
89
+ const ACK_CASES: readonly AckCase[] = [
90
+ // ─── Research / multi-source read ─────────────────────────────
91
+ {
92
+ name: "machine-summary research",
93
+ prompt:
94
+ "Read /etc/os-release and /etc/hostname, then tell me in one "
95
+ + "sentence what kind of machine this is.",
96
+ },
97
+ // ─── Multi-step compute ───────────────────────────────────────
98
+ {
99
+ name: "compound date math",
100
+ prompt:
101
+ "Work out what day of the week it is today, then tell me how "
102
+ + "many days are left until the end of this month.",
103
+ },
104
+ // ─── Open-ended advice ("take your time") ─────────────────────
105
+ {
106
+ name: "open-ended prioritisation",
107
+ prompt:
108
+ "I've got a free afternoon and three half-finished side "
109
+ + "projects. Help me decide what to focus on. Take your time.",
110
+ },
111
+ // ─── Summarise / explain ──────────────────────────────────────
112
+ {
113
+ name: "plain-language summary",
114
+ prompt:
115
+ "Give me a 3-bullet summary of what a Linux container actually "
116
+ + "is, in plain language.",
117
+ },
118
+ // ─── Code task ────────────────────────────────────────────────
119
+ {
120
+ name: "bash one-liner with explanation",
121
+ prompt:
122
+ "Write me a small bash one-liner that counts the total number "
123
+ + "of lines across all .ts files under the current directory, "
124
+ + "and explain how it works.",
125
+ },
126
+ // ─── Reflective / vague-but-real ──────────────────────────────
127
+ {
128
+ name: "reflective open ask",
129
+ prompt:
130
+ "Something feels off with how I'm spending my mornings lately. "
131
+ + "Help me think through it.",
132
+ },
133
+ // ─── Comparison / judgement ───────────────────────────────────
134
+ {
135
+ name: "tech comparison",
136
+ prompt:
137
+ "Compare REST and GraphQL for a small side project — which "
138
+ + "would you pick and why?",
139
+ },
140
+ // ─── Investigate the box ──────────────────────────────────────
141
+ {
142
+ name: "disk-usage investigation",
143
+ prompt:
144
+ "Have a look at what's taking up the most space under /var/log "
145
+ + "and summarise what you find.",
146
+ },
147
+ ];
148
+
149
+ describe("uat: guaranteed fast acknowledgement — fuzzy prompt shapes", () => {
150
+ for (const tc of ACK_CASES) {
151
+ it(
152
+ `[ack] ${tc.name} — sign of life within ${ACK_HARD_MS / 1000}s`,
153
+ async () => {
154
+ const sc = await spinUp({ agent: AGENT });
155
+ try {
156
+ const sendStart = Date.now();
157
+ await sc.sendDM(tc.prompt);
158
+
159
+ const firstOutbound = await sc.expectMessage(/\S/, {
160
+ from: "bot",
161
+ timeout: ACK_HARD_MS + 6_000,
162
+ });
163
+ const ttfo = Date.now() - sendStart;
164
+ const len = firstOutbound.text.trim().length;
165
+
166
+ // Invariant: the outbound is a real, non-empty message.
167
+ expect(len).toBeGreaterThan(0);
168
+
169
+ // Hard contract: a sign of life FAST. A latency target, not
170
+ // a framework guarantee (see header doc) — but a failure
171
+ // here is a real pacing defect, so it fails the build.
172
+ if (ttfo >= ACK_HARD_MS) {
173
+ throw new Error(
174
+ `[ack] ${tc.name}: TTFO=${ttfo}ms exceeds the hard `
175
+ + `contract ${ACK_HARD_MS}ms — the user sat on a silent `
176
+ + `chat. The fast-ack path (pacing prompt + ack-poke `
177
+ + `nudge) is not delivering. First outbound: `
178
+ + `${JSON.stringify(firstOutbound.text.slice(0, 200))}`,
179
+ );
180
+ }
181
+ expect(ttfo).toBeLessThan(ACK_HARD_MS);
182
+
183
+ // Forensic, soft: did the model self-acknowledge in a beat,
184
+ // or did it only get there with the ack-poke nudge?
185
+ const looksLikeAck = len <= ACK_LEN_CEILING;
186
+ if (ttfo < ACK_VISION_MS && looksLikeAck) {
187
+ console.log(
188
+ `[ack] ${tc.name}: TTFO=${ttfo}ms, ${len} chars — fast `
189
+ + `short acknowledgement. Feels human.`,
190
+ );
191
+ } else if (ttfo < ACK_VISION_MS && !looksLikeAck) {
192
+ // Fast but long: the answer itself arrived quickly. The
193
+ // pacing prompt explicitly sanctions skipping the ack when
194
+ // the answer lands in the first couple of seconds.
195
+ console.log(
196
+ `[ack] ${tc.name}: TTFO=${ttfo}ms, ${len} chars — fast `
197
+ + `full answer (legitimate ack-skip).`,
198
+ );
199
+ } else {
200
+ // Passed the hard contract but slower than the vision
201
+ // target — the canary for the model needing the ack-poke
202
+ // nudge instead of acknowledging promptly on its own.
203
+ console.warn(
204
+ `[ack] ${tc.name}: TTFO=${ttfo}ms (vision target `
205
+ + `<${ACK_VISION_MS}ms), ${len} chars`
206
+ + `${looksLikeAck ? "" : " — and long, not an ack one-liner"}`
207
+ + `. The model did not acknowledge promptly on its own.`,
208
+ );
209
+ }
210
+ } finally {
211
+ await sc.tearDown();
212
+ }
213
+ },
214
+ ACK_HARD_MS + 45_000,
215
+ );
216
+ }
217
+ });
@@ -1,16 +1,21 @@
1
1
  /**
2
- * JTBD scenario — soft-commit for slow turns.
2
+ * JTBD scenario — first sign of life on a slow turn.
3
3
  *
4
- * The new conversational-pacing prompt (#1122) instructs the agent
5
- * to send a one-liner "let me check, back in a few" before slow
6
- * work. This UAT exercises that behaviour: send a prompt that
7
- * obviously needs >15s, expect the FIRST outbound to be a short
8
- * soft-commit message, with the final answer landing later.
4
+ * The conversational-pacing prompt instructs the agent to open with
5
+ * an acknowledgement before slow work. (The original ">15s soft
6
+ * commit" bullet this file was named for was superseded by the
7
+ * guaranteed "Open with an acknowledgement" bullet in PR #1633
8
+ * acknowledge every turn unless the answer lands in a second or two.)
9
9
  *
10
- * Not strict the agent's allowed to skip the soft-commit if it
11
- * judges the work is fast enough. The assertion is "the user does
12
- * NOT see a long silent gap before the first sign of life": either
13
- * a soft-commit OR the actual reply lands within 20s.
10
+ * This UAT exercises a single slow prompt and asserts the loose
11
+ * floor: the user does NOT see a long silent gap before the first
12
+ * sign of life a reply lands within 30s.
13
+ *
14
+ * The stronger, fuzzed successor of this contract is
15
+ * `jtbd-fast-ack-dm.test.ts` — varied prompt shapes, a tight 20s
16
+ * hard latency target (a tight target, not a framework guarantee —
17
+ * see that file's header). This file is retained as a minimal
18
+ * single-prompt floor.
14
19
  */
15
20
 
16
21
  import { describe, it, expect } from "vitest";
@@ -26,7 +31,7 @@ const SLOW_PROMPT = (
26
31
 
27
32
  describe("uat: soft-commit pacing", () => {
28
33
  it(
29
- "user asks slow question → first reply lands within 20s",
34
+ "user asks slow question → first reply lands within 30s",
30
35
  async () => {
31
36
  const sc = await spinUp({ agent: "test-harness" });
32
37
  try {