switchroom 0.13.4 → 0.13.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -45
- package/dist/agent-scheduler/index.js +80 -80
- package/dist/auth-broker/index.js +80 -80
- package/dist/cli/drive-write-pretool.mjs +10 -10
- package/dist/cli/skill-validate-pretool.mjs +72 -72
- package/dist/cli/switchroom.js +361 -357
- package/dist/host-control/main.js +99 -99
- package/dist/vault/approvals/kernel-server.js +82 -82
- package/dist/vault/broker/server.js +83 -83
- package/package.json +1 -1
- package/profiles/_shared/telegram-style.md.hbs +1 -1
- package/profiles/_shared/vault-protocol.md.hbs +12 -0
- package/profiles/default/CLAUDE.md +192 -0
- package/telegram-plugin/dist/bridge/bridge.js +112 -112
- package/telegram-plugin/dist/gateway/gateway.js +210 -192
- package/telegram-plugin/dist/server.js +160 -160
- package/telegram-plugin/runtime-metrics.ts +14 -8
- package/telegram-plugin/silence-poke.ts +49 -1
- package/telegram-plugin/tests/silence-poke.test.ts +135 -3
- package/telegram-plugin/uat/scenarios/jtbd-fast-ack-dm.test.ts +217 -0
- package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +16 -11
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
import { existsSync, mkdirSync, appendFileSync } from 'node:fs'
|
|
20
20
|
import { dirname, join } from 'node:path'
|
|
21
21
|
import { captureEvent } from './analytics-posthog.js'
|
|
22
|
+
import type { PokeLevel } from './silence-poke.js'
|
|
22
23
|
|
|
23
24
|
export type RuntimeMetricEvent =
|
|
24
25
|
/**
|
|
@@ -62,28 +63,33 @@ export type RuntimeMetricEvent =
|
|
|
62
63
|
ended_via: 'reply' | 'stream_reply_done' | 'silent' | 'forced' | 'framework_fallback'
|
|
63
64
|
}
|
|
64
65
|
/**
|
|
65
|
-
* Framework safety-net: a silence-poke was armed
|
|
66
|
-
*
|
|
67
|
-
*
|
|
68
|
-
*
|
|
69
|
-
*
|
|
66
|
+
* Framework safety-net: a silence-poke was armed. `ack` is the early
|
|
67
|
+
* (~10s) ack-budget poke — the model has sent NOTHING this turn and is
|
|
68
|
+
* leaving the user on a silent chat. `soft` (75s) / `firm` (180s) are
|
|
69
|
+
* the silence-since-last-outbound ladder. The system-reminder appended
|
|
70
|
+
* to the next tool result nudges the model to send an update. Doubles
|
|
71
|
+
* as a design-health signal — if these fire frequently, the
|
|
72
|
+
* conversational-pacing prompt isn't doing its job.
|
|
70
73
|
*/
|
|
71
74
|
| {
|
|
72
75
|
kind: 'silence_poke_fired'
|
|
73
76
|
key: string
|
|
74
|
-
level:
|
|
77
|
+
level: PokeLevel
|
|
75
78
|
silence_ms: number
|
|
76
79
|
subagent_wait: boolean
|
|
77
80
|
}
|
|
78
81
|
/**
|
|
79
82
|
* The model sent an outbound message within the success window
|
|
80
83
|
* (default 15s) after a poke fired. Pair with `silence_poke_fired`
|
|
81
|
-
* to compute success rate — the design target is >80%.
|
|
84
|
+
* to compute success rate — the design target is >80%. (`ack`-level
|
|
85
|
+
* success is not currently emitted — the ack poke sits outside the
|
|
86
|
+
* `pokesFired` ladder noteOutbound measures against; the type admits
|
|
87
|
+
* `ack` only so the silence-poke metric union stays assignable.)
|
|
82
88
|
*/
|
|
83
89
|
| {
|
|
84
90
|
kind: 'silence_poke_succeeded'
|
|
85
91
|
key: string
|
|
86
|
-
level:
|
|
92
|
+
level: PokeLevel
|
|
87
93
|
latency_ms: number
|
|
88
94
|
}
|
|
89
95
|
/**
|
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
* pacing prompt still applies; only the framework safety net is off.
|
|
44
44
|
*/
|
|
45
45
|
|
|
46
|
-
export type PokeLevel = 'soft' | 'firm'
|
|
46
|
+
export type PokeLevel = 'ack' | 'soft' | 'firm'
|
|
47
47
|
|
|
48
48
|
/** #1292: snapshot of an in-flight tool call, surfaced in the 300s
|
|
49
49
|
* framework-fallback message so the user sees the actual observable
|
|
@@ -76,6 +76,10 @@ export interface SilencePokeState {
|
|
|
76
76
|
lastThinkingAt: number | null
|
|
77
77
|
/** True once the 300s framework fallback has fired this turn. */
|
|
78
78
|
fallbackFired: boolean
|
|
79
|
+
/** True once the early ack-budget poke has fired this turn. One-shot:
|
|
80
|
+
* the ack nudge is specifically about the *first* outbound, so it
|
|
81
|
+
* never re-arms even after the model later goes quiet again. */
|
|
82
|
+
ackPokeFired: boolean
|
|
79
83
|
/** Wall-clock ms of last poke fire — used for poke-success latency. */
|
|
80
84
|
lastPokeFiredAt: number | null
|
|
81
85
|
/** #1292: in-flight tool calls keyed by toolUseId. Populated by
|
|
@@ -91,6 +95,12 @@ export interface SilencePokeState {
|
|
|
91
95
|
}
|
|
92
96
|
|
|
93
97
|
export interface ThresholdsMs {
|
|
98
|
+
/** Ack budget: if NO outbound at all has landed this many ms after
|
|
99
|
+
* turn start, arm an 'ack' poke. This is the framework enforcing the
|
|
100
|
+
* human-baseline "acknowledge within a beat" — far tighter than the
|
|
101
|
+
* 75s `soft` threshold, which measures silence-since-last-outbound
|
|
102
|
+
* and is the wrong instrument for "you never said hello." */
|
|
103
|
+
ack: number
|
|
94
104
|
soft: number
|
|
95
105
|
firm: number
|
|
96
106
|
fallback: number
|
|
@@ -101,6 +111,7 @@ export interface ThresholdsMs {
|
|
|
101
111
|
}
|
|
102
112
|
|
|
103
113
|
export const DEFAULT_THRESHOLDS: ThresholdsMs = {
|
|
114
|
+
ack: 10_000,
|
|
104
115
|
soft: 75_000,
|
|
105
116
|
firm: 180_000,
|
|
106
117
|
fallback: 300_000,
|
|
@@ -176,6 +187,7 @@ export function startTurn(key: string, now: number): void {
|
|
|
176
187
|
subagentDispatchActive: false,
|
|
177
188
|
lastThinkingAt: null,
|
|
178
189
|
fallbackFired: false,
|
|
190
|
+
ackPokeFired: false,
|
|
179
191
|
lastPokeFiredAt: null,
|
|
180
192
|
inFlightTools: new Map(),
|
|
181
193
|
})
|
|
@@ -340,6 +352,16 @@ export function endTurn(key: string): void {
|
|
|
340
352
|
|
|
341
353
|
/** Verbatim poke text. Wording is load-bearing — see issue #1122 design. */
|
|
342
354
|
export function formatPokeText(level: PokeLevel): string {
|
|
355
|
+
if (level === 'ack') {
|
|
356
|
+
return (
|
|
357
|
+
"[silence-poke] You haven't sent the user anything yet this turn — "
|
|
358
|
+
+ 'they are looking at a silent chat. Send a short, human one-line '
|
|
359
|
+
+ 'acknowledgement now via `reply` (e.g. "on it — checking"), in your '
|
|
360
|
+
+ "persona's voice, before you do any more work. A good colleague "
|
|
361
|
+
+ "answers in a beat; don't leave the message hanging while you think. "
|
|
362
|
+
+ 'If the full answer is genuinely seconds away, send that instead.'
|
|
363
|
+
)
|
|
364
|
+
}
|
|
343
365
|
if (level === 'soft') {
|
|
344
366
|
return (
|
|
345
367
|
"[silence-poke] You've been silent to the user for 75s. If you're "
|
|
@@ -437,6 +459,32 @@ function tick(now: number): void {
|
|
|
437
459
|
? thresholds.subagentSoft
|
|
438
460
|
: thresholds.soft
|
|
439
461
|
|
|
462
|
+
// Ack budget — the framework enforcing the human-baseline "answer
|
|
463
|
+
// in a beat." Fires once, only when NOTHING has been sent this turn
|
|
464
|
+
// (`lastOutboundAt == null`), well before the 75s `soft` threshold.
|
|
465
|
+
// `soft` measures silence-since-last-outbound and is the wrong
|
|
466
|
+
// instrument for "you never acknowledged me." Independent of the
|
|
467
|
+
// soft/firm/fallback ladder: if the model never acks, it still
|
|
468
|
+
// escalates soft → firm → fallback on schedule after this.
|
|
469
|
+
if (
|
|
470
|
+
!s.ackPokeFired
|
|
471
|
+
&& s.lastOutboundAt == null
|
|
472
|
+
&& s.pokesFired === 0
|
|
473
|
+
&& silence >= thresholds.ack
|
|
474
|
+
) {
|
|
475
|
+
s.pokeArmed = { level: 'ack' }
|
|
476
|
+
s.ackPokeFired = true
|
|
477
|
+
s.lastPokeFiredAt = now
|
|
478
|
+
activeDeps.emitMetric({
|
|
479
|
+
kind: 'silence_poke_fired',
|
|
480
|
+
key,
|
|
481
|
+
level: 'ack',
|
|
482
|
+
silence_ms: silence,
|
|
483
|
+
subagent_wait: s.subagentDispatchActive,
|
|
484
|
+
})
|
|
485
|
+
continue
|
|
486
|
+
}
|
|
487
|
+
|
|
440
488
|
if (s.pokesFired === 0 && silence >= softThreshold) {
|
|
441
489
|
s.pokeArmed = { level: 'soft' }
|
|
442
490
|
s.pokesFired = 1
|
|
@@ -33,7 +33,15 @@ function setupDeps(opts?: { thresholds?: Partial<typeof DEFAULT_THRESHOLDS> }):
|
|
|
33
33
|
__setDepsForTests({
|
|
34
34
|
emitMetric: (e) => fixtures.emitted.push(e),
|
|
35
35
|
onFrameworkFallback: (ctx) => { fixtures.fallbacks.push(ctx) },
|
|
36
|
-
|
|
36
|
+
// The ack budget (a new poke that fires *earlier* than `soft`) is
|
|
37
|
+
// disabled by default in this fixture so the soft/firm/fallback
|
|
38
|
+
// ladder tests stay isolated from it. The 'ack budget' describe
|
|
39
|
+
// block opts back in with a real value.
|
|
40
|
+
thresholdsMs: {
|
|
41
|
+
...DEFAULT_THRESHOLDS,
|
|
42
|
+
ack: Number.MAX_SAFE_INTEGER,
|
|
43
|
+
...(opts?.thresholds ?? {}),
|
|
44
|
+
},
|
|
37
45
|
})
|
|
38
46
|
return fixtures
|
|
39
47
|
}
|
|
@@ -139,6 +147,127 @@ describe('silence-poke — escalation ladder', () => {
|
|
|
139
147
|
})
|
|
140
148
|
})
|
|
141
149
|
|
|
150
|
+
// PR1 (human-feel UX epic): the ack budget. A person you message
|
|
151
|
+
// answers in a beat — the framework enforces that baseline by arming an
|
|
152
|
+
// 'ack' poke if NOTHING has been sent within `thresholds.ack` of turn
|
|
153
|
+
// start. It is a one-shot nudge (the model still authors every word),
|
|
154
|
+
// deliberately OUTSIDE the soft/firm/fallback `pokesFired` ladder: if
|
|
155
|
+
// the model never acks, the ladder still escalates on its own schedule.
|
|
156
|
+
// See `reference/conversational-pacing.md` and the "Open with an
|
|
157
|
+
// acknowledgement" bullet in `profiles/_shared/telegram-style.md.hbs`.
|
|
158
|
+
//
|
|
159
|
+
// NB: `setupDeps` disables the ack budget by default (ack = MAX_SAFE);
|
|
160
|
+
// every test here opts back in with a real `ack` threshold.
|
|
161
|
+
describe('silence-poke — ack budget (PR1 human-feel UX)', () => {
|
|
162
|
+
it('arms an ack poke at the ack threshold when nothing has been sent', () => {
|
|
163
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
164
|
+
startTurn('chat:0', 0)
|
|
165
|
+
|
|
166
|
+
__tickForTests(9_000) // before the ack budget
|
|
167
|
+
expect(consumeArmedPoke()).toBeNull()
|
|
168
|
+
expect(fx.emitted).toHaveLength(0)
|
|
169
|
+
|
|
170
|
+
__tickForTests(10_000) // at the ack budget
|
|
171
|
+
expect(fx.emitted).toEqual([
|
|
172
|
+
expect.objectContaining({ kind: 'silence_poke_fired', level: 'ack' }),
|
|
173
|
+
])
|
|
174
|
+
const text = consumeArmedPoke()
|
|
175
|
+
expect(text).toContain('[silence-poke]')
|
|
176
|
+
expect(text).toContain('reply')
|
|
177
|
+
})
|
|
178
|
+
|
|
179
|
+
it('does NOT arm an ack poke if an outbound landed before the budget', () => {
|
|
180
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
181
|
+
startTurn('chat:0', 0)
|
|
182
|
+
noteOutbound('chat:0', 3_000) // model acked fast — inside the budget
|
|
183
|
+
__tickForTests(10_000)
|
|
184
|
+
__tickForTests(20_000)
|
|
185
|
+
expect(consumeArmedPoke()).toBeNull()
|
|
186
|
+
expect(
|
|
187
|
+
fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'ack'),
|
|
188
|
+
).toHaveLength(0)
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
it('is one-shot — never re-arms even if the model goes quiet again', () => {
|
|
192
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
193
|
+
startTurn('chat:0', 0)
|
|
194
|
+
__tickForTests(10_000) // ack fires
|
|
195
|
+
consumeArmedPoke() // drain it
|
|
196
|
+
noteOutbound('chat:0', 12_000) // model finally acks
|
|
197
|
+
// The model goes quiet again. The ack poke is specifically about the
|
|
198
|
+
// FIRST outbound — it must not fire twice. A later silence is the
|
|
199
|
+
// soft poke's job, not the ack budget's.
|
|
200
|
+
__tickForTests(40_000)
|
|
201
|
+
expect(
|
|
202
|
+
fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'ack'),
|
|
203
|
+
).toHaveLength(1)
|
|
204
|
+
})
|
|
205
|
+
|
|
206
|
+
it('ackPokeFired resets across turns even when endTurn was skipped (CC-5 invariant)', () => {
|
|
207
|
+
// Mirrors the subagentDispatchActive CC-5 guard: `ackPokeFired` is a
|
|
208
|
+
// turn-scoped one-shot flag, and the only thing that keeps it from
|
|
209
|
+
// leaking into the next turn (when an abnormal abort skips endTurn)
|
|
210
|
+
// is startTurn's unconditional state overwrite. Pin that here so a
|
|
211
|
+
// future read-modify-write refactor of startTurn fails loud.
|
|
212
|
+
setupDeps({ thresholds: { ack: 10_000 } })
|
|
213
|
+
startTurn('k', 0)
|
|
214
|
+
__tickForTests(10_000) // ack fires
|
|
215
|
+
expect(__getStateForTests('k')?.ackPokeFired).toBe(true)
|
|
216
|
+
// Turn 2 in the same key, no endTurn — startTurn MUST clear the flag.
|
|
217
|
+
startTurn('k', 1_000_000)
|
|
218
|
+
expect(__getStateForTests('k')?.ackPokeFired).toBe(false)
|
|
219
|
+
})
|
|
220
|
+
|
|
221
|
+
it('does not advance the ladder — soft still requires a full 75s of silence', () => {
|
|
222
|
+
// The ack poke is deliberately outside `pokesFired`. After it fires,
|
|
223
|
+
// a soft poke must still wait the normal 75s.
|
|
224
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
225
|
+
startTurn('chat:0', 0)
|
|
226
|
+
__tickForTests(10_000) // ack
|
|
227
|
+
consumeArmedPoke()
|
|
228
|
+
__tickForTests(70_000) // 70s total — under the 75s soft threshold
|
|
229
|
+
expect(
|
|
230
|
+
fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'soft'),
|
|
231
|
+
).toHaveLength(0)
|
|
232
|
+
__tickForTests(75_000)
|
|
233
|
+
expect(
|
|
234
|
+
fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'soft'),
|
|
235
|
+
).toHaveLength(1)
|
|
236
|
+
})
|
|
237
|
+
|
|
238
|
+
it('still escalates ack -> soft -> firm -> fallback on a turn that never acks', () => {
|
|
239
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
240
|
+
startTurn('chat:0', 0)
|
|
241
|
+
__tickForTests(10_000) // ack
|
|
242
|
+
consumeArmedPoke()
|
|
243
|
+
__tickForTests(75_000) // soft
|
|
244
|
+
consumeArmedPoke()
|
|
245
|
+
__tickForTests(180_000) // firm
|
|
246
|
+
consumeArmedPoke()
|
|
247
|
+
__tickForTests(300_000) // fallback
|
|
248
|
+
const trail = fx.emitted.map((e) =>
|
|
249
|
+
e.kind === 'silence_poke_fired'
|
|
250
|
+
? `poke:${e.level}`
|
|
251
|
+
: e.kind === 'silence_fallback_sent'
|
|
252
|
+
? `fallback:${e.fallback_kind}`
|
|
253
|
+
: e.kind,
|
|
254
|
+
)
|
|
255
|
+
expect(trail).toEqual([
|
|
256
|
+
'poke:ack',
|
|
257
|
+
'poke:soft',
|
|
258
|
+
'poke:firm',
|
|
259
|
+
'fallback:working',
|
|
260
|
+
])
|
|
261
|
+
})
|
|
262
|
+
|
|
263
|
+
it('formatPokeText("ack") nudges for a human acknowledgement via reply', () => {
|
|
264
|
+
const text = formatPokeText('ack')
|
|
265
|
+
expect(text).toContain('[silence-poke]')
|
|
266
|
+
expect(text.toLowerCase()).toContain('acknowledg')
|
|
267
|
+
expect(text).toContain('reply')
|
|
268
|
+
})
|
|
269
|
+
})
|
|
270
|
+
|
|
142
271
|
describe('silence-poke — outbound resets clock + success measurement', () => {
|
|
143
272
|
it('noteOutbound resets the silence clock', () => {
|
|
144
273
|
setupDeps()
|
|
@@ -608,7 +737,9 @@ describe('silence-poke — fallback handler errors do not break timer', () => {
|
|
|
608
737
|
__setDepsForTests({
|
|
609
738
|
emitMetric: (e) => fx.emitted.push(e),
|
|
610
739
|
onFrameworkFallback: () => { throw new Error('oh no') },
|
|
611
|
-
|
|
740
|
+
// ack budget out of the way — this test exercises the
|
|
741
|
+
// soft/firm/fallback ladder under a throwing fallback handler.
|
|
742
|
+
thresholdsMs: { ...DEFAULT_THRESHOLDS, ack: Number.MAX_SAFE_INTEGER },
|
|
612
743
|
})
|
|
613
744
|
startTurn('k', 0)
|
|
614
745
|
expect(() => {
|
|
@@ -625,7 +756,8 @@ describe('silence-poke — fallback handler errors do not break timer', () => {
|
|
|
625
756
|
__setDepsForTests({
|
|
626
757
|
emitMetric: (e) => fx.emitted.push(e),
|
|
627
758
|
onFrameworkFallback: () => Promise.reject(new Error('async fail')),
|
|
628
|
-
|
|
759
|
+
// ack budget out of the way — see the throwing-handler test above.
|
|
760
|
+
thresholdsMs: { ...DEFAULT_THRESHOLDS, ack: Number.MAX_SAFE_INTEGER },
|
|
629
761
|
})
|
|
630
762
|
startTurn('k', 0)
|
|
631
763
|
__tickForTests(75_000)
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JTBD scenario — guaranteed fast acknowledgement (human-feel UX epic).
|
|
3
|
+
*
|
|
4
|
+
* Serves: `reference/conversational-pacing.md` and the JTBD
|
|
5
|
+
* "talking to my agent feels like talking to a capable person".
|
|
6
|
+
*
|
|
7
|
+
* A person you message answers in a beat — "got it", "on it, checking
|
|
8
|
+
* now" — before the work is done. PR #1633 made that opening
|
|
9
|
+
* acknowledgement a *guarantee*, split across two layers:
|
|
10
|
+
*
|
|
11
|
+
* - the conversational-pacing prompt teaches the model to open with
|
|
12
|
+
* a short human one-liner unless the real answer lands in a second
|
|
13
|
+
* or two;
|
|
14
|
+
* - the silence-poke subsystem *enforces* it — a ~10s ack-budget
|
|
15
|
+
* poke fires when nothing at all has been sent this turn, nudging
|
|
16
|
+
* the model to acknowledge before it does more work.
|
|
17
|
+
*
|
|
18
|
+
* This UAT drives a FUZZY set of non-trivial prompt shapes — research,
|
|
19
|
+
* multi-step compute, open-ended advice, code, reflective asks. Every
|
|
20
|
+
* one needs real work, so a turn that goes silent for tens of seconds
|
|
21
|
+
* is a black box. The invariant under test: the user sees a sign of
|
|
22
|
+
* life FAST, every time, across every prompt shape.
|
|
23
|
+
*
|
|
24
|
+
* ## Targets
|
|
25
|
+
*
|
|
26
|
+
* - **Hard contract:** the first outbound lands within `ACK_HARD_MS`
|
|
27
|
+
* for every prompt. This is a tight *latency target*, not a
|
|
28
|
+
* framework guarantee. The silence-poke ack rung is a *nudge*
|
|
29
|
+
* piggybacked on the model's next tool result (`consumeArmedPoke`
|
|
30
|
+
* drained at the gateway tool-result chokepoint) — not a
|
|
31
|
+
* framework-composed send. It helps the model along, but a
|
|
32
|
+
* pure-reasoning prompt that issues no tool call never drains the
|
|
33
|
+
* nudge, so the bound ultimately depends on model latency. It
|
|
34
|
+
* still has teeth: pre-#1633 a slow prompt's first outbound was
|
|
35
|
+
* the full answer, often 30-60s out, so 20s cleanly separates the
|
|
36
|
+
* fixed behaviour from a regression. A failure here means the
|
|
37
|
+
* agent left the user on a silent chat — a real pacing defect.
|
|
38
|
+
* - **Vision target (soft, per-case forensic):** the first outbound
|
|
39
|
+
* lands within `ACK_VISION_MS` and is short — a genuine
|
|
40
|
+
* acknowledgement, not a full-answer dump. The model self-acking
|
|
41
|
+
* quickly is what makes it *feel* human. Logged, not failed: real
|
|
42
|
+
* model runs vary, and the prompt explicitly lets a turn skip the
|
|
43
|
+
* ack when the answer itself arrives in the first couple seconds.
|
|
44
|
+
*
|
|
45
|
+
* ## Relationship to adjacent UATs
|
|
46
|
+
*
|
|
47
|
+
* - `jtbd-fast-trivial-dm.test.ts` — TRIVIAL prompts: the answer
|
|
48
|
+
* itself should land fast, no ack ceremony. This file is the
|
|
49
|
+
* non-trivial inverse: real work, but a fast *acknowledgement*.
|
|
50
|
+
* - `jtbd-soft-commit-dm.test.ts` — the predecessor: a single slow
|
|
51
|
+
* prompt, a looser "first reply within 30s" floor. This file is
|
|
52
|
+
* the stronger, fuzzed successor of that contract.
|
|
53
|
+
*
|
|
54
|
+
* Each case is a single inbound; cases run sequentially. As with the
|
|
55
|
+
* other fuzz files, a prior turn may still be finishing in the
|
|
56
|
+
* background when the next case starts — an accepted, noted risk.
|
|
57
|
+
*/
|
|
58
|
+
|
|
59
|
+
import { describe, it, expect } from "vitest";
|
|
60
|
+
import { spinUp } from "../harness.js";
|
|
61
|
+
|
|
62
|
+
const AGENT = "test-harness";
|
|
63
|
+
|
|
64
|
+
// Hard contract: a sign of life within this budget, every prompt.
|
|
65
|
+
// A tight latency target — well above a healthy self-ack (~3-8s on a
|
|
66
|
+
// warm agent) and well below the pre-#1633 silent-then-dump regression
|
|
67
|
+
// (30-60s). Model-dependent, not a framework guarantee (see header
|
|
68
|
+
// doc), so it carries generous headroom for mtcute polling jitter and
|
|
69
|
+
// for a model that leans on the ack-poke nudge instead of self-acking.
|
|
70
|
+
const ACK_HARD_MS = 20_000;
|
|
71
|
+
|
|
72
|
+
// Vision target: the model self-acknowledges in a beat, fast enough
|
|
73
|
+
// that the ack-poke nudge never has to come into it.
|
|
74
|
+
const ACK_VISION_MS = 8_000;
|
|
75
|
+
|
|
76
|
+
// A first outbound at or under this length reads as an acknowledgement
|
|
77
|
+
// one-liner rather than a full-answer dump. Mirrors the >200-char
|
|
78
|
+
// "long answer" heuristic in jtbd-soft-commit-dm, with headroom for a
|
|
79
|
+
// persona-voiced ack ("on it — pulling the os-release and hostname now").
|
|
80
|
+
const ACK_LEN_CEILING = 320;
|
|
81
|
+
|
|
82
|
+
interface AckCase {
|
|
83
|
+
name: string;
|
|
84
|
+
/** A prompt that genuinely needs more than a second or two of work,
|
|
85
|
+
* so an instant full answer is not a legitimate ack-skip. */
|
|
86
|
+
prompt: string;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const ACK_CASES: readonly AckCase[] = [
|
|
90
|
+
// ─── Research / multi-source read ─────────────────────────────
|
|
91
|
+
{
|
|
92
|
+
name: "machine-summary research",
|
|
93
|
+
prompt:
|
|
94
|
+
"Read /etc/os-release and /etc/hostname, then tell me in one "
|
|
95
|
+
+ "sentence what kind of machine this is.",
|
|
96
|
+
},
|
|
97
|
+
// ─── Multi-step compute ───────────────────────────────────────
|
|
98
|
+
{
|
|
99
|
+
name: "compound date math",
|
|
100
|
+
prompt:
|
|
101
|
+
"Work out what day of the week it is today, then tell me how "
|
|
102
|
+
+ "many days are left until the end of this month.",
|
|
103
|
+
},
|
|
104
|
+
// ─── Open-ended advice ("take your time") ─────────────────────
|
|
105
|
+
{
|
|
106
|
+
name: "open-ended prioritisation",
|
|
107
|
+
prompt:
|
|
108
|
+
"I've got a free afternoon and three half-finished side "
|
|
109
|
+
+ "projects. Help me decide what to focus on. Take your time.",
|
|
110
|
+
},
|
|
111
|
+
// ─── Summarise / explain ──────────────────────────────────────
|
|
112
|
+
{
|
|
113
|
+
name: "plain-language summary",
|
|
114
|
+
prompt:
|
|
115
|
+
"Give me a 3-bullet summary of what a Linux container actually "
|
|
116
|
+
+ "is, in plain language.",
|
|
117
|
+
},
|
|
118
|
+
// ─── Code task ────────────────────────────────────────────────
|
|
119
|
+
{
|
|
120
|
+
name: "bash one-liner with explanation",
|
|
121
|
+
prompt:
|
|
122
|
+
"Write me a small bash one-liner that counts the total number "
|
|
123
|
+
+ "of lines across all .ts files under the current directory, "
|
|
124
|
+
+ "and explain how it works.",
|
|
125
|
+
},
|
|
126
|
+
// ─── Reflective / vague-but-real ──────────────────────────────
|
|
127
|
+
{
|
|
128
|
+
name: "reflective open ask",
|
|
129
|
+
prompt:
|
|
130
|
+
"Something feels off with how I'm spending my mornings lately. "
|
|
131
|
+
+ "Help me think through it.",
|
|
132
|
+
},
|
|
133
|
+
// ─── Comparison / judgement ───────────────────────────────────
|
|
134
|
+
{
|
|
135
|
+
name: "tech comparison",
|
|
136
|
+
prompt:
|
|
137
|
+
"Compare REST and GraphQL for a small side project — which "
|
|
138
|
+
+ "would you pick and why?",
|
|
139
|
+
},
|
|
140
|
+
// ─── Investigate the box ──────────────────────────────────────
|
|
141
|
+
{
|
|
142
|
+
name: "disk-usage investigation",
|
|
143
|
+
prompt:
|
|
144
|
+
"Have a look at what's taking up the most space under /var/log "
|
|
145
|
+
+ "and summarise what you find.",
|
|
146
|
+
},
|
|
147
|
+
];
|
|
148
|
+
|
|
149
|
+
describe("uat: guaranteed fast acknowledgement — fuzzy prompt shapes", () => {
|
|
150
|
+
for (const tc of ACK_CASES) {
|
|
151
|
+
it(
|
|
152
|
+
`[ack] ${tc.name} — sign of life within ${ACK_HARD_MS / 1000}s`,
|
|
153
|
+
async () => {
|
|
154
|
+
const sc = await spinUp({ agent: AGENT });
|
|
155
|
+
try {
|
|
156
|
+
const sendStart = Date.now();
|
|
157
|
+
await sc.sendDM(tc.prompt);
|
|
158
|
+
|
|
159
|
+
const firstOutbound = await sc.expectMessage(/\S/, {
|
|
160
|
+
from: "bot",
|
|
161
|
+
timeout: ACK_HARD_MS + 6_000,
|
|
162
|
+
});
|
|
163
|
+
const ttfo = Date.now() - sendStart;
|
|
164
|
+
const len = firstOutbound.text.trim().length;
|
|
165
|
+
|
|
166
|
+
// Invariant: the outbound is a real, non-empty message.
|
|
167
|
+
expect(len).toBeGreaterThan(0);
|
|
168
|
+
|
|
169
|
+
// Hard contract: a sign of life FAST. A latency target, not
|
|
170
|
+
// a framework guarantee (see header doc) — but a failure
|
|
171
|
+
// here is a real pacing defect, so it fails the build.
|
|
172
|
+
if (ttfo >= ACK_HARD_MS) {
|
|
173
|
+
throw new Error(
|
|
174
|
+
`[ack] ${tc.name}: TTFO=${ttfo}ms exceeds the hard `
|
|
175
|
+
+ `contract ${ACK_HARD_MS}ms — the user sat on a silent `
|
|
176
|
+
+ `chat. The fast-ack path (pacing prompt + ack-poke `
|
|
177
|
+
+ `nudge) is not delivering. First outbound: `
|
|
178
|
+
+ `${JSON.stringify(firstOutbound.text.slice(0, 200))}`,
|
|
179
|
+
);
|
|
180
|
+
}
|
|
181
|
+
expect(ttfo).toBeLessThan(ACK_HARD_MS);
|
|
182
|
+
|
|
183
|
+
// Forensic, soft: did the model self-acknowledge in a beat,
|
|
184
|
+
// or did it only get there with the ack-poke nudge?
|
|
185
|
+
const looksLikeAck = len <= ACK_LEN_CEILING;
|
|
186
|
+
if (ttfo < ACK_VISION_MS && looksLikeAck) {
|
|
187
|
+
console.log(
|
|
188
|
+
`[ack] ${tc.name}: TTFO=${ttfo}ms, ${len} chars — fast `
|
|
189
|
+
+ `short acknowledgement. Feels human.`,
|
|
190
|
+
);
|
|
191
|
+
} else if (ttfo < ACK_VISION_MS && !looksLikeAck) {
|
|
192
|
+
// Fast but long: the answer itself arrived quickly. The
|
|
193
|
+
// pacing prompt explicitly sanctions skipping the ack when
|
|
194
|
+
// the answer lands in the first couple of seconds.
|
|
195
|
+
console.log(
|
|
196
|
+
`[ack] ${tc.name}: TTFO=${ttfo}ms, ${len} chars — fast `
|
|
197
|
+
+ `full answer (legitimate ack-skip).`,
|
|
198
|
+
);
|
|
199
|
+
} else {
|
|
200
|
+
// Passed the hard contract but slower than the vision
|
|
201
|
+
// target — the canary for the model needing the ack-poke
|
|
202
|
+
// nudge instead of acknowledging promptly on its own.
|
|
203
|
+
console.warn(
|
|
204
|
+
`[ack] ${tc.name}: TTFO=${ttfo}ms (vision target `
|
|
205
|
+
+ `<${ACK_VISION_MS}ms), ${len} chars`
|
|
206
|
+
+ `${looksLikeAck ? "" : " — and long, not an ack one-liner"}`
|
|
207
|
+
+ `. The model did not acknowledge promptly on its own.`,
|
|
208
|
+
);
|
|
209
|
+
}
|
|
210
|
+
} finally {
|
|
211
|
+
await sc.tearDown();
|
|
212
|
+
}
|
|
213
|
+
},
|
|
214
|
+
ACK_HARD_MS + 45_000,
|
|
215
|
+
);
|
|
216
|
+
}
|
|
217
|
+
});
|
|
@@ -1,16 +1,21 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* JTBD scenario —
|
|
2
|
+
* JTBD scenario — first sign of life on a slow turn.
|
|
3
3
|
*
|
|
4
|
-
* The
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
4
|
+
* The conversational-pacing prompt instructs the agent to open with
|
|
5
|
+
* an acknowledgement before slow work. (The original ">15s soft
|
|
6
|
+
* commit" bullet this file was named for was superseded by the
|
|
7
|
+
* guaranteed "Open with an acknowledgement" bullet in PR #1633 —
|
|
8
|
+
* acknowledge every turn unless the answer lands in a second or two.)
|
|
9
9
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
10
|
+
* This UAT exercises a single slow prompt and asserts the loose
|
|
11
|
+
* floor: the user does NOT see a long silent gap before the first
|
|
12
|
+
* sign of life — a reply lands within 30s.
|
|
13
|
+
*
|
|
14
|
+
* The stronger, fuzzed successor of this contract is
|
|
15
|
+
* `jtbd-fast-ack-dm.test.ts` — varied prompt shapes, a tight 20s
|
|
16
|
+
* hard latency target (a tight target, not a framework guarantee —
|
|
17
|
+
* see that file's header). This file is retained as a minimal
|
|
18
|
+
* single-prompt floor.
|
|
14
19
|
*/
|
|
15
20
|
|
|
16
21
|
import { describe, it, expect } from "vitest";
|
|
@@ -26,7 +31,7 @@ const SLOW_PROMPT = (
|
|
|
26
31
|
|
|
27
32
|
describe("uat: soft-commit pacing", () => {
|
|
28
33
|
it(
|
|
29
|
-
"user asks slow question → first reply lands within
|
|
34
|
+
"user asks slow question → first reply lands within 30s",
|
|
30
35
|
async () => {
|
|
31
36
|
const sc = await spinUp({ agent: "test-harness" });
|
|
32
37
|
try {
|