switchroom 0.13.4 → 0.13.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ /**
2
+ * JTBD scenario — guaranteed fast acknowledgement (human-feel UX epic).
3
+ *
4
+ * Serves: `reference/conversational-pacing.md` and the JTBD
5
+ * "talking to my agent feels like talking to a capable person".
6
+ *
7
+ * A person you message answers in a beat — "got it", "on it, checking
8
+ * now" — before the work is done. PR #1633 made that opening
9
+ * acknowledgement a *guarantee*, split across two layers:
10
+ *
11
+ * - the conversational-pacing prompt teaches the model to open with
12
+ * a short human one-liner unless the real answer lands in a second
13
+ * or two;
14
+ * - the silence-poke subsystem *enforces* it — a ~10s ack-budget
15
+ * poke fires when nothing at all has been sent this turn, nudging
16
+ * the model to acknowledge before it does more work.
17
+ *
18
+ * This UAT drives a FUZZY set of non-trivial prompt shapes — research,
19
+ * multi-step compute, open-ended advice, code, reflective asks. Every
20
+ * one needs real work, so a turn that goes silent for tens of seconds
21
+ * is a black box. The invariant under test: the user sees a sign of
22
+ * life FAST, every time, across every prompt shape.
23
+ *
24
+ * ## Targets
25
+ *
26
+ * - **Hard contract:** the first outbound lands within `ACK_HARD_MS`
27
+ * for every prompt. This is a tight *latency target*, not a
28
+ * framework guarantee. The silence-poke ack rung is a *nudge*
29
+ * piggybacked on the model's next tool result (`consumeArmedPoke`
30
+ * drained at the gateway tool-result chokepoint) — not a
31
+ * framework-composed send. It helps the model along, but a
32
+ * pure-reasoning prompt that issues no tool call never drains the
33
+ * nudge, so the bound ultimately depends on model latency. It
34
+ * still has teeth: pre-#1633 a slow prompt's first outbound was
35
+ * the full answer, often 30-60s out, so 20s cleanly separates the
36
+ * fixed behaviour from a regression. A failure here means the
37
+ * agent left the user on a silent chat — a real pacing defect.
38
+ * - **Vision target (soft, per-case forensic):** the first outbound
39
+ * lands within `ACK_VISION_MS` and is short — a genuine
40
+ * acknowledgement, not a full-answer dump. The model self-acking
41
+ * quickly is what makes it *feel* human. Logged, not failed: real
42
+ * model runs vary, and the prompt explicitly lets a turn skip the
43
+ * ack when the answer itself arrives in the first couple seconds.
44
+ *
45
+ * ## Relationship to adjacent UATs
46
+ *
47
+ * - `jtbd-fast-trivial-dm.test.ts` — TRIVIAL prompts: the answer
48
+ * itself should land fast, no ack ceremony. This file is the
49
+ * non-trivial inverse: real work, but a fast *acknowledgement*.
50
+ * - `jtbd-soft-commit-dm.test.ts` — the predecessor: a single slow
51
+ * prompt, a looser "first reply within 30s" floor. This file is
52
+ * the stronger, fuzzed successor of that contract.
53
+ *
54
+ * Each case is a single inbound; cases run sequentially. As with the
55
+ * other fuzz files, a prior turn may still be finishing in the
56
+ * background when the next case starts — an accepted, noted risk.
57
+ */
58
+
59
+ import { describe, it, expect } from "vitest";
60
+ import { spinUp } from "../harness.js";
61
+
62
+ const AGENT = "test-harness";
63
+
64
+ // Hard contract: a sign of life within this budget, every prompt.
65
+ // A tight latency target — well above a healthy self-ack (~3-8s on a
66
+ // warm agent) and well below the pre-#1633 silent-then-dump regression
67
+ // (30-60s). Model-dependent, not a framework guarantee (see header
68
+ // doc), so it carries generous headroom for mtcute polling jitter and
69
+ // for a model that leans on the ack-poke nudge instead of self-acking.
70
+ const ACK_HARD_MS = 20_000;
71
+
72
+ // Vision target: the model self-acknowledges in a beat, fast enough
73
+ // that the ack-poke nudge never has to come into it.
74
+ const ACK_VISION_MS = 8_000;
75
+
76
+ // A first outbound at or under this length reads as an acknowledgement
77
+ // one-liner rather than a full-answer dump. Mirrors the >200-char
78
+ // "long answer" heuristic in jtbd-soft-commit-dm, with headroom for a
79
+ // persona-voiced ack ("on it — pulling the os-release and hostname now").
80
+ const ACK_LEN_CEILING = 320;
81
+
82
+ interface AckCase {
83
+ name: string;
84
+ /** A prompt that genuinely needs more than a second or two of work,
85
+ * so an instant full answer is not a legitimate ack-skip. */
86
+ prompt: string;
87
+ }
88
+
89
+ const ACK_CASES: readonly AckCase[] = [
90
+ // ─── Research / multi-source read ─────────────────────────────
91
+ {
92
+ name: "machine-summary research",
93
+ prompt:
94
+ "Read /etc/os-release and /etc/hostname, then tell me in one "
95
+ + "sentence what kind of machine this is.",
96
+ },
97
+ // ─── Multi-step compute ───────────────────────────────────────
98
+ {
99
+ name: "compound date math",
100
+ prompt:
101
+ "Work out what day of the week it is today, then tell me how "
102
+ + "many days are left until the end of this month.",
103
+ },
104
+ // ─── Open-ended advice ("take your time") ─────────────────────
105
+ {
106
+ name: "open-ended prioritisation",
107
+ prompt:
108
+ "I've got a free afternoon and three half-finished side "
109
+ + "projects. Help me decide what to focus on. Take your time.",
110
+ },
111
+ // ─── Summarise / explain ──────────────────────────────────────
112
+ {
113
+ name: "plain-language summary",
114
+ prompt:
115
+ "Give me a 3-bullet summary of what a Linux container actually "
116
+ + "is, in plain language.",
117
+ },
118
+ // ─── Code task ────────────────────────────────────────────────
119
+ {
120
+ name: "bash one-liner with explanation",
121
+ prompt:
122
+ "Write me a small bash one-liner that counts the total number "
123
+ + "of lines across all .ts files under the current directory, "
124
+ + "and explain how it works.",
125
+ },
126
+ // ─── Reflective / vague-but-real ──────────────────────────────
127
+ {
128
+ name: "reflective open ask",
129
+ prompt:
130
+ "Something feels off with how I'm spending my mornings lately. "
131
+ + "Help me think through it.",
132
+ },
133
+ // ─── Comparison / judgement ───────────────────────────────────
134
+ {
135
+ name: "tech comparison",
136
+ prompt:
137
+ "Compare REST and GraphQL for a small side project — which "
138
+ + "would you pick and why?",
139
+ },
140
+ // ─── Investigate the box ──────────────────────────────────────
141
+ {
142
+ name: "disk-usage investigation",
143
+ prompt:
144
+ "Have a look at what's taking up the most space under /var/log "
145
+ + "and summarise what you find.",
146
+ },
147
+ ];
148
+
149
+ describe("uat: guaranteed fast acknowledgement — fuzzy prompt shapes", () => {
150
+ for (const tc of ACK_CASES) {
151
+ it(
152
+ `[ack] ${tc.name} — sign of life within ${ACK_HARD_MS / 1000}s`,
153
+ async () => {
154
+ const sc = await spinUp({ agent: AGENT });
155
+ try {
156
+ const sendStart = Date.now();
157
+ await sc.sendDM(tc.prompt);
158
+
159
+ const firstOutbound = await sc.expectMessage(/\S/, {
160
+ from: "bot",
161
+ timeout: ACK_HARD_MS + 6_000,
162
+ });
163
+ const ttfo = Date.now() - sendStart;
164
+ const len = firstOutbound.text.trim().length;
165
+
166
+ // Invariant: the outbound is a real, non-empty message.
167
+ expect(len).toBeGreaterThan(0);
168
+
169
+ // Hard contract: a sign of life FAST. A latency target, not
170
+ // a framework guarantee (see header doc) — but a failure
171
+ // here is a real pacing defect, so it fails the build.
172
+ if (ttfo >= ACK_HARD_MS) {
173
+ throw new Error(
174
+ `[ack] ${tc.name}: TTFO=${ttfo}ms exceeds the hard `
175
+ + `contract ${ACK_HARD_MS}ms — the user sat on a silent `
176
+ + `chat. The fast-ack path (pacing prompt + ack-poke `
177
+ + `nudge) is not delivering. First outbound: `
178
+ + `${JSON.stringify(firstOutbound.text.slice(0, 200))}`,
179
+ );
180
+ }
181
+ expect(ttfo).toBeLessThan(ACK_HARD_MS);
182
+
183
+ // Forensic, soft: did the model self-acknowledge in a beat,
184
+ // or did it only get there with the ack-poke nudge?
185
+ const looksLikeAck = len <= ACK_LEN_CEILING;
186
+ if (ttfo < ACK_VISION_MS && looksLikeAck) {
187
+ console.log(
188
+ `[ack] ${tc.name}: TTFO=${ttfo}ms, ${len} chars — fast `
189
+ + `short acknowledgement. Feels human.`,
190
+ );
191
+ } else if (ttfo < ACK_VISION_MS && !looksLikeAck) {
192
+ // Fast but long: the answer itself arrived quickly. The
193
+ // pacing prompt explicitly sanctions skipping the ack when
194
+ // the answer lands in the first couple of seconds.
195
+ console.log(
196
+ `[ack] ${tc.name}: TTFO=${ttfo}ms, ${len} chars — fast `
197
+ + `full answer (legitimate ack-skip).`,
198
+ );
199
+ } else {
200
+ // Passed the hard contract but slower than the vision
201
+ // target — the canary for the model needing the ack-poke
202
+ // nudge instead of acknowledging promptly on its own.
203
+ console.warn(
204
+ `[ack] ${tc.name}: TTFO=${ttfo}ms (vision target `
205
+ + `<${ACK_VISION_MS}ms), ${len} chars`
206
+ + `${looksLikeAck ? "" : " — and long, not an ack one-liner"}`
207
+ + `. The model did not acknowledge promptly on its own.`,
208
+ );
209
+ }
210
+ } finally {
211
+ await sc.tearDown();
212
+ }
213
+ },
214
+ ACK_HARD_MS + 45_000,
215
+ );
216
+ }
217
+ });
@@ -1,16 +1,21 @@
1
1
  /**
2
- * JTBD scenario — soft-commit for slow turns.
2
+ * JTBD scenario — first sign of life on a slow turn.
3
3
  *
4
- * The new conversational-pacing prompt (#1122) instructs the agent
5
- * to send a one-liner "let me check, back in a few" before slow
6
- * work. This UAT exercises that behaviour: send a prompt that
7
- * obviously needs >15s, expect the FIRST outbound to be a short
8
- * soft-commit message, with the final answer landing later.
4
+ * The conversational-pacing prompt instructs the agent to open with
5
+ * an acknowledgement before slow work. (The original ">15s soft
6
+ * commit" bullet this file was named for was superseded by the
7
+ * guaranteed "Open with an acknowledgement" bullet in PR #1633
8
+ * acknowledge every turn unless the answer lands in a second or two.)
9
9
  *
10
- * Not strict the agent's allowed to skip the soft-commit if it
11
- * judges the work is fast enough. The assertion is "the user does
12
- * NOT see a long silent gap before the first sign of life": either
13
- * a soft-commit OR the actual reply lands within 20s.
10
+ * This UAT exercises a single slow prompt and asserts the loose
11
+ * floor: the user does NOT see a long silent gap before the first
12
+ * sign of life a reply lands within 30s.
13
+ *
14
+ * The stronger, fuzzed successor of this contract is
15
+ * `jtbd-fast-ack-dm.test.ts` — varied prompt shapes, a tight 20s
16
+ * hard latency target (a tight target, not a framework guarantee —
17
+ * see that file's header). This file is retained as a minimal
18
+ * single-prompt floor.
14
19
  */
15
20
 
16
21
  import { describe, it, expect } from "vitest";
@@ -26,7 +31,7 @@ const SLOW_PROMPT = (
26
31
 
27
32
  describe("uat: soft-commit pacing", () => {
28
33
  it(
29
- "user asks slow question → first reply lands within 20s",
34
+ "user asks slow question → first reply lands within 30s",
30
35
  async () => {
31
36
  const sc = await spinUp({ agent: "test-harness" });
32
37
  try {