switchroom 0.14.10 → 0.14.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,14 +6,18 @@
6
6
  *
7
7
  * A person you message answers in a beat — "got it", "on it, checking
8
8
  * now" — before the work is done. PR #1633 made that opening
9
- * acknowledgement a *guarantee*, split across two layers:
9
+ * acknowledgement a *guarantee*; the enforcement has since moved off
10
+ * the silence-poke subsystem entirely:
10
11
  *
11
12
  * - the conversational-pacing prompt teaches the model to open with
12
13
  * a short human one-liner unless the real answer lands in a second
13
14
  * or two;
14
- * - the silence-poke subsystem *enforces* it a ~10s ack-budget
15
- * poke fires when nothing at all has been sent this turn, nudging
16
- * the model to acknowledge before it does more work.
15
+ * - the live-updating reply/draft carries the acknowledgement beat
16
+ * natively the user watches the message begin to compose itself,
17
+ * which IS the sign of life. The old ~10s ack-budget poke (a
18
+ * model-targeted nudge) was retired along with the rest of the
19
+ * nudge ladder; only the 300s framework fallback remains, and that
20
+ * is a wedge-breaker, not an ack mechanism.
17
21
  *
18
22
  * This UAT drives a FUZZY set of non-trivial prompt shapes — research,
19
23
  * multi-step compute, open-ended advice, code, reflective asks. Every
@@ -25,15 +29,11 @@
25
29
  *
26
30
  * - **Hard contract:** the first outbound lands within `ACK_HARD_MS`
27
31
  * for every prompt. This is a tight *latency target*, not a
28
- * framework guarantee. The silence-poke ack rung is a *nudge*
29
- * piggybacked on the model's next tool result (`consumeArmedPoke`
30
- * drained at the gateway tool-result chokepoint) not a
31
- * framework-composed send. It helps the model along, but a
32
- * pure-reasoning prompt that issues no tool call never drains the
33
- * nudge, so the bound ultimately depends on model latency. It
34
- * still has teeth: pre-#1633 a slow prompt's first outbound was
35
- * the full answer, often 30-60s out, so 20s cleanly separates the
36
- * fixed behaviour from a regression. A failure here means the
32
+ * framework guarantee the bound ultimately depends on model
33
+ * latency and on the pacing prompt + draft transport doing their
34
+ * job. It still has teeth: pre-#1633 a slow prompt's first outbound
35
+ * was the full answer, often 30-60s out, so 20s cleanly separates
36
+ * the fixed behaviour from a regression. A failure here means the
37
37
  * agent left the user on a silent chat — a real pacing defect.
38
38
  * - **Vision target (soft, per-case forensic):** the first outbound
39
39
  * lands within `ACK_VISION_MS` and is short — a genuine
@@ -65,12 +65,11 @@ const AGENT = "test-harness";
65
65
  // A tight latency target — well above a healthy self-ack (~3-8s on a
66
66
  // warm agent) and well below the pre-#1633 silent-then-dump regression
67
67
  // (30-60s). Model-dependent, not a framework guarantee (see header
68
- // doc), so it carries generous headroom for mtcute polling jitter and
69
- // for a model that leans on the ack-poke nudge instead of self-acking.
68
+ // doc), so it carries generous headroom for mtcute polling jitter.
70
69
  const ACK_HARD_MS = 20_000;
71
70
 
72
- // Vision target: the model self-acknowledges in a beat, fast enough
73
- // that the ack-poke nudge never has to come into it.
71
+ // Vision target: the model self-acknowledges in a beat the draft
72
+ // begins composing fast enough that the user never feels a gap.
74
73
  const ACK_VISION_MS = 8_000;
75
74
 
76
75
  // A first outbound at or under this length reads as an acknowledgement
@@ -173,15 +172,14 @@ describe("uat: guaranteed fast acknowledgement — fuzzy prompt shapes", () => {
173
172
  throw new Error(
174
173
  `[ack] ${tc.name}: TTFO=${ttfo}ms exceeds the hard `
175
174
  + `contract ${ACK_HARD_MS}ms — the user sat on a silent `
176
- + `chat. The fast-ack path (pacing prompt + ack-poke `
177
- + `nudge) is not delivering. First outbound: `
175
+ + `chat. The fast-ack path (pacing prompt + live draft) `
176
+ + `is not delivering. First outbound: `
178
177
  + `${JSON.stringify(firstOutbound.text.slice(0, 200))}`,
179
178
  );
180
179
  }
181
180
  expect(ttfo).toBeLessThan(ACK_HARD_MS);
182
181
 
183
- // Forensic, soft: did the model self-acknowledge in a beat,
184
- // or did it only get there with the ack-poke nudge?
182
+ // Forensic, soft: did the model self-acknowledge in a beat?
185
183
  const looksLikeAck = len <= ACK_LEN_CEILING;
186
184
  if (ttfo < ACK_VISION_MS && looksLikeAck) {
187
185
  console.log(
@@ -198,8 +196,8 @@ describe("uat: guaranteed fast acknowledgement — fuzzy prompt shapes", () => {
198
196
  );
199
197
  } else {
200
198
  // Passed the hard contract but slower than the vision
201
- // target — the canary for the model needing the ack-poke
202
- // nudge instead of acknowledging promptly on its own.
199
+ // target — the canary for the model not acknowledging
200
+ // promptly on its own (draft slow to start composing).
203
201
  console.warn(
204
202
  `[ack] ${tc.name}: TTFO=${ttfo}ms (vision target `
205
203
  + `<${ACK_VISION_MS}ms), ${len} chars`
@@ -1,155 +0,0 @@
1
- /**
2
- * Silence-poke soft-fire end-to-end scenario.
3
- *
4
- * Goal context: cause class CC-3 in `docs/status-ask-cause-classes.md`
5
- * — the L3 safety net. Unit tests (`silence-poke.test.ts`) cover the
6
- * state machine: tick semantics, ladder thresholds, success measurement.
7
- * They DO NOT cover the wire path between `consumeArmedPoke()` (in
8
- * `silence-poke.ts`) and the model actually receiving the
9
- * `[silence-poke]` system-reminder block on its next tool result.
10
- *
11
- * The wire path lives at `gateway.ts:2740`:
12
- *
13
- * onToolCall → executeToolCall(...) → consumeArmedPoke() →
14
- * append `<system-reminder>[silence-poke] ...</system-reminder>`
15
- * to the tool-result text.
16
- *
17
- * If that integration ever breaks — a refactor swaps `executeToolCall`
18
- * for a path that doesn't call `consumeArmedPoke`, the result-content
19
- * shape mutation gets dropped, MCP framing changes — the unit tests
20
- * still pass but the model never sees the nudge, the user goes silent
21
- * past 75s, and `inbound_status_query` ticks. This UAT closes that
22
- * regression window end-to-end.
23
- *
24
- * ## Strategy
25
- *
26
- * Force the agent into a stretch of silent tool churn that exceeds the
27
- * 75s soft threshold without the model emitting any outbound `reply`.
28
- * The conversational-pacing prompt instructs the model to soft-commit
29
- * fast turns, so we have to explicitly suppress that:
30
- *
31
- * - Prompt instructs three sequential 30s `sleep` Bash calls, NO
32
- * mid-turn replies, single final reply when done.
33
- * - Total silent stretch is ~90s + tool overhead, comfortably past
34
- * the 75s soft threshold.
35
- * - If the silence-poke wire works: the model sees the
36
- * `[silence-poke]` system-reminder appended to the result of the
37
- * first or second sleep, breaks the no-reply rule, sends a brief
38
- * update. We observe a reply in the [70s, 200s] window.
39
- * - If the wire is broken: model never receives the nudge, no
40
- * reply until the third sleep ends at ~90s+, OR the framework
41
- * fallback at 300s fires. We catch the latter as a separate
42
- * failure (the framework fallback is the FLOOR, not the goal).
43
- *
44
- * ## Tolerances
45
- *
46
- * Real-Telegram UAT against a real Claude model has variability:
47
- *
48
- * - Model may insert one soft-commit "on it" reply at start; that
49
- * resets the silence clock. Three 30s sleeps still pushes the
50
- * post-commit silence past 75s as long as the commit lands
51
- * within the first ~10s. We tolerate this.
52
- * - Model may decline to follow the "no replies" instruction and
53
- * send updates organically; if the FIRST reply still lands in
54
- * [70s, 200s], the conversational pacing layer is doing its job
55
- * and the test passes regardless of whether silence-poke
56
- * specifically fired.
57
- * - Window is generous (70-200s) to absorb 5s poll interval,
58
- * mtcute receive lag, Telegram delivery jitter.
59
- *
60
- * ## Failure shapes the assertion catches
61
- *
62
- * 1. Wire path broken — first reply lands >200s after sendDM
63
- * because the framework fallback (300s) is the only thing that
64
- * eventually breaks the silence.
65
- * 2. Soft poke armed but not drained — first reply lands at >200s
66
- * similarly.
67
- * 3. Model misbehavior — first reply is the FINAL answer (long
68
- * text after all three sleeps complete at ~90s+); strictly that
69
- * passes the window check, but the test also asserts the first
70
- * reply is brief (<400 chars) as a sanity floor on "this is
71
- * actually a poke response, not the final answer." Skip strict
72
- * length if the prompt happens to be so simple the final
73
- * answer IS brief.
74
- *
75
- * Requires the same env as `smoke-dm-reply.test.ts` (see
76
- * `uat/SETUP.md` §6). Long-running: outer budget 4 min.
77
- */
78
-
79
- import { describe, expect, it } from "vitest";
80
- import { spinUp } from "../harness.js";
81
-
82
- const SOFT_WINDOW_MIN_MS = 70_000;
83
- const SOFT_WINDOW_MAX_MS = 200_000;
84
-
85
- // Explicit instruction shape. Mirrors the `BG_DISPATCH_PROMPT` pattern
86
- // in `bg-sub-agent-dispatch-dm.test.ts` — pin the tool + the sequence
87
- // so behaviour is deterministic enough to test the *infra*, not the
88
- // model's free-form judgement.
89
- const SILENT_CHURN_PROMPT =
90
- "I need you to test something. Run THREE separate Bash tool calls " +
91
- "in sequence: first `sleep 30`, then `sleep 30`, then `sleep 30`. " +
92
- "Critical: do NOT send any `reply` or `stream_reply` between or " +
93
- "during the sleeps — no soft commit, no progress updates, no " +
94
- "narration. Just the three Bash calls back-to-back. Once all three " +
95
- "complete, send ONE brief final reply saying 'done' so I know " +
96
- "you're back.";
97
-
98
- describe("uat: silence-poke soft fires + reaches the model wire", () => {
99
- it(
100
- "agent breaks self-imposed silence in [70s, 200s] window via silence-poke",
101
- async () => {
102
- const sc = await spinUp({ agent: "test-harness" });
103
- try {
104
- const sendStart = Date.now();
105
- await sc.sendDM(SILENT_CHURN_PROMPT);
106
-
107
- // Wait for the FIRST reply. If silence-poke + the wire path
108
- // are working, this lands between ~75s and ~110s as the
109
- // model responds to the [silence-poke] system-reminder
110
- // appended to the first or second sleep's tool result.
111
- const firstReply = await sc.expectMessage(/\S/, {
112
- from: "bot",
113
- timeout: SOFT_WINDOW_MAX_MS + 20_000,
114
- });
115
- const elapsed = Date.now() - sendStart;
116
-
117
- expect(firstReply.text.length).toBeGreaterThan(0);
118
-
119
- // Primary window assertion.
120
- expect(
121
- elapsed,
122
- `first bot reply lands at ${elapsed}ms (target window ` +
123
- `[${SOFT_WINDOW_MIN_MS}, ${SOFT_WINDOW_MAX_MS}]). ` +
124
- `Reply text: ${JSON.stringify(firstReply.text.slice(0, 200))}.`,
125
- ).toBeGreaterThanOrEqual(SOFT_WINDOW_MIN_MS);
126
- expect(
127
- elapsed,
128
- `first bot reply lands at ${elapsed}ms — above ${SOFT_WINDOW_MAX_MS}ms ` +
129
- `ceiling. Either silence-poke wire is broken (poke armed but ` +
130
- `not drained at gateway.ts:onToolCall) or the framework ` +
131
- `fallback at 300s was the first thing to break silence. ` +
132
- `Reply text: ${JSON.stringify(firstReply.text.slice(0, 200))}.`,
133
- ).toBeLessThanOrEqual(SOFT_WINDOW_MAX_MS);
134
-
135
- // Sanity floor: the first reply should be brief — proves it's
136
- // a poke-driven update, not the final "done" answer after all
137
- // three sleeps finished naturally. ~400 char ceiling allows a
138
- // verbose model to add a sentence of context. Bump this if it
139
- // flakes on perfectly valid short answers.
140
- if (firstReply.text.length > 400) {
141
- console.warn(
142
- `[silence-poke] first reply at ${elapsed}ms is ${firstReply.text.length} ` +
143
- `chars — longer than expected for a poke-driven update. The ` +
144
- `window assertion still passed, but consider whether the model ` +
145
- `bypassed the silence stretch (e.g. ran the sleeps in one ` +
146
- `Bash call, dodging the per-call result poke chokepoint).`,
147
- );
148
- }
149
- } finally {
150
- await sc.tearDown();
151
- }
152
- },
153
- 240_000,
154
- );
155
- });