switchroom 0.14.10 → 0.14.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +81 -80
- package/dist/auth-broker/index.js +81 -80
- package/dist/cli/drive-write-pretool.mjs +10 -10
- package/dist/cli/notion-write-pretool.mjs +83 -82
- package/dist/cli/skill-validate-pretool.mjs +72 -72
- package/dist/cli/switchroom.js +958 -912
- package/dist/host-control/main.js +149 -148
- package/dist/vault/approvals/kernel-server.js +83 -82
- package/dist/vault/broker/server.js +84 -83
- package/package.json +1 -1
- package/telegram-plugin/dist/bridge/bridge.js +112 -112
- package/telegram-plugin/dist/gateway/gateway.js +195 -356
- package/telegram-plugin/dist/server.js +160 -160
- package/telegram-plugin/gateway/boot-card.ts +15 -11
- package/telegram-plugin/gateway/gateway.ts +9 -67
- package/telegram-plugin/runtime-metrics.ts +8 -52
- package/telegram-plugin/silence-poke.ts +39 -312
- package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +27 -30
- package/telegram-plugin/tests/silence-poke.test.ts +54 -569
- package/telegram-plugin/uat/scenarios/jtbd-fast-ack-dm.test.ts +21 -23
- package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +0 -155
|
@@ -6,14 +6,18 @@
|
|
|
6
6
|
*
|
|
7
7
|
* A person you message answers in a beat — "got it", "on it, checking
|
|
8
8
|
* now" — before the work is done. PR #1633 made that opening
|
|
9
|
-
* acknowledgement a *guarantee
|
|
9
|
+
* acknowledgement a *guarantee*; the enforcement has since moved off
|
|
10
|
+
* the silence-poke subsystem entirely:
|
|
10
11
|
*
|
|
11
12
|
* - the conversational-pacing prompt teaches the model to open with
|
|
12
13
|
* a short human one-liner unless the real answer lands in a second
|
|
13
14
|
* or two;
|
|
14
|
-
* - the
|
|
15
|
-
*
|
|
16
|
-
* the
|
|
15
|
+
* - the live-updating reply/draft carries the acknowledgement beat
|
|
16
|
+
* natively — the user watches the message begin to compose itself,
|
|
17
|
+
* which IS the sign of life. The old ~10s ack-budget poke (a
|
|
18
|
+
* model-targeted nudge) was retired along with the rest of the
|
|
19
|
+
* nudge ladder; only the 300s framework fallback remains, and that
|
|
20
|
+
* is a wedge-breaker, not an ack mechanism.
|
|
17
21
|
*
|
|
18
22
|
* This UAT drives a FUZZY set of non-trivial prompt shapes — research,
|
|
19
23
|
* multi-step compute, open-ended advice, code, reflective asks. Every
|
|
@@ -25,15 +29,11 @@
|
|
|
25
29
|
*
|
|
26
30
|
* - **Hard contract:** the first outbound lands within `ACK_HARD_MS`
|
|
27
31
|
* for every prompt. This is a tight *latency target*, not a
|
|
28
|
-
* framework guarantee
|
|
29
|
-
*
|
|
30
|
-
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
* nudge, so the bound ultimately depends on model latency. It
|
|
34
|
-
* still has teeth: pre-#1633 a slow prompt's first outbound was
|
|
35
|
-
* the full answer, often 30-60s out, so 20s cleanly separates the
|
|
36
|
-
* fixed behaviour from a regression. A failure here means the
|
|
32
|
+
* framework guarantee — the bound ultimately depends on model
|
|
33
|
+
* latency and on the pacing prompt + draft transport doing their
|
|
34
|
+
* job. It still has teeth: pre-#1633 a slow prompt's first outbound
|
|
35
|
+
* was the full answer, often 30-60s out, so 20s cleanly separates
|
|
36
|
+
* the fixed behaviour from a regression. A failure here means the
|
|
37
37
|
* agent left the user on a silent chat — a real pacing defect.
|
|
38
38
|
* - **Vision target (soft, per-case forensic):** the first outbound
|
|
39
39
|
* lands within `ACK_VISION_MS` and is short — a genuine
|
|
@@ -65,12 +65,11 @@ const AGENT = "test-harness";
|
|
|
65
65
|
// A tight latency target — well above a healthy self-ack (~3-8s on a
|
|
66
66
|
// warm agent) and well below the pre-#1633 silent-then-dump regression
|
|
67
67
|
// (30-60s). Model-dependent, not a framework guarantee (see header
|
|
68
|
-
// doc), so it carries generous headroom for mtcute polling jitter
|
|
69
|
-
// for a model that leans on the ack-poke nudge instead of self-acking.
|
|
68
|
+
// doc), so it carries generous headroom for mtcute polling jitter.
|
|
70
69
|
const ACK_HARD_MS = 20_000;
|
|
71
70
|
|
|
72
|
-
// Vision target: the model self-acknowledges in a beat
|
|
73
|
-
// that the
|
|
71
|
+
// Vision target: the model self-acknowledges in a beat — the draft
|
|
72
|
+
// begins composing fast enough that the user never feels a gap.
|
|
74
73
|
const ACK_VISION_MS = 8_000;
|
|
75
74
|
|
|
76
75
|
// A first outbound at or under this length reads as an acknowledgement
|
|
@@ -173,15 +172,14 @@ describe("uat: guaranteed fast acknowledgement — fuzzy prompt shapes", () => {
|
|
|
173
172
|
throw new Error(
|
|
174
173
|
`[ack] ${tc.name}: TTFO=${ttfo}ms exceeds the hard `
|
|
175
174
|
+ `contract ${ACK_HARD_MS}ms — the user sat on a silent `
|
|
176
|
-
+ `chat. The fast-ack path (pacing prompt +
|
|
177
|
-
+ `
|
|
175
|
+
+ `chat. The fast-ack path (pacing prompt + live draft) `
|
|
176
|
+
+ `is not delivering. First outbound: `
|
|
178
177
|
+ `${JSON.stringify(firstOutbound.text.slice(0, 200))}`,
|
|
179
178
|
);
|
|
180
179
|
}
|
|
181
180
|
expect(ttfo).toBeLessThan(ACK_HARD_MS);
|
|
182
181
|
|
|
183
|
-
// Forensic, soft: did the model self-acknowledge in a beat
|
|
184
|
-
// or did it only get there with the ack-poke nudge?
|
|
182
|
+
// Forensic, soft: did the model self-acknowledge in a beat?
|
|
185
183
|
const looksLikeAck = len <= ACK_LEN_CEILING;
|
|
186
184
|
if (ttfo < ACK_VISION_MS && looksLikeAck) {
|
|
187
185
|
console.log(
|
|
@@ -198,8 +196,8 @@ describe("uat: guaranteed fast acknowledgement — fuzzy prompt shapes", () => {
|
|
|
198
196
|
);
|
|
199
197
|
} else {
|
|
200
198
|
// Passed the hard contract but slower than the vision
|
|
201
|
-
// target — the canary for the model
|
|
202
|
-
//
|
|
199
|
+
// target — the canary for the model not acknowledging
|
|
200
|
+
// promptly on its own (draft slow to start composing).
|
|
203
201
|
console.warn(
|
|
204
202
|
`[ack] ${tc.name}: TTFO=${ttfo}ms (vision target `
|
|
205
203
|
+ `<${ACK_VISION_MS}ms), ${len} chars`
|
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Silence-poke soft-fire end-to-end scenario.
|
|
3
|
-
*
|
|
4
|
-
* Goal context: cause class CC-3 in `docs/status-ask-cause-classes.md`
|
|
5
|
-
* — the L3 safety net. Unit tests (`silence-poke.test.ts`) cover the
|
|
6
|
-
* state machine: tick semantics, ladder thresholds, success measurement.
|
|
7
|
-
* They DO NOT cover the wire path between `consumeArmedPoke()` (in
|
|
8
|
-
* `silence-poke.ts`) and the model actually receiving the
|
|
9
|
-
* `[silence-poke]` system-reminder block on its next tool result.
|
|
10
|
-
*
|
|
11
|
-
* The wire path lives at `gateway.ts:2740`:
|
|
12
|
-
*
|
|
13
|
-
* onToolCall → executeToolCall(...) → consumeArmedPoke() →
|
|
14
|
-
* append `<system-reminder>[silence-poke] ...</system-reminder>`
|
|
15
|
-
* to the tool-result text.
|
|
16
|
-
*
|
|
17
|
-
* If that integration ever breaks — a refactor swaps `executeToolCall`
|
|
18
|
-
* for a path that doesn't call `consumeArmedPoke`, the result-content
|
|
19
|
-
* shape mutation gets dropped, MCP framing changes — the unit tests
|
|
20
|
-
* still pass but the model never sees the nudge, the user goes silent
|
|
21
|
-
* past 75s, and `inbound_status_query` ticks. This UAT closes that
|
|
22
|
-
* regression window end-to-end.
|
|
23
|
-
*
|
|
24
|
-
* ## Strategy
|
|
25
|
-
*
|
|
26
|
-
* Force the agent into a stretch of silent tool churn that exceeds the
|
|
27
|
-
* 75s soft threshold without the model emitting any outbound `reply`.
|
|
28
|
-
* The conversational-pacing prompt instructs the model to soft-commit
|
|
29
|
-
* fast turns, so we have to explicitly suppress that:
|
|
30
|
-
*
|
|
31
|
-
* - Prompt instructs three sequential 30s `sleep` Bash calls, NO
|
|
32
|
-
* mid-turn replies, single final reply when done.
|
|
33
|
-
* - Total silent stretch is ~90s + tool overhead, comfortably past
|
|
34
|
-
* the 75s soft threshold.
|
|
35
|
-
* - If the silence-poke wire works: the model sees the
|
|
36
|
-
* `[silence-poke]` system-reminder appended to the result of the
|
|
37
|
-
* first or second sleep, breaks the no-reply rule, sends a brief
|
|
38
|
-
* update. We observe a reply in the [70s, 200s] window.
|
|
39
|
-
* - If the wire is broken: model never receives the nudge, no
|
|
40
|
-
* reply until the third sleep ends at ~90s+, OR the framework
|
|
41
|
-
* fallback at 300s fires. We catch the latter as a separate
|
|
42
|
-
* failure (the framework fallback is the FLOOR, not the goal).
|
|
43
|
-
*
|
|
44
|
-
* ## Tolerances
|
|
45
|
-
*
|
|
46
|
-
* Real-Telegram UAT against a real Claude model has variability:
|
|
47
|
-
*
|
|
48
|
-
* - Model may insert one soft-commit "on it" reply at start; that
|
|
49
|
-
* resets the silence clock. Three 30s sleeps still pushes the
|
|
50
|
-
* post-commit silence past 75s as long as the commit lands
|
|
51
|
-
* within the first ~10s. We tolerate this.
|
|
52
|
-
* - Model may decline to follow the "no replies" instruction and
|
|
53
|
-
* send updates organically; if the FIRST reply still lands in
|
|
54
|
-
* [70s, 200s], the conversational pacing layer is doing its job
|
|
55
|
-
* and the test passes regardless of whether silence-poke
|
|
56
|
-
* specifically fired.
|
|
57
|
-
* - Window is generous (70-200s) to absorb 5s poll interval,
|
|
58
|
-
* mtcute receive lag, Telegram delivery jitter.
|
|
59
|
-
*
|
|
60
|
-
* ## Failure shapes the assertion catches
|
|
61
|
-
*
|
|
62
|
-
* 1. Wire path broken — first reply lands >200s after sendDM
|
|
63
|
-
* because the framework fallback (300s) is the only thing that
|
|
64
|
-
* eventually breaks the silence.
|
|
65
|
-
* 2. Soft poke armed but not drained — first reply lands at >200s
|
|
66
|
-
* similarly.
|
|
67
|
-
* 3. Model misbehavior — first reply is the FINAL answer (long
|
|
68
|
-
* text after all three sleeps complete at ~90s+); strictly that
|
|
69
|
-
* passes the window check, but the test also asserts the first
|
|
70
|
-
* reply is brief (<400 chars) as a sanity floor on "this is
|
|
71
|
-
* actually a poke response, not the final answer." Skip strict
|
|
72
|
-
* length if the prompt happens to be so simple the final
|
|
73
|
-
* answer IS brief.
|
|
74
|
-
*
|
|
75
|
-
* Requires the same env as `smoke-dm-reply.test.ts` (see
|
|
76
|
-
* `uat/SETUP.md` §6). Long-running: outer budget 4 min.
|
|
77
|
-
*/
|
|
78
|
-
|
|
79
|
-
import { describe, expect, it } from "vitest";
|
|
80
|
-
import { spinUp } from "../harness.js";
|
|
81
|
-
|
|
82
|
-
const SOFT_WINDOW_MIN_MS = 70_000;
|
|
83
|
-
const SOFT_WINDOW_MAX_MS = 200_000;
|
|
84
|
-
|
|
85
|
-
// Explicit instruction shape. Mirrors the `BG_DISPATCH_PROMPT` pattern
|
|
86
|
-
// in `bg-sub-agent-dispatch-dm.test.ts` — pin the tool + the sequence
|
|
87
|
-
// so behaviour is deterministic enough to test the *infra*, not the
|
|
88
|
-
// model's free-form judgement.
|
|
89
|
-
const SILENT_CHURN_PROMPT =
|
|
90
|
-
"I need you to test something. Run THREE separate Bash tool calls " +
|
|
91
|
-
"in sequence: first `sleep 30`, then `sleep 30`, then `sleep 30`. " +
|
|
92
|
-
"Critical: do NOT send any `reply` or `stream_reply` between or " +
|
|
93
|
-
"during the sleeps — no soft commit, no progress updates, no " +
|
|
94
|
-
"narration. Just the three Bash calls back-to-back. Once all three " +
|
|
95
|
-
"complete, send ONE brief final reply saying 'done' so I know " +
|
|
96
|
-
"you're back.";
|
|
97
|
-
|
|
98
|
-
describe("uat: silence-poke soft fires + reaches the model wire", () => {
|
|
99
|
-
it(
|
|
100
|
-
"agent breaks self-imposed silence in [70s, 200s] window via silence-poke",
|
|
101
|
-
async () => {
|
|
102
|
-
const sc = await spinUp({ agent: "test-harness" });
|
|
103
|
-
try {
|
|
104
|
-
const sendStart = Date.now();
|
|
105
|
-
await sc.sendDM(SILENT_CHURN_PROMPT);
|
|
106
|
-
|
|
107
|
-
// Wait for the FIRST reply. If silence-poke + the wire path
|
|
108
|
-
// are working, this lands between ~75s and ~110s as the
|
|
109
|
-
// model responds to the [silence-poke] system-reminder
|
|
110
|
-
// appended to the first or second sleep's tool result.
|
|
111
|
-
const firstReply = await sc.expectMessage(/\S/, {
|
|
112
|
-
from: "bot",
|
|
113
|
-
timeout: SOFT_WINDOW_MAX_MS + 20_000,
|
|
114
|
-
});
|
|
115
|
-
const elapsed = Date.now() - sendStart;
|
|
116
|
-
|
|
117
|
-
expect(firstReply.text.length).toBeGreaterThan(0);
|
|
118
|
-
|
|
119
|
-
// Primary window assertion.
|
|
120
|
-
expect(
|
|
121
|
-
elapsed,
|
|
122
|
-
`first bot reply lands at ${elapsed}ms (target window ` +
|
|
123
|
-
`[${SOFT_WINDOW_MIN_MS}, ${SOFT_WINDOW_MAX_MS}]). ` +
|
|
124
|
-
`Reply text: ${JSON.stringify(firstReply.text.slice(0, 200))}.`,
|
|
125
|
-
).toBeGreaterThanOrEqual(SOFT_WINDOW_MIN_MS);
|
|
126
|
-
expect(
|
|
127
|
-
elapsed,
|
|
128
|
-
`first bot reply lands at ${elapsed}ms — above ${SOFT_WINDOW_MAX_MS}ms ` +
|
|
129
|
-
`ceiling. Either silence-poke wire is broken (poke armed but ` +
|
|
130
|
-
`not drained at gateway.ts:onToolCall) or the framework ` +
|
|
131
|
-
`fallback at 300s was the first thing to break silence. ` +
|
|
132
|
-
`Reply text: ${JSON.stringify(firstReply.text.slice(0, 200))}.`,
|
|
133
|
-
).toBeLessThanOrEqual(SOFT_WINDOW_MAX_MS);
|
|
134
|
-
|
|
135
|
-
// Sanity floor: the first reply should be brief — proves it's
|
|
136
|
-
// a poke-driven update, not the final "done" answer after all
|
|
137
|
-
// three sleeps finished naturally. ~400 char ceiling allows a
|
|
138
|
-
// verbose model to add a sentence of context. Bump this if it
|
|
139
|
-
// flakes on perfectly valid short answers.
|
|
140
|
-
if (firstReply.text.length > 400) {
|
|
141
|
-
console.warn(
|
|
142
|
-
`[silence-poke] first reply at ${elapsed}ms is ${firstReply.text.length} ` +
|
|
143
|
-
`chars — longer than expected for a poke-driven update. The ` +
|
|
144
|
-
`window assertion still passed, but consider whether the model ` +
|
|
145
|
-
`bypassed the silence stretch (e.g. ran the sleeps in one ` +
|
|
146
|
-
`Bash call, dodging the per-call result poke chokepoint).`,
|
|
147
|
-
);
|
|
148
|
-
}
|
|
149
|
-
} finally {
|
|
150
|
-
await sc.tearDown();
|
|
151
|
-
}
|
|
152
|
-
},
|
|
153
|
-
240_000,
|
|
154
|
-
);
|
|
155
|
-
});
|