switchroom 0.13.19 → 0.13.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +2 -2
- package/package.json +1 -1
- package/profiles/_shared/telegram-style.md.hbs +3 -3
- package/telegram-plugin/dist/gateway/gateway.js +201 -31
- package/telegram-plugin/gateway/disconnect-flush.ts +37 -0
- package/telegram-plugin/gateway/gateway.ts +138 -8
- package/telegram-plugin/gateway/inbound-delivery-gate.ts +37 -4
- package/telegram-plugin/handoff-continuity.ts +8 -2
- package/telegram-plugin/recent-outbound-dedup.ts +51 -5
- package/telegram-plugin/runtime-metrics.ts +22 -0
- package/telegram-plugin/subagent-watcher.ts +25 -3
- package/telegram-plugin/tests/gateway-disconnect-flush.test.ts +114 -0
- package/telegram-plugin/tests/handoff-continuity.test.ts +15 -2
- package/telegram-plugin/tests/inbound-delivery-gate.test.ts +77 -4
- package/telegram-plugin/tests/recent-outbound-dedup.test.ts +72 -0
- package/telegram-plugin/tests/subagent-watcher-enoent-deregister.test.ts +152 -0
- package/telegram-plugin/tests/text-voice-scrub.test.ts +174 -0
- package/telegram-plugin/text-voice-scrub.ts +199 -0
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +72 -45
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* text-voice-scrub.ts — deterministic prose-style enforcement at the
|
|
3
|
+
* gateway.
|
|
4
|
+
*
|
|
5
|
+
* Background. Despite three landed soft fixes (SOUL.md.hbs "never use
|
|
6
|
+
* em-dashes" rule, PR #1177 voice consolidation, the /humanizer skill),
|
|
7
|
+
* sampling 2,867 recent fleet outbound replies on 2026-05-23 showed
|
|
8
|
+
* em-dashes still present in 73% of agent messages (3.23 per 1k chars).
|
|
9
|
+
* Soft layer was not winning. The operator's framing is the same one
|
|
10
|
+
* that drove the over-ping safety net (#1674) and the silent-reply
|
|
11
|
+
* auto-edit (#1677): when the model authors voice and the framework
|
|
12
|
+
* owns enforcement, soft instructions fail under load. Make the
|
|
13
|
+
* framework do it.
|
|
14
|
+
*
|
|
15
|
+
* Scope. Em / en dashes only. The wider "AI-tell phrase denylist"
|
|
16
|
+
* (smoking gun, by design, etc.) was scoped OUT after data showed
|
|
17
|
+
* those phrases land in <0.5% of fleet messages and substituting
|
|
18
|
+
* them risks semantic loss. Em-dash → comma/period is a pure
|
|
19
|
+
* mechanical transform with no semantic loss when the surrounding
|
|
20
|
+
* text is whitespace-separated prose, and a no-op when the dash
|
|
21
|
+
* is inside code or a URL.
|
|
22
|
+
*
|
|
23
|
+
* Pipeline integration. Apply BEFORE markdownToHtml so the scrub
|
|
24
|
+
* runs on the original model text, not on rendered HTML where
|
|
25
|
+
* the dash might already be tag-escaped or live inside a parked
|
|
26
|
+
* code-block placeholder. Apply BEFORE outboundDedup.check so
|
|
27
|
+
* dedup keys see the post-scrub content (same text from a retry
|
|
28
|
+
* collapses cleanly).
|
|
29
|
+
*
|
|
30
|
+
* Code-region awareness. The scrubber MUST preserve dashes inside:
|
|
31
|
+
* - fenced code blocks: ```lang\n...\n```
|
|
32
|
+
* - inline code: `...`
|
|
33
|
+
* - explicit Telegram HTML code tags: <code>...</code>, <pre>...</pre>
|
|
34
|
+
* - URLs (rare to contain em-dashes, but technically valid IDN)
|
|
35
|
+
* The strategy is to park each protected region with a sentinel,
|
|
36
|
+
* scrub the rest, then restore. Mirrors the well-trodden
|
|
37
|
+
* markdownToHtml() codeBlocks/inlineCode placeholder pattern at
|
|
38
|
+
* format.ts:254-272.
|
|
39
|
+
*
|
|
40
|
+
* Kill switch. `SWITCHROOM_DISABLE_VOICE_SCRUB=1` returns the input
|
|
41
|
+
* unchanged and reports zero replacements. Same shape every other
|
|
42
|
+
* gateway safety net uses; rollback is one env var + agent restart.
|
|
43
|
+
*/
|
|
44
|
+
|
|
45
|
+
export interface VoiceScrubResult {
|
|
46
|
+
/** The scrubbed text. Equal to input when no replacements made or
|
|
47
|
+
* when the kill switch is set. */
|
|
48
|
+
scrubbed: string
|
|
49
|
+
/** Count of dash replacements made across the whole input. Surfaces
|
|
50
|
+
* to the runtime-metrics fan-out so the cadence dashboard can track
|
|
51
|
+
* fleet-wide voice-scrub rate over time. */
|
|
52
|
+
replaced: number
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const NULL = '\x00'
|
|
56
|
+
const FENCE_PH = `${NULL}VS_FENCE`
|
|
57
|
+
const INLINE_PH = `${NULL}VS_INLINE`
|
|
58
|
+
const HTML_CODE_PH = `${NULL}VS_HTMLCODE`
|
|
59
|
+
const HTML_PRE_PH = `${NULL}VS_HTMLPRE`
|
|
60
|
+
const URL_PH = `${NULL}VS_URL`
|
|
61
|
+
|
|
62
|
+
const URL_RE = /https?:\/\/\S+/g
|
|
63
|
+
|
|
64
|
+
function enabled(): boolean {
|
|
65
|
+
const v = process.env.SWITCHROOM_DISABLE_VOICE_SCRUB
|
|
66
|
+
return !(v === '1' || v === 'true')
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Park code-like regions behind placeholders so the dash-replacement
|
|
71
|
+
* pass can't touch them. Returns the parked-string and the original
|
|
72
|
+
* fragments keyed by index.
|
|
73
|
+
*/
|
|
74
|
+
function park(text: string): {
|
|
75
|
+
parked: string
|
|
76
|
+
parts: Array<{ prefix: string; idx: number; raw: string }>
|
|
77
|
+
} {
|
|
78
|
+
const parts: Array<{ prefix: string; idx: number; raw: string }> = []
|
|
79
|
+
let parked = text
|
|
80
|
+
|
|
81
|
+
// Order matters: fenced first (so a ` inside a fence isn't taken
|
|
82
|
+
// as inline-code start), then HTML code tags, then inline backticks,
|
|
83
|
+
// then URLs.
|
|
84
|
+
parked = parked.replace(/```[\s\S]*?```/g, (m) => {
|
|
85
|
+
const idx = parts.length
|
|
86
|
+
parts.push({ prefix: FENCE_PH, idx, raw: m })
|
|
87
|
+
return `${FENCE_PH}${idx}${NULL}`
|
|
88
|
+
})
|
|
89
|
+
parked = parked.replace(/<pre>[\s\S]*?<\/pre>/gi, (m) => {
|
|
90
|
+
const idx = parts.length
|
|
91
|
+
parts.push({ prefix: HTML_PRE_PH, idx, raw: m })
|
|
92
|
+
return `${HTML_PRE_PH}${idx}${NULL}`
|
|
93
|
+
})
|
|
94
|
+
parked = parked.replace(/<code[^>]*>[\s\S]*?<\/code>/gi, (m) => {
|
|
95
|
+
const idx = parts.length
|
|
96
|
+
parts.push({ prefix: HTML_CODE_PH, idx, raw: m })
|
|
97
|
+
return `${HTML_CODE_PH}${idx}${NULL}`
|
|
98
|
+
})
|
|
99
|
+
parked = parked.replace(/`[^`\n]+`/g, (m) => {
|
|
100
|
+
const idx = parts.length
|
|
101
|
+
parts.push({ prefix: INLINE_PH, idx, raw: m })
|
|
102
|
+
return `${INLINE_PH}${idx}${NULL}`
|
|
103
|
+
})
|
|
104
|
+
parked = parked.replace(URL_RE, (m) => {
|
|
105
|
+
const idx = parts.length
|
|
106
|
+
parts.push({ prefix: URL_PH, idx, raw: m })
|
|
107
|
+
return `${URL_PH}${idx}${NULL}`
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
return { parked, parts }
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function restore(
|
|
114
|
+
text: string,
|
|
115
|
+
parts: Array<{ prefix: string; idx: number; raw: string }>,
|
|
116
|
+
): string {
|
|
117
|
+
let restored = text
|
|
118
|
+
// Restore in reverse-insertion order so a placeholder accidentally
|
|
119
|
+
// emitted by a nested replacement gets the right raw region.
|
|
120
|
+
for (let i = parts.length - 1; i >= 0; i--) {
|
|
121
|
+
const p = parts[i]!
|
|
122
|
+
restored = restored.replace(`${p.prefix}${p.idx}${NULL}`, () => p.raw)
|
|
123
|
+
}
|
|
124
|
+
return restored
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Replace em / en dashes with context-appropriate punctuation.
|
|
129
|
+
*
|
|
130
|
+
* Rules, applied in order:
|
|
131
|
+
* 1. ` — ` / ` – ` (flanked by single space) → `, ` if followed by a
|
|
132
|
+
* lowercase or open-paren character; otherwise `. ` if followed by
|
|
133
|
+
* an uppercase or end-of-string. Heuristic: lowercase = mid-clause
|
|
134
|
+
* continuation (comma reads naturally); uppercase = new sentence
|
|
135
|
+
* (period reads naturally).
|
|
136
|
+
* 2. End-of-line dash (` —\n` / ` –\n`) → `.\n` — treat as full stop.
|
|
137
|
+
* 3. Bare dash with no flanking spaces between word chars
|
|
138
|
+
* (e.g. "word—word") → `, ` — the missing-space form is rarer but
|
|
139
|
+
* semantically the same as #1.
|
|
140
|
+
* 4. Surviving dash (uncommon, e.g. at sentence start "— note") → `-`
|
|
141
|
+
* so the message still renders without the AI tell.
|
|
142
|
+
*/
|
|
143
|
+
function replaceDashes(text: string): { out: string; replaced: number } {
|
|
144
|
+
let replaced = 0
|
|
145
|
+
let out = text
|
|
146
|
+
|
|
147
|
+
// #1: spaced em-dash mid-prose. Decide between ", " and ". " on
|
|
148
|
+
// the leading character of the following token.
|
|
149
|
+
out = out.replace(/(\S) [—–] (\S)/g, (_m, before: string, after: string) => {
|
|
150
|
+
replaced++
|
|
151
|
+
// If `after` is uppercase ASCII or one of a known sentence-starter
|
|
152
|
+
// set, treat as new sentence; otherwise a parenthetical comma.
|
|
153
|
+
const sentenceStart = /[A-Z]/.test(after)
|
|
154
|
+
return sentenceStart ? `${before}. ${after}` : `${before}, ${after}`
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
// #2: dash at end of line. Treat as full stop.
|
|
158
|
+
out = out.replace(/ [—–](\s*\n)/g, (_m, ws: string) => {
|
|
159
|
+
replaced++
|
|
160
|
+
return `.${ws}`
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
// #3: bare dash between word chars (no flanking spaces). Treat as
|
|
164
|
+
// missing-space form of #1; comma is the safe fallback.
|
|
165
|
+
out = out.replace(/(\w)[—–](\w)/g, (_m, before: string, after: string) => {
|
|
166
|
+
replaced++
|
|
167
|
+
return `${before}, ${after}`
|
|
168
|
+
})
|
|
169
|
+
|
|
170
|
+
// #4: anything still standing — convert to ASCII hyphen so no
|
|
171
|
+
// typographic dash escapes the gate. Rare path; covers leading
|
|
172
|
+
// "— note" / quoted dash / etc.
|
|
173
|
+
out = out.replace(/[—–]/g, () => {
|
|
174
|
+
replaced++
|
|
175
|
+
return '-'
|
|
176
|
+
})
|
|
177
|
+
|
|
178
|
+
return { out, replaced }
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Public entry: scrub em / en dashes from outbound text while
|
|
183
|
+
* preserving dashes inside code and URLs.
|
|
184
|
+
*
|
|
185
|
+
* Pure: no IO, no module-scope state, deterministic. Kill switch is
|
|
186
|
+
* checked per call so an operator can flip it via env var without a
|
|
187
|
+
* restart of an in-process test.
|
|
188
|
+
*/
|
|
189
|
+
export function scrubVoice(text: string): VoiceScrubResult {
|
|
190
|
+
if (!enabled() || text.length === 0) {
|
|
191
|
+
return { scrubbed: text, replaced: 0 }
|
|
192
|
+
}
|
|
193
|
+
const { parked, parts } = park(text)
|
|
194
|
+
const { out, replaced } = replaceDashes(parked)
|
|
195
|
+
if (replaced === 0) {
|
|
196
|
+
return { scrubbed: text, replaced: 0 }
|
|
197
|
+
}
|
|
198
|
+
return { scrubbed: restore(out, parts), replaced }
|
|
199
|
+
}
|
|
@@ -1,38 +1,35 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* JTBD scenario — rapid follow-ups (steering vs queued classification).
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Live contract codified in `_shared/telegram-style.md.hbs` and
|
|
5
|
+
* `reference/steer-or-queue-mid-flight.md` (default-flip commits
|
|
6
|
+
* `4fff90bf` + `597a58af`, 2026-04-17):
|
|
5
7
|
*
|
|
6
|
-
* - A follow-up
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
* - A follow-up prefixed with `/
|
|
10
|
-
*
|
|
11
|
-
* in-flight
|
|
8
|
+
* - A mid-turn follow-up with NO prefix is `queued="true"` — new
|
|
9
|
+
* independent task. The agent should NOT reference the in-flight
|
|
10
|
+
* work.
|
|
11
|
+
* - A mid-turn follow-up prefixed with `/steer ` or `/s ` is
|
|
12
|
+
* `steering="true"` — course-correction; the agent continues the
|
|
13
|
+
* in-flight task incorporating the new guidance.
|
|
14
|
+
* - Legacy `/queue ` / `/q ` is a redundant alias for the default;
|
|
15
|
+
* still works.
|
|
12
16
|
*
|
|
13
|
-
* This UAT fires both shapes and asserts the agent
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
* top of its reply. So we can pattern-match on that.
|
|
17
|
+
* This UAT fires both shapes and asserts the agent narrates the
|
|
18
|
+
* classification correctly. The prior version of this scenario
|
|
19
|
+
* (2026-05-13 / PR #1132) tested the pre-flip contract with
|
|
20
|
+
* too-loose assertions (`/md5/i` regex passes on the queued path
|
|
21
|
+
* by coincidence — the model answers "use md5" fresh and the reply
|
|
22
|
+
* contains "md5"). After unskipping with the corrected contract,
|
|
23
|
+
* the assertions check for the italic classification line the
|
|
24
|
+
* prompt instructs the agent to emit.
|
|
22
25
|
*/
|
|
23
26
|
|
|
24
27
|
import { describe, it, expect } from "vitest";
|
|
25
28
|
import { spinUp } from "../harness.js";
|
|
26
29
|
|
|
27
|
-
|
|
28
|
-
// surface "md5"; queued didn't produce the expected fresh-task reply).
|
|
29
|
-
// May be real classification bugs, may be prompt fragility — neither
|
|
30
|
-
// has been root-caused. Excluded from the buildkite gate so it doesn't
|
|
31
|
-
// block every PR touching telegram-plugin/. Run locally via
|
|
32
|
-
// `bun run test:uat` once classification has been investigated.
|
|
33
|
-
describe.skip("uat: rapid follow-ups — steering vs queued", () => {
|
|
30
|
+
describe("uat: rapid follow-ups — steering vs queued classification", () => {
|
|
34
31
|
it(
|
|
35
|
-
"follow-up
|
|
32
|
+
"follow-up with /steer prefix → agent self-narrates as steering",
|
|
36
33
|
async () => {
|
|
37
34
|
const sc = await spinUp({ agent: "test-harness" });
|
|
38
35
|
try {
|
|
@@ -43,26 +40,39 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
|
|
|
43
40
|
+ "Show the work step by step with a 2-second pause between.",
|
|
44
41
|
);
|
|
45
42
|
await new Promise((r) => setTimeout(r, 3_000));
|
|
46
|
-
// Steer: change the algorithm
|
|
47
|
-
await sc.sendDM("actually use md5 not sha256");
|
|
43
|
+
// Steer: change the algorithm using the explicit /steer prefix.
|
|
44
|
+
await sc.sendDM("/steer actually use md5 not sha256");
|
|
48
45
|
|
|
49
|
-
// The agent should reply mentioning md5
|
|
50
|
-
//
|
|
51
|
-
//
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
46
|
+
// The agent should reply mentioning md5 AND surface the italic
|
|
47
|
+
// classification line per the prompt
|
|
48
|
+
// ("_↪️ treating as steer on the prior task_" or similar).
|
|
49
|
+
// We match either explicit-steer narration OR the steer emoji
|
|
50
|
+
// (`↪️`) to allow for natural-language variation while still
|
|
51
|
+
// failing if no narration appears (the previous version of
|
|
52
|
+
// this UAT was too loose — bare `/md5/i` passed by coincidence
|
|
53
|
+
// on the queued path).
|
|
54
|
+
const reply = await sc.expectMessage(
|
|
55
|
+
(m) => {
|
|
56
|
+
const txt = m.text;
|
|
57
|
+
const mentionsMd5 = /\bmd5\b/i.test(txt);
|
|
58
|
+
const narratesSteer =
|
|
59
|
+
/↪️|\bsteer(ing)?\b|continuing the (prior|original|in-flight) task|amendment|course[- ]correct/i.test(
|
|
60
|
+
txt,
|
|
61
|
+
);
|
|
62
|
+
return mentionsMd5 && narratesSteer;
|
|
63
|
+
},
|
|
64
|
+
{ from: "bot", timeout: 120_000 },
|
|
65
|
+
);
|
|
56
66
|
expect(reply.text.toLowerCase()).toContain("md5");
|
|
57
67
|
} finally {
|
|
58
68
|
await sc.tearDown();
|
|
59
69
|
}
|
|
60
70
|
},
|
|
61
|
-
|
|
71
|
+
180_000,
|
|
62
72
|
);
|
|
63
73
|
|
|
64
74
|
it(
|
|
65
|
-
"follow-up
|
|
75
|
+
"follow-up with no prefix mid-turn → agent treats as queued (new task)",
|
|
66
76
|
async () => {
|
|
67
77
|
const sc = await spinUp({ agent: "test-harness" });
|
|
68
78
|
try {
|
|
@@ -71,9 +81,10 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
|
|
|
71
81
|
+ "Use bash.",
|
|
72
82
|
);
|
|
73
83
|
await new Promise((r) => setTimeout(r, 3_000));
|
|
74
|
-
//
|
|
75
|
-
//
|
|
76
|
-
|
|
84
|
+
// No prefix — the default-flipped contract says this is a
|
|
85
|
+
// QUEUED new task. The agent should NOT reference the
|
|
86
|
+
// counting work.
|
|
87
|
+
await sc.sendDM("what is 2+2?");
|
|
77
88
|
|
|
78
89
|
// First reply should be from the counting task (still
|
|
79
90
|
// in-flight). Then a second reply for the queued task.
|
|
@@ -81,16 +92,32 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
|
|
|
81
92
|
from: "bot",
|
|
82
93
|
timeout: 60_000,
|
|
83
94
|
});
|
|
84
|
-
|
|
85
|
-
//
|
|
86
|
-
//
|
|
95
|
+
|
|
96
|
+
// Second reply: the queued task's answer. We want to see
|
|
97
|
+
// EITHER the italic queued-narration line OR a fresh "4"
|
|
98
|
+
// answer that doesn't reference the counting work.
|
|
87
99
|
const secondReply = await sc.expectMessage(
|
|
88
|
-
(m) =>
|
|
89
|
-
m.messageId
|
|
90
|
-
|
|
100
|
+
(m) => {
|
|
101
|
+
if (m.messageId <= firstReply.messageId) return false;
|
|
102
|
+
const txt = m.text;
|
|
103
|
+
const answersTheQuestion =
|
|
104
|
+
/\b4\b|\bfour\b|two\s+plus\s+two|2\s*\+\s*2/i.test(txt);
|
|
105
|
+
const narratesQueued =
|
|
106
|
+
/📥|\bqueued\b|new\s+(?:independent\s+)?task|fresh\s+task/i.test(
|
|
107
|
+
txt,
|
|
108
|
+
);
|
|
109
|
+
// Pass if either: the explicit narration is present, OR the
|
|
110
|
+
// reply answers cleanly without referencing the counting
|
|
111
|
+
// task. The latter is the substantive behavioural check —
|
|
112
|
+
// the queued task is isolated from the in-flight context.
|
|
113
|
+
const isolatedFromCounting = !/\bcount(ing)?\b|\bsleep\b/i.test(
|
|
114
|
+
txt,
|
|
115
|
+
);
|
|
116
|
+
return answersTheQuestion && (narratesQueued || isolatedFromCounting);
|
|
117
|
+
},
|
|
91
118
|
{ from: "bot", timeout: 120_000 },
|
|
92
119
|
);
|
|
93
|
-
expect(secondReply.text).toMatch(/4|
|
|
120
|
+
expect(secondReply.text).toMatch(/4|four|2\s*\+\s*2/i);
|
|
94
121
|
} finally {
|
|
95
122
|
await sc.tearDown();
|
|
96
123
|
}
|