switchroom 0.13.19 → 0.13.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,199 @@
1
+ /**
2
+ * text-voice-scrub.ts — deterministic prose-style enforcement at the
3
+ * gateway.
4
+ *
5
+ * Background. Despite three landed soft fixes (SOUL.md.hbs "never use
6
+ * em-dashes" rule, PR #1177 voice consolidation, the /humanizer skill),
7
+ * sampling 2,867 recent fleet outbound replies on 2026-05-23 showed
8
+ * em-dashes still present in 73% of agent messages (3.23 per 1k chars).
9
+ * Soft layer was not winning. The operator's framing is the same one
10
+ * that drove the over-ping safety net (#1674) and the silent-reply
11
+ * auto-edit (#1677): when the model authors voice and the framework
12
+ * owns enforcement, soft instructions fail under load. Make the
13
+ * framework do it.
14
+ *
15
+ * Scope. Em / en dashes only. The wider "AI-tell phrase denylist"
16
+ * (smoking gun, by design, etc.) was scoped OUT after data showed
17
+ * those phrases land in <0.5% of fleet messages and substituting
18
+ * them risks semantic loss. Em-dash → comma/period is a pure
19
+ * mechanical transform with no semantic loss when the surrounding
20
+ * text is whitespace-separated prose, and a no-op when the dash
21
+ * is inside code or a URL.
22
+ *
23
+ * Pipeline integration. Apply BEFORE markdownToHtml so the scrub
24
+ * runs on the original model text, not on rendered HTML where
25
+ * the dash might already be tag-escaped or live inside a parked
26
+ * code-block placeholder. Apply BEFORE outboundDedup.check so
27
+ * dedup keys see the post-scrub content (same text from a retry
28
+ * collapses cleanly).
29
+ *
30
+ * Code-region awareness. The scrubber MUST preserve dashes inside:
31
+ * - fenced code blocks: ```lang\n...\n```
32
+ * - inline code: `...`
33
+ * - explicit Telegram HTML code tags: <code>...</code>, <pre>...</pre>
34
+ * - URLs (rare to contain em-dashes, but technically valid IDN)
35
+ * The strategy is to park each protected region with a sentinel,
36
+ * scrub the rest, then restore. Mirrors the well-trodden
37
+ * markdownToHtml() codeBlocks/inlineCode placeholder pattern at
38
+ * format.ts:254-272.
39
+ *
40
+ * Kill switch. `SWITCHROOM_DISABLE_VOICE_SCRUB=1` returns the input
41
+ * unchanged and reports zero replacements. Same shape every other
42
+ * gateway safety net uses; rollback is one env var + agent restart.
43
+ */
44
+
45
+ export interface VoiceScrubResult {
46
+ /** The scrubbed text. Equal to input when no replacements made or
47
+ * when the kill switch is set. */
48
+ scrubbed: string
49
+ /** Count of dash replacements made across the whole input. Surfaces
50
+ * to the runtime-metrics fan-out so the cadence dashboard can track
51
+ * fleet-wide voice-scrub rate over time. */
52
+ replaced: number
53
+ }
54
+
55
+ const NULL = '\x00'
56
+ const FENCE_PH = `${NULL}VS_FENCE`
57
+ const INLINE_PH = `${NULL}VS_INLINE`
58
+ const HTML_CODE_PH = `${NULL}VS_HTMLCODE`
59
+ const HTML_PRE_PH = `${NULL}VS_HTMLPRE`
60
+ const URL_PH = `${NULL}VS_URL`
61
+
62
+ const URL_RE = /https?:\/\/\S+/g
63
+
64
+ function enabled(): boolean {
65
+ const v = process.env.SWITCHROOM_DISABLE_VOICE_SCRUB
66
+ return !(v === '1' || v === 'true')
67
+ }
68
+
69
+ /**
70
+ * Park code-like regions behind placeholders so the dash-replacement
71
+ * pass can't touch them. Returns the parked-string and the original
72
+ * fragments keyed by index.
73
+ */
74
+ function park(text: string): {
75
+ parked: string
76
+ parts: Array<{ prefix: string; idx: number; raw: string }>
77
+ } {
78
+ const parts: Array<{ prefix: string; idx: number; raw: string }> = []
79
+ let parked = text
80
+
81
+ // Order matters: fenced first (so a ` inside a fence isn't taken
82
+ // as inline-code start), then HTML code tags, then inline backticks,
83
+ // then URLs.
84
+ parked = parked.replace(/```[\s\S]*?```/g, (m) => {
85
+ const idx = parts.length
86
+ parts.push({ prefix: FENCE_PH, idx, raw: m })
87
+ return `${FENCE_PH}${idx}${NULL}`
88
+ })
89
+ parked = parked.replace(/<pre>[\s\S]*?<\/pre>/gi, (m) => {
90
+ const idx = parts.length
91
+ parts.push({ prefix: HTML_PRE_PH, idx, raw: m })
92
+ return `${HTML_PRE_PH}${idx}${NULL}`
93
+ })
94
+ parked = parked.replace(/<code[^>]*>[\s\S]*?<\/code>/gi, (m) => {
95
+ const idx = parts.length
96
+ parts.push({ prefix: HTML_CODE_PH, idx, raw: m })
97
+ return `${HTML_CODE_PH}${idx}${NULL}`
98
+ })
99
+ parked = parked.replace(/`[^`\n]+`/g, (m) => {
100
+ const idx = parts.length
101
+ parts.push({ prefix: INLINE_PH, idx, raw: m })
102
+ return `${INLINE_PH}${idx}${NULL}`
103
+ })
104
+ parked = parked.replace(URL_RE, (m) => {
105
+ const idx = parts.length
106
+ parts.push({ prefix: URL_PH, idx, raw: m })
107
+ return `${URL_PH}${idx}${NULL}`
108
+ })
109
+
110
+ return { parked, parts }
111
+ }
112
+
113
+ function restore(
114
+ text: string,
115
+ parts: Array<{ prefix: string; idx: number; raw: string }>,
116
+ ): string {
117
+ let restored = text
118
+ // Restore in reverse-insertion order so a placeholder accidentally
119
+ // emitted by a nested replacement gets the right raw region.
120
+ for (let i = parts.length - 1; i >= 0; i--) {
121
+ const p = parts[i]!
122
+ restored = restored.replace(`${p.prefix}${p.idx}${NULL}`, () => p.raw)
123
+ }
124
+ return restored
125
+ }
126
+
127
+ /**
128
+ * Replace em / en dashes with context-appropriate punctuation.
129
+ *
130
+ * Rules, applied in order:
131
+ * 1. ` — ` / ` – ` (flanked by single space) → `, ` if followed by a
132
+ * lowercase or open-paren character; otherwise `. ` if followed by
133
+ * an uppercase or end-of-string. Heuristic: lowercase = mid-clause
134
+ * continuation (comma reads naturally); uppercase = new sentence
135
+ * (period reads naturally).
136
+ * 2. End-of-line dash (` —\n` / ` –\n`) → `.\n` — treat as full stop.
137
+ * 3. Bare dash with no flanking spaces between word chars
138
+ * (e.g. "word—word") → `, ` — the missing-space form is rarer but
139
+ * semantically the same as #1.
140
+ * 4. Surviving dash (uncommon, e.g. at sentence start "— note") → `-`
141
+ * so the message still renders without the AI tell.
142
+ */
143
+ function replaceDashes(text: string): { out: string; replaced: number } {
144
+ let replaced = 0
145
+ let out = text
146
+
147
+ // #1: spaced em-dash mid-prose. Decide between ", " and ". " on
148
+ // the leading character of the following token.
149
+ out = out.replace(/(\S) [—–] (\S)/g, (_m, before: string, after: string) => {
150
+ replaced++
151
+ // If `after` is uppercase ASCII or one of a known sentence-starter
152
+ // set, treat as new sentence; otherwise a parenthetical comma.
153
+ const sentenceStart = /[A-Z]/.test(after)
154
+ return sentenceStart ? `${before}. ${after}` : `${before}, ${after}`
155
+ })
156
+
157
+ // #2: dash at end of line. Treat as full stop.
158
+ out = out.replace(/ [—–](\s*\n)/g, (_m, ws: string) => {
159
+ replaced++
160
+ return `.${ws}`
161
+ })
162
+
163
+ // #3: bare dash between word chars (no flanking spaces). Treat as
164
+ // missing-space form of #1; comma is the safe fallback.
165
+ out = out.replace(/(\w)[—–](\w)/g, (_m, before: string, after: string) => {
166
+ replaced++
167
+ return `${before}, ${after}`
168
+ })
169
+
170
+ // #4: anything still standing — convert to ASCII hyphen so no
171
+ // typographic dash escapes the gate. Rare path; covers leading
172
+ // "— note" / quoted dash / etc.
173
+ out = out.replace(/[—–]/g, () => {
174
+ replaced++
175
+ return '-'
176
+ })
177
+
178
+ return { out, replaced }
179
+ }
180
+
181
+ /**
182
+ * Public entry: scrub em / en dashes from outbound text while
183
+ * preserving dashes inside code and URLs.
184
+ *
185
+ * Pure: no IO, no module-scope state, deterministic. Kill switch is
186
+ * checked per call so an operator can flip it via env var without a
187
+ * restart of an in-process test.
188
+ */
189
+ export function scrubVoice(text: string): VoiceScrubResult {
190
+ if (!enabled() || text.length === 0) {
191
+ return { scrubbed: text, replaced: 0 }
192
+ }
193
+ const { parked, parts } = park(text)
194
+ const { out, replaced } = replaceDashes(parked)
195
+ if (replaced === 0) {
196
+ return { scrubbed: text, replaced: 0 }
197
+ }
198
+ return { scrubbed: restore(out, parts), replaced }
199
+ }
@@ -1,38 +1,35 @@
1
1
  /**
2
2
  * JTBD scenario — rapid follow-ups (steering vs queued classification).
3
3
  *
4
- * Production behaviour codified in `_shared/telegram-style.md.hbs`:
4
+ * Live contract codified in `_shared/telegram-style.md.hbs` and
5
+ * `reference/steer-or-queue-mid-flight.md` (default-flip commits
6
+ * `4fff90bf` + `597a58af`, 2026-04-17):
5
7
  *
6
- * - A follow-up message arriving while a turn is in flight, with no
7
- * `/queue` prefix, is `steering="true"` treated as a course
8
- * correction on the in-flight task.
9
- * - A follow-up prefixed with `/queue ` or `/q ` is `queued="true"` —
10
- * a new independent task; the agent should NOT reference the
11
- * in-flight work.
8
+ * - A mid-turn follow-up with NO prefix is `queued="true"` new
9
+ * independent task. The agent should NOT reference the in-flight
10
+ * work.
11
+ * - A mid-turn follow-up prefixed with `/steer ` or `/s ` is
12
+ * `steering="true"` course-correction; the agent continues the
13
+ * in-flight task incorporating the new guidance.
14
+ * - Legacy `/queue ` / `/q ` is a redundant alias for the default;
15
+ * still works.
12
16
  *
13
- * This UAT fires both shapes and asserts the agent responds in a way
14
- * that reflects the classification for steering it should mention
15
- * the correction; for queued it should treat the new task fresh.
16
- *
17
- * We can't assert directly on the internal channel meta (`steering`,
18
- * `queued`) from the driver side without inspecting the gateway log
19
- * but the conversational pacing prompt instructs the agent to
20
- * "self-narrate the classification" with a small italic line at the
21
- * top of its reply. So we can pattern-match on that.
17
+ * This UAT fires both shapes and asserts the agent narrates the
18
+ * classification correctly. The prior version of this scenario
19
+ * (2026-05-13 / PR #1132) tested the pre-flip contract with
20
+ * too-loose assertions (`/md5/i` regex passes on the queued path
21
+ * by coincidence the model answers "use md5" fresh and the reply
22
+ * contains "md5"). After unskipping with the corrected contract,
23
+ * the assertions check for the italic classification line the
24
+ * prompt instructs the agent to emit.
22
25
  */
23
26
 
24
27
  import { describe, it, expect } from "vitest";
25
28
  import { spinUp } from "../harness.js";
26
29
 
27
- // Skipped in CI: both cases failed in #1132 overnight (steering didn't
28
- // surface "md5"; queued didn't produce the expected fresh-task reply).
29
- // May be real classification bugs, may be prompt fragility — neither
30
- // has been root-caused. Excluded from the buildkite gate so it doesn't
31
- // block every PR touching telegram-plugin/. Run locally via
32
- // `bun run test:uat` once classification has been investigated.
33
- describe.skip("uat: rapid follow-ups — steering vs queued", () => {
30
+ describe("uat: rapid follow-ups steering vs queued classification", () => {
34
31
  it(
35
- "follow-up WITHOUT /queue → agent treats as steering",
32
+ "follow-up with /steer prefix → agent self-narrates as steering",
36
33
  async () => {
37
34
  const sc = await spinUp({ agent: "test-harness" });
38
35
  try {
@@ -43,26 +40,39 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
43
40
  + "Show the work step by step with a 2-second pause between.",
44
41
  );
45
42
  await new Promise((r) => setTimeout(r, 3_000));
46
- // Steer: change the algorithm
47
- await sc.sendDM("actually use md5 not sha256");
43
+ // Steer: change the algorithm using the explicit /steer prefix.
44
+ await sc.sendDM("/steer actually use md5 not sha256");
48
45
 
49
- // The agent should reply mentioning md5 (the steered
50
- // algorithm), AND ideally surface the italic classification
51
- // line per the prompt.
52
- const reply = await sc.expectMessage(/md5/i, {
53
- from: "bot",
54
- timeout: 120_000,
55
- });
46
+ // The agent should reply mentioning md5 AND surface the italic
47
+ // classification line per the prompt
48
+ // ("_↪️ treating as steer on the prior task_" or similar).
49
+ // We match either explicit-steer narration OR the steer emoji
50
+ // (`↪️`) to allow for natural-language variation while still
51
+ // failing if no narration appears (the previous version of
52
+ // this UAT was too loose — bare `/md5/i` passed by coincidence
53
+ // on the queued path).
54
+ const reply = await sc.expectMessage(
55
+ (m) => {
56
+ const txt = m.text;
57
+ const mentionsMd5 = /\bmd5\b/i.test(txt);
58
+ const narratesSteer =
59
+ /↪️|\bsteer(ing)?\b|continuing the (prior|original|in-flight) task|amendment|course[- ]correct/i.test(
60
+ txt,
61
+ );
62
+ return mentionsMd5 && narratesSteer;
63
+ },
64
+ { from: "bot", timeout: 120_000 },
65
+ );
56
66
  expect(reply.text.toLowerCase()).toContain("md5");
57
67
  } finally {
58
68
  await sc.tearDown();
59
69
  }
60
70
  },
61
- 150_000,
71
+ 180_000,
62
72
  );
63
73
 
64
74
  it(
65
- "follow-up WITH /queue → agent treats as new task",
75
+ "follow-up with no prefix mid-turn → agent treats as queued (new task)",
66
76
  async () => {
67
77
  const sc = await spinUp({ agent: "test-harness" });
68
78
  try {
@@ -71,9 +81,10 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
71
81
  + "Use bash.",
72
82
  );
73
83
  await new Promise((r) => setTimeout(r, 3_000));
74
- // Queued: completely independent task. The agent should NOT
75
- // reference the counting task.
76
- await sc.sendDM("/queue what is 2+2?");
84
+ // No prefix the default-flipped contract says this is a
85
+ // QUEUED new task. The agent should NOT reference the
86
+ // counting work.
87
+ await sc.sendDM("what is 2+2?");
77
88
 
78
89
  // First reply should be from the counting task (still
79
90
  // in-flight). Then a second reply for the queued task.
@@ -81,16 +92,32 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
81
92
  from: "bot",
82
93
  timeout: 60_000,
83
94
  });
84
- // Then we expect another reply (the queued task's answer).
85
- // /queue is treated as a new task per the prompt answer
86
- // should be "4" or mention 2+2.
95
+
96
+ // Second reply: the queued task's answer. We want to see
97
+ // EITHER the italic queued-narration line OR a fresh "4"
98
+ // answer that doesn't reference the counting work.
87
99
  const secondReply = await sc.expectMessage(
88
- (m) =>
89
- m.messageId > firstReply.messageId
90
- && /\b4\b|two\s+plus\s+two|2\s*\+\s*2/i.test(m.text),
100
+ (m) => {
101
+ if (m.messageId <= firstReply.messageId) return false;
102
+ const txt = m.text;
103
+ const answersTheQuestion =
104
+ /\b4\b|\bfour\b|two\s+plus\s+two|2\s*\+\s*2/i.test(txt);
105
+ const narratesQueued =
106
+ /📥|\bqueued\b|new\s+(?:independent\s+)?task|fresh\s+task/i.test(
107
+ txt,
108
+ );
109
+ // Pass if either: the explicit narration is present, OR the
110
+ // reply answers cleanly without referencing the counting
111
+ // task. The latter is the substantive behavioural check —
112
+ // the queued task is isolated from the in-flight context.
113
+ const isolatedFromCounting = !/\bcount(ing)?\b|\bsleep\b/i.test(
114
+ txt,
115
+ );
116
+ return answersTheQuestion && (narratesQueued || isolatedFromCounting);
117
+ },
91
118
  { from: "bot", timeout: 120_000 },
92
119
  );
93
- expect(secondReply.text).toMatch(/4|two|2\s*\+\s*2/i);
120
+ expect(secondReply.text).toMatch(/4|four|2\s*\+\s*2/i);
94
121
  } finally {
95
122
  await sc.tearDown();
96
123
  }