switchroom 0.13.20 → 0.13.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,38 +1,35 @@
1
1
  /**
2
2
  * JTBD scenario — rapid follow-ups (steering vs queued classification).
3
3
  *
4
- * Production behaviour codified in `_shared/telegram-style.md.hbs`:
4
+ * Live contract codified in `_shared/telegram-style.md.hbs` and
5
+ * `reference/steer-or-queue-mid-flight.md` (default-flip commits
6
+ * `4fff90bf` + `597a58af`, 2026-04-17):
5
7
  *
6
- * - A follow-up message arriving while a turn is in flight, with no
7
- * `/queue` prefix, is `steering="true"` treated as a course
8
- * correction on the in-flight task.
9
- * - A follow-up prefixed with `/queue ` or `/q ` is `queued="true"` —
10
- * a new independent task; the agent should NOT reference the
11
- * in-flight work.
8
+ * - A mid-turn follow-up with NO prefix is `queued="true"` new
9
+ * independent task. The agent should NOT reference the in-flight
10
+ * work.
11
+ * - A mid-turn follow-up prefixed with `/steer ` or `/s ` is
12
+ * `steering="true"` course-correction; the agent continues the
13
+ * in-flight task incorporating the new guidance.
14
+ * - Legacy `/queue ` / `/q ` is a redundant alias for the default;
15
+ * still works.
12
16
  *
13
- * This UAT fires both shapes and asserts the agent responds in a way
14
- * that reflects the classification for steering it should mention
15
- * the correction; for queued it should treat the new task fresh.
16
- *
17
- * We can't assert directly on the internal channel meta (`steering`,
18
- * `queued`) from the driver side without inspecting the gateway log
19
- * but the conversational pacing prompt instructs the agent to
20
- * "self-narrate the classification" with a small italic line at the
21
- * top of its reply. So we can pattern-match on that.
17
+ * This UAT fires both shapes and asserts the agent narrates the
18
+ * classification correctly. The prior version of this scenario
19
+ * (2026-05-13 / PR #1132) tested the pre-flip contract with
20
+ * too-loose assertions (`/md5/i` regex passes on the queued path
21
+ * by coincidence the model answers "use md5" fresh and the reply
22
+ * contains "md5"). After unskipping with the corrected contract,
23
+ * the assertions check for the italic classification line the
24
+ * prompt instructs the agent to emit.
22
25
  */
23
26
 
24
27
  import { describe, it, expect } from "vitest";
25
28
  import { spinUp } from "../harness.js";
26
29
 
27
- // Skipped in CI: both cases failed in #1132 overnight (steering didn't
28
- // surface "md5"; queued didn't produce the expected fresh-task reply).
29
- // May be real classification bugs, may be prompt fragility — neither
30
- // has been root-caused. Excluded from the buildkite gate so it doesn't
31
- // block every PR touching telegram-plugin/. Run locally via
32
- // `bun run test:uat` once classification has been investigated.
33
- describe.skip("uat: rapid follow-ups — steering vs queued", () => {
30
+ describe("uat: rapid follow-ups steering vs queued classification", () => {
34
31
  it(
35
- "follow-up WITHOUT /queue → agent treats as steering",
32
+ "follow-up with /steer prefix → agent self-narrates as steering",
36
33
  async () => {
37
34
  const sc = await spinUp({ agent: "test-harness" });
38
35
  try {
@@ -43,26 +40,39 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
43
40
  + "Show the work step by step with a 2-second pause between.",
44
41
  );
45
42
  await new Promise((r) => setTimeout(r, 3_000));
46
- // Steer: change the algorithm
47
- await sc.sendDM("actually use md5 not sha256");
43
+ // Steer: change the algorithm using the explicit /steer prefix.
44
+ await sc.sendDM("/steer actually use md5 not sha256");
48
45
 
49
- // The agent should reply mentioning md5 (the steered
50
- // algorithm), AND ideally surface the italic classification
51
- // line per the prompt.
52
- const reply = await sc.expectMessage(/md5/i, {
53
- from: "bot",
54
- timeout: 120_000,
55
- });
46
+ // The agent should reply mentioning md5 AND surface the italic
47
+ // classification line per the prompt
48
+ // ("_↪️ treating as steer on the prior task_" or similar).
49
+ // We match either explicit-steer narration OR the steer emoji
50
+ // (`↪️`) to allow for natural-language variation while still
51
+ // failing if no narration appears (the previous version of
52
+ // this UAT was too loose — bare `/md5/i` passed by coincidence
53
+ // on the queued path).
54
+ const reply = await sc.expectMessage(
55
+ (m) => {
56
+ const txt = m.text;
57
+ const mentionsMd5 = /\bmd5\b/i.test(txt);
58
+ const narratesSteer =
59
+ /↪️|\bsteer(ing)?\b|continuing the (prior|original|in-flight) task|amendment|course[- ]correct/i.test(
60
+ txt,
61
+ );
62
+ return mentionsMd5 && narratesSteer;
63
+ },
64
+ { from: "bot", timeout: 120_000 },
65
+ );
56
66
  expect(reply.text.toLowerCase()).toContain("md5");
57
67
  } finally {
58
68
  await sc.tearDown();
59
69
  }
60
70
  },
61
- 150_000,
71
+ 180_000,
62
72
  );
63
73
 
64
74
  it(
65
- "follow-up WITH /queue → agent treats as new task",
75
+ "follow-up with no prefix mid-turn → agent treats as queued (new task)",
66
76
  async () => {
67
77
  const sc = await spinUp({ agent: "test-harness" });
68
78
  try {
@@ -71,9 +81,10 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
71
81
  + "Use bash.",
72
82
  );
73
83
  await new Promise((r) => setTimeout(r, 3_000));
74
- // Queued: completely independent task. The agent should NOT
75
- // reference the counting task.
76
- await sc.sendDM("/queue what is 2+2?");
84
+ // No prefix the default-flipped contract says this is a
85
+ // QUEUED new task. The agent should NOT reference the
86
+ // counting work.
87
+ await sc.sendDM("what is 2+2?");
77
88
 
78
89
  // First reply should be from the counting task (still
79
90
  // in-flight). Then a second reply for the queued task.
@@ -81,16 +92,32 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
81
92
  from: "bot",
82
93
  timeout: 60_000,
83
94
  });
84
- // Then we expect another reply (the queued task's answer).
85
- // /queue is treated as a new task per the prompt answer
86
- // should be "4" or mention 2+2.
95
+
96
+ // Second reply: the queued task's answer. We want to see
97
+ // EITHER the italic queued-narration line OR a fresh "4"
98
+ // answer that doesn't reference the counting work.
87
99
  const secondReply = await sc.expectMessage(
88
- (m) =>
89
- m.messageId > firstReply.messageId
90
- && /\b4\b|two\s+plus\s+two|2\s*\+\s*2/i.test(m.text),
100
+ (m) => {
101
+ if (m.messageId <= firstReply.messageId) return false;
102
+ const txt = m.text;
103
+ const answersTheQuestion =
104
+ /\b4\b|\bfour\b|two\s+plus\s+two|2\s*\+\s*2/i.test(txt);
105
+ const narratesQueued =
106
+ /📥|\bqueued\b|new\s+(?:independent\s+)?task|fresh\s+task/i.test(
107
+ txt,
108
+ );
109
+ // Pass if either: the explicit narration is present, OR the
110
+ // reply answers cleanly without referencing the counting
111
+ // task. The latter is the substantive behavioural check —
112
+ // the queued task is isolated from the in-flight context.
113
+ const isolatedFromCounting = !/\bcount(ing)?\b|\bsleep\b/i.test(
114
+ txt,
115
+ );
116
+ return answersTheQuestion && (narratesQueued || isolatedFromCounting);
117
+ },
91
118
  { from: "bot", timeout: 120_000 },
92
119
  );
93
- expect(secondReply.text).toMatch(/4|two|2\s*\+\s*2/i);
120
+ expect(secondReply.text).toMatch(/4|four|2\s*\+\s*2/i);
94
121
  } finally {
95
122
  await sc.tearDown();
96
123
  }