switchroom 0.8.1 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/README.md +54 -61
  2. package/bin/timezone-hook.sh +9 -7
  3. package/dist/agent-scheduler/index.js +285 -45
  4. package/dist/auth-broker/index.js +13932 -0
  5. package/dist/cli/drive-write-pretool.mjs +5418 -0
  6. package/dist/cli/switchroom.js +8890 -5560
  7. package/dist/host-control/main.js +582 -43
  8. package/dist/vault/approvals/kernel-server.js +276 -47
  9. package/dist/vault/broker/server.js +333 -69
  10. package/examples/minimal.yaml +63 -0
  11. package/examples/personal-google-workspace-mcp/.env.example +34 -0
  12. package/examples/personal-google-workspace-mcp/README.md +194 -0
  13. package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
  14. package/examples/switchroom.yaml +220 -0
  15. package/package.json +6 -4
  16. package/profiles/_base/start.sh.hbs +3 -3
  17. package/profiles/_shared/agent-self-service.md.hbs +126 -0
  18. package/profiles/default/CLAUDE.md +10 -0
  19. package/profiles/default/CLAUDE.md.hbs +16 -0
  20. package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
  21. package/skills/buildkite-agent-runtime/SKILL.md +44 -11
  22. package/skills/buildkite-api/SKILL.md +31 -8
  23. package/skills/buildkite-cli/SKILL.md +27 -9
  24. package/skills/buildkite-migration/SKILL.md +22 -9
  25. package/skills/buildkite-pipelines/SKILL.md +26 -9
  26. package/skills/buildkite-secure-delivery/SKILL.md +23 -9
  27. package/skills/buildkite-test-engine/SKILL.md +25 -8
  28. package/skills/docx/SKILL.md +1 -1
  29. package/skills/file-bug/SKILL.md +34 -6
  30. package/skills/humanizer/SKILL.md +15 -0
  31. package/skills/humanizer-calibrate/SKILL.md +7 -1
  32. package/skills/mcp-builder/SKILL.md +1 -1
  33. package/skills/pdf/SKILL.md +1 -1
  34. package/skills/pptx/SKILL.md +1 -1
  35. package/skills/skill-creator/SKILL.md +21 -1
  36. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
  37. package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
  38. package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
  39. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
  40. package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
  41. package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
  42. package/skills/switchroom-cli/SKILL.md +63 -64
  43. package/skills/switchroom-health/SKILL.md +23 -10
  44. package/skills/switchroom-install/SKILL.md +3 -3
  45. package/skills/switchroom-manage/SKILL.md +26 -19
  46. package/skills/switchroom-runtime/SKILL.md +67 -15
  47. package/skills/switchroom-status/SKILL.md +26 -1
  48. package/skills/telegram-test-harness/SKILL.md +3 -0
  49. package/skills/webapp-testing/SKILL.md +31 -1
  50. package/skills/xlsx/SKILL.md +1 -1
  51. package/telegram-plugin/admin-commands/dispatch.test.ts +1 -1
  52. package/telegram-plugin/admin-commands/index.ts +9 -5
  53. package/telegram-plugin/auth-snapshot-format.ts +612 -0
  54. package/telegram-plugin/auto-fallback-fleet.ts +215 -0
  55. package/telegram-plugin/auto-fallback.ts +28 -301
  56. package/telegram-plugin/dist/gateway/gateway.js +17453 -15100
  57. package/telegram-plugin/fleet-fallback-gate.ts +105 -0
  58. package/telegram-plugin/gateway/approval-callback.test.ts +104 -0
  59. package/telegram-plugin/gateway/approval-callback.ts +31 -3
  60. package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
  61. package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
  62. package/telegram-plugin/gateway/auth-command.ts +905 -0
  63. package/telegram-plugin/gateway/auth-line.ts +123 -0
  64. package/telegram-plugin/gateway/auth-status-adapter.ts +101 -0
  65. package/telegram-plugin/gateway/boot-card.ts +23 -37
  66. package/telegram-plugin/gateway/boot-probes.ts +9 -12
  67. package/telegram-plugin/gateway/diff-preview-card.test.ts +192 -0
  68. package/telegram-plugin/gateway/diff-preview-card.ts +170 -0
  69. package/telegram-plugin/gateway/drive-write-approval.test.ts +312 -0
  70. package/telegram-plugin/gateway/drive-write-approval.ts +243 -0
  71. package/telegram-plugin/gateway/folder-picker-handler.test.ts +314 -0
  72. package/telegram-plugin/gateway/folder-picker-handler.ts +348 -0
  73. package/telegram-plugin/gateway/gateway.ts +1156 -938
  74. package/telegram-plugin/gateway/hostd-dispatch.ts +244 -0
  75. package/telegram-plugin/gateway/ipc-protocol.ts +83 -2
  76. package/telegram-plugin/gateway/ipc-server.ts +69 -0
  77. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +103 -12
  78. package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
  79. package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
  80. package/telegram-plugin/model-unavailable.ts +28 -12
  81. package/telegram-plugin/permission-title.ts +56 -0
  82. package/telegram-plugin/quota-check.ts +19 -41
  83. package/telegram-plugin/scripts/build.mjs +0 -1
  84. package/telegram-plugin/shared/bot-runtime.ts +5 -4
  85. package/telegram-plugin/silence-poke.ts +153 -1
  86. package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
  87. package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
  88. package/telegram-plugin/tests/auth-command-format2.test.ts +156 -0
  89. package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
  90. package/telegram-plugin/tests/auth-snapshot-format.test.ts +429 -0
  91. package/telegram-plugin/tests/auth-status-adapter.test.ts +129 -0
  92. package/telegram-plugin/tests/auto-fallback-fleet.test.ts +211 -0
  93. package/telegram-plugin/tests/auto-fallback.test.ts +60 -358
  94. package/telegram-plugin/tests/boot-probes.test.ts +27 -22
  95. package/telegram-plugin/tests/fleet-fallback-gate.test.ts +197 -0
  96. package/telegram-plugin/tests/model-unavailable.test.ts +30 -5
  97. package/telegram-plugin/tests/permission-title.test.ts +31 -0
  98. package/telegram-plugin/tests/quota-check.test.ts +5 -35
  99. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +212 -2
  100. package/telegram-plugin/tests/silence-poke.test.ts +237 -0
  101. package/telegram-plugin/tests/turn-flush-safety.test.ts +112 -0
  102. package/telegram-plugin/turn-flush-safety.ts +55 -1
  103. package/telegram-plugin/uat/SETUP.md +35 -1
  104. package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
  105. package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
  106. package/telegram-plugin/uat/runners/report.ts +150 -0
  107. package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
  108. package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
  109. package/telegram-plugin/uat/runners/scorer.ts +106 -0
  110. package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
  111. package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
  112. package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +7 -1
  113. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +7 -1
  114. package/telegram-plugin/auth-dashboard.ts +0 -1104
  115. package/telegram-plugin/auth-slot-parser.ts +0 -497
  116. package/telegram-plugin/auto-fallback-dispatcher.ts +0 -68
  117. package/telegram-plugin/dist/foreman/foreman.js +0 -31358
  118. package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
  119. package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
  120. package/telegram-plugin/foreman/foreman.ts +0 -1165
  121. package/telegram-plugin/foreman/setup-flow.ts +0 -345
  122. package/telegram-plugin/foreman/setup-state.ts +0 -239
  123. package/telegram-plugin/foreman/state.ts +0 -203
  124. package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
  125. package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
  126. package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
  127. package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
  128. package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
  129. package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
  130. package/telegram-plugin/tests/auto-fallback-dispatcher.e2e.test.ts +0 -183
  131. package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
  132. package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
  133. package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
  134. package/telegram-plugin/tests/foreman-state.test.ts +0 -164
  135. package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
  136. package/telegram-plugin/tests/setup-flow.test.ts +0 -510
  137. package/telegram-plugin/tests/setup-state.test.ts +0 -146
@@ -70,6 +70,7 @@ describe('sandbox-hint-posttool', () => {
70
70
  tool_name: 'Bash',
71
71
  tool_use_id: 'toolu_003',
72
72
  tool_response: {
73
+ exit_code: 100,
73
74
  stderr:
74
75
  'E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?',
75
76
  },
@@ -141,15 +142,224 @@ describe('sandbox-hint-posttool', () => {
141
142
  it('caps the scan window for huge tool_response payloads', () => {
142
143
  // 100 KiB of harmless output followed by an EROFS — we cap at 64 KiB
143
144
  // so this should NOT match. Keeps a runaway tool_response from
144
- // pinning the hook on a regex scan.
145
+ // pinning the hook on a regex scan. The exit_code is set so the
146
+ // failure-classifier reaches the scan path — without it, #1303's
147
+ // success-gate would return early for a different reason.
145
148
  const huge = 'x'.repeat(100 * 1024) + ' EROFS happened'
146
149
  const result = runHook({
147
150
  tool_name: 'Bash',
148
151
  tool_use_id: 'toolu_007',
149
- tool_response: { stdout: huge },
152
+ tool_response: { exit_code: 1, stdout: huge },
150
153
  })
151
154
 
152
155
  expect(result.status).toBe(0)
153
156
  expect(result.stdout.trim()).toBe('')
154
157
  })
158
+
159
+ // #1303 — the hook used to fire on every tool whose payload merely
160
+ // MENTIONED EROFS / read-only-fs / EACCES /usr / dpkg, regardless of
161
+ // whether the tool actually failed. Concrete repro: reading a file
162
+ // whose content describes the sandbox model triggered the advisory
163
+ // every time. Fix: classify tool_response as success-or-failure FIRST
164
+ // (only failures can have hit a kernel boundary), AND gate on
165
+ // write-capable tools only (Read/Grep/Glob can't EROFS).
166
+ describe('#1303 — false-positive guard', () => {
167
+ it('does NOT emit when a Read on a file MENTIONS EROFS (Read is not write-capable)', () => {
168
+ const result = runHook({
169
+ tool_name: 'Read',
170
+ tool_use_id: 'toolu_fp_read',
171
+ // Realistic: an Edit on a file whose Read returns content that
172
+ // happens to talk about the sandbox model. Pre-fix this fired.
173
+ tool_response: {
174
+ file: '/state/agent/home/some-doc.md',
175
+ content:
176
+ '# Sandbox notes\n\nWhen a write hits EROFS we say "Read-only file system".\n',
177
+ },
178
+ })
179
+
180
+ expect(result.status).toBe(0)
181
+ expect(result.stdout.trim()).toBe('')
182
+ })
183
+
184
+ it('does NOT emit when a Grep finds a line containing "Read-only file system"', () => {
185
+ const result = runHook({
186
+ tool_name: 'Grep',
187
+ tool_use_id: 'toolu_fp_grep',
188
+ tool_response: { stdout: 'docs/sandbox.md:42: Read-only file system semantics' },
189
+ })
190
+
191
+ expect(result.status).toBe(0)
192
+ expect(result.stdout.trim()).toBe('')
193
+ })
194
+
195
+ it('does NOT emit when a successful Bash mentions EROFS in stdout (exit_code=0)', () => {
196
+ const result = runHook({
197
+ tool_name: 'Bash',
198
+ tool_use_id: 'toolu_fp_bash_success',
199
+ tool_response: {
200
+ exit_code: 0,
201
+ stdout: 'I tested EROFS handling: all good.',
202
+ },
203
+ })
204
+
205
+ expect(result.status).toBe(0)
206
+ expect(result.stdout.trim()).toBe('')
207
+ })
208
+
209
+ it('does NOT emit when a successful Edit echoes new content containing "EROFS"', () => {
210
+ // The Edit tool's tool_response echoes the modified content. If
211
+ // the new content mentions EROFS — e.g. when editing this very
212
+ // hook source — the pre-fix logic fired falsely on every keystroke.
213
+ const result = runHook({
214
+ tool_name: 'Edit',
215
+ tool_use_id: 'toolu_fp_edit_success',
216
+ tool_response: {
217
+ // is_error explicitly false; no error field; no exit_code.
218
+ is_error: false,
219
+ file_path: '/state/agent/home/hook.mjs',
220
+ old_string: '// old',
221
+ new_string: '// new code mentioning EROFS and read-only file system semantics',
222
+ },
223
+ })
224
+
225
+ expect(result.status).toBe(0)
226
+ expect(result.stdout.trim()).toBe('')
227
+ })
228
+
229
+ it('still emits when an Edit FAILED with is_error=true on a real EROFS', () => {
230
+ const result = runHook({
231
+ tool_name: 'Edit',
232
+ tool_use_id: 'toolu_real_failure',
233
+ tool_response: {
234
+ is_error: true,
235
+ error: "EROFS: read-only file system, open '/opt/switchroom/skills/foo.md'",
236
+ },
237
+ })
238
+
239
+ expect(result.status).toBe(0)
240
+ const ctx = parseContext(result.stdout)
241
+ expect(ctx).toContain('Sandbox boundary hit')
242
+ })
243
+
244
+ it('still emits when a Bash FAILED with non-zero exit_code and stderr containing EROFS', () => {
245
+ const result = runHook({
246
+ tool_name: 'Bash',
247
+ tool_use_id: 'toolu_real_bash_failure',
248
+ tool_response: {
249
+ exit_code: 1,
250
+ stderr: "mkdir: cannot create directory '/opt/foo': Read-only file system",
251
+ stdout: '',
252
+ },
253
+ })
254
+
255
+ expect(result.status).toBe(0)
256
+ const ctx = parseContext(result.stdout)
257
+ expect(ctx).toContain('Sandbox boundary hit')
258
+ })
259
+
260
+ it('does NOT emit for tools not in the write-capable allowlist, even on failure-shaped payload', () => {
261
+ // Even a payload that LOOKS like a failure — `is_error: true` —
262
+ // cannot reflect a kernel sandbox hit if the tool isn't write-
263
+ // capable. Read can't EROFS. We refuse to advise.
264
+ const result = runHook({
265
+ tool_name: 'WebFetch',
266
+ tool_use_id: 'toolu_fp_webfetch',
267
+ tool_response: { is_error: true, error: 'EROFS lookalike in HTTP body' },
268
+ })
269
+
270
+ expect(result.status).toBe(0)
271
+ expect(result.stdout.trim()).toBe('')
272
+ })
273
+
274
+ it('DOES emit for an MCP tool failure (proxies can write)', () => {
275
+ const result = runHook({
276
+ tool_name: 'mcp__some-server__write_file',
277
+ tool_use_id: 'toolu_mcp_failure',
278
+ tool_response: {
279
+ is_error: true,
280
+ error: 'EROFS: read-only file system on /opt/foo',
281
+ },
282
+ })
283
+
284
+ expect(result.status).toBe(0)
285
+ const ctx = parseContext(result.stdout)
286
+ expect(ctx).toContain('Sandbox boundary hit')
287
+ })
288
+ })
289
+
290
+ // Direct unit tests on the classifier helper.
291
+ describe('classifyFailure', () => {
292
+ it('returns null for a successful object response', async () => {
293
+ const mod = await import('../hooks/sandbox-hint-posttool.mjs')
294
+ expect(mod.__internals.classifyFailure({ exit_code: 0, stdout: 'EROFS mentioned' }))
295
+ .toBeNull()
296
+ expect(mod.__internals.classifyFailure({ is_error: false, content: 'EROFS mentioned' }))
297
+ .toBeNull()
298
+ })
299
+
300
+ it('returns a structured-failure for is_error=true', async () => {
301
+ const mod = await import('../hooks/sandbox-hint-posttool.mjs')
302
+ const got = mod.__internals.classifyFailure({
303
+ is_error: true,
304
+ error: 'EROFS: ...',
305
+ })
306
+ expect(got?.kind).toBe('structured-failure')
307
+ expect(got?.body).toContain('EROFS')
308
+ })
309
+
310
+ it('returns a structured-failure for non-zero exit_code with stderr', async () => {
311
+ const mod = await import('../hooks/sandbox-hint-posttool.mjs')
312
+ const got = mod.__internals.classifyFailure({
313
+ exit_code: 1,
314
+ stderr: 'Read-only file system',
315
+ stdout: 'also relevant context',
316
+ })
317
+ expect(got?.kind).toBe('structured-failure')
318
+ // Both stderr and stdout included on failed Bash.
319
+ expect(got?.body).toContain('Read-only file system')
320
+ expect(got?.body).toContain('also relevant context')
321
+ })
322
+
323
+ it('treats a bare string as a candidate to scan', async () => {
324
+ const mod = await import('../hooks/sandbox-hint-posttool.mjs')
325
+ const got = mod.__internals.classifyFailure('mkdir: Read-only file system')
326
+ expect(got?.kind).toBe('bare-string')
327
+ expect(got?.body).toContain('Read-only file system')
328
+ })
329
+
330
+ it('returns null for null / undefined / primitives', async () => {
331
+ const mod = await import('../hooks/sandbox-hint-posttool.mjs')
332
+ expect(mod.__internals.classifyFailure(null)).toBeNull()
333
+ expect(mod.__internals.classifyFailure(undefined)).toBeNull()
334
+ expect(mod.__internals.classifyFailure(42)).toBeNull()
335
+ })
336
+ })
337
+
338
+ describe('isWriteCapableTool', () => {
339
+ it('returns true for the canonical write tools', async () => {
340
+ const mod = await import('../hooks/sandbox-hint-posttool.mjs')
341
+ for (const n of ['Edit', 'MultiEdit', 'Write', 'NotebookEdit', 'Bash']) {
342
+ expect(mod.__internals.isWriteCapableTool(n)).toBe(true)
343
+ }
344
+ })
345
+
346
+ it('returns false for read-only tools', async () => {
347
+ const mod = await import('../hooks/sandbox-hint-posttool.mjs')
348
+ for (const n of ['Read', 'Grep', 'Glob', 'WebFetch', 'WebSearch', 'TodoWrite']) {
349
+ expect(mod.__internals.isWriteCapableTool(n)).toBe(false)
350
+ }
351
+ })
352
+
353
+ it('returns true for any MCP tool (proxy writes possible)', async () => {
354
+ const mod = await import('../hooks/sandbox-hint-posttool.mjs')
355
+ expect(mod.__internals.isWriteCapableTool('mcp__server__do_thing')).toBe(true)
356
+ })
357
+
358
+ it('returns false for empty / non-string', async () => {
359
+ const mod = await import('../hooks/sandbox-hint-posttool.mjs')
360
+ expect(mod.__internals.isWriteCapableTool('')).toBe(false)
361
+ expect(mod.__internals.isWriteCapableTool(null as any)).toBe(false)
362
+ expect(mod.__internals.isWriteCapableTool(undefined as any)).toBe(false)
363
+ })
364
+ })
155
365
  })
@@ -4,6 +4,9 @@ import {
4
4
  noteOutbound,
5
5
  noteSubagentDispatch,
6
6
  noteThinking,
7
+ noteToolStart,
8
+ noteToolEnd,
9
+ noteToolLabel,
7
10
  consumeArmedPoke,
8
11
  endTurn,
9
12
  silencePokeEnabled,
@@ -309,6 +312,240 @@ describe('silence-poke — abnormal turn-end invariants (CC-5 follow-up)', () =>
309
312
  ).toHaveLength(1) // unchanged: only the original soft
310
313
  expect(fx.fallbacks).toHaveLength(0)
311
314
  })
315
+
316
+ // #1289: the flush-backstop turn-end branch in the gateway (the path
317
+ // taken when the agent emits assistant text but never calls the reply
318
+ // tool) was retrofitted in #1067 to null `currentTurn` early but never
319
+ // had `silencePoke.endTurn` added — leaving state2 populated so the
320
+ // 300s framework fallback fired after the gateway already flushed the
321
+ // captured prose and considered the turn over. Pin the contract at
322
+ // the silence-poke level: a turn that records an outbound (the
323
+ // flushed message) and then calls endTurn must not later fire a
324
+ // fallback even if 300s elapses from the original turn start.
325
+ it('#1289: flush-backstop turn-end (outbound + endTurn) suppresses the 300s fallback', () => {
326
+ const fx = setupDeps()
327
+ startTurn('k', 0)
328
+ // Some time passes while the agent generates prose without calling
329
+ // the reply tool. No soft/firm armed yet.
330
+ __tickForTests(60_000)
331
+ // Gateway turn-flush fires: captured text is sent as an outbound,
332
+ // then the flush branch nulls currentTurn AND (post-fix) calls
333
+ // signalTracker.clear + silencePoke.endTurn.
334
+ noteOutbound('k', 60_000)
335
+ endTurn('k')
336
+ // 300s elapses from the original turn start. Pre-fix: the framework
337
+ // fallback fired here. Post-fix: the state is drained, no fallback.
338
+ __tickForTests(240_000)
339
+ expect(fx.fallbacks).toHaveLength(0)
340
+ expect(
341
+ fx.emitted.filter((e) => e.kind === 'silence_fallback_sent'),
342
+ ).toHaveLength(0)
343
+ })
344
+ })
345
+
346
+ // #1292 — drive a deterministic, tool-aware fallback message from the
347
+ // gateway's `tool_use` / `tool_result` event stream. The progress card
348
+ // was retired in #1122 PR3 in favour of the conversational shape; the
349
+ // remaining honesty gap was that the 300s framework fallback said
350
+ // "still working… no update in 5 min" on turns where the agent was
351
+ // clearly grinding through tool calls. These tests pin the behaviour:
352
+ // the silence clock is NOT reset by tool churn (header invariant
353
+ // preserved), but the fallback message body becomes tool-aware so the
354
+ // user sees the actual observable.
355
+ describe('silence-poke — #1292 tool-aware framework fallback', () => {
356
+ it('fallback context exposes in-flight tool snapshot with duration', () => {
357
+ const fx = setupDeps()
358
+ startTurn('k', 0)
359
+ noteToolStart('k', 'T1', 'Grep', 'foo', 30_000)
360
+ __tickForTests(75_000)
361
+ __tickForTests(180_000)
362
+ __tickForTests(305_000)
363
+ expect(fx.fallbacks).toHaveLength(1)
364
+ const ctx = fx.fallbacks[0]!
365
+ expect(ctx.inFlightTools).toHaveLength(1)
366
+ expect(ctx.inFlightTools[0]!.name).toBe('Grep')
367
+ expect(ctx.inFlightTools[0]!.label).toBe('foo')
368
+ expect(ctx.inFlightTools[0]!.durationMs).toBe(305_000 - 30_000)
369
+ })
370
+
371
+ it('formatFrameworkFallbackText names the longest-running tool with duration', () => {
372
+ const text = formatFrameworkFallbackText('working', 305_000, [
373
+ { name: 'Grep', label: '"foo"', durationMs: 275_000 },
374
+ ])
375
+ expect(text).toBe(
376
+ 'running Grep "foo" for 5m (no update from agent in 5 min)',
377
+ )
378
+ })
379
+
380
+ it('multiple in-flight tools render as "+ N more"', () => {
381
+ const text = formatFrameworkFallbackText('working', 305_000, [
382
+ { name: 'Grep', label: '"foo"', durationMs: 275_000 },
383
+ { name: 'Read', label: 'config.ts', durationMs: 120_000 },
384
+ { name: 'Bash', label: null, durationMs: 60_000 },
385
+ ])
386
+ expect(text).toBe(
387
+ 'running Grep "foo" + 2 more for 5m (no update from agent in 5 min)',
388
+ )
389
+ })
390
+
391
+ it('tool with no label renders the bare name', () => {
392
+ const text = formatFrameworkFallbackText('working', 305_000, [
393
+ { name: 'Bash', label: null, durationMs: 305_000 },
394
+ ])
395
+ expect(text).toBe(
396
+ 'running Bash for 5m (no update from agent in 5 min)',
397
+ )
398
+ })
399
+
400
+ it('empty inFlightTools falls back to the base "still working" wording', () => {
401
+ expect(
402
+ formatFrameworkFallbackText('working', 305_000, []),
403
+ ).toBe('still working… (no update from agent in 5 min)')
404
+ expect(
405
+ formatFrameworkFallbackText('thinking', 305_000, []),
406
+ ).toBe('still thinking… (no update from agent in 5 min)')
407
+ // No third arg → same as empty array.
408
+ expect(
409
+ formatFrameworkFallbackText('working', 305_000),
410
+ ).toBe('still working… (no update from agent in 5 min)')
411
+ })
412
+
413
+ it('tool-aware wording wins over "thinking" — the actual observable beats the inferred kind', () => {
414
+ const text = formatFrameworkFallbackText('thinking', 305_000, [
415
+ { name: 'Grep', label: '"foo"', durationMs: 305_000 },
416
+ ])
417
+ expect(text.startsWith('running Grep')).toBe(true)
418
+ expect(text).not.toContain('still thinking')
419
+ })
420
+
421
+ it('tool completed before the fallback → empty snapshot → base wording', () => {
422
+ const fx = setupDeps()
423
+ startTurn('k', 0)
424
+ noteToolStart('k', 'T1', 'Grep', 'foo', 30_000)
425
+ noteToolEnd('k', 'T1', 200_000)
426
+ __tickForTests(75_000)
427
+ __tickForTests(180_000)
428
+ __tickForTests(305_000)
429
+ expect(fx.fallbacks).toHaveLength(1)
430
+ expect(fx.fallbacks[0]!.inFlightTools).toHaveLength(0)
431
+ })
432
+
433
+ it('late noteToolLabel updates the in-flight entry in place', () => {
434
+ const fx = setupDeps()
435
+ startTurn('k', 0)
436
+ noteToolStart('k', 'T1', 'Grep', null, 30_000)
437
+ noteToolLabel('k', 'T1', '"refined-from-sidecar"')
438
+ __tickForTests(75_000)
439
+ __tickForTests(180_000)
440
+ __tickForTests(305_000)
441
+ expect(fx.fallbacks[0]!.inFlightTools[0]!.label).toBe('"refined-from-sidecar"')
442
+ })
443
+
444
+ it('endTurn drains inFlightTools', () => {
445
+ setupDeps()
446
+ startTurn('k', 0)
447
+ noteToolStart('k', 'T1', 'Grep', 'foo', 30_000)
448
+ expect(__getStateForTests('k')!.inFlightTools.size).toBe(1)
449
+ endTurn('k')
450
+ // A fresh turn under the same key has an empty map.
451
+ startTurn('k', 1_000_000)
452
+ expect(__getStateForTests('k')!.inFlightTools.size).toBe(0)
453
+ })
454
+
455
+ it('parallel tools sort by startedAt ascending — longest-running rendered first', () => {
456
+ const fx = setupDeps()
457
+ startTurn('k', 0)
458
+ // Order intentionally NOT chronological to verify sort.
459
+ noteToolStart('k', 'T-late', 'Read', 'recent.ts', 250_000)
460
+ noteToolStart('k', 'T-early', 'Grep', '"oldest"', 20_000)
461
+ noteToolStart('k', 'T-mid', 'Bash', null, 100_000)
462
+ __tickForTests(75_000)
463
+ __tickForTests(180_000)
464
+ __tickForTests(305_000)
465
+ const snap = fx.fallbacks[0]!.inFlightTools
466
+ expect(snap.map(t => t.name)).toEqual(['Grep', 'Bash', 'Read'])
467
+ })
468
+
469
+ it('tool churn does NOT reset the silence clock (header invariant preserved)', () => {
470
+ // The whole point of #1292 (b) over (a) is that we enrich the
471
+ // fallback TEXT, never the timing. Tool activity must not delay
472
+ // or suppress the soft/firm/fallback escalation ladder.
473
+ const fx = setupDeps()
474
+ startTurn('k', 0)
475
+ // A constant stream of tool churn through the entire 5min window —
476
+ // each tool ends quickly so inFlightTools is empty by fallback.
477
+ for (let t = 5_000; t <= 295_000; t += 10_000) {
478
+ noteToolStart('k', `T-${t}`, 'Grep', 'foo', t)
479
+ noteToolEnd('k', `T-${t}`, t + 500)
480
+ }
481
+ __tickForTests(75_000) // soft
482
+ __tickForTests(180_000) // firm
483
+ __tickForTests(305_000) // fallback
484
+ expect(
485
+ fx.emitted.filter(e => e.kind === 'silence_poke_fired'),
486
+ ).toHaveLength(2)
487
+ expect(fx.fallbacks).toHaveLength(1)
488
+ })
489
+
490
+ it('Task tool sets subagentDispatchActive AND populates inFlightTools', () => {
491
+ // Two flags are independent: the soft-threshold extension still
492
+ // works for sub-agent waits (existing behaviour), AND the fallback
493
+ // message names the Task tool as the actual observable.
494
+ const fx = setupDeps()
495
+ startTurn('k', 0)
496
+ // Gateway calls both for a Task tool_use (mirrors the wiring at
497
+ // gateway.ts onSessionEvent).
498
+ noteSubagentDispatch('k')
499
+ noteToolStart('k', 'T1', 'Task', 'spinning up @researcher', 10_000)
500
+ // Soft threshold extends to 300s under subagent — so no soft poke
501
+ // fires at 75s and no firm fires at 180s (firm requires pokesFired===1,
502
+ // i.e. soft must fire first). Once we cross the 300s subagent-soft,
503
+ // soft fires; each tick fires one level via the `continue` in tick(),
504
+ // so we need three ticks to walk soft → firm → fallback.
505
+ __tickForTests(75_000) // suppressed by subagent
506
+ __tickForTests(180_000) // still suppressed
507
+ __tickForTests(305_000) // soft fires (subagent soft = 300s)
508
+ __tickForTests(305_001) // firm fires
509
+ __tickForTests(305_002) // fallback fires
510
+ expect(fx.fallbacks).toHaveLength(1)
511
+ const snap = fx.fallbacks[0]!.inFlightTools
512
+ expect(snap[0]!.name).toBe('Task')
513
+ expect(snap[0]!.label).toBe('spinning up @researcher')
514
+ })
515
+
516
+ it('noteToolStart on an unknown key is a no-op (no crash, no state)', () => {
517
+ setupDeps()
518
+ // No startTurn first — silence-poke ignores the call.
519
+ noteToolStart('k-never-started', 'T1', 'Grep', 'foo', 30_000)
520
+ expect(__getStateForTests('k-never-started')).toBeUndefined()
521
+ })
522
+
523
+ it('noteToolEnd on an unknown id is a no-op', () => {
524
+ setupDeps()
525
+ startTurn('k', 0)
526
+ noteToolEnd('k', 'never-started', 100_000)
527
+ expect(__getStateForTests('k')!.inFlightTools.size).toBe(0)
528
+ })
529
+
530
+ it('formatFrameworkFallbackText sub-minute durations render as "Ns"', () => {
531
+ const text = formatFrameworkFallbackText('working', 305_000, [
532
+ { name: 'Grep', label: 'foo', durationMs: 12_000 },
533
+ ])
534
+ expect(text).toBe(
535
+ 'running Grep foo for 12s (no update from agent in 5 min)',
536
+ )
537
+ })
538
+
539
+ it('formatFrameworkFallbackText truncates very long labels', () => {
540
+ const longLabel = '"' + 'x'.repeat(120) + '"'
541
+ const text = formatFrameworkFallbackText('working', 305_000, [
542
+ { name: 'Grep', label: longLabel, durationMs: 305_000 },
543
+ ])
544
+ // 60-char cap (with trailing ellipsis) — verify clipping without
545
+ // pinning exact bytes.
546
+ expect(text.length).toBeLessThan(120)
547
+ expect(text).toContain('…')
548
+ })
312
549
  })
313
550
 
314
551
  describe('silence-poke — consumeArmedPoke draining', () => {
@@ -137,6 +137,118 @@ describe('decideTurnFlush', () => {
137
137
  }),
138
138
  ).toEqual({ kind: 'skip', reason: 'reply-called' })
139
139
  })
140
+
141
+ // #1291 — when the model emits a soft-commit reply ("on it, back in a
142
+ // few") and then composes the real substantive answer in terminal text
143
+ // only, the pre-#1291 behaviour skipped flush entirely because
144
+ // replyCalled was true. The fix: track capturedTextLenAtLastReply and
145
+ // flush the post-reply tail when it meets the substantive threshold.
146
+ describe('#1291 — post-reply tail flush', () => {
147
+ it('flushes the post-reply tail when it meets the substantive threshold', () => {
148
+ const decision = decideTurnFlush({
149
+ chatId: '700',
150
+ replyCalled: true,
151
+ // Index 0 = the captured text BEFORE the reply tool was called
152
+ // (some thinking-as-text). Indices 1..2 are post-reply.
153
+ capturedText: [
154
+ 'thinking out loud before the reply',
155
+ 'Now here is the actual substantive answer the model composed ',
156
+ 'in terminal text only after the interim reply call.',
157
+ ],
158
+ capturedTextLenAtLastReply: 1,
159
+ })
160
+ expect(decision).toEqual({
161
+ kind: 'flush',
162
+ text:
163
+ 'Now here is the actual substantive answer the model composed ' +
164
+ '\nin terminal text only after the interim reply call.',
165
+ })
166
+ })
167
+
168
+ it('skips with reply-called-no-new-text when post-reply tail is below threshold', () => {
169
+ const decision = decideTurnFlush({
170
+ chatId: '701',
171
+ replyCalled: true,
172
+ capturedText: ['the pre-reply scratch', 'ok.'], // tail = "ok." (3 chars)
173
+ capturedTextLenAtLastReply: 1,
174
+ })
175
+ expect(decision).toEqual({
176
+ kind: 'skip',
177
+ reason: 'reply-called-no-new-text',
178
+ })
179
+ })
180
+
181
+ it('skips with reply-called when there is no post-reply text at all', () => {
182
+ const decision = decideTurnFlush({
183
+ chatId: '702',
184
+ replyCalled: true,
185
+ capturedText: ['everything-was-before-the-reply'],
186
+ capturedTextLenAtLastReply: 1, // tail slice is empty
187
+ })
188
+ expect(decision).toEqual({ kind: 'skip', reason: 'reply-called' })
189
+ })
190
+
191
+ it('post-reply tail honors a silent marker (skip)', () => {
192
+ const decision = decideTurnFlush({
193
+ chatId: '703',
194
+ replyCalled: true,
195
+ capturedText: ['real answer pre-reply', 'NO_REPLY'],
196
+ capturedTextLenAtLastReply: 1,
197
+ replyCalledTailMinChars: 1, // force the marker check
198
+ })
199
+ expect(decision).toEqual({ kind: 'skip', reason: 'silent-marker' })
200
+ })
201
+
202
+ it('post-reply tail with null chatId still skips (no-inbound-chat)', () => {
203
+ const decision = decideTurnFlush({
204
+ chatId: null,
205
+ replyCalled: true,
206
+ capturedText: [
207
+ 'pre',
208
+ 'this tail would have been substantive enough to flush normally',
209
+ ],
210
+ capturedTextLenAtLastReply: 1,
211
+ })
212
+ expect(decision).toEqual({ kind: 'skip', reason: 'no-inbound-chat' })
213
+ })
214
+
215
+ it('preserves pre-#1291 behaviour when capturedTextLenAtLastReply is omitted', () => {
216
+ // Legacy caller doesn't track the marker — defaults to
217
+ // capturedText.length, so the tail slice is empty and we skip
218
+ // with reason 'reply-called' (the original behaviour).
219
+ const decision = decideTurnFlush({
220
+ chatId: '704',
221
+ replyCalled: true,
222
+ capturedText: ['some answer the model emitted'],
223
+ })
224
+ expect(decision).toEqual({ kind: 'skip', reason: 'reply-called' })
225
+ })
226
+
227
+ it('respects a custom replyCalledTailMinChars threshold', () => {
228
+ const decision = decideTurnFlush({
229
+ chatId: '705',
230
+ replyCalled: true,
231
+ capturedText: ['pre-reply', 'short but substantive in this test'],
232
+ capturedTextLenAtLastReply: 1,
233
+ replyCalledTailMinChars: 10,
234
+ })
235
+ expect(decision.kind).toBe('flush')
236
+ })
237
+
238
+ it('feature flag off still wins over post-reply tail flush', () => {
239
+ const decision = decideTurnFlush({
240
+ chatId: '706',
241
+ replyCalled: true,
242
+ capturedText: [
243
+ 'pre',
244
+ 'a long substantive post-reply tail that would otherwise flush',
245
+ ],
246
+ capturedTextLenAtLastReply: 1,
247
+ flushEnabled: false,
248
+ })
249
+ expect(decision).toEqual({ kind: 'skip', reason: 'flag-disabled' })
250
+ })
251
+ })
140
252
  })
141
253
 
142
254
  describe('isSilentFlushMarker', () => {