switchroom 0.10.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/dist/agent-scheduler/index.js +2 -2
- package/dist/auth-broker/index.js +125 -3
- package/dist/cli/drive-write-pretool.mjs +5436 -0
- package/dist/cli/switchroom.js +231 -29
- package/dist/host-control/main.js +2 -2
- package/dist/vault/approvals/kernel-server.js +2 -2
- package/dist/vault/broker/server.js +2 -2
- package/package.json +1 -1
- package/telegram-plugin/admin-commands/dispatch.test.ts +1 -1
- package/telegram-plugin/admin-commands/index.ts +2 -0
- package/telegram-plugin/auth-snapshot-format.ts +612 -0
- package/telegram-plugin/auto-fallback-fleet.ts +215 -0
- package/telegram-plugin/auto-fallback.ts +28 -301
- package/telegram-plugin/dist/gateway/gateway.js +4314 -2143
- package/telegram-plugin/fleet-fallback-gate.ts +105 -0
- package/telegram-plugin/gateway/approval-callback.test.ts +104 -0
- package/telegram-plugin/gateway/approval-callback.ts +31 -3
- package/telegram-plugin/gateway/auth-broker-client.ts +2 -0
- package/telegram-plugin/gateway/auth-command.ts +131 -10
- package/telegram-plugin/gateway/auth-status-adapter.ts +101 -0
- package/telegram-plugin/gateway/boot-card.ts +1 -1
- package/telegram-plugin/gateway/boot-probes.ts +6 -9
- package/telegram-plugin/gateway/diff-preview-card.test.ts +192 -0
- package/telegram-plugin/gateway/diff-preview-card.ts +170 -0
- package/telegram-plugin/gateway/drive-write-approval.test.ts +312 -0
- package/telegram-plugin/gateway/drive-write-approval.ts +243 -0
- package/telegram-plugin/gateway/folder-picker-handler.test.ts +314 -0
- package/telegram-plugin/gateway/folder-picker-handler.ts +348 -0
- package/telegram-plugin/gateway/gateway.ts +903 -173
- package/telegram-plugin/gateway/hostd-dispatch.ts +137 -2
- package/telegram-plugin/gateway/ipc-protocol.ts +83 -2
- package/telegram-plugin/gateway/ipc-server.ts +69 -0
- package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +103 -12
- package/telegram-plugin/model-unavailable.ts +28 -12
- package/telegram-plugin/silence-poke.ts +153 -1
- package/telegram-plugin/tests/auth-command-format2.test.ts +156 -0
- package/telegram-plugin/tests/auth-snapshot-format.test.ts +429 -0
- package/telegram-plugin/tests/auth-status-adapter.test.ts +129 -0
- package/telegram-plugin/tests/auto-fallback-fleet.test.ts +211 -0
- package/telegram-plugin/tests/auto-fallback.test.ts +60 -358
- package/telegram-plugin/tests/boot-probes.test.ts +16 -18
- package/telegram-plugin/tests/fleet-fallback-gate.test.ts +197 -0
- package/telegram-plugin/tests/model-unavailable.test.ts +30 -5
- package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +212 -2
- package/telegram-plugin/tests/silence-poke.test.ts +237 -0
- package/telegram-plugin/tests/turn-flush-safety.test.ts +112 -0
- package/telegram-plugin/turn-flush-safety.ts +55 -1
- package/telegram-plugin/uat/SETUP.md +16 -12
- package/telegram-plugin/auto-fallback-dispatcher.ts +0 -68
- package/telegram-plugin/tests/auto-fallback-dispatcher.e2e.test.ts +0 -183
- package/telegram-plugin/tests/hostd-dispatch.test.ts +0 -129
|
@@ -4,6 +4,9 @@ import {
|
|
|
4
4
|
noteOutbound,
|
|
5
5
|
noteSubagentDispatch,
|
|
6
6
|
noteThinking,
|
|
7
|
+
noteToolStart,
|
|
8
|
+
noteToolEnd,
|
|
9
|
+
noteToolLabel,
|
|
7
10
|
consumeArmedPoke,
|
|
8
11
|
endTurn,
|
|
9
12
|
silencePokeEnabled,
|
|
@@ -309,6 +312,240 @@ describe('silence-poke — abnormal turn-end invariants (CC-5 follow-up)', () =>
|
|
|
309
312
|
).toHaveLength(1) // unchanged: only the original soft
|
|
310
313
|
expect(fx.fallbacks).toHaveLength(0)
|
|
311
314
|
})
|
|
315
|
+
|
|
316
|
+
// #1289: the flush-backstop turn-end branch in the gateway (the path
|
|
317
|
+
// taken when the agent emits assistant text but never calls the reply
|
|
318
|
+
// tool) was retrofitted in #1067 to null `currentTurn` early but never
|
|
319
|
+
// had `silencePoke.endTurn` added — leaving state2 populated so the
|
|
320
|
+
// 300s framework fallback fired after the gateway already flushed the
|
|
321
|
+
// captured prose and considered the turn over. Pin the contract at
|
|
322
|
+
// the silence-poke level: a turn that records an outbound (the
|
|
323
|
+
// flushed message) and then calls endTurn must not later fire a
|
|
324
|
+
// fallback even if 300s elapses from the original turn start.
|
|
325
|
+
it('#1289: flush-backstop turn-end (outbound + endTurn) suppresses the 300s fallback', () => {
|
|
326
|
+
const fx = setupDeps()
|
|
327
|
+
startTurn('k', 0)
|
|
328
|
+
// Some time passes while the agent generates prose without calling
|
|
329
|
+
// the reply tool. No soft/firm armed yet.
|
|
330
|
+
__tickForTests(60_000)
|
|
331
|
+
// Gateway turn-flush fires: captured text is sent as an outbound,
|
|
332
|
+
// then the flush branch nulls currentTurn AND (post-fix) calls
|
|
333
|
+
// signalTracker.clear + silencePoke.endTurn.
|
|
334
|
+
noteOutbound('k', 60_000)
|
|
335
|
+
endTurn('k')
|
|
336
|
+
// 300s elapses from the original turn start. Pre-fix: the framework
|
|
337
|
+
// fallback fired here. Post-fix: the state is drained, no fallback.
|
|
338
|
+
__tickForTests(240_000)
|
|
339
|
+
expect(fx.fallbacks).toHaveLength(0)
|
|
340
|
+
expect(
|
|
341
|
+
fx.emitted.filter((e) => e.kind === 'silence_fallback_sent'),
|
|
342
|
+
).toHaveLength(0)
|
|
343
|
+
})
|
|
344
|
+
})
|
|
345
|
+
|
|
346
|
+
// #1292 — drive a deterministic, tool-aware fallback message from the
|
|
347
|
+
// gateway's `tool_use` / `tool_result` event stream. The progress card
|
|
348
|
+
// was retired in #1122 PR3 in favour of the conversational shape; the
|
|
349
|
+
// remaining honesty gap was that the 300s framework fallback said
|
|
350
|
+
// "still working… no update in 5 min" on turns where the agent was
|
|
351
|
+
// clearly grinding through tool calls. These tests pin the behaviour:
|
|
352
|
+
// the silence clock is NOT reset by tool churn (header invariant
|
|
353
|
+
// preserved), but the fallback message body becomes tool-aware so the
|
|
354
|
+
// user sees the actual observable.
|
|
355
|
+
describe('silence-poke — #1292 tool-aware framework fallback', () => {
|
|
356
|
+
it('fallback context exposes in-flight tool snapshot with duration', () => {
|
|
357
|
+
const fx = setupDeps()
|
|
358
|
+
startTurn('k', 0)
|
|
359
|
+
noteToolStart('k', 'T1', 'Grep', 'foo', 30_000)
|
|
360
|
+
__tickForTests(75_000)
|
|
361
|
+
__tickForTests(180_000)
|
|
362
|
+
__tickForTests(305_000)
|
|
363
|
+
expect(fx.fallbacks).toHaveLength(1)
|
|
364
|
+
const ctx = fx.fallbacks[0]!
|
|
365
|
+
expect(ctx.inFlightTools).toHaveLength(1)
|
|
366
|
+
expect(ctx.inFlightTools[0]!.name).toBe('Grep')
|
|
367
|
+
expect(ctx.inFlightTools[0]!.label).toBe('foo')
|
|
368
|
+
expect(ctx.inFlightTools[0]!.durationMs).toBe(305_000 - 30_000)
|
|
369
|
+
})
|
|
370
|
+
|
|
371
|
+
it('formatFrameworkFallbackText names the longest-running tool with duration', () => {
|
|
372
|
+
const text = formatFrameworkFallbackText('working', 305_000, [
|
|
373
|
+
{ name: 'Grep', label: '"foo"', durationMs: 275_000 },
|
|
374
|
+
])
|
|
375
|
+
expect(text).toBe(
|
|
376
|
+
'running Grep "foo" for 5m (no update from agent in 5 min)',
|
|
377
|
+
)
|
|
378
|
+
})
|
|
379
|
+
|
|
380
|
+
it('multiple in-flight tools render as "+ N more"', () => {
|
|
381
|
+
const text = formatFrameworkFallbackText('working', 305_000, [
|
|
382
|
+
{ name: 'Grep', label: '"foo"', durationMs: 275_000 },
|
|
383
|
+
{ name: 'Read', label: 'config.ts', durationMs: 120_000 },
|
|
384
|
+
{ name: 'Bash', label: null, durationMs: 60_000 },
|
|
385
|
+
])
|
|
386
|
+
expect(text).toBe(
|
|
387
|
+
'running Grep "foo" + 2 more for 5m (no update from agent in 5 min)',
|
|
388
|
+
)
|
|
389
|
+
})
|
|
390
|
+
|
|
391
|
+
it('tool with no label renders the bare name', () => {
|
|
392
|
+
const text = formatFrameworkFallbackText('working', 305_000, [
|
|
393
|
+
{ name: 'Bash', label: null, durationMs: 305_000 },
|
|
394
|
+
])
|
|
395
|
+
expect(text).toBe(
|
|
396
|
+
'running Bash for 5m (no update from agent in 5 min)',
|
|
397
|
+
)
|
|
398
|
+
})
|
|
399
|
+
|
|
400
|
+
it('empty inFlightTools falls back to the base "still working" wording', () => {
|
|
401
|
+
expect(
|
|
402
|
+
formatFrameworkFallbackText('working', 305_000, []),
|
|
403
|
+
).toBe('still working… (no update from agent in 5 min)')
|
|
404
|
+
expect(
|
|
405
|
+
formatFrameworkFallbackText('thinking', 305_000, []),
|
|
406
|
+
).toBe('still thinking… (no update from agent in 5 min)')
|
|
407
|
+
// No third arg → same as empty array.
|
|
408
|
+
expect(
|
|
409
|
+
formatFrameworkFallbackText('working', 305_000),
|
|
410
|
+
).toBe('still working… (no update from agent in 5 min)')
|
|
411
|
+
})
|
|
412
|
+
|
|
413
|
+
it('tool-aware wording wins over "thinking" — the actual observable beats the inferred kind', () => {
|
|
414
|
+
const text = formatFrameworkFallbackText('thinking', 305_000, [
|
|
415
|
+
{ name: 'Grep', label: '"foo"', durationMs: 305_000 },
|
|
416
|
+
])
|
|
417
|
+
expect(text.startsWith('running Grep')).toBe(true)
|
|
418
|
+
expect(text).not.toContain('still thinking')
|
|
419
|
+
})
|
|
420
|
+
|
|
421
|
+
it('tool completed before the fallback → empty snapshot → base wording', () => {
|
|
422
|
+
const fx = setupDeps()
|
|
423
|
+
startTurn('k', 0)
|
|
424
|
+
noteToolStart('k', 'T1', 'Grep', 'foo', 30_000)
|
|
425
|
+
noteToolEnd('k', 'T1', 200_000)
|
|
426
|
+
__tickForTests(75_000)
|
|
427
|
+
__tickForTests(180_000)
|
|
428
|
+
__tickForTests(305_000)
|
|
429
|
+
expect(fx.fallbacks).toHaveLength(1)
|
|
430
|
+
expect(fx.fallbacks[0]!.inFlightTools).toHaveLength(0)
|
|
431
|
+
})
|
|
432
|
+
|
|
433
|
+
it('late noteToolLabel updates the in-flight entry in place', () => {
|
|
434
|
+
const fx = setupDeps()
|
|
435
|
+
startTurn('k', 0)
|
|
436
|
+
noteToolStart('k', 'T1', 'Grep', null, 30_000)
|
|
437
|
+
noteToolLabel('k', 'T1', '"refined-from-sidecar"')
|
|
438
|
+
__tickForTests(75_000)
|
|
439
|
+
__tickForTests(180_000)
|
|
440
|
+
__tickForTests(305_000)
|
|
441
|
+
expect(fx.fallbacks[0]!.inFlightTools[0]!.label).toBe('"refined-from-sidecar"')
|
|
442
|
+
})
|
|
443
|
+
|
|
444
|
+
it('endTurn drains inFlightTools', () => {
|
|
445
|
+
setupDeps()
|
|
446
|
+
startTurn('k', 0)
|
|
447
|
+
noteToolStart('k', 'T1', 'Grep', 'foo', 30_000)
|
|
448
|
+
expect(__getStateForTests('k')!.inFlightTools.size).toBe(1)
|
|
449
|
+
endTurn('k')
|
|
450
|
+
// A fresh turn under the same key has an empty map.
|
|
451
|
+
startTurn('k', 1_000_000)
|
|
452
|
+
expect(__getStateForTests('k')!.inFlightTools.size).toBe(0)
|
|
453
|
+
})
|
|
454
|
+
|
|
455
|
+
it('parallel tools sort by startedAt ascending — longest-running rendered first', () => {
|
|
456
|
+
const fx = setupDeps()
|
|
457
|
+
startTurn('k', 0)
|
|
458
|
+
// Order intentionally NOT chronological to verify sort.
|
|
459
|
+
noteToolStart('k', 'T-late', 'Read', 'recent.ts', 250_000)
|
|
460
|
+
noteToolStart('k', 'T-early', 'Grep', '"oldest"', 20_000)
|
|
461
|
+
noteToolStart('k', 'T-mid', 'Bash', null, 100_000)
|
|
462
|
+
__tickForTests(75_000)
|
|
463
|
+
__tickForTests(180_000)
|
|
464
|
+
__tickForTests(305_000)
|
|
465
|
+
const snap = fx.fallbacks[0]!.inFlightTools
|
|
466
|
+
expect(snap.map(t => t.name)).toEqual(['Grep', 'Bash', 'Read'])
|
|
467
|
+
})
|
|
468
|
+
|
|
469
|
+
it('tool churn does NOT reset the silence clock (header invariant preserved)', () => {
|
|
470
|
+
// The whole point of #1292 (b) over (a) is that we enrich the
|
|
471
|
+
// fallback TEXT, never the timing. Tool activity must not delay
|
|
472
|
+
// or suppress the soft/firm/fallback escalation ladder.
|
|
473
|
+
const fx = setupDeps()
|
|
474
|
+
startTurn('k', 0)
|
|
475
|
+
// A constant stream of tool churn through the entire 5min window —
|
|
476
|
+
// each tool ends quickly so inFlightTools is empty by fallback.
|
|
477
|
+
for (let t = 5_000; t <= 295_000; t += 10_000) {
|
|
478
|
+
noteToolStart('k', `T-${t}`, 'Grep', 'foo', t)
|
|
479
|
+
noteToolEnd('k', `T-${t}`, t + 500)
|
|
480
|
+
}
|
|
481
|
+
__tickForTests(75_000) // soft
|
|
482
|
+
__tickForTests(180_000) // firm
|
|
483
|
+
__tickForTests(305_000) // fallback
|
|
484
|
+
expect(
|
|
485
|
+
fx.emitted.filter(e => e.kind === 'silence_poke_fired'),
|
|
486
|
+
).toHaveLength(2)
|
|
487
|
+
expect(fx.fallbacks).toHaveLength(1)
|
|
488
|
+
})
|
|
489
|
+
|
|
490
|
+
it('Task tool sets subagentDispatchActive AND populates inFlightTools', () => {
|
|
491
|
+
// Two flags are independent: the soft-threshold extension still
|
|
492
|
+
// works for sub-agent waits (existing behaviour), AND the fallback
|
|
493
|
+
// message names the Task tool as the actual observable.
|
|
494
|
+
const fx = setupDeps()
|
|
495
|
+
startTurn('k', 0)
|
|
496
|
+
// Gateway calls both for a Task tool_use (mirrors the wiring at
|
|
497
|
+
// gateway.ts onSessionEvent).
|
|
498
|
+
noteSubagentDispatch('k')
|
|
499
|
+
noteToolStart('k', 'T1', 'Task', 'spinning up @researcher', 10_000)
|
|
500
|
+
// Soft threshold extends to 300s under subagent — so no soft poke
|
|
501
|
+
// fires at 75s and no firm fires at 180s (firm requires pokesFired===1,
|
|
502
|
+
// i.e. soft must fire first). Once we cross the 300s subagent-soft,
|
|
503
|
+
// soft fires; each tick fires one level via the `continue` in tick(),
|
|
504
|
+
// so we need three ticks to walk soft → firm → fallback.
|
|
505
|
+
__tickForTests(75_000) // suppressed by subagent
|
|
506
|
+
__tickForTests(180_000) // still suppressed
|
|
507
|
+
__tickForTests(305_000) // soft fires (subagent soft = 300s)
|
|
508
|
+
__tickForTests(305_001) // firm fires
|
|
509
|
+
__tickForTests(305_002) // fallback fires
|
|
510
|
+
expect(fx.fallbacks).toHaveLength(1)
|
|
511
|
+
const snap = fx.fallbacks[0]!.inFlightTools
|
|
512
|
+
expect(snap[0]!.name).toBe('Task')
|
|
513
|
+
expect(snap[0]!.label).toBe('spinning up @researcher')
|
|
514
|
+
})
|
|
515
|
+
|
|
516
|
+
it('noteToolStart on an unknown key is a no-op (no crash, no state)', () => {
|
|
517
|
+
setupDeps()
|
|
518
|
+
// No startTurn first — silence-poke ignores the call.
|
|
519
|
+
noteToolStart('k-never-started', 'T1', 'Grep', 'foo', 30_000)
|
|
520
|
+
expect(__getStateForTests('k-never-started')).toBeUndefined()
|
|
521
|
+
})
|
|
522
|
+
|
|
523
|
+
it('noteToolEnd on an unknown id is a no-op', () => {
|
|
524
|
+
setupDeps()
|
|
525
|
+
startTurn('k', 0)
|
|
526
|
+
noteToolEnd('k', 'never-started', 100_000)
|
|
527
|
+
expect(__getStateForTests('k')!.inFlightTools.size).toBe(0)
|
|
528
|
+
})
|
|
529
|
+
|
|
530
|
+
it('formatFrameworkFallbackText sub-minute durations render as "Ns"', () => {
|
|
531
|
+
const text = formatFrameworkFallbackText('working', 305_000, [
|
|
532
|
+
{ name: 'Grep', label: 'foo', durationMs: 12_000 },
|
|
533
|
+
])
|
|
534
|
+
expect(text).toBe(
|
|
535
|
+
'running Grep foo for 12s (no update from agent in 5 min)',
|
|
536
|
+
)
|
|
537
|
+
})
|
|
538
|
+
|
|
539
|
+
it('formatFrameworkFallbackText truncates very long labels', () => {
|
|
540
|
+
const longLabel = '"' + 'x'.repeat(120) + '"'
|
|
541
|
+
const text = formatFrameworkFallbackText('working', 305_000, [
|
|
542
|
+
{ name: 'Grep', label: longLabel, durationMs: 305_000 },
|
|
543
|
+
])
|
|
544
|
+
// 60-char cap (with trailing ellipsis) — verify clipping without
|
|
545
|
+
// pinning exact bytes.
|
|
546
|
+
expect(text.length).toBeLessThan(120)
|
|
547
|
+
expect(text).toContain('…')
|
|
548
|
+
})
|
|
312
549
|
})
|
|
313
550
|
|
|
314
551
|
describe('silence-poke — consumeArmedPoke draining', () => {
|
|
@@ -137,6 +137,118 @@ describe('decideTurnFlush', () => {
|
|
|
137
137
|
}),
|
|
138
138
|
).toEqual({ kind: 'skip', reason: 'reply-called' })
|
|
139
139
|
})
|
|
140
|
+
|
|
141
|
+
// #1291 — when the model emits a soft-commit reply ("on it, back in a
|
|
142
|
+
// few") and then composes the real substantive answer in terminal text
|
|
143
|
+
// only, the pre-#1291 behaviour skipped flush entirely because
|
|
144
|
+
// replyCalled was true. The fix: track capturedTextLenAtLastReply and
|
|
145
|
+
// flush the post-reply tail when it meets the substantive threshold.
|
|
146
|
+
describe('#1291 — post-reply tail flush', () => {
|
|
147
|
+
it('flushes the post-reply tail when it meets the substantive threshold', () => {
|
|
148
|
+
const decision = decideTurnFlush({
|
|
149
|
+
chatId: '700',
|
|
150
|
+
replyCalled: true,
|
|
151
|
+
// Index 0 = the captured text BEFORE the reply tool was called
|
|
152
|
+
// (some thinking-as-text). Indices 1..2 are post-reply.
|
|
153
|
+
capturedText: [
|
|
154
|
+
'thinking out loud before the reply',
|
|
155
|
+
'Now here is the actual substantive answer the model composed ',
|
|
156
|
+
'in terminal text only after the interim reply call.',
|
|
157
|
+
],
|
|
158
|
+
capturedTextLenAtLastReply: 1,
|
|
159
|
+
})
|
|
160
|
+
expect(decision).toEqual({
|
|
161
|
+
kind: 'flush',
|
|
162
|
+
text:
|
|
163
|
+
'Now here is the actual substantive answer the model composed ' +
|
|
164
|
+
'\nin terminal text only after the interim reply call.',
|
|
165
|
+
})
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
it('skips with reply-called-no-new-text when post-reply tail is below threshold', () => {
|
|
169
|
+
const decision = decideTurnFlush({
|
|
170
|
+
chatId: '701',
|
|
171
|
+
replyCalled: true,
|
|
172
|
+
capturedText: ['the pre-reply scratch', 'ok.'], // tail = "ok." (3 chars)
|
|
173
|
+
capturedTextLenAtLastReply: 1,
|
|
174
|
+
})
|
|
175
|
+
expect(decision).toEqual({
|
|
176
|
+
kind: 'skip',
|
|
177
|
+
reason: 'reply-called-no-new-text',
|
|
178
|
+
})
|
|
179
|
+
})
|
|
180
|
+
|
|
181
|
+
it('skips with reply-called when there is no post-reply text at all', () => {
|
|
182
|
+
const decision = decideTurnFlush({
|
|
183
|
+
chatId: '702',
|
|
184
|
+
replyCalled: true,
|
|
185
|
+
capturedText: ['everything-was-before-the-reply'],
|
|
186
|
+
capturedTextLenAtLastReply: 1, // tail slice is empty
|
|
187
|
+
})
|
|
188
|
+
expect(decision).toEqual({ kind: 'skip', reason: 'reply-called' })
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
it('post-reply tail honors a silent marker (skip)', () => {
|
|
192
|
+
const decision = decideTurnFlush({
|
|
193
|
+
chatId: '703',
|
|
194
|
+
replyCalled: true,
|
|
195
|
+
capturedText: ['real answer pre-reply', 'NO_REPLY'],
|
|
196
|
+
capturedTextLenAtLastReply: 1,
|
|
197
|
+
replyCalledTailMinChars: 1, // force the marker check
|
|
198
|
+
})
|
|
199
|
+
expect(decision).toEqual({ kind: 'skip', reason: 'silent-marker' })
|
|
200
|
+
})
|
|
201
|
+
|
|
202
|
+
it('post-reply tail with null chatId still skips (no-inbound-chat)', () => {
|
|
203
|
+
const decision = decideTurnFlush({
|
|
204
|
+
chatId: null,
|
|
205
|
+
replyCalled: true,
|
|
206
|
+
capturedText: [
|
|
207
|
+
'pre',
|
|
208
|
+
'this tail would have been substantive enough to flush normally',
|
|
209
|
+
],
|
|
210
|
+
capturedTextLenAtLastReply: 1,
|
|
211
|
+
})
|
|
212
|
+
expect(decision).toEqual({ kind: 'skip', reason: 'no-inbound-chat' })
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
it('preserves pre-#1291 behaviour when capturedTextLenAtLastReply is omitted', () => {
|
|
216
|
+
// Legacy caller doesn't track the marker — defaults to
|
|
217
|
+
// capturedText.length, so the tail slice is empty and we skip
|
|
218
|
+
// with reason 'reply-called' (the original behaviour).
|
|
219
|
+
const decision = decideTurnFlush({
|
|
220
|
+
chatId: '704',
|
|
221
|
+
replyCalled: true,
|
|
222
|
+
capturedText: ['some answer the model emitted'],
|
|
223
|
+
})
|
|
224
|
+
expect(decision).toEqual({ kind: 'skip', reason: 'reply-called' })
|
|
225
|
+
})
|
|
226
|
+
|
|
227
|
+
it('respects a custom replyCalledTailMinChars threshold', () => {
|
|
228
|
+
const decision = decideTurnFlush({
|
|
229
|
+
chatId: '705',
|
|
230
|
+
replyCalled: true,
|
|
231
|
+
capturedText: ['pre-reply', 'short but substantive in this test'],
|
|
232
|
+
capturedTextLenAtLastReply: 1,
|
|
233
|
+
replyCalledTailMinChars: 10,
|
|
234
|
+
})
|
|
235
|
+
expect(decision.kind).toBe('flush')
|
|
236
|
+
})
|
|
237
|
+
|
|
238
|
+
it('feature flag off still wins over post-reply tail flush', () => {
|
|
239
|
+
const decision = decideTurnFlush({
|
|
240
|
+
chatId: '706',
|
|
241
|
+
replyCalled: true,
|
|
242
|
+
capturedText: [
|
|
243
|
+
'pre',
|
|
244
|
+
'a long substantive post-reply tail that would otherwise flush',
|
|
245
|
+
],
|
|
246
|
+
capturedTextLenAtLastReply: 1,
|
|
247
|
+
flushEnabled: false,
|
|
248
|
+
})
|
|
249
|
+
expect(decision).toEqual({ kind: 'skip', reason: 'flag-disabled' })
|
|
250
|
+
})
|
|
251
|
+
})
|
|
140
252
|
})
|
|
141
253
|
|
|
142
254
|
describe('isSilentFlushMarker', () => {
|
|
@@ -57,6 +57,7 @@ export type FlushDecision =
|
|
|
57
57
|
export type FlushSkipReason =
|
|
58
58
|
| 'flag-disabled'
|
|
59
59
|
| 'reply-called'
|
|
60
|
+
| 'reply-called-no-new-text'
|
|
60
61
|
| 'no-inbound-chat'
|
|
61
62
|
| 'empty-text'
|
|
62
63
|
| 'silent-marker'
|
|
@@ -71,10 +72,33 @@ export interface FlushDecisionInput {
|
|
|
71
72
|
/** Raw text content blocks accumulated from assistant events across the
|
|
72
73
|
* turn. Joined + trimmed internally. */
|
|
73
74
|
capturedText: string[]
|
|
75
|
+
/** Snapshot of `capturedText.length` at the moment of the most recent
|
|
76
|
+
* reply / stream_reply tool call in this turn. Indices `[capturedText
|
|
77
|
+
* length-at-last-reply, capturedText.length)` are the post-reply tail
|
|
78
|
+
* — substantive content the model emitted AFTER the reply (e.g. soft
|
|
79
|
+
* commit "on it, back in a few" followed by the real answer in
|
|
80
|
+
* terminal text only, the #1291 repro). When the tail meets
|
|
81
|
+
* `replyCalledTailMinChars` we flush it; otherwise we skip.
|
|
82
|
+
*
|
|
83
|
+
* Defaults to `capturedText.length` (treat all captured text as
|
|
84
|
+
* pre-reply, preserve the pre-#1291 behaviour where any reply tool
|
|
85
|
+
* call suppressed flush entirely) so callers that don't track the
|
|
86
|
+
* marker keep the old contract. */
|
|
87
|
+
capturedTextLenAtLastReply?: number
|
|
88
|
+
/** Minimum trimmed-tail length to qualify a post-reply tail flush.
|
|
89
|
+
* Defaults to `REPLY_CALLED_TAIL_MIN_CHARS` (40). Below this we skip
|
|
90
|
+
* with `reply-called-no-new-text` — typical for trailing markdown
|
|
91
|
+
* artifacts or a one-word afterthought. */
|
|
92
|
+
replyCalledTailMinChars?: number
|
|
74
93
|
/** Feature flag — defaults to true. Pass `false` to force skip everywhere. */
|
|
75
94
|
flushEnabled?: boolean
|
|
76
95
|
}
|
|
77
96
|
|
|
97
|
+
/** Default minimum trimmed length for the post-reply tail to be flushed
|
|
98
|
+
* as a follow-up message. Below this we treat the tail as noise / artifact
|
|
99
|
+
* and skip silently. */
|
|
100
|
+
export const REPLY_CALLED_TAIL_MIN_CHARS = 40
|
|
101
|
+
|
|
78
102
|
/**
|
|
79
103
|
* Pure decision: should the gateway deterministically send the model's
|
|
80
104
|
* captured assistant text at turn_end? Returns `{kind: 'flush', text}` with
|
|
@@ -82,11 +106,41 @@ export interface FlushDecisionInput {
|
|
|
82
106
|
*
|
|
83
107
|
* Ordering of checks is deliberate: cheapest/strongest first so logs
|
|
84
108
|
* attribute a skip to the most specific cause.
|
|
109
|
+
*
|
|
110
|
+
* #1291 — when `replyCalled` is true we no longer suppress unconditionally.
|
|
111
|
+
* The model may have emitted a soft-commit reply ("on it, back in a few")
|
|
112
|
+
* followed by the real substantive answer in terminal text only. Using
|
|
113
|
+
* `capturedTextLenAtLastReply` we isolate the post-reply tail and flush
|
|
114
|
+
* it if it's substantive enough; otherwise we skip with
|
|
115
|
+
* `reply-called-no-new-text` (logged) or `reply-called` (silent, no tail).
|
|
85
116
|
*/
|
|
86
117
|
export function decideTurnFlush(input: FlushDecisionInput): FlushDecision {
|
|
87
118
|
const flushEnabled = input.flushEnabled !== false
|
|
88
119
|
if (!flushEnabled) return { kind: 'skip', reason: 'flag-disabled' }
|
|
89
|
-
|
|
120
|
+
|
|
121
|
+
if (input.replyCalled) {
|
|
122
|
+
const tailIdx = input.capturedTextLenAtLastReply ?? input.capturedText.length
|
|
123
|
+
const tail = input.capturedText.slice(tailIdx).join('\n').trim()
|
|
124
|
+
const minChars = input.replyCalledTailMinChars ?? REPLY_CALLED_TAIL_MIN_CHARS
|
|
125
|
+
if (tail.length === 0) {
|
|
126
|
+
// The reply tool was called and nothing of substance came after —
|
|
127
|
+
// the turn is fully served by the reply. Skip silently (the gateway
|
|
128
|
+
// WARN gate excludes this reason from logs).
|
|
129
|
+
return { kind: 'skip', reason: 'reply-called' }
|
|
130
|
+
}
|
|
131
|
+
if (tail.length < minChars) {
|
|
132
|
+
// Post-reply tail exists but is below the substantive-content
|
|
133
|
+
// threshold — typically trailing markdown artifacts or a one-word
|
|
134
|
+
// afterthought. Skip but with a distinct reason so this case IS
|
|
135
|
+
// logged (auditable for #1291 regressions, vs the silent
|
|
136
|
+
// 'reply-called' which is the expected steady state).
|
|
137
|
+
return { kind: 'skip', reason: 'reply-called-no-new-text' }
|
|
138
|
+
}
|
|
139
|
+
if (input.chatId == null) return { kind: 'skip', reason: 'no-inbound-chat' }
|
|
140
|
+
if (isSilentFlushMarker(tail)) return { kind: 'skip', reason: 'silent-marker' }
|
|
141
|
+
return { kind: 'flush', text: tail }
|
|
142
|
+
}
|
|
143
|
+
|
|
90
144
|
if (input.chatId == null) return { kind: 'skip', reason: 'no-inbound-chat' }
|
|
91
145
|
const joined = input.capturedText.join('\n').trim()
|
|
92
146
|
if (joined.length === 0) return { kind: 'skip', reason: 'empty-text' }
|
|
@@ -297,18 +297,22 @@ as a long-lived secret.
|
|
|
297
297
|
When all three are checked, the env block above + `bun run test:uat`
|
|
298
298
|
is safe to run.
|
|
299
299
|
|
|
300
|
-
## 8. CI gate —
|
|
301
|
-
|
|
302
|
-
Since the
|
|
303
|
-
(`fuzz-random-prompts-dm.test.ts`,
|
|
304
|
-
`fuzz-human-style-dm.test.ts`) runs
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
300
|
+
## 8. CI gate — `ci-uat` GitHub Actions workflow
|
|
301
|
+
|
|
302
|
+
Since the GHA gate landed (replacing the original Buildkite gate),
|
|
303
|
+
the fuzz subset of scenarios (`fuzz-random-prompts-dm.test.ts`,
|
|
304
|
+
`fuzz-extended-dm.test.ts`, `fuzz-human-style-dm.test.ts`) runs
|
|
305
|
+
automatically on every PR that touches `telegram-plugin/`,
|
|
306
|
+
`src/agents/`, or `telegram-plugin/uat/`.
|
|
307
|
+
|
|
308
|
+
The workflow (`.github/workflows/ci-uat.yml`) runs on a self-hosted
|
|
309
|
+
GHA runner labelled `[self-hosted, uat-host]` that lives on the
|
|
310
|
+
same box as the `test-harness` agent. Gating: the `UAT_GATE_ENABLED`
|
|
311
|
+
repository variable must be `true` AND the four Telegram secrets
|
|
312
|
+
(`TELEGRAM_API_ID`, `TELEGRAM_API_HASH`, `TELEGRAM_UAT_DRIVER_SESSION`,
|
|
313
|
+
`TELEGRAM_TEST_BOT_USERNAME`) must be present as GitHub Actions
|
|
314
|
+
secrets. The workflow's header docstring covers agent setup + secret
|
|
315
|
+
rotation.
|
|
312
316
|
|
|
313
317
|
**Scope (CI):**
|
|
314
318
|
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Side-effecting half of the auto-fallback flow (#11 / #420 / #421).
|
|
3
|
-
*
|
|
4
|
-
* `auto-fallback.ts` returns a pure `FallbackPlan`. This module
|
|
5
|
-
* dispatches the user-visible Telegram notification for that plan
|
|
6
|
-
* to the owner chat. Extracted from gateway.ts so the dispatch is
|
|
7
|
-
* testable end-to-end via `tests/fake-bot-api.ts` instead of through
|
|
8
|
-
* the full gateway boot path.
|
|
9
|
-
*
|
|
10
|
-
* Error-handling contract: API failures are reported via `onError`
|
|
11
|
-
* but never throw. The gateway logs to stderr; tests assert via the
|
|
12
|
-
* callback. A failed notification does not block the agent restart
|
|
13
|
-
* downstream — the user being unaware of the swap is a worse failure
|
|
14
|
-
* than burning a slot, but neither failure should kill the gateway.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
import type { FallbackPlan } from './auto-fallback.js';
|
|
18
|
-
|
|
19
|
-
/** Minimal subset of grammy's `bot.api` we depend on. */
|
|
20
|
-
export interface FallbackBotApi {
|
|
21
|
-
sendMessage(
|
|
22
|
-
chat_id: string | number,
|
|
23
|
-
text: string,
|
|
24
|
-
opts?: Record<string, unknown>,
|
|
25
|
-
): Promise<{ message_id: number }>;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
export interface FallbackBot {
|
|
29
|
-
api: FallbackBotApi;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
export interface DispatchFallbackArgs {
|
|
33
|
-
bot: FallbackBot;
|
|
34
|
-
/** Owner chat (`access.allowFrom[0]`). When null/empty, dispatch
|
|
35
|
-
* becomes a noop — no chat to notify. */
|
|
36
|
-
ownerChatId: string | null | undefined;
|
|
37
|
-
plan: FallbackPlan;
|
|
38
|
-
onError?: (err: unknown) => void;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
export type DispatchOutcome =
|
|
42
|
-
| { kind: 'sent'; messageId: number }
|
|
43
|
-
| { kind: 'no-chat' }
|
|
44
|
-
| { kind: 'error' };
|
|
45
|
-
|
|
46
|
-
/**
|
|
47
|
-
* Send the plan's `notificationHtml` to the owner chat. Idempotent
|
|
48
|
-
* within a plan (caller decides when to invoke). Always resolves.
|
|
49
|
-
*/
|
|
50
|
-
export async function dispatchFallbackNotification(
|
|
51
|
-
args: DispatchFallbackArgs,
|
|
52
|
-
): Promise<DispatchOutcome> {
|
|
53
|
-
if (!args.ownerChatId) return { kind: 'no-chat' };
|
|
54
|
-
try {
|
|
55
|
-
const sent = await args.bot.api.sendMessage(
|
|
56
|
-
args.ownerChatId,
|
|
57
|
-
args.plan.notificationHtml,
|
|
58
|
-
{
|
|
59
|
-
parse_mode: 'HTML',
|
|
60
|
-
link_preview_options: { is_disabled: true },
|
|
61
|
-
},
|
|
62
|
-
);
|
|
63
|
-
return { kind: 'sent', messageId: sent.message_id };
|
|
64
|
-
} catch (err) {
|
|
65
|
-
args.onError?.(err);
|
|
66
|
-
return { kind: 'error' };
|
|
67
|
-
}
|
|
68
|
-
}
|