@songsid/agend 2.0.8-beta.2 → 2.0.8-beta.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +8 -3
  2. package/dist/adapter-world.d.ts +1 -1
  3. package/dist/adapter-world.js +2 -2
  4. package/dist/adapter-world.js.map +1 -1
  5. package/dist/backend/antigravity.js +6 -2
  6. package/dist/backend/antigravity.js.map +1 -1
  7. package/dist/backend/claude-code.js +3 -2
  8. package/dist/backend/claude-code.js.map +1 -1
  9. package/dist/backend/codex.js +4 -2
  10. package/dist/backend/codex.js.map +1 -1
  11. package/dist/backend/kiro.js +4 -2
  12. package/dist/backend/kiro.js.map +1 -1
  13. package/dist/backend/opencode.js.map +1 -1
  14. package/dist/backend/types.d.ts +8 -0
  15. package/dist/backend/types.js +12 -0
  16. package/dist/backend/types.js.map +1 -1
  17. package/dist/channel/adapters/discord.d.ts +5 -1
  18. package/dist/channel/adapters/discord.js +68 -5
  19. package/dist/channel/adapters/discord.js.map +1 -1
  20. package/dist/channel/adapters/telegram.d.ts +3 -0
  21. package/dist/channel/adapters/telegram.js +7 -0
  22. package/dist/channel/adapters/telegram.js.map +1 -1
  23. package/dist/channel/tool-router.js +1 -1
  24. package/dist/channel/tool-router.js.map +1 -1
  25. package/dist/channel/tool-tracker.js +2 -2
  26. package/dist/channel/tool-tracker.js.map +1 -1
  27. package/dist/channel/types.d.ts +12 -2
  28. package/dist/cli.js +126 -48
  29. package/dist/cli.js.map +1 -1
  30. package/dist/daemon.d.ts +25 -1
  31. package/dist/daemon.js +179 -57
  32. package/dist/daemon.js.map +1 -1
  33. package/dist/fleet-context.d.ts +2 -0
  34. package/dist/fleet-manager.d.ts +40 -0
  35. package/dist/fleet-manager.js +444 -138
  36. package/dist/fleet-manager.js.map +1 -1
  37. package/dist/general-knowledge/skills/session-management/SKILL.md +56 -1
  38. package/dist/instance-lifecycle.js +9 -0
  39. package/dist/instance-lifecycle.js.map +1 -1
  40. package/dist/outbound-handlers.d.ts +1 -0
  41. package/dist/outbound-handlers.js +3 -0
  42. package/dist/outbound-handlers.js.map +1 -1
  43. package/dist/outbound-schemas.d.ts +1 -1
  44. package/dist/tmux-control.d.ts +10 -0
  45. package/dist/tmux-control.js +29 -0
  46. package/dist/tmux-control.js.map +1 -1
  47. package/dist/tmux-manager.d.ts +7 -1
  48. package/dist/tmux-manager.js +17 -0
  49. package/dist/tmux-manager.js.map +1 -1
  50. package/dist/topic-commands.d.ts +21 -0
  51. package/dist/topic-commands.js +81 -6
  52. package/dist/topic-commands.js.map +1 -1
  53. package/package.json +1 -1
package/dist/daemon.js CHANGED
@@ -385,11 +385,27 @@ export class Daemon extends EventEmitter {
385
385
  scheduleNext();
386
386
  return;
387
387
  }
388
- const paneStatus = await this.tmux.getPaneStatus();
388
+ // Human-readable backend label for logs (e.g. "claude", "kiro-cli")
389
+ const cliLabel = this.backend?.binaryName ?? "CLI";
390
+ let paneStatus = await this.tmux.getPaneStatus();
389
391
  if (paneStatus?.alive) {
390
392
  scheduleNext();
391
393
  return;
392
394
  }
395
+ // A null status is ambiguous: it can be a transient `tmux list-panes`
396
+ // failure (e.g. tmux busy during a fleet-restart storm) rather than a
397
+ // real exit. Re-confirm once after a short delay before treating it as
398
+ // a crash. A non-null {alive:false} is a definite dead pane (real exit)
399
+ // and needs no recheck.
400
+ if (paneStatus === null) {
401
+ await new Promise(r => setTimeout(r, 1500));
402
+ paneStatus = await this.tmux.getPaneStatus();
403
+ if (paneStatus?.alive) {
404
+ this.logger.debug(`[health] ${cliLabel} pane reported gone then alive on recheck — transient query failure, ignoring`);
405
+ scheduleNext();
406
+ return;
407
+ }
408
+ }
393
409
  // paneStatus === null → window gone entirely (e.g. tmux server crash)
394
410
  // paneStatus.alive === false → pane dead, exit code available
395
411
  const exitCode = paneStatus?.exitCode;
@@ -401,13 +417,17 @@ export class Daemon extends EventEmitter {
401
417
  this.healthCheckPaused = true;
402
418
  return;
403
419
  }
404
- // Distinguish tmux server crash from single window crash
420
+ // Distinguish tmux server crash from single window crash.
421
+ // nullReason records *why* getPaneStatus returned null (for diagnosing
422
+ // whether this was a real window loss or a transient query failure).
405
423
  let crashType = "window";
424
+ let nullReason;
406
425
  if (!paneStatus) {
407
426
  const serverAlive = await TmuxManager.sessionExists(this.tmuxSessionName);
408
427
  if (!serverAlive) {
409
428
  crashType = "server";
410
- this.logger.error("tmux server died — all windows lost");
429
+ nullReason = "server_gone";
430
+ this.logger.error(`tmux server died — all ${cliLabel} windows lost`);
411
431
  // Fleet-level circuit breaker: pause all instances on repeated tmux server crashes
412
432
  Daemon.tmuxServerCrashTimestamps.push(Date.now());
413
433
  const cutoff = Date.now() - 5 * 60_000;
@@ -428,23 +448,34 @@ export class Daemon extends EventEmitter {
428
448
  await new Promise(r => setTimeout(r, 2_000)); // let session stabilize
429
449
  }
430
450
  else {
431
- this.logger.warn({ exitCode }, "Claude window died (tmux server alive)");
451
+ // null but server alive: window-level disappearance. Probe whether
452
+ // the window truly no longer exists vs a transient query glitch.
453
+ nullReason = "no_window";
454
+ try {
455
+ const windows = await TmuxManager.listWindows(this.tmuxSessionName);
456
+ if (windows.some(w => w.name === this.name))
457
+ nullReason = "window_present_query_glitch";
458
+ }
459
+ catch {
460
+ nullReason = "query_error";
461
+ }
462
+ this.logger.warn({ exitCode, nullReason }, `${cliLabel} window not found (tmux server alive)`);
432
463
  }
433
464
  }
434
465
  else {
435
- this.logger.warn({ exitCode }, "Claude process exited");
466
+ this.logger.warn({ exitCode }, `${cliLabel} process exited`);
436
467
  }
437
- // Capture last output from dead pane before killing
468
+ // Capture last output before killing. Best-effort even when the pane is
469
+ // gone (paneStatus null) — gives the crash record something to diagnose
470
+ // from instead of an empty lastOutput.
438
471
  let lastOutput;
439
- if (paneStatus) {
440
- try {
441
- const raw = await this.tmux.capturePaneWithHistory(50);
442
- // Strip ANSI escape codes for readability
443
- const cleaned = raw.replace(/\x1b\[[0-9;]*[a-zA-Z]/g, "");
444
- lastOutput = cleaned.trimEnd() || undefined;
445
- }
446
- catch { /* best effort */ }
472
+ try {
473
+ const raw = await this.tmux.capturePaneWithHistory(50);
474
+ // Strip ANSI escape codes for readability
475
+ const cleaned = raw.replace(/\x1b\[[0-9;]*[a-zA-Z]/g, "");
476
+ lastOutput = cleaned.trimEnd() || undefined;
447
477
  }
478
+ catch { /* best effort — pane may already be gone */ }
448
479
  // Kill the dead window (remain-on-exit keeps it around) before respawn
449
480
  if (paneStatus) {
450
481
  await this.tmux.killWindow();
@@ -473,8 +504,21 @@ export class Daemon extends EventEmitter {
473
504
  }
474
505
  // Already attempted recovery — fall through to normal crash handling
475
506
  }
507
+ // Detect a --continue/--resume failure (no conversation to resume). The
508
+ // session-id file persists across the crash, so a blind respawn would add
509
+ // --continue again and crash in the same way → loop. Clear the session id
510
+ // and skip resume so the next spawn starts fresh. (skipResume also stops
511
+ // saveSessionId below from resurrecting the id from statusline.json.)
512
+ if (lastOutput && /no conversation found|no conversation to (continue|resume)|no previous (session|conversation)|--continue/i.test(lastOutput)) {
513
+ this.logger.warn("Detected --continue/resume failure — clearing session-id; next spawn starts fresh");
514
+ try {
515
+ unlinkSync(join(this.instanceDir, "session-id"));
516
+ }
517
+ catch { /* may not exist */ }
518
+ this.skipResume = true;
519
+ }
476
520
  // Append to crash history
477
- this.appendCrashHistory({ exitCode, lastOutput, crashType });
521
+ this.appendCrashHistory({ exitCode, lastOutput, crashType, reason: nullReason });
478
522
  // Detect rapid crash: sliding window — 3+ crashes in 5 minutes
479
523
  this.crashTimestamps.push(Date.now());
480
524
  const crashWindowMs = 5 * 60_000;
@@ -508,7 +552,7 @@ export class Daemon extends EventEmitter {
508
552
  const delay = backoff === "exponential"
509
553
  ? Math.min(1000 * Math.pow(2, this.crashCount - 1), 60_000)
510
554
  : 1000 * this.crashCount;
511
- this.logger.warn({ crashCount: this.crashCount, delay }, "Claude window died — respawning after backoff");
555
+ this.logger.warn({ crashCount: this.crashCount, delay }, `${cliLabel} window died — respawning after backoff`);
512
556
  await new Promise(r => setTimeout(r, delay));
513
557
  try {
514
558
  this.saveSessionId();
@@ -550,11 +594,11 @@ export class Daemon extends EventEmitter {
550
594
  }
551
595
  catch { /* may not exist */ }
552
596
  }
553
- this.logger.info({ resumed }, "Respawned Claude window after crash");
597
+ this.logger.info({ resumed }, `Respawned ${cliLabel} window after crash`);
554
598
  this.emit("crash_respawn", this.name);
555
599
  }
556
600
  catch (err) {
557
- this.logger.error({ err }, "Failed to respawn Claude window");
601
+ this.logger.error({ err }, `Failed to respawn ${cliLabel} window`);
558
602
  }
559
603
  scheduleNext();
560
604
  }, this.config.restart_policy.health_check_interval_ms ?? 30_000);
@@ -669,6 +713,15 @@ export class Daemon extends EventEmitter {
669
713
  }
670
714
  }, 5_000); // Check every 5 seconds (runtime dialogs need fast response)
671
715
  }
716
+ /**
717
+ * Interrupt the CLI's current generation (cancel button / `/cancel`).
718
+ * Direct tmux key event (not a paste) so it registers as the interrupt key.
719
+ * kiro-cli interrupts on Ctrl+C; the others (claude-code, codex, …) on Escape.
720
+ */
721
+ async sendEscape() {
722
+ const cancelKey = this.backend?.binaryName === "kiro-cli" ? "C-c" : "Escape";
723
+ await this.tmux?.sendSpecialKey(cancelKey);
724
+ }
672
725
  async stop() {
673
726
  this.logger.info("Stopping daemon instance");
674
727
  if (this.healthCheckTimer) {
@@ -704,9 +757,10 @@ export class Daemon extends EventEmitter {
704
757
  // instances stop in parallel (same pattern as pasteText).
705
758
  await new Promise(r => setTimeout(r, 150));
706
759
  await this.tmux.sendSpecialKey("Enter");
707
- // Wait up to 10s for graceful exit
708
- for (let i = 0; i < 20; i++) {
709
- await new Promise(r => setTimeout(r, 500));
760
+ // Wait up to 3s for graceful exit, polling every 200ms. A healthy CLI
761
+ // exits within ~1s; a longer wait just delays the force-kill fallback.
762
+ for (let i = 0; i < 15; i++) {
763
+ await new Promise(r => setTimeout(r, 200));
710
764
  const status = await this.tmux.getPaneStatus();
711
765
  if (!status || !status.alive) {
712
766
  killed = true;
@@ -715,7 +769,7 @@ export class Daemon extends EventEmitter {
715
769
  }
716
770
  }
717
771
  if (!killed)
718
- this.logger.warn("CLI did not exit gracefully within 10s, force killing window");
772
+ this.logger.warn("CLI did not exit gracefully within 3s, force killing window");
719
773
  // Always kill window — remain-on-exit keeps dead panes around after CLI exits
720
774
  await this.tmux.killWindow();
721
775
  const windowIdFile = join(this.instanceDir, "window-id");
@@ -857,9 +911,9 @@ export class Daemon extends EventEmitter {
857
911
  formatted += `\n(reply_to: "${meta.reply_to_text}")`;
858
912
  }
859
913
  // Serialize deliveries: each message waits for the previous to complete,
860
- // and each waits for the CLI to be idle before pasting.
861
- const enqueuedAt = Date.now();
862
- const isFromInstance = !!meta.from_instance;
914
+ // and each waits for the CLI to be idle before pasting. Messages are never
915
+ // dropped for age — a long-busy CLI just queues them until it frees up
916
+ // (the user can press Cancel to interrupt and let the queue drain sooner).
863
917
  const chatId = meta.chat_id;
864
918
  const messageId = meta.message_id;
865
919
  const wasQueued = this.pasteQueueDepth > 0;
@@ -872,11 +926,6 @@ export class Daemon extends EventEmitter {
872
926
  }
873
927
  this.pasteLock = this.pasteLock.then(async () => {
874
928
  try {
875
- // Drop stale user messages (>60s in queue), but never drop cross-instance messages
876
- if (!isFromInstance && Date.now() - enqueuedAt > 60_000) {
877
- this.logger.warn({ age: Date.now() - enqueuedAt, user: meta.user }, "Dropping stale message");
878
- return;
879
- }
880
929
  if (this.config.pre_task_command) {
881
930
  await this.deliverMessage(this.config.pre_task_command);
882
931
  }
@@ -884,10 +933,10 @@ export class Daemon extends EventEmitter {
884
933
  this.pendingInstructionsNotice = false;
885
934
  await this.deliverMessage("[system] Your instructions/steering files have been updated. Please re-read them for the latest guidelines.");
886
935
  }
887
- await this.deliverMessage(formatted);
888
- if (chatId && messageId) {
889
- this.emit("message_delivered", { chatId: meta.thread_id || chatId, messageId });
890
- }
936
+ const status = (chatId && messageId)
937
+ ? { chatId: meta.thread_id || chatId, messageId }
938
+ : undefined;
939
+ await this.deliverMessage(formatted, status);
891
940
  }
892
941
  finally {
893
942
  this.pasteQueueDepth--;
@@ -897,41 +946,109 @@ export class Daemon extends EventEmitter {
897
946
  });
898
947
  this.logger.debug({ user: meta.user, text: content.slice(0, 100) }, "Queued channel message for delivery");
899
948
  }
900
- /** Deliver a single message: wait for idle, then paste */
901
- async deliverMessage(formatted) {
949
+ /**
950
+ * Deliver a single message and drive its status reactions:
951
+ * ⏳ message_queued — CLI busy; queued, waiting for idle
952
+ * 👀 message_delivered — pasted + Enter sent; agent now has it
953
+ * ✅ message_confirmed — idle→busy transition observed; agent is processing
954
+ * ❌ message_failed — tmux window gone, paste retries exhausted
955
+ * Returns true once the message is in the CLI, false only on real delivery failure.
956
+ *
957
+ * Bug A (silent message loss): paste failures retry with backoff (window recovery)
958
+ * and emit `message_failed` if all attempts fail.
959
+ * Busy handling (UX): we never force-paste into a busy CLI and never give up on a
960
+ * busy one — we show ⏳ and wait for idle indefinitely (a genuinely hung CLI is the
961
+ * hang detector's job; a user who can't wait presses Cancel → Escape → idle). The
962
+ * pasteLock is serial, so later messages naturally queue behind this wait.
963
+ */
964
+ async deliverMessage(formatted, status) {
902
965
  // Sanitize unclosed code fences — they cause CLI to wait for closure on Enter
903
966
  const fenceCount = (formatted.match(/```/g) || []).length;
904
967
  if (fenceCount % 2 !== 0) {
905
968
  // Odd number of fences = unclosed. Remove all code fences from the message.
906
969
  formatted = formatted.replace(/```/g, "");
907
970
  }
908
- const windowId = this.getWindowId();
909
- if (windowId && this.controlClient) {
910
- const idle = await this.controlClient.waitForIdle(windowId, this.config.lightweight ? 30_000 : 120_000);
911
- if (!idle) {
912
- this.logger.warn("Delivering message after idle timeout (CLI may be busy)");
971
+ let windowId = this.getWindowId();
972
+ // If the CLI is busy, show ⏳ and wait for it to go idle — no timeout, no force.
973
+ if (windowId && this.controlClient && !this.controlClient.isIdle(windowId)) {
974
+ if (status)
975
+ this.emit("message_queued", status);
976
+ this.logger.debug("CLI busy — queuing message until idle");
977
+ await this.controlClient.waitUntilIdle(windowId);
978
+ }
979
+ // Bug A: paste with backoff. Transient failures are usually a stale window id
980
+ // after a crash/respawn — recover by name and retry (max 3 attempts, 2s apart).
981
+ const maxAttempts = 3;
982
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
983
+ const pasted = await this.tmux.pasteBuffer(formatted);
984
+ if (!pasted) {
985
+ this.logger.warn({ attempt }, "pasteBuffer failed — recovering window and backing off");
986
+ windowId = (await this.recoverWindow()) ?? windowId;
987
+ if (attempt < maxAttempts)
988
+ await new Promise(r => setTimeout(r, 2000));
989
+ continue;
913
990
  }
914
- }
915
- const ok = await this.tmux.pasteText(formatted);
916
- if (!ok) {
917
- // Window ID may be stale after crash/respawn — try to find by name
918
- this.logger.warn("pasteText failed, looking up window by name");
919
- try {
920
- const windows = await TmuxManager.listWindows(this.tmuxSessionName);
921
- const match = windows.find(w => w.name === this.name);
922
- if (match) {
923
- this.tmux = new TmuxManager(this.tmuxSessionName, match.id);
924
- writeFileSync(join(this.instanceDir, "window-id"), match.id);
925
- await this.controlClient?.registerWindow(match.id);
926
- await this.tmux.pasteText(formatted);
927
- this.logger.info({ windowId: match.id }, "Recovered window ID and delivered message");
991
+ // Settle the bracketed paste, then submit.
992
+ await new Promise(r => setTimeout(r, 500));
993
+ const enterAt = Date.now();
994
+ await this.tmux.sendSpecialKey("Enter");
995
+ if (status)
996
+ this.emit("message_delivered", status); // 👀
997
+ // Confirm the CLI accepted the message by transitioning idle→busy (new output
998
+ // after Enter). If still idle after ~2s the Enter was likely swallowed while
999
+ // the TUI was redrawing — re-send Enter once and re-check.
1000
+ if (windowId && this.controlClient) {
1001
+ let becameBusy = await this.confirmBusyAfterEnter(windowId, enterAt);
1002
+ if (!becameBusy) {
1003
+ this.logger.warn("No idle→busy transition after Enter — re-sending Enter once");
1004
+ const retryAt = Date.now();
1005
+ await this.tmux.sendSpecialKey("Enter");
1006
+ becameBusy = await this.confirmBusyAfterEnter(windowId, retryAt);
928
1007
  }
1008
+ if (becameBusy && status)
1009
+ this.emit("message_confirmed", status); // ✅
929
1010
  }
930
- catch (retryErr) {
931
- this.logger.error({ err: retryErr }, "Failed to recover window for message delivery");
1011
+ else {
1012
+ // No control client to observe output: fall back to the legacy double-Enter.
1013
+ await new Promise(r => setTimeout(r, 1000));
1014
+ await this.tmux.sendSpecialKey("Enter");
1015
+ if (status)
1016
+ this.emit("message_confirmed", status); // ✅ (best-effort)
932
1017
  }
1018
+ return true;
1019
+ }
1020
+ this.logger.error("Message delivery failed after retries — window not ready");
1021
+ if (status)
1022
+ this.emit("message_failed", status); // ❌
1023
+ return false;
1024
+ }
1025
+ /** Re-resolve this instance's tmux window by name (stale id after crash/respawn). */
1026
+ async recoverWindow() {
1027
+ try {
1028
+ const windows = await TmuxManager.listWindows(this.tmuxSessionName);
1029
+ const match = windows.find(w => w.name === this.name);
1030
+ if (!match)
1031
+ return undefined;
1032
+ this.tmux = new TmuxManager(this.tmuxSessionName, match.id);
1033
+ writeFileSync(join(this.instanceDir, "window-id"), match.id);
1034
+ await this.controlClient?.registerWindow(match.id);
1035
+ this.logger.info({ windowId: match.id }, "Recovered window ID for message delivery");
1036
+ return match.id;
1037
+ }
1038
+ catch (retryErr) {
1039
+ this.logger.error({ err: retryErr }, "Failed to recover window for message delivery");
1040
+ return undefined;
933
1041
  }
934
1042
  }
1043
+ /** Poll up to ~2s (200ms × 10) for the pane to emit output after `since`. */
1044
+ async confirmBusyAfterEnter(windowId, since) {
1045
+ for (let i = 0; i < 10; i++) {
1046
+ await new Promise(r => setTimeout(r, 200));
1047
+ if (this.controlClient.hasOutputSince(windowId, since))
1048
+ return true;
1049
+ }
1050
+ return false;
1051
+ }
935
1052
  getWindowId() {
936
1053
  try {
937
1054
  return readFileSync(join(this.instanceDir, "window-id"), "utf-8").trim() || undefined;
@@ -1504,6 +1621,10 @@ export class Daemon extends EventEmitter {
1504
1621
  return true;
1505
1622
  }
1506
1623
  saveSessionId() {
1624
+ // When a resume failure has forced a fresh start, don't persist the stale id
1625
+ // back from statusline.json — that would re-arm --continue and re-loop.
1626
+ if (this.skipResume)
1627
+ return;
1507
1628
  const sid = this.backend?.getSessionId();
1508
1629
  if (sid) {
1509
1630
  writeFileSync(join(this.instanceDir, "session-id"), sid);
@@ -1642,6 +1763,7 @@ export class Daemon extends EventEmitter {
1642
1763
  instance: this.name,
1643
1764
  crashType: data.crashType,
1644
1765
  exitCode: data.exitCode,
1766
+ reason: data.reason,
1645
1767
  lastOutput: data.lastOutput,
1646
1768
  crashCount: this.crashCount + 1,
1647
1769
  crashesInWindow: this.crashTimestamps.length,