agent-relay-orchestrator 0.119.7 → 0.119.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-orchestrator",
3
- "version": "0.119.7",
3
+ "version": "0.119.9",
4
4
  "description": "Agent Relay orchestrator — manages agent lifecycle across hosts",
5
5
  "type": "module",
6
6
  "bin": {
@@ -17,7 +17,7 @@
17
17
  },
18
18
  "dependencies": {
19
19
  "agent-relay-providers": "0.104.3",
20
- "agent-relay-sdk": "0.2.105",
20
+ "agent-relay-sdk": "0.2.106",
21
21
  "callmux": "0.23.0"
22
22
  },
23
23
  "devDependencies": {
@@ -10,24 +10,53 @@ interface CommandPollerOptions {
10
10
  log?: (message: string) => void;
11
11
  intervalMs?: number;
12
12
  errorBackoffMs?: number;
13
+ watchdogMs?: number;
14
+ watchdogCheckMs?: number;
15
+ now?: () => number;
13
16
  }
14
17
 
15
- export function createCommandPoller({ relay, control, log = console.error, intervalMs = 3_000, errorBackoffMs = 3_000 }: CommandPollerOptions) {
18
+ const DEFAULT_WATCHDOG_MS = 5 * 60 * 1000;
19
+
20
+ export function createCommandPoller({
21
+ relay,
22
+ control,
23
+ log = console.error,
24
+ intervalMs = 3_000,
25
+ errorBackoffMs = 3_000,
26
+ watchdogMs = DEFAULT_WATCHDOG_MS,
27
+ watchdogCheckMs = Math.min(60_000, watchdogMs),
28
+ now = Date.now,
29
+ }: CommandPollerOptions) {
16
30
  let inFlight = false;
17
31
  let stopped = true;
18
32
  let timer: ReturnType<typeof setTimeout> | undefined;
33
+ let watchdogTimer: ReturnType<typeof setInterval> | undefined;
34
+ let activePollController: AbortController | undefined;
35
+ let activePollStartedAt = 0;
36
+ let activePhase: "idle" | "poll" | "handler" = "idle";
19
37
  let lastTickErrored = false;
38
+ let lastCycleCompletedAt = now();
39
+ let generation = 0;
20
40
 
21
41
  async function tick(): Promise<boolean> {
22
42
  if (!relay.connected || inFlight) return false;
43
+ const tickGeneration = generation;
44
+ const controller = new AbortController();
23
45
  inFlight = true;
46
+ activePollController = controller;
47
+ activePollStartedAt = now();
48
+ activePhase = "poll";
24
49
  lastTickErrored = false;
25
50
  try {
26
- const commands = await relay.pollCommands();
51
+ const commands = await relay.pollCommands(controller.signal);
52
+ if (tickGeneration !== generation) return false;
53
+ if (activePollController === controller) activePollController = undefined;
54
+ activePhase = "handler";
27
55
  if (commands.length > 0) {
28
56
  log(`[orchestrator] Received ${commands.length} command(s)`);
29
57
  }
30
58
  for (const command of commands) {
59
+ if (tickGeneration !== generation) return false;
31
60
  log(`[orchestrator] Handling command: ${command.type} ${command.id}`);
32
61
  await control.handleCommand(command);
33
62
  }
@@ -37,7 +66,12 @@ export function createCommandPoller({ relay, control, log = console.error, inter
37
66
  log(`[orchestrator] Poll error: ${err}`);
38
67
  return false;
39
68
  } finally {
40
- inFlight = false;
69
+ if (activePollController === controller) activePollController = undefined;
70
+ if (tickGeneration === generation) {
71
+ inFlight = false;
72
+ activePhase = "idle";
73
+ activePollStartedAt = 0;
74
+ }
41
75
  }
42
76
  }
43
77
 
@@ -45,12 +79,12 @@ export function createCommandPoller({ relay, control, log = console.error, inter
45
79
  if (stopped) return;
46
80
  timer = setTimeout(() => {
47
81
  timer = undefined;
48
- void runCycle();
82
+ void runCycle(generation);
49
83
  }, delayMs);
50
84
  timer.unref?.();
51
85
  }
52
86
 
53
- async function runCycle(): Promise<void> {
87
+ async function runCycle(runGeneration: number): Promise<void> {
54
88
  let errored = false;
55
89
  try {
56
90
  await tick();
@@ -59,20 +93,50 @@ export function createCommandPoller({ relay, control, log = console.error, inter
59
93
  errored = true;
60
94
  log(`[orchestrator] Poll loop error: ${err}`);
61
95
  } finally {
62
- if (!stopped) schedule(errored ? errorBackoffMs : intervalMs);
96
+ if (runGeneration === generation) lastCycleCompletedAt = now();
97
+ if (!stopped && runGeneration === generation) schedule(errored ? errorBackoffMs : intervalMs);
63
98
  }
64
99
  }
65
100
 
101
+ function startWatchdog(): void {
102
+ if (watchdogTimer || watchdogMs <= 0 || watchdogCheckMs <= 0) return;
103
+ watchdogTimer = setInterval(() => {
104
+ if (stopped) return;
105
+ if (activePhase !== "poll" || !activePollController) return;
106
+ const staleForMs = now() - activePollStartedAt;
107
+ if (staleForMs < watchdogMs) return;
108
+ generation += 1;
109
+ activePollController.abort();
110
+ activePollController = undefined;
111
+ activePollStartedAt = 0;
112
+ activePhase = "idle";
113
+ inFlight = false;
114
+ log(`[orchestrator] Poll loop watchdog: command poll stuck for ${staleForMs}ms; aborting stuck poll and restarting`);
115
+ if (!timer) schedule(0);
116
+ lastCycleCompletedAt = now();
117
+ }, watchdogCheckMs);
118
+ watchdogTimer.unref?.();
119
+ }
120
+
66
121
  function start(): void {
67
122
  if (!stopped) return;
68
123
  stopped = false;
124
+ lastCycleCompletedAt = now();
125
+ startWatchdog();
69
126
  schedule(0);
70
127
  }
71
128
 
72
129
  function stop(): void {
73
130
  stopped = true;
131
+ generation += 1;
132
+ activePollController?.abort();
133
+ activePollController = undefined;
134
+ activePollStartedAt = 0;
135
+ activePhase = "idle";
74
136
  if (timer) clearTimeout(timer);
75
137
  timer = undefined;
138
+ if (watchdogTimer) clearInterval(watchdogTimer);
139
+ watchdogTimer = undefined;
76
140
  }
77
141
 
78
142
  return {
package/src/relay.ts CHANGED
@@ -9,7 +9,7 @@ export interface RelayClient {
9
9
  register(): Promise<void>;
10
10
  heartbeat(): Promise<void>;
11
11
  updateManagedAgents(agents: ManagedAgentReport[], exitedAgents?: ManagedSessionExitDiagnostics[]): Promise<void>;
12
- pollCommands(): Promise<RelayCommand[]>;
12
+ pollCommands(signal?: AbortSignal): Promise<RelayCommand[]>;
13
13
  getCommand(commandId: string): Promise<RelayCommand | null>;
14
14
  updateCommand(commandId: string, status: string, result?: Record<string, unknown>, error?: string): Promise<boolean>;
15
15
  acquireProviderQuotaLease(orchestratorId: string, input: ProviderQuotaLeaseAcquireInput): Promise<ProviderQuotaLeaseAcquireResult>;
@@ -213,9 +213,9 @@ export function createRelayClient(config: OrchestratorConfig, probeCache: Provid
213
213
  });
214
214
  }
215
215
 
216
- async function pollCommands(): Promise<RelayCommand[]> {
216
+ async function pollCommands(signal?: AbortSignal): Promise<RelayCommand[]> {
217
217
  const url = `/commands?target=${encodeURIComponent(agentId)}&status=pending&limit=50`;
218
- const res = await apiCall("GET", url);
218
+ const res = await http.request("GET", `/api${url}`, undefined, signal ? { signal } : undefined);
219
219
  if (!res.ok) return [];
220
220
  return await res.json() as RelayCommand[];
221
221
  }
@@ -274,8 +274,7 @@ export { tmuxCommand };
274
274
 
275
275
  // Lightweight liveness for the live terminal stream's backfill metadata — avoids a full
276
276
  // capture-pane just to learn whether the pane/agent are still up.
277
- export function sessionLiveness(name: string): { running: boolean; agentAlive: boolean } {
278
- const socketName = tmuxSocketForSession(name);
277
+ export function sessionLiveness(name: string, socketName = tmuxSocketForSession(name)): { running: boolean; agentAlive: boolean } {
279
278
  return { running: tmuxHasSession(name, socketName), agentAlive: isSessionAlive(name) };
280
279
  }
281
280
 
@@ -368,7 +368,7 @@ class SessionStream {
368
368
  private socket: string | undefined;
369
369
 
370
370
  start(): void {
371
- const socket = tmuxSocketForSession(this.session);
371
+ const socket = this.config.env.AGENT_RELAY_TMUX_SOCKET || tmuxSocketForSession(this.session);
372
372
  this.socket = socket;
373
373
  // Pin the window size before attaching so the control client can't reflow the
374
374
  // pane (default window-size would shrink it to the new client's 80x24). These are
@@ -786,7 +786,7 @@ class SessionStream {
786
786
  const dims = await this.paneDims();
787
787
  if (dims.cols) this.termCols = dims.cols;
788
788
  if (dims.rows) this.termRows = dims.rows;
789
- const live = sessionLiveness(this.session);
789
+ const live = sessionLiveness(this.session, this.config.env.AGENT_RELAY_TMUX_SOCKET || tmuxSocketForSession(this.session));
790
790
  tdbg(`backfill ${this.session} req=${cols}x${rows} term=${this.termCols}x${this.termRows} contentLen=${content.length} viewers=${this.subscribers.size}`);
791
791
  return {
792
792
  session: this.session,
@@ -55,9 +55,10 @@ export async function populateMergeState(cwd: string, targetRef: string, state:
55
55
  // compare against the base branch's upstream when it has one (squash PRs
56
56
  // land on the remote), treat an identical tree as fully landed (covers a
57
57
  // multi-commit squash), else count only commits whose patch isn't already
58
- // present in base (git cherry '+'). Staleness of the compare ref can only
59
- // under-count landings, never invent one so `landed` is safe to act on.
60
- const cherryBase = await upstreamRef(cwd, base) ?? base;
58
+ // present in base (git cherry '+'). When the base tracks an upstream, fetch
59
+ // before comparing: a stale origin/main can both hide a real squash landing
60
+ // and prove a false tree match against old remote state.
61
+ const cherryBase = await syncBaseFromOrigin(cwd, base) ?? base;
61
62
  if ((await git(["diff", "--quiet", cherryBase, targetRef], cwd)).ok) {
62
63
  state.unmergedAhead = 0;
63
64
  } else {
@@ -86,7 +87,7 @@ export async function upstreamRef(worktreePath: string, base: string, signal?: A
86
87
  * base is never advanced, so cutting from local base would recycle onto stale main. Fetching
87
88
  * the upstream and returning `origin/<base>` (when it resolves) yields a branch off CURRENT
88
89
  * base. Falls back to the local base ref when there is no upstream (no remote) or the
89
- * upstream ref can't be resolved — the caller still verifies it contains the merge SHA.
90
+ * upstream ref can't be fetched/resolved — callers must not act on a stale remote-tracking ref.
90
91
  */
91
92
  export async function syncBaseFromOrigin(worktreePath: string, base: string | undefined, signal?: AbortSignal): Promise<string | undefined> {
92
93
  if (!base) return undefined;
@@ -95,7 +96,10 @@ export async function syncBaseFromOrigin(worktreePath: string, base: string | un
95
96
  const slash = upstream.indexOf("/");
96
97
  const remote = slash > 0 ? upstream.slice(0, slash) : undefined;
97
98
  const remoteBranch = slash > 0 ? upstream.slice(slash + 1) : base;
98
- if (remote) await git(["fetch", remote, remoteBranch], worktreePath, { signal }); // best-effort freshness
99
+ if (remote) {
100
+ const fetch = await git(["fetch", remote, `${remoteBranch}:refs/remotes/${remote}/${remoteBranch}`], worktreePath, { signal });
101
+ if (!fetch.ok) return base;
102
+ }
99
103
  return (await git(["rev-parse", "--verify", "--quiet", upstream], worktreePath, { signal })).ok ? upstream : base;
100
104
  }
101
105