@pushpalsdev/cli 1.0.38 → 1.0.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -122,6 +122,7 @@ class ServiceManager {
122
122
  onServiceDegraded;
123
123
  onEvent;
124
124
  timer;
125
+ shutdownBegun = false;
125
126
  stopped = false;
126
127
  constructor(options = {}) {
127
128
  this.pollMs = Math.max(50, Math.floor(options.pollMs ?? DEFAULT_SERVICE_MANAGER_POLL_MS));
@@ -161,10 +162,10 @@ class ServiceManager {
161
162
  action: this.degradedAction
162
163
  };
163
164
  }
164
- stop() {
165
- if (this.stopped)
165
+ beginShutdown() {
166
+ if (this.shutdownBegun || this.stopped)
166
167
  return;
167
- this.stopped = true;
168
+ this.shutdownBegun = true;
168
169
  clearInterval(this.timer);
169
170
  for (const state of this.stateByService.values()) {
170
171
  if (!state.pendingRestartTimer)
@@ -172,6 +173,12 @@ class ServiceManager {
172
173
  clearTimeout(state.pendingRestartTimer);
173
174
  state.pendingRestartTimer = null;
174
175
  }
176
+ }
177
+ stop() {
178
+ if (this.stopped)
179
+ return;
180
+ this.beginShutdown();
181
+ this.stopped = true;
175
182
  for (const service of this.services.values()) {
176
183
  try {
177
184
  const pid = service.proc.pid;
@@ -207,7 +214,7 @@ class ServiceManager {
207
214
  this.onEvent?.(level, line);
208
215
  }
209
216
  tick() {
210
- if (this.stopped)
217
+ if (this.shutdownBegun || this.stopped)
211
218
  return;
212
219
  const now = Date.now();
213
220
  for (const [name, service] of this.services.entries()) {
@@ -250,7 +257,7 @@ class ServiceManager {
250
257
  state.pendingRestartTimer = setTimeout(() => {
251
258
  state.pendingRestartTimer = null;
252
259
  state.nextRestartAtMs = 0;
253
- if (this.stopped)
260
+ if (this.shutdownBegun || this.stopped)
254
261
  return;
255
262
  const current = this.services.get(name);
256
263
  if (!current || !current.exited)
@@ -4792,6 +4799,7 @@ async function main() {
4792
4799
  return;
4793
4800
  const serviceManager = autoStartedServiceManager;
4794
4801
  autoStartedServiceManager = null;
4802
+ serviceManager.beginShutdown();
4795
4803
  const shutdown = await requestLocalRuntimeShutdown(serverUrl, repoRoot, reason);
4796
4804
  if (shutdown.attempted && shutdown.accepted) {
4797
4805
  console.log("[pushpals] Local runtime shutdown accepted; waiting for services to exit...");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pushpalsdev/cli",
3
- "version": "1.0.38",
3
+ "version": "1.0.39",
4
4
  "description": "PushPals terminal CLI for LocalBuddy -> RemoteBuddy orchestration",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -0,0 +1,323 @@
1
+ import type { CommandRequest } from "protocol";
2
+
3
+ export type WorkerHeartbeatStatus = "idle" | "busy" | "error" | "offline";
4
+
5
+ type QueuePriority = "high" | "normal";
6
+
7
+ type TransportTask = {
8
+ label: string;
9
+ priority: QueuePriority;
10
+ droppable: boolean;
11
+ run: () => Promise<void>;
12
+ resolve: () => void;
13
+ };
14
+
15
+ export type WorkerServerTransportOptions = {
16
+ server: string;
17
+ headers: Record<string, string>;
18
+ workerId: string;
19
+ pollMs: number;
20
+ heartbeatMs: number;
21
+ staleClaimTtlMs: number;
22
+ fetchFn?: typeof fetch;
23
+ logInfo?: (message: string) => void;
24
+ logWarn?: (message: string) => void;
25
+ nowFn?: () => number;
26
+ };
27
+
28
+ export type WorkerTransportHealthSnapshot = {
29
+ heartbeatInFlight: boolean;
30
+ consecutiveHeartbeatFailures: number;
31
+ lastHeartbeatAttemptAt: number;
32
+ lastHeartbeatSuccessAt: number;
33
+ queuedRequests: number;
34
+ droppedLogRequests: number;
35
+ };
36
+
37
+ export type WorkerHeartbeatPayload = {
38
+ status: WorkerHeartbeatStatus;
39
+ currentJobId: string | null;
40
+ capabilities?: Record<string, unknown>;
41
+ details?: Record<string, unknown>;
42
+ };
43
+
44
+ function computeHeartbeatTimeoutMs(heartbeatMs: number): number {
45
+ return Math.max(1_500, Math.min(4_000, Math.floor(heartbeatMs * 0.8)));
46
+ }
47
+
48
+ function computeRequestTimeoutMs(heartbeatMs: number): number {
49
+ return Math.max(4_000, Math.min(10_000, Math.floor(heartbeatMs * 2)));
50
+ }
51
+
52
+ async function readResponseDetail(response: Response): Promise<string> {
53
+ const text = await response.text().catch(() => "");
54
+ return text.trim();
55
+ }
56
+
57
+ export class WorkerServerTransport {
58
+ private readonly server: string;
59
+ private readonly headers: Record<string, string>;
60
+ private readonly workerId: string;
61
+ private readonly pollMs: number;
62
+ private readonly staleClaimTtlMs: number;
63
+ private readonly fetchFn: typeof fetch;
64
+ private readonly logInfo: (message: string) => void;
65
+ private readonly logWarn: (message: string) => void;
66
+ private readonly nowFn: () => number;
67
+ private readonly heartbeatTimeoutMs: number;
68
+ private readonly requestTimeoutMs: number;
69
+ private readonly maxQueuedRequests = 256;
70
+
71
+ private queuedRequests: TransportTask[] = [];
72
+ private queueDrainInFlight = false;
73
+ private queueFlushWaiters: Array<() => void> = [];
74
+ private droppedLogRequests = 0;
75
+ private heartbeatInFlight = false;
76
+ private lastHeartbeatAttemptAt = 0;
77
+ private lastHeartbeatSuccessAt = 0;
78
+ private consecutiveHeartbeatFailures = 0;
79
+ private firstHeartbeatFailureAt = -1;
80
+ private lastHeartbeatFailureDetail = "";
81
+
82
+ constructor(options: WorkerServerTransportOptions) {
83
+ this.server = options.server;
84
+ this.headers = options.headers;
85
+ this.workerId = options.workerId;
86
+ this.pollMs = options.pollMs;
87
+ this.staleClaimTtlMs = options.staleClaimTtlMs;
88
+ this.fetchFn = options.fetchFn ?? fetch;
89
+ this.logInfo = options.logInfo ?? ((message) => console.log(message));
90
+ this.logWarn = options.logWarn ?? ((message) => console.warn(message));
91
+ this.nowFn = options.nowFn ?? (() => Date.now());
92
+ this.heartbeatTimeoutMs = computeHeartbeatTimeoutMs(options.heartbeatMs);
93
+ this.requestTimeoutMs = computeRequestTimeoutMs(options.heartbeatMs);
94
+ }
95
+
96
+ getHealthSnapshot(): WorkerTransportHealthSnapshot {
97
+ return {
98
+ heartbeatInFlight: this.heartbeatInFlight,
99
+ consecutiveHeartbeatFailures: this.consecutiveHeartbeatFailures,
100
+ lastHeartbeatAttemptAt: this.lastHeartbeatAttemptAt,
101
+ lastHeartbeatSuccessAt: this.lastHeartbeatSuccessAt,
102
+ queuedRequests: this.queuedRequests.length,
103
+ droppedLogRequests: this.droppedLogRequests,
104
+ };
105
+ }
106
+
107
+ getHeartbeatStaleAgeMs(nowMs = this.nowFn()): number {
108
+ if (this.lastHeartbeatSuccessAt <= 0) return Number.POSITIVE_INFINITY;
109
+ return Math.max(0, nowMs - this.lastHeartbeatSuccessAt);
110
+ }
111
+
112
+ shouldRecycleBusyWorker(nowMs = this.nowFn()): boolean {
113
+ const failureAgeMs =
114
+ this.firstHeartbeatFailureAt >= 0
115
+ ? Math.max(0, nowMs - this.firstHeartbeatFailureAt)
116
+ : null;
117
+ if (failureAgeMs == null) return false;
118
+ const threshold = Math.min(
119
+ this.staleClaimTtlMs,
120
+ Math.max(
121
+ 30_000,
122
+ Math.min(
123
+ this.staleClaimTtlMs - this.heartbeatTimeoutMs,
124
+ Math.floor(this.staleClaimTtlMs * 0.75),
125
+ ),
126
+ ),
127
+ );
128
+ return failureAgeMs >= threshold;
129
+ }
130
+
131
+ async sendHeartbeat(payload: WorkerHeartbeatPayload): Promise<boolean> {
132
+ if (this.heartbeatInFlight) {
133
+ return false;
134
+ }
135
+ this.heartbeatInFlight = true;
136
+ this.lastHeartbeatAttemptAt = this.nowFn();
137
+ try {
138
+ const response = await this.postJson("/workers/heartbeat", {
139
+ workerId: this.workerId,
140
+ status: payload.status,
141
+ currentJobId: payload.currentJobId,
142
+ pollMs: this.pollMs,
143
+ capabilities: payload.capabilities ?? {},
144
+ details: payload.details ?? {},
145
+ }, this.heartbeatTimeoutMs);
146
+ if (!response.ok) {
147
+ const detail = await readResponseDetail(response);
148
+ throw new Error(
149
+ `heartbeat rejected (${response.status})${detail ? `: ${detail}` : ""}`,
150
+ );
151
+ }
152
+ const previousFailures = this.consecutiveHeartbeatFailures;
153
+ this.lastHeartbeatSuccessAt = this.nowFn();
154
+ this.consecutiveHeartbeatFailures = 0;
155
+ this.firstHeartbeatFailureAt = -1;
156
+ this.lastHeartbeatFailureDetail = "";
157
+ if (previousFailures > 0) {
158
+ this.logInfo(
159
+ `[WorkerPals] Heartbeat recovered for ${this.workerId} after ${previousFailures} failed attempt(s).`,
160
+ );
161
+ }
162
+ return true;
163
+ } catch (error) {
164
+ if (this.consecutiveHeartbeatFailures === 0) {
165
+ this.firstHeartbeatFailureAt = this.nowFn();
166
+ }
167
+ this.consecutiveHeartbeatFailures += 1;
168
+ this.lastHeartbeatFailureDetail = error instanceof Error ? error.message : String(error);
169
+ const staleAgeMs = this.getHeartbeatStaleAgeMs();
170
+ this.logWarn(
171
+ `[WorkerPals] Heartbeat failure ${this.consecutiveHeartbeatFailures} for ${this.workerId}: ${this.lastHeartbeatFailureDetail} (lastSuccessAgeMs=${Number.isFinite(staleAgeMs) ? staleAgeMs : -1}).`,
172
+ );
173
+ return false;
174
+ } finally {
175
+ this.heartbeatInFlight = false;
176
+ }
177
+ }
178
+
179
+ queueSessionCommand(
180
+ sessionId: string,
181
+ cmd: CommandRequest,
182
+ options: { priority?: QueuePriority; droppable?: boolean } = {},
183
+ ): Promise<void> {
184
+ return this.enqueueTask({
185
+ label: `command:${cmd.type}`,
186
+ priority: options.priority ?? "normal",
187
+ droppable: options.droppable ?? false,
188
+ run: async () => {
189
+ const response = await this.postJson(
190
+ `/sessions/${sessionId}/command`,
191
+ cmd,
192
+ this.requestTimeoutMs,
193
+ );
194
+ if (!response.ok) {
195
+ const detail = await readResponseDetail(response);
196
+ this.logWarn(
197
+ `[WorkerPals] Command ${cmd.type} failed: ${response.status}${detail ? ` ${detail}` : ""}`,
198
+ );
199
+ }
200
+ },
201
+ });
202
+ }
203
+
204
+ queueJobLog(
205
+ jobId: string,
206
+ payload: { stream: "stdout" | "stderr"; seq: number; message: string; ts: string },
207
+ ): Promise<void> {
208
+ return this.enqueueTask({
209
+ label: "job_log",
210
+ priority: "normal",
211
+ droppable: true,
212
+ run: async () => {
213
+ const response = await this.postJson(`/jobs/${jobId}/log`, payload, this.requestTimeoutMs);
214
+ if (!response.ok) {
215
+ const detail = await readResponseDetail(response);
216
+ this.logWarn(
217
+ `[WorkerPals] Job log delivery failed for ${jobId}: ${response.status}${detail ? ` ${detail}` : ""}`,
218
+ );
219
+ }
220
+ },
221
+ });
222
+ }
223
+
224
+ async flush(timeoutMs = 15_000): Promise<void> {
225
+ if (this.queuedRequests.length === 0 && !this.queueDrainInFlight) return;
226
+ await new Promise<void>((resolve) => {
227
+ let settled = false;
228
+ const timer = setTimeout(() => {
229
+ if (settled) return;
230
+ settled = true;
231
+ this.logWarn(
232
+ `[WorkerPals] Timed out flushing queued server transport requests after ${timeoutMs}ms (queued=${this.queuedRequests.length}).`,
233
+ );
234
+ resolve();
235
+ }, timeoutMs);
236
+ this.queueFlushWaiters.push(() => {
237
+ if (settled) return;
238
+ settled = true;
239
+ clearTimeout(timer);
240
+ resolve();
241
+ });
242
+ this.maybeResolveFlushWaiters();
243
+ });
244
+ }
245
+
246
+ private enqueueTask(task: Omit<TransportTask, "resolve">): Promise<void> {
247
+ if (task.droppable && this.queuedRequests.length >= this.maxQueuedRequests) {
248
+ this.droppedLogRequests += 1;
249
+ if (this.droppedLogRequests === 1 || this.droppedLogRequests % 25 === 0) {
250
+ this.logWarn(
251
+ `[WorkerPals] Dropped ${this.droppedLogRequests} queued low-priority transport request(s) because the queue is saturated (limit=${this.maxQueuedRequests}).`,
252
+ );
253
+ }
254
+ return Promise.resolve();
255
+ }
256
+
257
+ return new Promise((resolve) => {
258
+ const queued: TransportTask = { ...task, resolve };
259
+ if (queued.priority === "high") {
260
+ const firstNormalIndex = this.queuedRequests.findIndex((entry) => entry.priority !== "high");
261
+ if (firstNormalIndex === -1) {
262
+ this.queuedRequests.push(queued);
263
+ } else {
264
+ this.queuedRequests.splice(firstNormalIndex, 0, queued);
265
+ }
266
+ } else {
267
+ this.queuedRequests.push(queued);
268
+ }
269
+ void this.drainQueue();
270
+ });
271
+ }
272
+
273
+ private async drainQueue(): Promise<void> {
274
+ if (this.queueDrainInFlight) return;
275
+ this.queueDrainInFlight = true;
276
+ try {
277
+ while (this.queuedRequests.length > 0) {
278
+ const task = this.queuedRequests.shift();
279
+ if (!task) break;
280
+ try {
281
+ await task.run();
282
+ } catch (error) {
283
+ this.logWarn(
284
+ `[WorkerPals] Transport request ${task.label} failed: ${
285
+ error instanceof Error ? error.message : String(error)
286
+ }`,
287
+ );
288
+ } finally {
289
+ task.resolve();
290
+ }
291
+ }
292
+ } finally {
293
+ this.queueDrainInFlight = false;
294
+ this.maybeResolveFlushWaiters();
295
+ }
296
+ }
297
+
298
+ private maybeResolveFlushWaiters(): void {
299
+ if (this.queuedRequests.length > 0 || this.queueDrainInFlight) return;
300
+ const waiters = this.queueFlushWaiters.splice(0);
301
+ for (const waiter of waiters) waiter();
302
+ }
303
+
304
+ private async postJson(path: string, payload: unknown, timeoutMs: number): Promise<Response> {
305
+ const controller = new AbortController();
306
+ const timer = setTimeout(() => controller.abort("timeout"), timeoutMs);
307
+ try {
308
+ return await this.fetchFn(`${this.server}${path}`, {
309
+ method: "POST",
310
+ headers: this.headers,
311
+ body: JSON.stringify(payload),
312
+ signal: controller.signal,
313
+ });
314
+ } catch (error) {
315
+ if (controller.signal.aborted) {
316
+ throw new Error(`request timed out after ${timeoutMs}ms (${path})`);
317
+ }
318
+ throw error;
319
+ } finally {
320
+ clearTimeout(timer);
321
+ }
322
+ }
323
+ }
@@ -20,7 +20,6 @@
20
20
  * JobRunner executes a single job, streams logs, and outputs a final result with a sentinel line.
21
21
  */
22
22
 
23
- import type { CommandRequest } from "protocol";
24
23
  import { randomUUID } from "crypto";
25
24
  import { mkdirSync } from "fs";
26
25
  import { resolve } from "path";
@@ -44,6 +43,7 @@ import {
44
43
  } from "./execute_job.js";
45
44
  import { DockerExecutionExhaustedError, DockerExecutor } from "./docker_executor.js";
46
45
  import { forceDeleteWorktreePath } from "./common/worktree_cleanup.js";
46
+ import { WorkerServerTransport, type WorkerHeartbeatPayload } from "./common/server_transport.js";
47
47
  import { DEFAULT_DOCKER_TIMEOUT_MS, parseDockerTimeoutMs } from "./timeout_policy.js";
48
48
 
49
49
  type CommitRef = {
@@ -95,6 +95,31 @@ function estimateTokensFromText(text: string): number {
95
95
  return Math.max(0, Math.ceil(String(text ?? "").length / 3));
96
96
  }
97
97
 
98
+ async function postJsonWithTimeout(
99
+ url: string,
100
+ headers: Record<string, string>,
101
+ body: unknown,
102
+ timeoutMs = 10_000,
103
+ ): Promise<Response> {
104
+ const controller = new AbortController();
105
+ const timer = setTimeout(() => controller.abort("timeout"), timeoutMs);
106
+ try {
107
+ return await fetch(url, {
108
+ method: "POST",
109
+ headers,
110
+ body: JSON.stringify(body),
111
+ signal: controller.signal,
112
+ });
113
+ } catch (error) {
114
+ if (controller.signal.aborted) {
115
+ throw new Error(`request timed out after ${timeoutMs}ms: ${url}`);
116
+ }
117
+ throw error;
118
+ } finally {
119
+ clearTimeout(timer);
120
+ }
121
+ }
122
+
98
123
  function buildWorkerLlmUsageEvent(
99
124
  job: {
100
125
  kind: string;
@@ -172,11 +197,7 @@ async function reportWorkerLlmUsage(
172
197
  ): Promise<void> {
173
198
  const payload = buildWorkerLlmUsageEvent(job, result);
174
199
  if (!payload) return;
175
- const response = await fetch(`${server}/telemetry/llm-usage`, {
176
- method: "POST",
177
- headers,
178
- body: JSON.stringify(payload),
179
- });
200
+ const response = await postJsonWithTimeout(`${server}/telemetry/llm-usage`, headers, payload);
180
201
  if (!response.ok) {
181
202
  const detail = await response.text().catch(() => "");
182
203
  throw new Error(
@@ -223,6 +244,16 @@ export function shouldEmitDirectSessionJobEvent(options: {
223
244
  return !options.statusPersistedToServer;
224
245
  }
225
246
 
247
+ export function shouldRecycleWorkerForHeartbeatDegradation(options: {
248
+ heartbeatDelivered: boolean;
249
+ allowHeartbeatRecycle: boolean;
250
+ transportStale: boolean;
251
+ }): boolean {
252
+ if (options.heartbeatDelivered) return false;
253
+ if (!options.allowHeartbeatRecycle) return false;
254
+ return options.transportStale;
255
+ }
256
+
226
257
  function shouldRecycleWorkerForCodexUnavailableFailure(
227
258
  summary: string,
228
259
  stderr?: string | null,
@@ -853,19 +884,15 @@ async function enqueueCompletion(
853
884
  resultSummary,
854
885
  });
855
886
 
856
- const response = await fetch(`${server}/completions/enqueue`, {
857
- method: "POST",
858
- headers,
859
- body: JSON.stringify({
860
- jobId: job.id,
861
- sessionId: job.sessionId,
862
- commitSha: commit.sha,
863
- branch: commit.branch,
864
- message: `${job.kind}: ${job.taskId} (worker PR metadata attached)`,
865
- prUrl,
866
- prTitle: pr.title,
867
- prBody: pr.body,
868
- }),
887
+ const response = await postJsonWithTimeout(`${server}/completions/enqueue`, headers, {
888
+ jobId: job.id,
889
+ sessionId: job.sessionId,
890
+ commitSha: commit.sha,
891
+ branch: commit.branch,
892
+ message: `${job.kind}: ${job.taskId} (worker PR metadata attached)`,
893
+ prUrl,
894
+ prTitle: pr.title,
895
+ prBody: pr.body,
869
896
  });
870
897
 
871
898
  if (response.ok) {
@@ -883,24 +910,6 @@ async function enqueueCompletion(
883
910
  }
884
911
  }
885
912
 
886
- function sendCommand(
887
- server: string,
888
- sessionId: string,
889
- headers: Record<string, string>,
890
- cmd: CommandRequest,
891
- ): Promise<void> {
892
- return fetch(`${server}/sessions/${sessionId}/command`, {
893
- method: "POST",
894
- headers,
895
- body: JSON.stringify(cmd),
896
- })
897
- .then((res) => {
898
- if (!res.ok) console.error(`[WorkerPals] Command ${cmd.type} failed: ${res.status}`);
899
- })
900
- .catch((err) => console.error(`[WorkerPals] Command ${cmd.type} error:`, err));
901
- }
902
-
903
- type WorkerHeartbeatStatus = "idle" | "busy" | "error" | "offline";
904
913
  type WorkerRuntimeState = {
905
914
  currentJobId: string | null;
906
915
  currentSessionId: string | null;
@@ -913,44 +922,11 @@ function buildWorkerHeaders(authToken: string | null): Record<string, string> {
913
922
  return headers;
914
923
  }
915
924
 
916
- async function sendWorkerHeartbeat(
917
- opts: ReturnType<typeof parseArgs>,
918
- headers: Record<string, string>,
919
- status: WorkerHeartbeatStatus,
920
- currentJobId: string | null = null,
921
- ): Promise<void> {
922
- try {
923
- await fetch(`${opts.server}/workers/heartbeat`, {
924
- method: "POST",
925
- headers,
926
- body: JSON.stringify({
927
- workerId: opts.workerId,
928
- status,
929
- currentJobId,
930
- pollMs: opts.pollMs,
931
- capabilities: {
932
- docker: opts.docker,
933
- labels: opts.labels,
934
- executor: resolveExecutor(CONFIG),
935
- requireDocker: opts.requireDocker,
936
- },
937
- details: {
938
- repo: opts.repo,
939
- baseRef: opts.worktreeBaseRef,
940
- dockerImage: opts.docker ? opts.dockerImage : null,
941
- dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
942
- },
943
- }),
944
- });
945
- } catch (err) {
946
- console.error(`[WorkerPals] Heartbeat error:`, err);
947
- }
948
- }
949
-
950
925
  async function failActiveJobOnShutdown(
951
926
  opts: ReturnType<typeof parseArgs>,
952
927
  headers: Record<string, string>,
953
928
  runtimeState: WorkerRuntimeState,
929
+ transport: WorkerServerTransport,
954
930
  signalName: string,
955
931
  ): Promise<void> {
956
932
  const activeJobId = runtimeState.currentJobId;
@@ -961,10 +937,9 @@ async function failActiveJobOnShutdown(
961
937
  let statusPersistedToServer = false;
962
938
 
963
939
  try {
964
- const response = await fetch(`${opts.server}/jobs/${activeJobId}/fail`, {
965
- method: "POST",
966
- headers,
967
- body: JSON.stringify({ message, detail }),
940
+ const response = await postJsonWithTimeout(`${opts.server}/jobs/${activeJobId}/fail`, headers, {
941
+ message,
942
+ detail,
968
943
  });
969
944
  statusPersistedToServer = response.ok;
970
945
  } catch (err) {
@@ -978,7 +953,7 @@ async function failActiveJobOnShutdown(
978
953
  runtimeState.currentSessionId &&
979
954
  shouldEmitDirectSessionJobEvent({ ok: false, statusPersistedToServer })
980
955
  ) {
981
- await sendCommand(opts.server, runtimeState.currentSessionId, headers, {
956
+ await transport.queueSessionCommand(runtimeState.currentSessionId, {
982
957
  type: "job_failed",
983
958
  payload: {
984
959
  jobId: activeJobId,
@@ -986,7 +961,7 @@ async function failActiveJobOnShutdown(
986
961
  detail,
987
962
  },
988
963
  from: `worker:${opts.workerId}`,
989
- });
964
+ }, { priority: "high" });
990
965
  }
991
966
  }
992
967
 
@@ -994,6 +969,8 @@ async function workerLoop(
994
969
  opts: ReturnType<typeof parseArgs>,
995
970
  dockerExecutor: DockerExecutor | null,
996
971
  runtimeState: WorkerRuntimeState,
972
+ transport: WorkerServerTransport,
973
+ requestWorkerRestart: (reason: string) => void,
997
974
  ): Promise<void> {
998
975
  const headers = buildWorkerHeaders(opts.authToken);
999
976
 
@@ -1007,17 +984,37 @@ async function workerLoop(
1007
984
  }
1008
985
  console.log(`[WorkerPals ${opts.workerId}] Executor backend: ${resolveExecutor(CONFIG)}`);
1009
986
  const heartbeatEveryMs = Math.max(1000, opts.heartbeatMs);
987
+ const claimTimeoutMs = Math.max(4_000, Math.min(15_000, opts.pollMs * 3));
1010
988
  let lastHeartbeatAt = 0;
989
+ const buildHeartbeatPayload = (
990
+ status: WorkerHeartbeatPayload["status"],
991
+ currentJobId: string | null,
992
+ ): WorkerHeartbeatPayload => ({
993
+ status,
994
+ currentJobId,
995
+ capabilities: {
996
+ docker: opts.docker,
997
+ labels: opts.labels,
998
+ executor: resolveExecutor(CONFIG),
999
+ requireDocker: opts.requireDocker,
1000
+ },
1001
+ details: {
1002
+ repo: opts.repo,
1003
+ baseRef: opts.worktreeBaseRef,
1004
+ dockerImage: opts.docker ? opts.dockerImage : null,
1005
+ dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
1006
+ },
1007
+ });
1011
1008
 
1012
1009
  const maybeHeartbeat = async (
1013
- status: WorkerHeartbeatStatus,
1010
+ status: WorkerHeartbeatPayload["status"],
1014
1011
  currentJobId: string | null = null,
1015
1012
  force = false,
1016
1013
  ) => {
1017
1014
  const now = Date.now();
1018
1015
  if (!force && now - lastHeartbeatAt < heartbeatEveryMs) return;
1019
- await sendWorkerHeartbeat(opts, headers, status, currentJobId);
1020
- lastHeartbeatAt = now;
1016
+ const ok = await transport.sendHeartbeat(buildHeartbeatPayload(status, currentJobId));
1017
+ if (ok) lastHeartbeatAt = now;
1021
1018
  };
1022
1019
 
1023
1020
  await maybeHeartbeat("idle", null, true);
@@ -1025,11 +1022,12 @@ async function workerLoop(
1025
1022
  while (!runtimeState.shutdownRequested) {
1026
1023
  try {
1027
1024
  await maybeHeartbeat("idle");
1028
- const claimRes = await fetch(`${opts.server}/jobs/claim`, {
1029
- method: "POST",
1025
+ const claimRes = await postJsonWithTimeout(
1026
+ `${opts.server}/jobs/claim`,
1030
1027
  headers,
1031
- body: JSON.stringify({ workerId: opts.workerId }),
1032
- });
1028
+ { workerId: opts.workerId },
1029
+ claimTimeoutMs,
1030
+ );
1033
1031
 
1034
1032
  if (claimRes.ok) {
1035
1033
  const data = (await claimRes.json()) as any;
@@ -1040,22 +1038,35 @@ async function workerLoop(
1040
1038
  runtimeState.currentSessionId = job.sessionId ?? null;
1041
1039
  console.log(`[WorkerPals] Claimed job ${job.id} (${job.kind})`);
1042
1040
  await maybeHeartbeat("busy", job.id, true);
1041
+ let allowHeartbeatRecycle = true;
1043
1042
 
1044
1043
  const busyHeartbeat = setInterval(() => {
1045
- void sendWorkerHeartbeat(opts, headers, "busy", job.id);
1044
+ void transport.sendHeartbeat(buildHeartbeatPayload("busy", job.id)).then((ok) => {
1045
+ if (
1046
+ !shouldRecycleWorkerForHeartbeatDegradation({
1047
+ heartbeatDelivered: ok,
1048
+ allowHeartbeatRecycle,
1049
+ transportStale: transport.shouldRecycleBusyWorker(),
1050
+ })
1051
+ ) {
1052
+ return;
1053
+ }
1054
+ requestWorkerRestart(
1055
+ `heartbeat transport stale while claimed job ${job.id} is still running`,
1056
+ );
1057
+ });
1046
1058
  }, heartbeatEveryMs);
1047
1059
 
1048
1060
  if (job.sessionId) {
1049
- await sendCommand(opts.server, job.sessionId, headers, {
1061
+ await transport.queueSessionCommand(job.sessionId, {
1050
1062
  type: "job_claimed",
1051
1063
  payload: { jobId: job.id, workerId: opts.workerId },
1052
1064
  from: `worker:${opts.workerId}`,
1053
- });
1065
+ }, { priority: "high" });
1054
1066
  }
1055
1067
 
1056
1068
  let stdoutSeq = 0;
1057
1069
  let stderrSeq = 0;
1058
- let logChain: Promise<void> = Promise.resolve();
1059
1070
  let lastCleanLog = "";
1060
1071
  let lastCleanLogAt = 0;
1061
1072
 
@@ -1077,20 +1088,17 @@ async function workerLoop(
1077
1088
  const logTs = new Date(now).toISOString();
1078
1089
 
1079
1090
  const seq = stream === "stdout" ? ++stdoutSeq : ++stderrSeq;
1080
- logChain = logChain.then(() =>
1081
- Promise.allSettled([
1082
- sendCommand(opts.server, job.sessionId, headers, {
1083
- type: "job_log",
1084
- payload: { jobId: job.id, stream, seq, line: cleaned, ts: logTs },
1085
- from: `worker:${opts.workerId}`,
1086
- }),
1087
- fetch(`${opts.server}/jobs/${job.id}/log`, {
1088
- method: "POST",
1089
- headers,
1090
- body: JSON.stringify({ stream, seq, message: cleaned, ts: logTs }),
1091
- }),
1092
- ]).then(() => undefined),
1093
- );
1091
+ void transport.queueSessionCommand(job.sessionId, {
1092
+ type: "job_log",
1093
+ payload: { jobId: job.id, stream, seq, line: cleaned, ts: logTs },
1094
+ from: `worker:${opts.workerId}`,
1095
+ }, { droppable: true });
1096
+ void transport.queueJobLog(job.id, {
1097
+ stream,
1098
+ seq,
1099
+ message: cleaned,
1100
+ ts: logTs,
1101
+ });
1094
1102
  }
1095
1103
  : undefined;
1096
1104
 
@@ -1153,7 +1161,8 @@ async function workerLoop(
1153
1161
  }
1154
1162
  const jobDurationMs = Math.max(0, Date.now() - jobStartedAtMs);
1155
1163
 
1156
- await logChain;
1164
+ allowHeartbeatRecycle = false;
1165
+ await transport.flush();
1157
1166
  try {
1158
1167
  await reportWorkerLlmUsage(opts.server, headers, jobData, result);
1159
1168
  } catch (err) {
@@ -1267,10 +1276,10 @@ async function workerLoop(
1267
1276
  reviewAgent.prUrl.trim().length > 0
1268
1277
  ? reviewAgent.prUrl.trim()
1269
1278
  : null;
1270
- const response = await fetch(`${opts.server}/jobs/${job.id}/complete`, {
1271
- method: "POST",
1279
+ const response = await postJsonWithTimeout(
1280
+ `${opts.server}/jobs/${job.id}/complete`,
1272
1281
  headers,
1273
- body: JSON.stringify({
1282
+ {
1274
1283
  summary: result.summary,
1275
1284
  durationMs: jobDurationMs,
1276
1285
  prUrl: jobPrUrl,
@@ -1278,21 +1287,17 @@ async function workerLoop(
1278
1287
  ...(result.stdout ? [{ kind: "stdout", text: result.stdout }] : []),
1279
1288
  ...(result.stderr ? [{ kind: "stderr", text: result.stderr }] : []),
1280
1289
  ],
1281
- }),
1282
- });
1290
+ },
1291
+ );
1283
1292
  statusPersistedToServer = response.ok;
1284
1293
  console.log(
1285
1294
  `[WorkerPals] Job ${job.id} completed in ${formatDurationMs(jobDurationMs)}: ${result.summary}`,
1286
1295
  );
1287
1296
  } else {
1288
- const response = await fetch(`${opts.server}/jobs/${job.id}/fail`, {
1289
- method: "POST",
1290
- headers,
1291
- body: JSON.stringify({
1292
- message: result.summary,
1293
- detail: redactSensitiveText(result.stderr ?? ""),
1294
- durationMs: jobDurationMs,
1295
- }),
1297
+ const response = await postJsonWithTimeout(`${opts.server}/jobs/${job.id}/fail`, headers, {
1298
+ message: result.summary,
1299
+ detail: redactSensitiveText(result.stderr ?? ""),
1300
+ durationMs: jobDurationMs,
1296
1301
  });
1297
1302
  statusPersistedToServer = response.ok;
1298
1303
  console.log(
@@ -1327,11 +1332,11 @@ async function workerLoop(
1327
1332
  ? `${rawText.slice(0, maxResponseChars - 3)}...`
1328
1333
  : rawText;
1329
1334
  if (assistantText) {
1330
- await sendCommand(opts.server, job.sessionId, headers, {
1335
+ await transport.queueSessionCommand(job.sessionId, {
1331
1336
  type: "assistant_message",
1332
1337
  payload: { text: assistantText },
1333
1338
  from: `worker:${opts.workerId}`,
1334
- });
1339
+ }, { priority: "high" });
1335
1340
  }
1336
1341
  }
1337
1342
 
@@ -1358,7 +1363,9 @@ async function workerLoop(
1358
1363
  from: `worker:${opts.workerId}`,
1359
1364
  };
1360
1365
 
1361
- await sendCommand(opts.server, job.sessionId, headers, eventCmd);
1366
+ await transport.queueSessionCommand(job.sessionId, eventCmd, {
1367
+ priority: "high",
1368
+ });
1362
1369
  }
1363
1370
  }
1364
1371
  } finally {
@@ -1369,13 +1376,13 @@ async function workerLoop(
1369
1376
  result?.cooldownMs &&
1370
1377
  result.cooldownMs > 0
1371
1378
  ) {
1372
- await sendCommand(opts.server, job.sessionId, headers, {
1379
+ await transport.queueSessionCommand(job.sessionId, {
1373
1380
  type: "assistant_message",
1374
1381
  payload: {
1375
1382
  text: `WorkerPal is cooling down for ${formatDurationMs(result.cooldownMs)} after transient infrastructure failures.`,
1376
1383
  },
1377
1384
  from: `worker:${opts.workerId}`,
1378
- });
1385
+ }, { priority: "high" });
1379
1386
  }
1380
1387
  if (!recycleWorkerAfterJob && result?.cooldownMs && result.cooldownMs > 0) {
1381
1388
  const cooldownMs = Math.max(0, Math.floor(result.cooldownMs));
@@ -1501,6 +1508,14 @@ async function main(): Promise<void> {
1501
1508
  shutdownRequested: false,
1502
1509
  };
1503
1510
  const headers = buildWorkerHeaders(opts.authToken);
1511
+ const transport = new WorkerServerTransport({
1512
+ server: opts.server,
1513
+ headers,
1514
+ workerId: opts.workerId,
1515
+ pollMs: opts.pollMs,
1516
+ heartbeatMs: opts.heartbeatMs,
1517
+ staleClaimTtlMs: CONFIG.server.staleClaimTtlMs,
1518
+ });
1504
1519
  let shutdownTriggered = false;
1505
1520
  const shutdownAndExit = (signalName: string, code: number) => {
1506
1521
  if (shutdownTriggered) return;
@@ -1517,9 +1532,25 @@ async function main(): Promise<void> {
1517
1532
 
1518
1533
  void (async () => {
1519
1534
  await withTimeout(
1520
- sendWorkerHeartbeat(opts, headers, "offline", runtimeState.currentJobId ?? null),
1535
+ transport.sendHeartbeat({
1536
+ status: "offline",
1537
+ currentJobId: runtimeState.currentJobId ?? null,
1538
+ capabilities: {
1539
+ docker: opts.docker,
1540
+ labels: opts.labels,
1541
+ executor: resolveExecutor(CONFIG),
1542
+ requireDocker: opts.requireDocker,
1543
+ },
1544
+ details: {
1545
+ repo: opts.repo,
1546
+ baseRef: opts.worktreeBaseRef,
1547
+ dockerImage: opts.docker ? opts.dockerImage : null,
1548
+ dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
1549
+ },
1550
+ }),
1521
1551
  );
1522
- await withTimeout(failActiveJobOnShutdown(opts, headers, runtimeState, signalName));
1552
+ await withTimeout(failActiveJobOnShutdown(opts, headers, runtimeState, transport, signalName));
1553
+ await withTimeout(transport.flush());
1523
1554
  if (dockerExecutor) {
1524
1555
  await withTimeout(
1525
1556
  dockerExecutor.shutdown().catch((err) => {
@@ -1548,7 +1579,13 @@ async function main(): Promise<void> {
1548
1579
  }
1549
1580
  });
1550
1581
 
1551
- workerLoop(opts, dockerExecutor, runtimeState).catch((err) => {
1582
+ const requestWorkerRestart = (reason: string) => {
1583
+ if (shutdownTriggered) return;
1584
+ console.error(`[WorkerPals] Control plane unhealthy: ${reason}. Recycling worker.`);
1585
+ shutdownAndExit("CONTROL_PLANE_UNHEALTHY", 91);
1586
+ };
1587
+
1588
+ workerLoop(opts, dockerExecutor, runtimeState, transport, requestWorkerRestart).catch((err) => {
1552
1589
  console.error("[WorkerPals] Fatal:", err);
1553
1590
  process.exit(1);
1554
1591
  });