@pushpalsdev/cli 1.0.38 → 1.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -122,6 +122,7 @@ class ServiceManager {
122
122
  onServiceDegraded;
123
123
  onEvent;
124
124
  timer;
125
+ shutdownBegun = false;
125
126
  stopped = false;
126
127
  constructor(options = {}) {
127
128
  this.pollMs = Math.max(50, Math.floor(options.pollMs ?? DEFAULT_SERVICE_MANAGER_POLL_MS));
@@ -161,10 +162,10 @@ class ServiceManager {
161
162
  action: this.degradedAction
162
163
  };
163
164
  }
164
- stop() {
165
- if (this.stopped)
165
+ beginShutdown() {
166
+ if (this.shutdownBegun || this.stopped)
166
167
  return;
167
- this.stopped = true;
168
+ this.shutdownBegun = true;
168
169
  clearInterval(this.timer);
169
170
  for (const state of this.stateByService.values()) {
170
171
  if (!state.pendingRestartTimer)
@@ -172,6 +173,12 @@ class ServiceManager {
172
173
  clearTimeout(state.pendingRestartTimer);
173
174
  state.pendingRestartTimer = null;
174
175
  }
176
+ }
177
+ stop() {
178
+ if (this.stopped)
179
+ return;
180
+ this.beginShutdown();
181
+ this.stopped = true;
175
182
  for (const service of this.services.values()) {
176
183
  try {
177
184
  const pid = service.proc.pid;
@@ -207,7 +214,7 @@ class ServiceManager {
207
214
  this.onEvent?.(level, line);
208
215
  }
209
216
  tick() {
210
- if (this.stopped)
217
+ if (this.shutdownBegun || this.stopped)
211
218
  return;
212
219
  const now = Date.now();
213
220
  for (const [name, service] of this.services.entries()) {
@@ -250,7 +257,7 @@ class ServiceManager {
250
257
  state.pendingRestartTimer = setTimeout(() => {
251
258
  state.pendingRestartTimer = null;
252
259
  state.nextRestartAtMs = 0;
253
- if (this.stopped)
260
+ if (this.shutdownBegun || this.stopped)
254
261
  return;
255
262
  const current = this.services.get(name);
256
263
  if (!current || !current.exited)
@@ -4792,6 +4799,7 @@ async function main() {
4792
4799
  return;
4793
4800
  const serviceManager = autoStartedServiceManager;
4794
4801
  autoStartedServiceManager = null;
4802
+ serviceManager.beginShutdown();
4795
4803
  const shutdown = await requestLocalRuntimeShutdown(serverUrl, repoRoot, reason);
4796
4804
  if (shutdown.attempted && shutdown.accepted) {
4797
4805
  console.log("[pushpals] Local runtime shutdown accepted; waiting for services to exit...");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pushpalsdev/cli",
3
- "version": "1.0.38",
3
+ "version": "1.0.40",
4
4
  "description": "PushPals terminal CLI for LocalBuddy -> RemoteBuddy orchestration",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -0,0 +1,323 @@
1
+ import type { CommandRequest } from "protocol";
2
+
3
+ export type WorkerHeartbeatStatus = "idle" | "busy" | "error" | "offline";
4
+
5
+ type QueuePriority = "high" | "normal";
6
+
7
+ type TransportTask = {
8
+ label: string;
9
+ priority: QueuePriority;
10
+ droppable: boolean;
11
+ run: () => Promise<void>;
12
+ resolve: () => void;
13
+ };
14
+
15
+ export type WorkerServerTransportOptions = {
16
+ server: string;
17
+ headers: Record<string, string>;
18
+ workerId: string;
19
+ pollMs: number;
20
+ heartbeatMs: number;
21
+ staleClaimTtlMs: number;
22
+ fetchFn?: typeof fetch;
23
+ logInfo?: (message: string) => void;
24
+ logWarn?: (message: string) => void;
25
+ nowFn?: () => number;
26
+ };
27
+
28
+ export type WorkerTransportHealthSnapshot = {
29
+ heartbeatInFlight: boolean;
30
+ consecutiveHeartbeatFailures: number;
31
+ lastHeartbeatAttemptAt: number;
32
+ lastHeartbeatSuccessAt: number;
33
+ queuedRequests: number;
34
+ droppedLogRequests: number;
35
+ };
36
+
37
+ export type WorkerHeartbeatPayload = {
38
+ status: WorkerHeartbeatStatus;
39
+ currentJobId: string | null;
40
+ capabilities?: Record<string, unknown>;
41
+ details?: Record<string, unknown>;
42
+ };
43
+
44
+ function computeHeartbeatTimeoutMs(heartbeatMs: number): number {
45
+ return Math.max(1_500, Math.min(4_000, Math.floor(heartbeatMs * 0.8)));
46
+ }
47
+
48
+ function computeRequestTimeoutMs(heartbeatMs: number): number {
49
+ return Math.max(4_000, Math.min(10_000, Math.floor(heartbeatMs * 2)));
50
+ }
51
+
52
+ async function readResponseDetail(response: Response): Promise<string> {
53
+ const text = await response.text().catch(() => "");
54
+ return text.trim();
55
+ }
56
+
57
+ export class WorkerServerTransport {
58
+ private readonly server: string;
59
+ private readonly headers: Record<string, string>;
60
+ private readonly workerId: string;
61
+ private readonly pollMs: number;
62
+ private readonly staleClaimTtlMs: number;
63
+ private readonly fetchFn: typeof fetch;
64
+ private readonly logInfo: (message: string) => void;
65
+ private readonly logWarn: (message: string) => void;
66
+ private readonly nowFn: () => number;
67
+ private readonly heartbeatTimeoutMs: number;
68
+ private readonly requestTimeoutMs: number;
69
+ private readonly maxQueuedRequests = 256;
70
+
71
+ private queuedRequests: TransportTask[] = [];
72
+ private queueDrainInFlight = false;
73
+ private queueFlushWaiters: Array<() => void> = [];
74
+ private droppedLogRequests = 0;
75
+ private heartbeatInFlight = false;
76
+ private lastHeartbeatAttemptAt = 0;
77
+ private lastHeartbeatSuccessAt = 0;
78
+ private consecutiveHeartbeatFailures = 0;
79
+ private firstHeartbeatFailureAt = -1;
80
+ private lastHeartbeatFailureDetail = "";
81
+
82
+ constructor(options: WorkerServerTransportOptions) {
83
+ this.server = options.server;
84
+ this.headers = options.headers;
85
+ this.workerId = options.workerId;
86
+ this.pollMs = options.pollMs;
87
+ this.staleClaimTtlMs = options.staleClaimTtlMs;
88
+ this.fetchFn = options.fetchFn ?? fetch;
89
+ this.logInfo = options.logInfo ?? ((message) => console.log(message));
90
+ this.logWarn = options.logWarn ?? ((message) => console.warn(message));
91
+ this.nowFn = options.nowFn ?? (() => Date.now());
92
+ this.heartbeatTimeoutMs = computeHeartbeatTimeoutMs(options.heartbeatMs);
93
+ this.requestTimeoutMs = computeRequestTimeoutMs(options.heartbeatMs);
94
+ }
95
+
96
+ getHealthSnapshot(): WorkerTransportHealthSnapshot {
97
+ return {
98
+ heartbeatInFlight: this.heartbeatInFlight,
99
+ consecutiveHeartbeatFailures: this.consecutiveHeartbeatFailures,
100
+ lastHeartbeatAttemptAt: this.lastHeartbeatAttemptAt,
101
+ lastHeartbeatSuccessAt: this.lastHeartbeatSuccessAt,
102
+ queuedRequests: this.queuedRequests.length,
103
+ droppedLogRequests: this.droppedLogRequests,
104
+ };
105
+ }
106
+
107
+ getHeartbeatStaleAgeMs(nowMs = this.nowFn()): number {
108
+ if (this.lastHeartbeatSuccessAt <= 0) return Number.POSITIVE_INFINITY;
109
+ return Math.max(0, nowMs - this.lastHeartbeatSuccessAt);
110
+ }
111
+
112
+ shouldRecycleBusyWorker(nowMs = this.nowFn()): boolean {
113
+ const failureAgeMs =
114
+ this.firstHeartbeatFailureAt >= 0
115
+ ? Math.max(0, nowMs - this.firstHeartbeatFailureAt)
116
+ : null;
117
+ if (failureAgeMs == null) return false;
118
+ const threshold = Math.min(
119
+ this.staleClaimTtlMs,
120
+ Math.max(
121
+ 30_000,
122
+ Math.min(
123
+ this.staleClaimTtlMs - this.heartbeatTimeoutMs,
124
+ Math.floor(this.staleClaimTtlMs * 0.75),
125
+ ),
126
+ ),
127
+ );
128
+ return failureAgeMs >= threshold;
129
+ }
130
+
131
+ async sendHeartbeat(payload: WorkerHeartbeatPayload): Promise<boolean> {
132
+ if (this.heartbeatInFlight) {
133
+ return false;
134
+ }
135
+ this.heartbeatInFlight = true;
136
+ this.lastHeartbeatAttemptAt = this.nowFn();
137
+ try {
138
+ const response = await this.postJson("/workers/heartbeat", {
139
+ workerId: this.workerId,
140
+ status: payload.status,
141
+ currentJobId: payload.currentJobId,
142
+ pollMs: this.pollMs,
143
+ capabilities: payload.capabilities ?? {},
144
+ details: payload.details ?? {},
145
+ }, this.heartbeatTimeoutMs);
146
+ if (!response.ok) {
147
+ const detail = await readResponseDetail(response);
148
+ throw new Error(
149
+ `heartbeat rejected (${response.status})${detail ? `: ${detail}` : ""}`,
150
+ );
151
+ }
152
+ const previousFailures = this.consecutiveHeartbeatFailures;
153
+ this.lastHeartbeatSuccessAt = this.nowFn();
154
+ this.consecutiveHeartbeatFailures = 0;
155
+ this.firstHeartbeatFailureAt = -1;
156
+ this.lastHeartbeatFailureDetail = "";
157
+ if (previousFailures > 0) {
158
+ this.logInfo(
159
+ `[WorkerPals] Heartbeat recovered for ${this.workerId} after ${previousFailures} failed attempt(s).`,
160
+ );
161
+ }
162
+ return true;
163
+ } catch (error) {
164
+ if (this.consecutiveHeartbeatFailures === 0) {
165
+ this.firstHeartbeatFailureAt = this.nowFn();
166
+ }
167
+ this.consecutiveHeartbeatFailures += 1;
168
+ this.lastHeartbeatFailureDetail = error instanceof Error ? error.message : String(error);
169
+ const staleAgeMs = this.getHeartbeatStaleAgeMs();
170
+ this.logWarn(
171
+ `[WorkerPals] Heartbeat failure ${this.consecutiveHeartbeatFailures} for ${this.workerId}: ${this.lastHeartbeatFailureDetail} (lastSuccessAgeMs=${Number.isFinite(staleAgeMs) ? staleAgeMs : -1}).`,
172
+ );
173
+ return false;
174
+ } finally {
175
+ this.heartbeatInFlight = false;
176
+ }
177
+ }
178
+
179
+ queueSessionCommand(
180
+ sessionId: string,
181
+ cmd: CommandRequest,
182
+ options: { priority?: QueuePriority; droppable?: boolean } = {},
183
+ ): Promise<void> {
184
+ return this.enqueueTask({
185
+ label: `command:${cmd.type}`,
186
+ priority: options.priority ?? "normal",
187
+ droppable: options.droppable ?? false,
188
+ run: async () => {
189
+ const response = await this.postJson(
190
+ `/sessions/${sessionId}/command`,
191
+ cmd,
192
+ this.requestTimeoutMs,
193
+ );
194
+ if (!response.ok) {
195
+ const detail = await readResponseDetail(response);
196
+ this.logWarn(
197
+ `[WorkerPals] Command ${cmd.type} failed: ${response.status}${detail ? ` ${detail}` : ""}`,
198
+ );
199
+ }
200
+ },
201
+ });
202
+ }
203
+
204
+ queueJobLog(
205
+ jobId: string,
206
+ payload: { stream: "stdout" | "stderr"; seq: number; message: string; ts: string },
207
+ ): Promise<void> {
208
+ return this.enqueueTask({
209
+ label: "job_log",
210
+ priority: "normal",
211
+ droppable: true,
212
+ run: async () => {
213
+ const response = await this.postJson(`/jobs/${jobId}/log`, payload, this.requestTimeoutMs);
214
+ if (!response.ok) {
215
+ const detail = await readResponseDetail(response);
216
+ this.logWarn(
217
+ `[WorkerPals] Job log delivery failed for ${jobId}: ${response.status}${detail ? ` ${detail}` : ""}`,
218
+ );
219
+ }
220
+ },
221
+ });
222
+ }
223
+
224
+ async flush(timeoutMs = 15_000): Promise<void> {
225
+ if (this.queuedRequests.length === 0 && !this.queueDrainInFlight) return;
226
+ await new Promise<void>((resolve) => {
227
+ let settled = false;
228
+ const timer = setTimeout(() => {
229
+ if (settled) return;
230
+ settled = true;
231
+ this.logWarn(
232
+ `[WorkerPals] Timed out flushing queued server transport requests after ${timeoutMs}ms (queued=${this.queuedRequests.length}).`,
233
+ );
234
+ resolve();
235
+ }, timeoutMs);
236
+ this.queueFlushWaiters.push(() => {
237
+ if (settled) return;
238
+ settled = true;
239
+ clearTimeout(timer);
240
+ resolve();
241
+ });
242
+ this.maybeResolveFlushWaiters();
243
+ });
244
+ }
245
+
246
+ private enqueueTask(task: Omit<TransportTask, "resolve">): Promise<void> {
247
+ if (task.droppable && this.queuedRequests.length >= this.maxQueuedRequests) {
248
+ this.droppedLogRequests += 1;
249
+ if (this.droppedLogRequests === 1 || this.droppedLogRequests % 25 === 0) {
250
+ this.logWarn(
251
+ `[WorkerPals] Dropped ${this.droppedLogRequests} queued low-priority transport request(s) because the queue is saturated (limit=${this.maxQueuedRequests}).`,
252
+ );
253
+ }
254
+ return Promise.resolve();
255
+ }
256
+
257
+ return new Promise((resolve) => {
258
+ const queued: TransportTask = { ...task, resolve };
259
+ if (queued.priority === "high") {
260
+ const firstNormalIndex = this.queuedRequests.findIndex((entry) => entry.priority !== "high");
261
+ if (firstNormalIndex === -1) {
262
+ this.queuedRequests.push(queued);
263
+ } else {
264
+ this.queuedRequests.splice(firstNormalIndex, 0, queued);
265
+ }
266
+ } else {
267
+ this.queuedRequests.push(queued);
268
+ }
269
+ void this.drainQueue();
270
+ });
271
+ }
272
+
273
+ private async drainQueue(): Promise<void> {
274
+ if (this.queueDrainInFlight) return;
275
+ this.queueDrainInFlight = true;
276
+ try {
277
+ while (this.queuedRequests.length > 0) {
278
+ const task = this.queuedRequests.shift();
279
+ if (!task) break;
280
+ try {
281
+ await task.run();
282
+ } catch (error) {
283
+ this.logWarn(
284
+ `[WorkerPals] Transport request ${task.label} failed: ${
285
+ error instanceof Error ? error.message : String(error)
286
+ }`,
287
+ );
288
+ } finally {
289
+ task.resolve();
290
+ }
291
+ }
292
+ } finally {
293
+ this.queueDrainInFlight = false;
294
+ this.maybeResolveFlushWaiters();
295
+ }
296
+ }
297
+
298
+ private maybeResolveFlushWaiters(): void {
299
+ if (this.queuedRequests.length > 0 || this.queueDrainInFlight) return;
300
+ const waiters = this.queueFlushWaiters.splice(0);
301
+ for (const waiter of waiters) waiter();
302
+ }
303
+
304
+ private async postJson(path: string, payload: unknown, timeoutMs: number): Promise<Response> {
305
+ const controller = new AbortController();
306
+ const timer = setTimeout(() => controller.abort("timeout"), timeoutMs);
307
+ try {
308
+ return await this.fetchFn(`${this.server}${path}`, {
309
+ method: "POST",
310
+ headers: this.headers,
311
+ body: JSON.stringify(payload),
312
+ signal: controller.signal,
313
+ });
314
+ } catch (error) {
315
+ if (controller.signal.aborted) {
316
+ throw new Error(`request timed out after ${timeoutMs}ms (${path})`);
317
+ }
318
+ throw error;
319
+ } finally {
320
+ clearTimeout(timer);
321
+ }
322
+ }
323
+ }
@@ -240,6 +240,7 @@ export class DockerExecutor {
240
240
  private lastLoggedExecutionConfig = "";
241
241
  private lastLoggedEndpointRewrite = "";
242
242
  private warmedBackends = new Set<string>();
243
+ private preparedMergeConflictJobs = new Set<string>();
243
244
  private mergeConflictRefreshPromise: Promise<void> | null = null;
244
245
  private readonly config: WorkerpalsRuntimeConfig;
245
246
 
@@ -318,7 +319,6 @@ export class DockerExecutor {
318
319
  const worktreePath = resolve(this.worktreeDir, worktreeName);
319
320
 
320
321
  try {
321
- await this.ensureFreshImageForMergeConflictJob(job, onLog);
322
322
  const worktreeBaseRef = await this.resolveWorktreeBaseRefForJob(job, onLog);
323
323
  // Step 1: Create isolated git worktree
324
324
  await this.createWorktree(worktreePath, worktreeBaseRef);
@@ -398,6 +398,7 @@ export class DockerExecutor {
398
398
  stderr: `Retries exhausted after ${this.jobRetryMaxAttempts} attempts`,
399
399
  };
400
400
  } finally {
401
+ this.preparedMergeConflictJobs.delete(job.id);
401
402
  this.activeJobs = Math.max(0, this.activeJobs - 1);
402
403
  // Step 4: Clean up worktree (always cleanup)
403
404
  await this.removeWorktree(worktreePath).catch((err) => {
@@ -1690,6 +1691,22 @@ export class DockerExecutor {
1690
1691
  return resolutionType === "merge_conflict";
1691
1692
  }
1692
1693
 
1694
+ shouldPrepareMergeConflictJobBeforeExecution(job: Job): boolean {
1695
+ return this.isMergeConflictResolutionJob(job) && !this.preparedMergeConflictJobs.has(job.id);
1696
+ }
1697
+
1698
+ async prepareMergeConflictJobEnvironment(
1699
+ job: Job,
1700
+ onLog?: (stream: "stdout" | "stderr", line: string) => void,
1701
+ ): Promise<void> {
1702
+ await this.ensureFreshImageForMergeConflictJob(job, onLog);
1703
+ this.preparedMergeConflictJobs.add(job.id);
1704
+ }
1705
+
1706
+ recommendedMergeConflictDeferMs(): number {
1707
+ return Math.max(60_000, Math.min(this.options.timeoutMs, 5 * 60_000));
1708
+ }
1709
+
1693
1710
  private async ensureFreshImageForMergeConflictJob(
1694
1711
  job: Job,
1695
1712
  onLog?: (stream: "stdout" | "stderr", line: string) => void,
@@ -20,7 +20,6 @@
20
20
  * JobRunner executes a single job, streams logs, and outputs a final result with a sentinel line.
21
21
  */
22
22
 
23
- import type { CommandRequest } from "protocol";
24
23
  import { randomUUID } from "crypto";
25
24
  import { mkdirSync } from "fs";
26
25
  import { resolve } from "path";
@@ -44,6 +43,7 @@ import {
44
43
  } from "./execute_job.js";
45
44
  import { DockerExecutionExhaustedError, DockerExecutor } from "./docker_executor.js";
46
45
  import { forceDeleteWorktreePath } from "./common/worktree_cleanup.js";
46
+ import { WorkerServerTransport, type WorkerHeartbeatPayload } from "./common/server_transport.js";
47
47
  import { DEFAULT_DOCKER_TIMEOUT_MS, parseDockerTimeoutMs } from "./timeout_policy.js";
48
48
 
49
49
  type CommitRef = {
@@ -95,6 +95,31 @@ function estimateTokensFromText(text: string): number {
95
95
  return Math.max(0, Math.ceil(String(text ?? "").length / 3));
96
96
  }
97
97
 
98
+ async function postJsonWithTimeout(
99
+ url: string,
100
+ headers: Record<string, string>,
101
+ body: unknown,
102
+ timeoutMs = 10_000,
103
+ ): Promise<Response> {
104
+ const controller = new AbortController();
105
+ const timer = setTimeout(() => controller.abort("timeout"), timeoutMs);
106
+ try {
107
+ return await fetch(url, {
108
+ method: "POST",
109
+ headers,
110
+ body: JSON.stringify(body),
111
+ signal: controller.signal,
112
+ });
113
+ } catch (error) {
114
+ if (controller.signal.aborted) {
115
+ throw new Error(`request timed out after ${timeoutMs}ms: ${url}`);
116
+ }
117
+ throw error;
118
+ } finally {
119
+ clearTimeout(timer);
120
+ }
121
+ }
122
+
98
123
  function buildWorkerLlmUsageEvent(
99
124
  job: {
100
125
  kind: string;
@@ -172,11 +197,7 @@ async function reportWorkerLlmUsage(
172
197
  ): Promise<void> {
173
198
  const payload = buildWorkerLlmUsageEvent(job, result);
174
199
  if (!payload) return;
175
- const response = await fetch(`${server}/telemetry/llm-usage`, {
176
- method: "POST",
177
- headers,
178
- body: JSON.stringify(payload),
179
- });
200
+ const response = await postJsonWithTimeout(`${server}/telemetry/llm-usage`, headers, payload);
180
201
  if (!response.ok) {
181
202
  const detail = await response.text().catch(() => "");
182
203
  throw new Error(
@@ -223,6 +244,16 @@ export function shouldEmitDirectSessionJobEvent(options: {
223
244
  return !options.statusPersistedToServer;
224
245
  }
225
246
 
247
+ export function shouldRecycleWorkerForHeartbeatDegradation(options: {
248
+ heartbeatDelivered: boolean;
249
+ allowHeartbeatRecycle: boolean;
250
+ transportStale: boolean;
251
+ }): boolean {
252
+ if (options.heartbeatDelivered) return false;
253
+ if (!options.allowHeartbeatRecycle) return false;
254
+ return options.transportStale;
255
+ }
256
+
226
257
  function shouldRecycleWorkerForCodexUnavailableFailure(
227
258
  summary: string,
228
259
  stderr?: string | null,
@@ -853,19 +884,15 @@ async function enqueueCompletion(
853
884
  resultSummary,
854
885
  });
855
886
 
856
- const response = await fetch(`${server}/completions/enqueue`, {
857
- method: "POST",
858
- headers,
859
- body: JSON.stringify({
860
- jobId: job.id,
861
- sessionId: job.sessionId,
862
- commitSha: commit.sha,
863
- branch: commit.branch,
864
- message: `${job.kind}: ${job.taskId} (worker PR metadata attached)`,
865
- prUrl,
866
- prTitle: pr.title,
867
- prBody: pr.body,
868
- }),
887
+ const response = await postJsonWithTimeout(`${server}/completions/enqueue`, headers, {
888
+ jobId: job.id,
889
+ sessionId: job.sessionId,
890
+ commitSha: commit.sha,
891
+ branch: commit.branch,
892
+ message: `${job.kind}: ${job.taskId} (worker PR metadata attached)`,
893
+ prUrl,
894
+ prTitle: pr.title,
895
+ prBody: pr.body,
869
896
  });
870
897
 
871
898
  if (response.ok) {
@@ -883,24 +910,6 @@ async function enqueueCompletion(
883
910
  }
884
911
  }
885
912
 
886
- function sendCommand(
887
- server: string,
888
- sessionId: string,
889
- headers: Record<string, string>,
890
- cmd: CommandRequest,
891
- ): Promise<void> {
892
- return fetch(`${server}/sessions/${sessionId}/command`, {
893
- method: "POST",
894
- headers,
895
- body: JSON.stringify(cmd),
896
- })
897
- .then((res) => {
898
- if (!res.ok) console.error(`[WorkerPals] Command ${cmd.type} failed: ${res.status}`);
899
- })
900
- .catch((err) => console.error(`[WorkerPals] Command ${cmd.type} error:`, err));
901
- }
902
-
903
- type WorkerHeartbeatStatus = "idle" | "busy" | "error" | "offline";
904
913
  type WorkerRuntimeState = {
905
914
  currentJobId: string | null;
906
915
  currentSessionId: string | null;
@@ -913,44 +922,11 @@ function buildWorkerHeaders(authToken: string | null): Record<string, string> {
913
922
  return headers;
914
923
  }
915
924
 
916
- async function sendWorkerHeartbeat(
917
- opts: ReturnType<typeof parseArgs>,
918
- headers: Record<string, string>,
919
- status: WorkerHeartbeatStatus,
920
- currentJobId: string | null = null,
921
- ): Promise<void> {
922
- try {
923
- await fetch(`${opts.server}/workers/heartbeat`, {
924
- method: "POST",
925
- headers,
926
- body: JSON.stringify({
927
- workerId: opts.workerId,
928
- status,
929
- currentJobId,
930
- pollMs: opts.pollMs,
931
- capabilities: {
932
- docker: opts.docker,
933
- labels: opts.labels,
934
- executor: resolveExecutor(CONFIG),
935
- requireDocker: opts.requireDocker,
936
- },
937
- details: {
938
- repo: opts.repo,
939
- baseRef: opts.worktreeBaseRef,
940
- dockerImage: opts.docker ? opts.dockerImage : null,
941
- dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
942
- },
943
- }),
944
- });
945
- } catch (err) {
946
- console.error(`[WorkerPals] Heartbeat error:`, err);
947
- }
948
- }
949
-
950
925
  async function failActiveJobOnShutdown(
951
926
  opts: ReturnType<typeof parseArgs>,
952
927
  headers: Record<string, string>,
953
928
  runtimeState: WorkerRuntimeState,
929
+ transport: WorkerServerTransport,
954
930
  signalName: string,
955
931
  ): Promise<void> {
956
932
  const activeJobId = runtimeState.currentJobId;
@@ -961,10 +937,9 @@ async function failActiveJobOnShutdown(
961
937
  let statusPersistedToServer = false;
962
938
 
963
939
  try {
964
- const response = await fetch(`${opts.server}/jobs/${activeJobId}/fail`, {
965
- method: "POST",
966
- headers,
967
- body: JSON.stringify({ message, detail }),
940
+ const response = await postJsonWithTimeout(`${opts.server}/jobs/${activeJobId}/fail`, headers, {
941
+ message,
942
+ detail,
968
943
  });
969
944
  statusPersistedToServer = response.ok;
970
945
  } catch (err) {
@@ -978,7 +953,7 @@ async function failActiveJobOnShutdown(
978
953
  runtimeState.currentSessionId &&
979
954
  shouldEmitDirectSessionJobEvent({ ok: false, statusPersistedToServer })
980
955
  ) {
981
- await sendCommand(opts.server, runtimeState.currentSessionId, headers, {
956
+ await transport.queueSessionCommand(runtimeState.currentSessionId, {
982
957
  type: "job_failed",
983
958
  payload: {
984
959
  jobId: activeJobId,
@@ -986,7 +961,41 @@ async function failActiveJobOnShutdown(
986
961
  detail,
987
962
  },
988
963
  from: `worker:${opts.workerId}`,
964
+ }, { priority: "high" });
965
+ }
966
+ }
967
+
968
+ async function deferClaimedJobForMaintenance(
969
+ opts: ReturnType<typeof parseArgs>,
970
+ headers: Record<string, string>,
971
+ jobId: string,
972
+ deferMs: number,
973
+ ): Promise<{ ok: boolean; availableAt?: string; message?: string }> {
974
+ try {
975
+ const response = await postJsonWithTimeout(`${opts.server}/jobs/${jobId}/defer`, headers, {
976
+ workerId: opts.workerId,
977
+ deferMs,
989
978
  });
979
+ const payload = (await response.json().catch(() => ({}))) as {
980
+ ok?: boolean;
981
+ availableAt?: string;
982
+ message?: string;
983
+ };
984
+ if (!response.ok || !payload.ok) {
985
+ return {
986
+ ok: false,
987
+ message: payload.message || `HTTP ${response.status}`,
988
+ };
989
+ }
990
+ return {
991
+ ok: true,
992
+ availableAt: payload.availableAt,
993
+ };
994
+ } catch (error) {
995
+ return {
996
+ ok: false,
997
+ message: error instanceof Error ? error.message : String(error),
998
+ };
990
999
  }
991
1000
  }
992
1001
 
@@ -994,6 +1003,8 @@ async function workerLoop(
994
1003
  opts: ReturnType<typeof parseArgs>,
995
1004
  dockerExecutor: DockerExecutor | null,
996
1005
  runtimeState: WorkerRuntimeState,
1006
+ transport: WorkerServerTransport,
1007
+ requestWorkerRestart: (reason: string) => void,
997
1008
  ): Promise<void> {
998
1009
  const headers = buildWorkerHeaders(opts.authToken);
999
1010
 
@@ -1007,17 +1018,37 @@ async function workerLoop(
1007
1018
  }
1008
1019
  console.log(`[WorkerPals ${opts.workerId}] Executor backend: ${resolveExecutor(CONFIG)}`);
1009
1020
  const heartbeatEveryMs = Math.max(1000, opts.heartbeatMs);
1021
+ const claimTimeoutMs = Math.max(4_000, Math.min(15_000, opts.pollMs * 3));
1010
1022
  let lastHeartbeatAt = 0;
1023
+ const buildHeartbeatPayload = (
1024
+ status: WorkerHeartbeatPayload["status"],
1025
+ currentJobId: string | null,
1026
+ ): WorkerHeartbeatPayload => ({
1027
+ status,
1028
+ currentJobId,
1029
+ capabilities: {
1030
+ docker: opts.docker,
1031
+ labels: opts.labels,
1032
+ executor: resolveExecutor(CONFIG),
1033
+ requireDocker: opts.requireDocker,
1034
+ },
1035
+ details: {
1036
+ repo: opts.repo,
1037
+ baseRef: opts.worktreeBaseRef,
1038
+ dockerImage: opts.docker ? opts.dockerImage : null,
1039
+ dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
1040
+ },
1041
+ });
1011
1042
 
1012
1043
  const maybeHeartbeat = async (
1013
- status: WorkerHeartbeatStatus,
1044
+ status: WorkerHeartbeatPayload["status"],
1014
1045
  currentJobId: string | null = null,
1015
1046
  force = false,
1016
1047
  ) => {
1017
1048
  const now = Date.now();
1018
1049
  if (!force && now - lastHeartbeatAt < heartbeatEveryMs) return;
1019
- await sendWorkerHeartbeat(opts, headers, status, currentJobId);
1020
- lastHeartbeatAt = now;
1050
+ const ok = await transport.sendHeartbeat(buildHeartbeatPayload(status, currentJobId));
1051
+ if (ok) lastHeartbeatAt = now;
1021
1052
  };
1022
1053
 
1023
1054
  await maybeHeartbeat("idle", null, true);
@@ -1025,37 +1056,128 @@ async function workerLoop(
1025
1056
  while (!runtimeState.shutdownRequested) {
1026
1057
  try {
1027
1058
  await maybeHeartbeat("idle");
1028
- const claimRes = await fetch(`${opts.server}/jobs/claim`, {
1029
- method: "POST",
1059
+ const claimRes = await postJsonWithTimeout(
1060
+ `${opts.server}/jobs/claim`,
1030
1061
  headers,
1031
- body: JSON.stringify({ workerId: opts.workerId }),
1032
- });
1062
+ { workerId: opts.workerId },
1063
+ claimTimeoutMs,
1064
+ );
1033
1065
 
1034
1066
  if (claimRes.ok) {
1035
1067
  const data = (await claimRes.json()) as any;
1036
1068
  const job = data.job;
1037
1069
 
1038
1070
  if (job) {
1071
+ if (
1072
+ dockerExecutor &&
1073
+ dockerExecutor.shouldPrepareMergeConflictJobBeforeExecution(job)
1074
+ ) {
1075
+ const deferMs = dockerExecutor.recommendedMergeConflictDeferMs();
1076
+ const deferred = await deferClaimedJobForMaintenance(opts, headers, job.id, deferMs);
1077
+ if (!deferred.ok) {
1078
+ console.warn(
1079
+ `[WorkerPals] Failed to defer merge-conflict job ${job.id} for image refresh; falling back to claimed execution path: ${
1080
+ deferred.message || "unknown error"
1081
+ }`,
1082
+ );
1083
+ } else {
1084
+ console.log(
1085
+ `[WorkerPals] Deferred merge-conflict job ${job.id} until ${
1086
+ deferred.availableAt ?? "maintenance complete"
1087
+ } while refreshing Docker image outside claimed-job lifetime.`,
1088
+ );
1089
+ const maintenanceHeartbeat = setInterval(() => {
1090
+ void transport.sendHeartbeat({
1091
+ ...buildHeartbeatPayload("idle", null),
1092
+ details: {
1093
+ repo: opts.repo,
1094
+ baseRef: opts.worktreeBaseRef,
1095
+ dockerImage: opts.docker ? opts.dockerImage : null,
1096
+ dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
1097
+ maintenance: "merge_conflict_image_refresh",
1098
+ deferredJobId: job.id,
1099
+ },
1100
+ });
1101
+ }, heartbeatEveryMs);
1102
+ try {
1103
+ await maybeHeartbeat("idle", null, true);
1104
+ await dockerExecutor.prepareMergeConflictJobEnvironment(job);
1105
+ } catch (error) {
1106
+ const detail = redactSensitiveText(
1107
+ error instanceof Error ? error.stack || error.message : String(error),
1108
+ );
1109
+ console.error(
1110
+ `[WorkerPals] Merge-conflict environment preparation failed for ${job.id}: ${detail}`,
1111
+ );
1112
+ try {
1113
+ const failResponse = await postJsonWithTimeout(
1114
+ `${opts.server}/jobs/${job.id}/fail-deferred`,
1115
+ headers,
1116
+ {
1117
+ workerId: opts.workerId,
1118
+ message: "Merge-conflict environment preparation failed",
1119
+ detail,
1120
+ },
1121
+ );
1122
+ const failPayload = (await failResponse.json().catch(() => ({}))) as {
1123
+ ok?: boolean;
1124
+ message?: string;
1125
+ };
1126
+ if (!failResponse.ok || !failPayload.ok) {
1127
+ console.error(
1128
+ `[WorkerPals] Failed to mark deferred job ${job.id} as failed: ${
1129
+ failPayload.message || `HTTP ${failResponse.status}`
1130
+ }`,
1131
+ );
1132
+ }
1133
+ } catch (failErr) {
1134
+ console.error(
1135
+ `[WorkerPals] Failed to mark deferred job ${job.id} as failed: ${
1136
+ failErr instanceof Error ? failErr.message : String(failErr)
1137
+ }`,
1138
+ );
1139
+ }
1140
+ } finally {
1141
+ clearInterval(maintenanceHeartbeat);
1142
+ }
1143
+ await maybeHeartbeat("idle", null, true);
1144
+ continue;
1145
+ }
1146
+ }
1147
+
1039
1148
  runtimeState.currentJobId = job.id;
1040
1149
  runtimeState.currentSessionId = job.sessionId ?? null;
1041
1150
  console.log(`[WorkerPals] Claimed job ${job.id} (${job.kind})`);
1042
1151
  await maybeHeartbeat("busy", job.id, true);
1152
+ let allowHeartbeatRecycle = true;
1043
1153
 
1044
1154
  const busyHeartbeat = setInterval(() => {
1045
- void sendWorkerHeartbeat(opts, headers, "busy", job.id);
1155
+ void transport.sendHeartbeat(buildHeartbeatPayload("busy", job.id)).then((ok) => {
1156
+ if (
1157
+ !shouldRecycleWorkerForHeartbeatDegradation({
1158
+ heartbeatDelivered: ok,
1159
+ allowHeartbeatRecycle,
1160
+ transportStale: transport.shouldRecycleBusyWorker(),
1161
+ })
1162
+ ) {
1163
+ return;
1164
+ }
1165
+ requestWorkerRestart(
1166
+ `heartbeat transport stale while claimed job ${job.id} is still running`,
1167
+ );
1168
+ });
1046
1169
  }, heartbeatEveryMs);
1047
1170
 
1048
1171
  if (job.sessionId) {
1049
- await sendCommand(opts.server, job.sessionId, headers, {
1172
+ await transport.queueSessionCommand(job.sessionId, {
1050
1173
  type: "job_claimed",
1051
1174
  payload: { jobId: job.id, workerId: opts.workerId },
1052
1175
  from: `worker:${opts.workerId}`,
1053
- });
1176
+ }, { priority: "high" });
1054
1177
  }
1055
1178
 
1056
1179
  let stdoutSeq = 0;
1057
1180
  let stderrSeq = 0;
1058
- let logChain: Promise<void> = Promise.resolve();
1059
1181
  let lastCleanLog = "";
1060
1182
  let lastCleanLogAt = 0;
1061
1183
 
@@ -1077,20 +1199,17 @@ async function workerLoop(
1077
1199
  const logTs = new Date(now).toISOString();
1078
1200
 
1079
1201
  const seq = stream === "stdout" ? ++stdoutSeq : ++stderrSeq;
1080
- logChain = logChain.then(() =>
1081
- Promise.allSettled([
1082
- sendCommand(opts.server, job.sessionId, headers, {
1083
- type: "job_log",
1084
- payload: { jobId: job.id, stream, seq, line: cleaned, ts: logTs },
1085
- from: `worker:${opts.workerId}`,
1086
- }),
1087
- fetch(`${opts.server}/jobs/${job.id}/log`, {
1088
- method: "POST",
1089
- headers,
1090
- body: JSON.stringify({ stream, seq, message: cleaned, ts: logTs }),
1091
- }),
1092
- ]).then(() => undefined),
1093
- );
1202
+ void transport.queueSessionCommand(job.sessionId, {
1203
+ type: "job_log",
1204
+ payload: { jobId: job.id, stream, seq, line: cleaned, ts: logTs },
1205
+ from: `worker:${opts.workerId}`,
1206
+ }, { droppable: true });
1207
+ void transport.queueJobLog(job.id, {
1208
+ stream,
1209
+ seq,
1210
+ message: cleaned,
1211
+ ts: logTs,
1212
+ });
1094
1213
  }
1095
1214
  : undefined;
1096
1215
 
@@ -1153,7 +1272,8 @@ async function workerLoop(
1153
1272
  }
1154
1273
  const jobDurationMs = Math.max(0, Date.now() - jobStartedAtMs);
1155
1274
 
1156
- await logChain;
1275
+ allowHeartbeatRecycle = false;
1276
+ await transport.flush();
1157
1277
  try {
1158
1278
  await reportWorkerLlmUsage(opts.server, headers, jobData, result);
1159
1279
  } catch (err) {
@@ -1267,10 +1387,10 @@ async function workerLoop(
1267
1387
  reviewAgent.prUrl.trim().length > 0
1268
1388
  ? reviewAgent.prUrl.trim()
1269
1389
  : null;
1270
- const response = await fetch(`${opts.server}/jobs/${job.id}/complete`, {
1271
- method: "POST",
1390
+ const response = await postJsonWithTimeout(
1391
+ `${opts.server}/jobs/${job.id}/complete`,
1272
1392
  headers,
1273
- body: JSON.stringify({
1393
+ {
1274
1394
  summary: result.summary,
1275
1395
  durationMs: jobDurationMs,
1276
1396
  prUrl: jobPrUrl,
@@ -1278,21 +1398,17 @@ async function workerLoop(
1278
1398
  ...(result.stdout ? [{ kind: "stdout", text: result.stdout }] : []),
1279
1399
  ...(result.stderr ? [{ kind: "stderr", text: result.stderr }] : []),
1280
1400
  ],
1281
- }),
1282
- });
1401
+ },
1402
+ );
1283
1403
  statusPersistedToServer = response.ok;
1284
1404
  console.log(
1285
1405
  `[WorkerPals] Job ${job.id} completed in ${formatDurationMs(jobDurationMs)}: ${result.summary}`,
1286
1406
  );
1287
1407
  } else {
1288
- const response = await fetch(`${opts.server}/jobs/${job.id}/fail`, {
1289
- method: "POST",
1290
- headers,
1291
- body: JSON.stringify({
1292
- message: result.summary,
1293
- detail: redactSensitiveText(result.stderr ?? ""),
1294
- durationMs: jobDurationMs,
1295
- }),
1408
+ const response = await postJsonWithTimeout(`${opts.server}/jobs/${job.id}/fail`, headers, {
1409
+ message: result.summary,
1410
+ detail: redactSensitiveText(result.stderr ?? ""),
1411
+ durationMs: jobDurationMs,
1296
1412
  });
1297
1413
  statusPersistedToServer = response.ok;
1298
1414
  console.log(
@@ -1327,11 +1443,11 @@ async function workerLoop(
1327
1443
  ? `${rawText.slice(0, maxResponseChars - 3)}...`
1328
1444
  : rawText;
1329
1445
  if (assistantText) {
1330
- await sendCommand(opts.server, job.sessionId, headers, {
1446
+ await transport.queueSessionCommand(job.sessionId, {
1331
1447
  type: "assistant_message",
1332
1448
  payload: { text: assistantText },
1333
1449
  from: `worker:${opts.workerId}`,
1334
- });
1450
+ }, { priority: "high" });
1335
1451
  }
1336
1452
  }
1337
1453
 
@@ -1358,7 +1474,9 @@ async function workerLoop(
1358
1474
  from: `worker:${opts.workerId}`,
1359
1475
  };
1360
1476
 
1361
- await sendCommand(opts.server, job.sessionId, headers, eventCmd);
1477
+ await transport.queueSessionCommand(job.sessionId, eventCmd, {
1478
+ priority: "high",
1479
+ });
1362
1480
  }
1363
1481
  }
1364
1482
  } finally {
@@ -1369,13 +1487,13 @@ async function workerLoop(
1369
1487
  result?.cooldownMs &&
1370
1488
  result.cooldownMs > 0
1371
1489
  ) {
1372
- await sendCommand(opts.server, job.sessionId, headers, {
1490
+ await transport.queueSessionCommand(job.sessionId, {
1373
1491
  type: "assistant_message",
1374
1492
  payload: {
1375
1493
  text: `WorkerPal is cooling down for ${formatDurationMs(result.cooldownMs)} after transient infrastructure failures.`,
1376
1494
  },
1377
1495
  from: `worker:${opts.workerId}`,
1378
- });
1496
+ }, { priority: "high" });
1379
1497
  }
1380
1498
  if (!recycleWorkerAfterJob && result?.cooldownMs && result.cooldownMs > 0) {
1381
1499
  const cooldownMs = Math.max(0, Math.floor(result.cooldownMs));
@@ -1501,6 +1619,14 @@ async function main(): Promise<void> {
1501
1619
  shutdownRequested: false,
1502
1620
  };
1503
1621
  const headers = buildWorkerHeaders(opts.authToken);
1622
+ const transport = new WorkerServerTransport({
1623
+ server: opts.server,
1624
+ headers,
1625
+ workerId: opts.workerId,
1626
+ pollMs: opts.pollMs,
1627
+ heartbeatMs: opts.heartbeatMs,
1628
+ staleClaimTtlMs: CONFIG.server.staleClaimTtlMs,
1629
+ });
1504
1630
  let shutdownTriggered = false;
1505
1631
  const shutdownAndExit = (signalName: string, code: number) => {
1506
1632
  if (shutdownTriggered) return;
@@ -1517,9 +1643,25 @@ async function main(): Promise<void> {
1517
1643
 
1518
1644
  void (async () => {
1519
1645
  await withTimeout(
1520
- sendWorkerHeartbeat(opts, headers, "offline", runtimeState.currentJobId ?? null),
1646
+ transport.sendHeartbeat({
1647
+ status: "offline",
1648
+ currentJobId: runtimeState.currentJobId ?? null,
1649
+ capabilities: {
1650
+ docker: opts.docker,
1651
+ labels: opts.labels,
1652
+ executor: resolveExecutor(CONFIG),
1653
+ requireDocker: opts.requireDocker,
1654
+ },
1655
+ details: {
1656
+ repo: opts.repo,
1657
+ baseRef: opts.worktreeBaseRef,
1658
+ dockerImage: opts.docker ? opts.dockerImage : null,
1659
+ dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
1660
+ },
1661
+ }),
1521
1662
  );
1522
- await withTimeout(failActiveJobOnShutdown(opts, headers, runtimeState, signalName));
1663
+ await withTimeout(failActiveJobOnShutdown(opts, headers, runtimeState, transport, signalName));
1664
+ await withTimeout(transport.flush());
1523
1665
  if (dockerExecutor) {
1524
1666
  await withTimeout(
1525
1667
  dockerExecutor.shutdown().catch((err) => {
@@ -1548,7 +1690,13 @@ async function main(): Promise<void> {
1548
1690
  }
1549
1691
  });
1550
1692
 
1551
- workerLoop(opts, dockerExecutor, runtimeState).catch((err) => {
1693
+ const requestWorkerRestart = (reason: string) => {
1694
+ if (shutdownTriggered) return;
1695
+ console.error(`[WorkerPals] Control plane unhealthy: ${reason}. Recycling worker.`);
1696
+ shutdownAndExit("CONTROL_PLANE_UNHEALTHY", 91);
1697
+ };
1698
+
1699
+ workerLoop(opts, dockerExecutor, runtimeState, transport, requestWorkerRestart).catch((err) => {
1552
1700
  console.error("[WorkerPals] Fatal:", err);
1553
1701
  process.exit(1);
1554
1702
  });