@pushpalsdev/cli 1.0.38 → 1.0.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/pushpals-cli.js
CHANGED
|
@@ -122,6 +122,7 @@ class ServiceManager {
|
|
|
122
122
|
onServiceDegraded;
|
|
123
123
|
onEvent;
|
|
124
124
|
timer;
|
|
125
|
+
shutdownBegun = false;
|
|
125
126
|
stopped = false;
|
|
126
127
|
constructor(options = {}) {
|
|
127
128
|
this.pollMs = Math.max(50, Math.floor(options.pollMs ?? DEFAULT_SERVICE_MANAGER_POLL_MS));
|
|
@@ -161,10 +162,10 @@ class ServiceManager {
|
|
|
161
162
|
action: this.degradedAction
|
|
162
163
|
};
|
|
163
164
|
}
|
|
164
|
-
|
|
165
|
-
if (this.stopped)
|
|
165
|
+
beginShutdown() {
|
|
166
|
+
if (this.shutdownBegun || this.stopped)
|
|
166
167
|
return;
|
|
167
|
-
this.
|
|
168
|
+
this.shutdownBegun = true;
|
|
168
169
|
clearInterval(this.timer);
|
|
169
170
|
for (const state of this.stateByService.values()) {
|
|
170
171
|
if (!state.pendingRestartTimer)
|
|
@@ -172,6 +173,12 @@ class ServiceManager {
|
|
|
172
173
|
clearTimeout(state.pendingRestartTimer);
|
|
173
174
|
state.pendingRestartTimer = null;
|
|
174
175
|
}
|
|
176
|
+
}
|
|
177
|
+
stop() {
|
|
178
|
+
if (this.stopped)
|
|
179
|
+
return;
|
|
180
|
+
this.beginShutdown();
|
|
181
|
+
this.stopped = true;
|
|
175
182
|
for (const service of this.services.values()) {
|
|
176
183
|
try {
|
|
177
184
|
const pid = service.proc.pid;
|
|
@@ -207,7 +214,7 @@ class ServiceManager {
|
|
|
207
214
|
this.onEvent?.(level, line);
|
|
208
215
|
}
|
|
209
216
|
tick() {
|
|
210
|
-
if (this.stopped)
|
|
217
|
+
if (this.shutdownBegun || this.stopped)
|
|
211
218
|
return;
|
|
212
219
|
const now = Date.now();
|
|
213
220
|
for (const [name, service] of this.services.entries()) {
|
|
@@ -250,7 +257,7 @@ class ServiceManager {
|
|
|
250
257
|
state.pendingRestartTimer = setTimeout(() => {
|
|
251
258
|
state.pendingRestartTimer = null;
|
|
252
259
|
state.nextRestartAtMs = 0;
|
|
253
|
-
if (this.stopped)
|
|
260
|
+
if (this.shutdownBegun || this.stopped)
|
|
254
261
|
return;
|
|
255
262
|
const current = this.services.get(name);
|
|
256
263
|
if (!current || !current.exited)
|
|
@@ -4792,6 +4799,7 @@ async function main() {
|
|
|
4792
4799
|
return;
|
|
4793
4800
|
const serviceManager = autoStartedServiceManager;
|
|
4794
4801
|
autoStartedServiceManager = null;
|
|
4802
|
+
serviceManager.beginShutdown();
|
|
4795
4803
|
const shutdown = await requestLocalRuntimeShutdown(serverUrl, repoRoot, reason);
|
|
4796
4804
|
if (shutdown.attempted && shutdown.accepted) {
|
|
4797
4805
|
console.log("[pushpals] Local runtime shutdown accepted; waiting for services to exit...");
|
package/package.json
CHANGED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
import type { CommandRequest } from "protocol";
|
|
2
|
+
|
|
3
|
+
export type WorkerHeartbeatStatus = "idle" | "busy" | "error" | "offline";
|
|
4
|
+
|
|
5
|
+
type QueuePriority = "high" | "normal";
|
|
6
|
+
|
|
7
|
+
type TransportTask = {
|
|
8
|
+
label: string;
|
|
9
|
+
priority: QueuePriority;
|
|
10
|
+
droppable: boolean;
|
|
11
|
+
run: () => Promise<void>;
|
|
12
|
+
resolve: () => void;
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
export type WorkerServerTransportOptions = {
|
|
16
|
+
server: string;
|
|
17
|
+
headers: Record<string, string>;
|
|
18
|
+
workerId: string;
|
|
19
|
+
pollMs: number;
|
|
20
|
+
heartbeatMs: number;
|
|
21
|
+
staleClaimTtlMs: number;
|
|
22
|
+
fetchFn?: typeof fetch;
|
|
23
|
+
logInfo?: (message: string) => void;
|
|
24
|
+
logWarn?: (message: string) => void;
|
|
25
|
+
nowFn?: () => number;
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
export type WorkerTransportHealthSnapshot = {
|
|
29
|
+
heartbeatInFlight: boolean;
|
|
30
|
+
consecutiveHeartbeatFailures: number;
|
|
31
|
+
lastHeartbeatAttemptAt: number;
|
|
32
|
+
lastHeartbeatSuccessAt: number;
|
|
33
|
+
queuedRequests: number;
|
|
34
|
+
droppedLogRequests: number;
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
export type WorkerHeartbeatPayload = {
|
|
38
|
+
status: WorkerHeartbeatStatus;
|
|
39
|
+
currentJobId: string | null;
|
|
40
|
+
capabilities?: Record<string, unknown>;
|
|
41
|
+
details?: Record<string, unknown>;
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
function computeHeartbeatTimeoutMs(heartbeatMs: number): number {
|
|
45
|
+
return Math.max(1_500, Math.min(4_000, Math.floor(heartbeatMs * 0.8)));
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function computeRequestTimeoutMs(heartbeatMs: number): number {
|
|
49
|
+
return Math.max(4_000, Math.min(10_000, Math.floor(heartbeatMs * 2)));
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
async function readResponseDetail(response: Response): Promise<string> {
|
|
53
|
+
const text = await response.text().catch(() => "");
|
|
54
|
+
return text.trim();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export class WorkerServerTransport {
|
|
58
|
+
private readonly server: string;
|
|
59
|
+
private readonly headers: Record<string, string>;
|
|
60
|
+
private readonly workerId: string;
|
|
61
|
+
private readonly pollMs: number;
|
|
62
|
+
private readonly staleClaimTtlMs: number;
|
|
63
|
+
private readonly fetchFn: typeof fetch;
|
|
64
|
+
private readonly logInfo: (message: string) => void;
|
|
65
|
+
private readonly logWarn: (message: string) => void;
|
|
66
|
+
private readonly nowFn: () => number;
|
|
67
|
+
private readonly heartbeatTimeoutMs: number;
|
|
68
|
+
private readonly requestTimeoutMs: number;
|
|
69
|
+
private readonly maxQueuedRequests = 256;
|
|
70
|
+
|
|
71
|
+
private queuedRequests: TransportTask[] = [];
|
|
72
|
+
private queueDrainInFlight = false;
|
|
73
|
+
private queueFlushWaiters: Array<() => void> = [];
|
|
74
|
+
private droppedLogRequests = 0;
|
|
75
|
+
private heartbeatInFlight = false;
|
|
76
|
+
private lastHeartbeatAttemptAt = 0;
|
|
77
|
+
private lastHeartbeatSuccessAt = 0;
|
|
78
|
+
private consecutiveHeartbeatFailures = 0;
|
|
79
|
+
private firstHeartbeatFailureAt = -1;
|
|
80
|
+
private lastHeartbeatFailureDetail = "";
|
|
81
|
+
|
|
82
|
+
constructor(options: WorkerServerTransportOptions) {
|
|
83
|
+
this.server = options.server;
|
|
84
|
+
this.headers = options.headers;
|
|
85
|
+
this.workerId = options.workerId;
|
|
86
|
+
this.pollMs = options.pollMs;
|
|
87
|
+
this.staleClaimTtlMs = options.staleClaimTtlMs;
|
|
88
|
+
this.fetchFn = options.fetchFn ?? fetch;
|
|
89
|
+
this.logInfo = options.logInfo ?? ((message) => console.log(message));
|
|
90
|
+
this.logWarn = options.logWarn ?? ((message) => console.warn(message));
|
|
91
|
+
this.nowFn = options.nowFn ?? (() => Date.now());
|
|
92
|
+
this.heartbeatTimeoutMs = computeHeartbeatTimeoutMs(options.heartbeatMs);
|
|
93
|
+
this.requestTimeoutMs = computeRequestTimeoutMs(options.heartbeatMs);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
getHealthSnapshot(): WorkerTransportHealthSnapshot {
|
|
97
|
+
return {
|
|
98
|
+
heartbeatInFlight: this.heartbeatInFlight,
|
|
99
|
+
consecutiveHeartbeatFailures: this.consecutiveHeartbeatFailures,
|
|
100
|
+
lastHeartbeatAttemptAt: this.lastHeartbeatAttemptAt,
|
|
101
|
+
lastHeartbeatSuccessAt: this.lastHeartbeatSuccessAt,
|
|
102
|
+
queuedRequests: this.queuedRequests.length,
|
|
103
|
+
droppedLogRequests: this.droppedLogRequests,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
getHeartbeatStaleAgeMs(nowMs = this.nowFn()): number {
|
|
108
|
+
if (this.lastHeartbeatSuccessAt <= 0) return Number.POSITIVE_INFINITY;
|
|
109
|
+
return Math.max(0, nowMs - this.lastHeartbeatSuccessAt);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
shouldRecycleBusyWorker(nowMs = this.nowFn()): boolean {
|
|
113
|
+
const failureAgeMs =
|
|
114
|
+
this.firstHeartbeatFailureAt >= 0
|
|
115
|
+
? Math.max(0, nowMs - this.firstHeartbeatFailureAt)
|
|
116
|
+
: null;
|
|
117
|
+
if (failureAgeMs == null) return false;
|
|
118
|
+
const threshold = Math.min(
|
|
119
|
+
this.staleClaimTtlMs,
|
|
120
|
+
Math.max(
|
|
121
|
+
30_000,
|
|
122
|
+
Math.min(
|
|
123
|
+
this.staleClaimTtlMs - this.heartbeatTimeoutMs,
|
|
124
|
+
Math.floor(this.staleClaimTtlMs * 0.75),
|
|
125
|
+
),
|
|
126
|
+
),
|
|
127
|
+
);
|
|
128
|
+
return failureAgeMs >= threshold;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
async sendHeartbeat(payload: WorkerHeartbeatPayload): Promise<boolean> {
|
|
132
|
+
if (this.heartbeatInFlight) {
|
|
133
|
+
return false;
|
|
134
|
+
}
|
|
135
|
+
this.heartbeatInFlight = true;
|
|
136
|
+
this.lastHeartbeatAttemptAt = this.nowFn();
|
|
137
|
+
try {
|
|
138
|
+
const response = await this.postJson("/workers/heartbeat", {
|
|
139
|
+
workerId: this.workerId,
|
|
140
|
+
status: payload.status,
|
|
141
|
+
currentJobId: payload.currentJobId,
|
|
142
|
+
pollMs: this.pollMs,
|
|
143
|
+
capabilities: payload.capabilities ?? {},
|
|
144
|
+
details: payload.details ?? {},
|
|
145
|
+
}, this.heartbeatTimeoutMs);
|
|
146
|
+
if (!response.ok) {
|
|
147
|
+
const detail = await readResponseDetail(response);
|
|
148
|
+
throw new Error(
|
|
149
|
+
`heartbeat rejected (${response.status})${detail ? `: ${detail}` : ""}`,
|
|
150
|
+
);
|
|
151
|
+
}
|
|
152
|
+
const previousFailures = this.consecutiveHeartbeatFailures;
|
|
153
|
+
this.lastHeartbeatSuccessAt = this.nowFn();
|
|
154
|
+
this.consecutiveHeartbeatFailures = 0;
|
|
155
|
+
this.firstHeartbeatFailureAt = -1;
|
|
156
|
+
this.lastHeartbeatFailureDetail = "";
|
|
157
|
+
if (previousFailures > 0) {
|
|
158
|
+
this.logInfo(
|
|
159
|
+
`[WorkerPals] Heartbeat recovered for ${this.workerId} after ${previousFailures} failed attempt(s).`,
|
|
160
|
+
);
|
|
161
|
+
}
|
|
162
|
+
return true;
|
|
163
|
+
} catch (error) {
|
|
164
|
+
if (this.consecutiveHeartbeatFailures === 0) {
|
|
165
|
+
this.firstHeartbeatFailureAt = this.nowFn();
|
|
166
|
+
}
|
|
167
|
+
this.consecutiveHeartbeatFailures += 1;
|
|
168
|
+
this.lastHeartbeatFailureDetail = error instanceof Error ? error.message : String(error);
|
|
169
|
+
const staleAgeMs = this.getHeartbeatStaleAgeMs();
|
|
170
|
+
this.logWarn(
|
|
171
|
+
`[WorkerPals] Heartbeat failure ${this.consecutiveHeartbeatFailures} for ${this.workerId}: ${this.lastHeartbeatFailureDetail} (lastSuccessAgeMs=${Number.isFinite(staleAgeMs) ? staleAgeMs : -1}).`,
|
|
172
|
+
);
|
|
173
|
+
return false;
|
|
174
|
+
} finally {
|
|
175
|
+
this.heartbeatInFlight = false;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
queueSessionCommand(
|
|
180
|
+
sessionId: string,
|
|
181
|
+
cmd: CommandRequest,
|
|
182
|
+
options: { priority?: QueuePriority; droppable?: boolean } = {},
|
|
183
|
+
): Promise<void> {
|
|
184
|
+
return this.enqueueTask({
|
|
185
|
+
label: `command:${cmd.type}`,
|
|
186
|
+
priority: options.priority ?? "normal",
|
|
187
|
+
droppable: options.droppable ?? false,
|
|
188
|
+
run: async () => {
|
|
189
|
+
const response = await this.postJson(
|
|
190
|
+
`/sessions/${sessionId}/command`,
|
|
191
|
+
cmd,
|
|
192
|
+
this.requestTimeoutMs,
|
|
193
|
+
);
|
|
194
|
+
if (!response.ok) {
|
|
195
|
+
const detail = await readResponseDetail(response);
|
|
196
|
+
this.logWarn(
|
|
197
|
+
`[WorkerPals] Command ${cmd.type} failed: ${response.status}${detail ? ` ${detail}` : ""}`,
|
|
198
|
+
);
|
|
199
|
+
}
|
|
200
|
+
},
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
queueJobLog(
|
|
205
|
+
jobId: string,
|
|
206
|
+
payload: { stream: "stdout" | "stderr"; seq: number; message: string; ts: string },
|
|
207
|
+
): Promise<void> {
|
|
208
|
+
return this.enqueueTask({
|
|
209
|
+
label: "job_log",
|
|
210
|
+
priority: "normal",
|
|
211
|
+
droppable: true,
|
|
212
|
+
run: async () => {
|
|
213
|
+
const response = await this.postJson(`/jobs/${jobId}/log`, payload, this.requestTimeoutMs);
|
|
214
|
+
if (!response.ok) {
|
|
215
|
+
const detail = await readResponseDetail(response);
|
|
216
|
+
this.logWarn(
|
|
217
|
+
`[WorkerPals] Job log delivery failed for ${jobId}: ${response.status}${detail ? ` ${detail}` : ""}`,
|
|
218
|
+
);
|
|
219
|
+
}
|
|
220
|
+
},
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
async flush(timeoutMs = 15_000): Promise<void> {
|
|
225
|
+
if (this.queuedRequests.length === 0 && !this.queueDrainInFlight) return;
|
|
226
|
+
await new Promise<void>((resolve) => {
|
|
227
|
+
let settled = false;
|
|
228
|
+
const timer = setTimeout(() => {
|
|
229
|
+
if (settled) return;
|
|
230
|
+
settled = true;
|
|
231
|
+
this.logWarn(
|
|
232
|
+
`[WorkerPals] Timed out flushing queued server transport requests after ${timeoutMs}ms (queued=${this.queuedRequests.length}).`,
|
|
233
|
+
);
|
|
234
|
+
resolve();
|
|
235
|
+
}, timeoutMs);
|
|
236
|
+
this.queueFlushWaiters.push(() => {
|
|
237
|
+
if (settled) return;
|
|
238
|
+
settled = true;
|
|
239
|
+
clearTimeout(timer);
|
|
240
|
+
resolve();
|
|
241
|
+
});
|
|
242
|
+
this.maybeResolveFlushWaiters();
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
private enqueueTask(task: Omit<TransportTask, "resolve">): Promise<void> {
|
|
247
|
+
if (task.droppable && this.queuedRequests.length >= this.maxQueuedRequests) {
|
|
248
|
+
this.droppedLogRequests += 1;
|
|
249
|
+
if (this.droppedLogRequests === 1 || this.droppedLogRequests % 25 === 0) {
|
|
250
|
+
this.logWarn(
|
|
251
|
+
`[WorkerPals] Dropped ${this.droppedLogRequests} queued low-priority transport request(s) because the queue is saturated (limit=${this.maxQueuedRequests}).`,
|
|
252
|
+
);
|
|
253
|
+
}
|
|
254
|
+
return Promise.resolve();
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return new Promise((resolve) => {
|
|
258
|
+
const queued: TransportTask = { ...task, resolve };
|
|
259
|
+
if (queued.priority === "high") {
|
|
260
|
+
const firstNormalIndex = this.queuedRequests.findIndex((entry) => entry.priority !== "high");
|
|
261
|
+
if (firstNormalIndex === -1) {
|
|
262
|
+
this.queuedRequests.push(queued);
|
|
263
|
+
} else {
|
|
264
|
+
this.queuedRequests.splice(firstNormalIndex, 0, queued);
|
|
265
|
+
}
|
|
266
|
+
} else {
|
|
267
|
+
this.queuedRequests.push(queued);
|
|
268
|
+
}
|
|
269
|
+
void this.drainQueue();
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
private async drainQueue(): Promise<void> {
|
|
274
|
+
if (this.queueDrainInFlight) return;
|
|
275
|
+
this.queueDrainInFlight = true;
|
|
276
|
+
try {
|
|
277
|
+
while (this.queuedRequests.length > 0) {
|
|
278
|
+
const task = this.queuedRequests.shift();
|
|
279
|
+
if (!task) break;
|
|
280
|
+
try {
|
|
281
|
+
await task.run();
|
|
282
|
+
} catch (error) {
|
|
283
|
+
this.logWarn(
|
|
284
|
+
`[WorkerPals] Transport request ${task.label} failed: ${
|
|
285
|
+
error instanceof Error ? error.message : String(error)
|
|
286
|
+
}`,
|
|
287
|
+
);
|
|
288
|
+
} finally {
|
|
289
|
+
task.resolve();
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
} finally {
|
|
293
|
+
this.queueDrainInFlight = false;
|
|
294
|
+
this.maybeResolveFlushWaiters();
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
private maybeResolveFlushWaiters(): void {
|
|
299
|
+
if (this.queuedRequests.length > 0 || this.queueDrainInFlight) return;
|
|
300
|
+
const waiters = this.queueFlushWaiters.splice(0);
|
|
301
|
+
for (const waiter of waiters) waiter();
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
private async postJson(path: string, payload: unknown, timeoutMs: number): Promise<Response> {
|
|
305
|
+
const controller = new AbortController();
|
|
306
|
+
const timer = setTimeout(() => controller.abort("timeout"), timeoutMs);
|
|
307
|
+
try {
|
|
308
|
+
return await this.fetchFn(`${this.server}${path}`, {
|
|
309
|
+
method: "POST",
|
|
310
|
+
headers: this.headers,
|
|
311
|
+
body: JSON.stringify(payload),
|
|
312
|
+
signal: controller.signal,
|
|
313
|
+
});
|
|
314
|
+
} catch (error) {
|
|
315
|
+
if (controller.signal.aborted) {
|
|
316
|
+
throw new Error(`request timed out after ${timeoutMs}ms (${path})`);
|
|
317
|
+
}
|
|
318
|
+
throw error;
|
|
319
|
+
} finally {
|
|
320
|
+
clearTimeout(timer);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
}
|
|
@@ -240,6 +240,7 @@ export class DockerExecutor {
|
|
|
240
240
|
private lastLoggedExecutionConfig = "";
|
|
241
241
|
private lastLoggedEndpointRewrite = "";
|
|
242
242
|
private warmedBackends = new Set<string>();
|
|
243
|
+
private preparedMergeConflictJobs = new Set<string>();
|
|
243
244
|
private mergeConflictRefreshPromise: Promise<void> | null = null;
|
|
244
245
|
private readonly config: WorkerpalsRuntimeConfig;
|
|
245
246
|
|
|
@@ -318,7 +319,6 @@ export class DockerExecutor {
|
|
|
318
319
|
const worktreePath = resolve(this.worktreeDir, worktreeName);
|
|
319
320
|
|
|
320
321
|
try {
|
|
321
|
-
await this.ensureFreshImageForMergeConflictJob(job, onLog);
|
|
322
322
|
const worktreeBaseRef = await this.resolveWorktreeBaseRefForJob(job, onLog);
|
|
323
323
|
// Step 1: Create isolated git worktree
|
|
324
324
|
await this.createWorktree(worktreePath, worktreeBaseRef);
|
|
@@ -398,6 +398,7 @@ export class DockerExecutor {
|
|
|
398
398
|
stderr: `Retries exhausted after ${this.jobRetryMaxAttempts} attempts`,
|
|
399
399
|
};
|
|
400
400
|
} finally {
|
|
401
|
+
this.preparedMergeConflictJobs.delete(job.id);
|
|
401
402
|
this.activeJobs = Math.max(0, this.activeJobs - 1);
|
|
402
403
|
// Step 4: Clean up worktree (always cleanup)
|
|
403
404
|
await this.removeWorktree(worktreePath).catch((err) => {
|
|
@@ -1690,6 +1691,22 @@ export class DockerExecutor {
|
|
|
1690
1691
|
return resolutionType === "merge_conflict";
|
|
1691
1692
|
}
|
|
1692
1693
|
|
|
1694
|
+
shouldPrepareMergeConflictJobBeforeExecution(job: Job): boolean {
|
|
1695
|
+
return this.isMergeConflictResolutionJob(job) && !this.preparedMergeConflictJobs.has(job.id);
|
|
1696
|
+
}
|
|
1697
|
+
|
|
1698
|
+
async prepareMergeConflictJobEnvironment(
|
|
1699
|
+
job: Job,
|
|
1700
|
+
onLog?: (stream: "stdout" | "stderr", line: string) => void,
|
|
1701
|
+
): Promise<void> {
|
|
1702
|
+
await this.ensureFreshImageForMergeConflictJob(job, onLog);
|
|
1703
|
+
this.preparedMergeConflictJobs.add(job.id);
|
|
1704
|
+
}
|
|
1705
|
+
|
|
1706
|
+
recommendedMergeConflictDeferMs(): number {
|
|
1707
|
+
return Math.max(60_000, Math.min(this.options.timeoutMs, 5 * 60_000));
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1693
1710
|
private async ensureFreshImageForMergeConflictJob(
|
|
1694
1711
|
job: Job,
|
|
1695
1712
|
onLog?: (stream: "stdout" | "stderr", line: string) => void,
|
|
@@ -20,7 +20,6 @@
|
|
|
20
20
|
* JobRunner executes a single job, streams logs, and outputs a final result with a sentinel line.
|
|
21
21
|
*/
|
|
22
22
|
|
|
23
|
-
import type { CommandRequest } from "protocol";
|
|
24
23
|
import { randomUUID } from "crypto";
|
|
25
24
|
import { mkdirSync } from "fs";
|
|
26
25
|
import { resolve } from "path";
|
|
@@ -44,6 +43,7 @@ import {
|
|
|
44
43
|
} from "./execute_job.js";
|
|
45
44
|
import { DockerExecutionExhaustedError, DockerExecutor } from "./docker_executor.js";
|
|
46
45
|
import { forceDeleteWorktreePath } from "./common/worktree_cleanup.js";
|
|
46
|
+
import { WorkerServerTransport, type WorkerHeartbeatPayload } from "./common/server_transport.js";
|
|
47
47
|
import { DEFAULT_DOCKER_TIMEOUT_MS, parseDockerTimeoutMs } from "./timeout_policy.js";
|
|
48
48
|
|
|
49
49
|
type CommitRef = {
|
|
@@ -95,6 +95,31 @@ function estimateTokensFromText(text: string): number {
|
|
|
95
95
|
return Math.max(0, Math.ceil(String(text ?? "").length / 3));
|
|
96
96
|
}
|
|
97
97
|
|
|
98
|
+
async function postJsonWithTimeout(
|
|
99
|
+
url: string,
|
|
100
|
+
headers: Record<string, string>,
|
|
101
|
+
body: unknown,
|
|
102
|
+
timeoutMs = 10_000,
|
|
103
|
+
): Promise<Response> {
|
|
104
|
+
const controller = new AbortController();
|
|
105
|
+
const timer = setTimeout(() => controller.abort("timeout"), timeoutMs);
|
|
106
|
+
try {
|
|
107
|
+
return await fetch(url, {
|
|
108
|
+
method: "POST",
|
|
109
|
+
headers,
|
|
110
|
+
body: JSON.stringify(body),
|
|
111
|
+
signal: controller.signal,
|
|
112
|
+
});
|
|
113
|
+
} catch (error) {
|
|
114
|
+
if (controller.signal.aborted) {
|
|
115
|
+
throw new Error(`request timed out after ${timeoutMs}ms: ${url}`);
|
|
116
|
+
}
|
|
117
|
+
throw error;
|
|
118
|
+
} finally {
|
|
119
|
+
clearTimeout(timer);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
98
123
|
function buildWorkerLlmUsageEvent(
|
|
99
124
|
job: {
|
|
100
125
|
kind: string;
|
|
@@ -172,11 +197,7 @@ async function reportWorkerLlmUsage(
|
|
|
172
197
|
): Promise<void> {
|
|
173
198
|
const payload = buildWorkerLlmUsageEvent(job, result);
|
|
174
199
|
if (!payload) return;
|
|
175
|
-
const response = await
|
|
176
|
-
method: "POST",
|
|
177
|
-
headers,
|
|
178
|
-
body: JSON.stringify(payload),
|
|
179
|
-
});
|
|
200
|
+
const response = await postJsonWithTimeout(`${server}/telemetry/llm-usage`, headers, payload);
|
|
180
201
|
if (!response.ok) {
|
|
181
202
|
const detail = await response.text().catch(() => "");
|
|
182
203
|
throw new Error(
|
|
@@ -223,6 +244,16 @@ export function shouldEmitDirectSessionJobEvent(options: {
|
|
|
223
244
|
return !options.statusPersistedToServer;
|
|
224
245
|
}
|
|
225
246
|
|
|
247
|
+
export function shouldRecycleWorkerForHeartbeatDegradation(options: {
|
|
248
|
+
heartbeatDelivered: boolean;
|
|
249
|
+
allowHeartbeatRecycle: boolean;
|
|
250
|
+
transportStale: boolean;
|
|
251
|
+
}): boolean {
|
|
252
|
+
if (options.heartbeatDelivered) return false;
|
|
253
|
+
if (!options.allowHeartbeatRecycle) return false;
|
|
254
|
+
return options.transportStale;
|
|
255
|
+
}
|
|
256
|
+
|
|
226
257
|
function shouldRecycleWorkerForCodexUnavailableFailure(
|
|
227
258
|
summary: string,
|
|
228
259
|
stderr?: string | null,
|
|
@@ -853,19 +884,15 @@ async function enqueueCompletion(
|
|
|
853
884
|
resultSummary,
|
|
854
885
|
});
|
|
855
886
|
|
|
856
|
-
const response = await
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
prUrl,
|
|
866
|
-
prTitle: pr.title,
|
|
867
|
-
prBody: pr.body,
|
|
868
|
-
}),
|
|
887
|
+
const response = await postJsonWithTimeout(`${server}/completions/enqueue`, headers, {
|
|
888
|
+
jobId: job.id,
|
|
889
|
+
sessionId: job.sessionId,
|
|
890
|
+
commitSha: commit.sha,
|
|
891
|
+
branch: commit.branch,
|
|
892
|
+
message: `${job.kind}: ${job.taskId} (worker PR metadata attached)`,
|
|
893
|
+
prUrl,
|
|
894
|
+
prTitle: pr.title,
|
|
895
|
+
prBody: pr.body,
|
|
869
896
|
});
|
|
870
897
|
|
|
871
898
|
if (response.ok) {
|
|
@@ -883,24 +910,6 @@ async function enqueueCompletion(
|
|
|
883
910
|
}
|
|
884
911
|
}
|
|
885
912
|
|
|
886
|
-
function sendCommand(
|
|
887
|
-
server: string,
|
|
888
|
-
sessionId: string,
|
|
889
|
-
headers: Record<string, string>,
|
|
890
|
-
cmd: CommandRequest,
|
|
891
|
-
): Promise<void> {
|
|
892
|
-
return fetch(`${server}/sessions/${sessionId}/command`, {
|
|
893
|
-
method: "POST",
|
|
894
|
-
headers,
|
|
895
|
-
body: JSON.stringify(cmd),
|
|
896
|
-
})
|
|
897
|
-
.then((res) => {
|
|
898
|
-
if (!res.ok) console.error(`[WorkerPals] Command ${cmd.type} failed: ${res.status}`);
|
|
899
|
-
})
|
|
900
|
-
.catch((err) => console.error(`[WorkerPals] Command ${cmd.type} error:`, err));
|
|
901
|
-
}
|
|
902
|
-
|
|
903
|
-
type WorkerHeartbeatStatus = "idle" | "busy" | "error" | "offline";
|
|
904
913
|
type WorkerRuntimeState = {
|
|
905
914
|
currentJobId: string | null;
|
|
906
915
|
currentSessionId: string | null;
|
|
@@ -913,44 +922,11 @@ function buildWorkerHeaders(authToken: string | null): Record<string, string> {
|
|
|
913
922
|
return headers;
|
|
914
923
|
}
|
|
915
924
|
|
|
916
|
-
async function sendWorkerHeartbeat(
|
|
917
|
-
opts: ReturnType<typeof parseArgs>,
|
|
918
|
-
headers: Record<string, string>,
|
|
919
|
-
status: WorkerHeartbeatStatus,
|
|
920
|
-
currentJobId: string | null = null,
|
|
921
|
-
): Promise<void> {
|
|
922
|
-
try {
|
|
923
|
-
await fetch(`${opts.server}/workers/heartbeat`, {
|
|
924
|
-
method: "POST",
|
|
925
|
-
headers,
|
|
926
|
-
body: JSON.stringify({
|
|
927
|
-
workerId: opts.workerId,
|
|
928
|
-
status,
|
|
929
|
-
currentJobId,
|
|
930
|
-
pollMs: opts.pollMs,
|
|
931
|
-
capabilities: {
|
|
932
|
-
docker: opts.docker,
|
|
933
|
-
labels: opts.labels,
|
|
934
|
-
executor: resolveExecutor(CONFIG),
|
|
935
|
-
requireDocker: opts.requireDocker,
|
|
936
|
-
},
|
|
937
|
-
details: {
|
|
938
|
-
repo: opts.repo,
|
|
939
|
-
baseRef: opts.worktreeBaseRef,
|
|
940
|
-
dockerImage: opts.docker ? opts.dockerImage : null,
|
|
941
|
-
dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
|
|
942
|
-
},
|
|
943
|
-
}),
|
|
944
|
-
});
|
|
945
|
-
} catch (err) {
|
|
946
|
-
console.error(`[WorkerPals] Heartbeat error:`, err);
|
|
947
|
-
}
|
|
948
|
-
}
|
|
949
|
-
|
|
950
925
|
async function failActiveJobOnShutdown(
|
|
951
926
|
opts: ReturnType<typeof parseArgs>,
|
|
952
927
|
headers: Record<string, string>,
|
|
953
928
|
runtimeState: WorkerRuntimeState,
|
|
929
|
+
transport: WorkerServerTransport,
|
|
954
930
|
signalName: string,
|
|
955
931
|
): Promise<void> {
|
|
956
932
|
const activeJobId = runtimeState.currentJobId;
|
|
@@ -961,10 +937,9 @@ async function failActiveJobOnShutdown(
|
|
|
961
937
|
let statusPersistedToServer = false;
|
|
962
938
|
|
|
963
939
|
try {
|
|
964
|
-
const response = await
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
body: JSON.stringify({ message, detail }),
|
|
940
|
+
const response = await postJsonWithTimeout(`${opts.server}/jobs/${activeJobId}/fail`, headers, {
|
|
941
|
+
message,
|
|
942
|
+
detail,
|
|
968
943
|
});
|
|
969
944
|
statusPersistedToServer = response.ok;
|
|
970
945
|
} catch (err) {
|
|
@@ -978,7 +953,7 @@ async function failActiveJobOnShutdown(
|
|
|
978
953
|
runtimeState.currentSessionId &&
|
|
979
954
|
shouldEmitDirectSessionJobEvent({ ok: false, statusPersistedToServer })
|
|
980
955
|
) {
|
|
981
|
-
await
|
|
956
|
+
await transport.queueSessionCommand(runtimeState.currentSessionId, {
|
|
982
957
|
type: "job_failed",
|
|
983
958
|
payload: {
|
|
984
959
|
jobId: activeJobId,
|
|
@@ -986,7 +961,41 @@ async function failActiveJobOnShutdown(
|
|
|
986
961
|
detail,
|
|
987
962
|
},
|
|
988
963
|
from: `worker:${opts.workerId}`,
|
|
964
|
+
}, { priority: "high" });
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
async function deferClaimedJobForMaintenance(
|
|
969
|
+
opts: ReturnType<typeof parseArgs>,
|
|
970
|
+
headers: Record<string, string>,
|
|
971
|
+
jobId: string,
|
|
972
|
+
deferMs: number,
|
|
973
|
+
): Promise<{ ok: boolean; availableAt?: string; message?: string }> {
|
|
974
|
+
try {
|
|
975
|
+
const response = await postJsonWithTimeout(`${opts.server}/jobs/${jobId}/defer`, headers, {
|
|
976
|
+
workerId: opts.workerId,
|
|
977
|
+
deferMs,
|
|
989
978
|
});
|
|
979
|
+
const payload = (await response.json().catch(() => ({}))) as {
|
|
980
|
+
ok?: boolean;
|
|
981
|
+
availableAt?: string;
|
|
982
|
+
message?: string;
|
|
983
|
+
};
|
|
984
|
+
if (!response.ok || !payload.ok) {
|
|
985
|
+
return {
|
|
986
|
+
ok: false,
|
|
987
|
+
message: payload.message || `HTTP ${response.status}`,
|
|
988
|
+
};
|
|
989
|
+
}
|
|
990
|
+
return {
|
|
991
|
+
ok: true,
|
|
992
|
+
availableAt: payload.availableAt,
|
|
993
|
+
};
|
|
994
|
+
} catch (error) {
|
|
995
|
+
return {
|
|
996
|
+
ok: false,
|
|
997
|
+
message: error instanceof Error ? error.message : String(error),
|
|
998
|
+
};
|
|
990
999
|
}
|
|
991
1000
|
}
|
|
992
1001
|
|
|
@@ -994,6 +1003,8 @@ async function workerLoop(
|
|
|
994
1003
|
opts: ReturnType<typeof parseArgs>,
|
|
995
1004
|
dockerExecutor: DockerExecutor | null,
|
|
996
1005
|
runtimeState: WorkerRuntimeState,
|
|
1006
|
+
transport: WorkerServerTransport,
|
|
1007
|
+
requestWorkerRestart: (reason: string) => void,
|
|
997
1008
|
): Promise<void> {
|
|
998
1009
|
const headers = buildWorkerHeaders(opts.authToken);
|
|
999
1010
|
|
|
@@ -1007,17 +1018,37 @@ async function workerLoop(
|
|
|
1007
1018
|
}
|
|
1008
1019
|
console.log(`[WorkerPals ${opts.workerId}] Executor backend: ${resolveExecutor(CONFIG)}`);
|
|
1009
1020
|
const heartbeatEveryMs = Math.max(1000, opts.heartbeatMs);
|
|
1021
|
+
const claimTimeoutMs = Math.max(4_000, Math.min(15_000, opts.pollMs * 3));
|
|
1010
1022
|
let lastHeartbeatAt = 0;
|
|
1023
|
+
const buildHeartbeatPayload = (
|
|
1024
|
+
status: WorkerHeartbeatPayload["status"],
|
|
1025
|
+
currentJobId: string | null,
|
|
1026
|
+
): WorkerHeartbeatPayload => ({
|
|
1027
|
+
status,
|
|
1028
|
+
currentJobId,
|
|
1029
|
+
capabilities: {
|
|
1030
|
+
docker: opts.docker,
|
|
1031
|
+
labels: opts.labels,
|
|
1032
|
+
executor: resolveExecutor(CONFIG),
|
|
1033
|
+
requireDocker: opts.requireDocker,
|
|
1034
|
+
},
|
|
1035
|
+
details: {
|
|
1036
|
+
repo: opts.repo,
|
|
1037
|
+
baseRef: opts.worktreeBaseRef,
|
|
1038
|
+
dockerImage: opts.docker ? opts.dockerImage : null,
|
|
1039
|
+
dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
|
|
1040
|
+
},
|
|
1041
|
+
});
|
|
1011
1042
|
|
|
1012
1043
|
const maybeHeartbeat = async (
|
|
1013
|
-
status:
|
|
1044
|
+
status: WorkerHeartbeatPayload["status"],
|
|
1014
1045
|
currentJobId: string | null = null,
|
|
1015
1046
|
force = false,
|
|
1016
1047
|
) => {
|
|
1017
1048
|
const now = Date.now();
|
|
1018
1049
|
if (!force && now - lastHeartbeatAt < heartbeatEveryMs) return;
|
|
1019
|
-
await
|
|
1020
|
-
lastHeartbeatAt = now;
|
|
1050
|
+
const ok = await transport.sendHeartbeat(buildHeartbeatPayload(status, currentJobId));
|
|
1051
|
+
if (ok) lastHeartbeatAt = now;
|
|
1021
1052
|
};
|
|
1022
1053
|
|
|
1023
1054
|
await maybeHeartbeat("idle", null, true);
|
|
@@ -1025,37 +1056,128 @@ async function workerLoop(
|
|
|
1025
1056
|
while (!runtimeState.shutdownRequested) {
|
|
1026
1057
|
try {
|
|
1027
1058
|
await maybeHeartbeat("idle");
|
|
1028
|
-
const claimRes = await
|
|
1029
|
-
|
|
1059
|
+
const claimRes = await postJsonWithTimeout(
|
|
1060
|
+
`${opts.server}/jobs/claim`,
|
|
1030
1061
|
headers,
|
|
1031
|
-
|
|
1032
|
-
|
|
1062
|
+
{ workerId: opts.workerId },
|
|
1063
|
+
claimTimeoutMs,
|
|
1064
|
+
);
|
|
1033
1065
|
|
|
1034
1066
|
if (claimRes.ok) {
|
|
1035
1067
|
const data = (await claimRes.json()) as any;
|
|
1036
1068
|
const job = data.job;
|
|
1037
1069
|
|
|
1038
1070
|
if (job) {
|
|
1071
|
+
if (
|
|
1072
|
+
dockerExecutor &&
|
|
1073
|
+
dockerExecutor.shouldPrepareMergeConflictJobBeforeExecution(job)
|
|
1074
|
+
) {
|
|
1075
|
+
const deferMs = dockerExecutor.recommendedMergeConflictDeferMs();
|
|
1076
|
+
const deferred = await deferClaimedJobForMaintenance(opts, headers, job.id, deferMs);
|
|
1077
|
+
if (!deferred.ok) {
|
|
1078
|
+
console.warn(
|
|
1079
|
+
`[WorkerPals] Failed to defer merge-conflict job ${job.id} for image refresh; falling back to claimed execution path: ${
|
|
1080
|
+
deferred.message || "unknown error"
|
|
1081
|
+
}`,
|
|
1082
|
+
);
|
|
1083
|
+
} else {
|
|
1084
|
+
console.log(
|
|
1085
|
+
`[WorkerPals] Deferred merge-conflict job ${job.id} until ${
|
|
1086
|
+
deferred.availableAt ?? "maintenance complete"
|
|
1087
|
+
} while refreshing Docker image outside claimed-job lifetime.`,
|
|
1088
|
+
);
|
|
1089
|
+
const maintenanceHeartbeat = setInterval(() => {
|
|
1090
|
+
void transport.sendHeartbeat({
|
|
1091
|
+
...buildHeartbeatPayload("idle", null),
|
|
1092
|
+
details: {
|
|
1093
|
+
repo: opts.repo,
|
|
1094
|
+
baseRef: opts.worktreeBaseRef,
|
|
1095
|
+
dockerImage: opts.docker ? opts.dockerImage : null,
|
|
1096
|
+
dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
|
|
1097
|
+
maintenance: "merge_conflict_image_refresh",
|
|
1098
|
+
deferredJobId: job.id,
|
|
1099
|
+
},
|
|
1100
|
+
});
|
|
1101
|
+
}, heartbeatEveryMs);
|
|
1102
|
+
try {
|
|
1103
|
+
await maybeHeartbeat("idle", null, true);
|
|
1104
|
+
await dockerExecutor.prepareMergeConflictJobEnvironment(job);
|
|
1105
|
+
} catch (error) {
|
|
1106
|
+
const detail = redactSensitiveText(
|
|
1107
|
+
error instanceof Error ? error.stack || error.message : String(error),
|
|
1108
|
+
);
|
|
1109
|
+
console.error(
|
|
1110
|
+
`[WorkerPals] Merge-conflict environment preparation failed for ${job.id}: ${detail}`,
|
|
1111
|
+
);
|
|
1112
|
+
try {
|
|
1113
|
+
const failResponse = await postJsonWithTimeout(
|
|
1114
|
+
`${opts.server}/jobs/${job.id}/fail-deferred`,
|
|
1115
|
+
headers,
|
|
1116
|
+
{
|
|
1117
|
+
workerId: opts.workerId,
|
|
1118
|
+
message: "Merge-conflict environment preparation failed",
|
|
1119
|
+
detail,
|
|
1120
|
+
},
|
|
1121
|
+
);
|
|
1122
|
+
const failPayload = (await failResponse.json().catch(() => ({}))) as {
|
|
1123
|
+
ok?: boolean;
|
|
1124
|
+
message?: string;
|
|
1125
|
+
};
|
|
1126
|
+
if (!failResponse.ok || !failPayload.ok) {
|
|
1127
|
+
console.error(
|
|
1128
|
+
`[WorkerPals] Failed to mark deferred job ${job.id} as failed: ${
|
|
1129
|
+
failPayload.message || `HTTP ${failResponse.status}`
|
|
1130
|
+
}`,
|
|
1131
|
+
);
|
|
1132
|
+
}
|
|
1133
|
+
} catch (failErr) {
|
|
1134
|
+
console.error(
|
|
1135
|
+
`[WorkerPals] Failed to mark deferred job ${job.id} as failed: ${
|
|
1136
|
+
failErr instanceof Error ? failErr.message : String(failErr)
|
|
1137
|
+
}`,
|
|
1138
|
+
);
|
|
1139
|
+
}
|
|
1140
|
+
} finally {
|
|
1141
|
+
clearInterval(maintenanceHeartbeat);
|
|
1142
|
+
}
|
|
1143
|
+
await maybeHeartbeat("idle", null, true);
|
|
1144
|
+
continue;
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1039
1148
|
runtimeState.currentJobId = job.id;
|
|
1040
1149
|
runtimeState.currentSessionId = job.sessionId ?? null;
|
|
1041
1150
|
console.log(`[WorkerPals] Claimed job ${job.id} (${job.kind})`);
|
|
1042
1151
|
await maybeHeartbeat("busy", job.id, true);
|
|
1152
|
+
let allowHeartbeatRecycle = true;
|
|
1043
1153
|
|
|
1044
1154
|
const busyHeartbeat = setInterval(() => {
|
|
1045
|
-
void
|
|
1155
|
+
void transport.sendHeartbeat(buildHeartbeatPayload("busy", job.id)).then((ok) => {
|
|
1156
|
+
if (
|
|
1157
|
+
!shouldRecycleWorkerForHeartbeatDegradation({
|
|
1158
|
+
heartbeatDelivered: ok,
|
|
1159
|
+
allowHeartbeatRecycle,
|
|
1160
|
+
transportStale: transport.shouldRecycleBusyWorker(),
|
|
1161
|
+
})
|
|
1162
|
+
) {
|
|
1163
|
+
return;
|
|
1164
|
+
}
|
|
1165
|
+
requestWorkerRestart(
|
|
1166
|
+
`heartbeat transport stale while claimed job ${job.id} is still running`,
|
|
1167
|
+
);
|
|
1168
|
+
});
|
|
1046
1169
|
}, heartbeatEveryMs);
|
|
1047
1170
|
|
|
1048
1171
|
if (job.sessionId) {
|
|
1049
|
-
await
|
|
1172
|
+
await transport.queueSessionCommand(job.sessionId, {
|
|
1050
1173
|
type: "job_claimed",
|
|
1051
1174
|
payload: { jobId: job.id, workerId: opts.workerId },
|
|
1052
1175
|
from: `worker:${opts.workerId}`,
|
|
1053
|
-
});
|
|
1176
|
+
}, { priority: "high" });
|
|
1054
1177
|
}
|
|
1055
1178
|
|
|
1056
1179
|
let stdoutSeq = 0;
|
|
1057
1180
|
let stderrSeq = 0;
|
|
1058
|
-
let logChain: Promise<void> = Promise.resolve();
|
|
1059
1181
|
let lastCleanLog = "";
|
|
1060
1182
|
let lastCleanLogAt = 0;
|
|
1061
1183
|
|
|
@@ -1077,20 +1199,17 @@ async function workerLoop(
|
|
|
1077
1199
|
const logTs = new Date(now).toISOString();
|
|
1078
1200
|
|
|
1079
1201
|
const seq = stream === "stdout" ? ++stdoutSeq : ++stderrSeq;
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
}),
|
|
1092
|
-
]).then(() => undefined),
|
|
1093
|
-
);
|
|
1202
|
+
void transport.queueSessionCommand(job.sessionId, {
|
|
1203
|
+
type: "job_log",
|
|
1204
|
+
payload: { jobId: job.id, stream, seq, line: cleaned, ts: logTs },
|
|
1205
|
+
from: `worker:${opts.workerId}`,
|
|
1206
|
+
}, { droppable: true });
|
|
1207
|
+
void transport.queueJobLog(job.id, {
|
|
1208
|
+
stream,
|
|
1209
|
+
seq,
|
|
1210
|
+
message: cleaned,
|
|
1211
|
+
ts: logTs,
|
|
1212
|
+
});
|
|
1094
1213
|
}
|
|
1095
1214
|
: undefined;
|
|
1096
1215
|
|
|
@@ -1153,7 +1272,8 @@ async function workerLoop(
|
|
|
1153
1272
|
}
|
|
1154
1273
|
const jobDurationMs = Math.max(0, Date.now() - jobStartedAtMs);
|
|
1155
1274
|
|
|
1156
|
-
|
|
1275
|
+
allowHeartbeatRecycle = false;
|
|
1276
|
+
await transport.flush();
|
|
1157
1277
|
try {
|
|
1158
1278
|
await reportWorkerLlmUsage(opts.server, headers, jobData, result);
|
|
1159
1279
|
} catch (err) {
|
|
@@ -1267,10 +1387,10 @@ async function workerLoop(
|
|
|
1267
1387
|
reviewAgent.prUrl.trim().length > 0
|
|
1268
1388
|
? reviewAgent.prUrl.trim()
|
|
1269
1389
|
: null;
|
|
1270
|
-
const response = await
|
|
1271
|
-
|
|
1390
|
+
const response = await postJsonWithTimeout(
|
|
1391
|
+
`${opts.server}/jobs/${job.id}/complete`,
|
|
1272
1392
|
headers,
|
|
1273
|
-
|
|
1393
|
+
{
|
|
1274
1394
|
summary: result.summary,
|
|
1275
1395
|
durationMs: jobDurationMs,
|
|
1276
1396
|
prUrl: jobPrUrl,
|
|
@@ -1278,21 +1398,17 @@ async function workerLoop(
|
|
|
1278
1398
|
...(result.stdout ? [{ kind: "stdout", text: result.stdout }] : []),
|
|
1279
1399
|
...(result.stderr ? [{ kind: "stderr", text: result.stderr }] : []),
|
|
1280
1400
|
],
|
|
1281
|
-
}
|
|
1282
|
-
|
|
1401
|
+
},
|
|
1402
|
+
);
|
|
1283
1403
|
statusPersistedToServer = response.ok;
|
|
1284
1404
|
console.log(
|
|
1285
1405
|
`[WorkerPals] Job ${job.id} completed in ${formatDurationMs(jobDurationMs)}: ${result.summary}`,
|
|
1286
1406
|
);
|
|
1287
1407
|
} else {
|
|
1288
|
-
const response = await
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
message: result.summary,
|
|
1293
|
-
detail: redactSensitiveText(result.stderr ?? ""),
|
|
1294
|
-
durationMs: jobDurationMs,
|
|
1295
|
-
}),
|
|
1408
|
+
const response = await postJsonWithTimeout(`${opts.server}/jobs/${job.id}/fail`, headers, {
|
|
1409
|
+
message: result.summary,
|
|
1410
|
+
detail: redactSensitiveText(result.stderr ?? ""),
|
|
1411
|
+
durationMs: jobDurationMs,
|
|
1296
1412
|
});
|
|
1297
1413
|
statusPersistedToServer = response.ok;
|
|
1298
1414
|
console.log(
|
|
@@ -1327,11 +1443,11 @@ async function workerLoop(
|
|
|
1327
1443
|
? `${rawText.slice(0, maxResponseChars - 3)}...`
|
|
1328
1444
|
: rawText;
|
|
1329
1445
|
if (assistantText) {
|
|
1330
|
-
await
|
|
1446
|
+
await transport.queueSessionCommand(job.sessionId, {
|
|
1331
1447
|
type: "assistant_message",
|
|
1332
1448
|
payload: { text: assistantText },
|
|
1333
1449
|
from: `worker:${opts.workerId}`,
|
|
1334
|
-
});
|
|
1450
|
+
}, { priority: "high" });
|
|
1335
1451
|
}
|
|
1336
1452
|
}
|
|
1337
1453
|
|
|
@@ -1358,7 +1474,9 @@ async function workerLoop(
|
|
|
1358
1474
|
from: `worker:${opts.workerId}`,
|
|
1359
1475
|
};
|
|
1360
1476
|
|
|
1361
|
-
await
|
|
1477
|
+
await transport.queueSessionCommand(job.sessionId, eventCmd, {
|
|
1478
|
+
priority: "high",
|
|
1479
|
+
});
|
|
1362
1480
|
}
|
|
1363
1481
|
}
|
|
1364
1482
|
} finally {
|
|
@@ -1369,13 +1487,13 @@ async function workerLoop(
|
|
|
1369
1487
|
result?.cooldownMs &&
|
|
1370
1488
|
result.cooldownMs > 0
|
|
1371
1489
|
) {
|
|
1372
|
-
await
|
|
1490
|
+
await transport.queueSessionCommand(job.sessionId, {
|
|
1373
1491
|
type: "assistant_message",
|
|
1374
1492
|
payload: {
|
|
1375
1493
|
text: `WorkerPal is cooling down for ${formatDurationMs(result.cooldownMs)} after transient infrastructure failures.`,
|
|
1376
1494
|
},
|
|
1377
1495
|
from: `worker:${opts.workerId}`,
|
|
1378
|
-
});
|
|
1496
|
+
}, { priority: "high" });
|
|
1379
1497
|
}
|
|
1380
1498
|
if (!recycleWorkerAfterJob && result?.cooldownMs && result.cooldownMs > 0) {
|
|
1381
1499
|
const cooldownMs = Math.max(0, Math.floor(result.cooldownMs));
|
|
@@ -1501,6 +1619,14 @@ async function main(): Promise<void> {
|
|
|
1501
1619
|
shutdownRequested: false,
|
|
1502
1620
|
};
|
|
1503
1621
|
const headers = buildWorkerHeaders(opts.authToken);
|
|
1622
|
+
const transport = new WorkerServerTransport({
|
|
1623
|
+
server: opts.server,
|
|
1624
|
+
headers,
|
|
1625
|
+
workerId: opts.workerId,
|
|
1626
|
+
pollMs: opts.pollMs,
|
|
1627
|
+
heartbeatMs: opts.heartbeatMs,
|
|
1628
|
+
staleClaimTtlMs: CONFIG.server.staleClaimTtlMs,
|
|
1629
|
+
});
|
|
1504
1630
|
let shutdownTriggered = false;
|
|
1505
1631
|
const shutdownAndExit = (signalName: string, code: number) => {
|
|
1506
1632
|
if (shutdownTriggered) return;
|
|
@@ -1517,9 +1643,25 @@ async function main(): Promise<void> {
|
|
|
1517
1643
|
|
|
1518
1644
|
void (async () => {
|
|
1519
1645
|
await withTimeout(
|
|
1520
|
-
|
|
1646
|
+
transport.sendHeartbeat({
|
|
1647
|
+
status: "offline",
|
|
1648
|
+
currentJobId: runtimeState.currentJobId ?? null,
|
|
1649
|
+
capabilities: {
|
|
1650
|
+
docker: opts.docker,
|
|
1651
|
+
labels: opts.labels,
|
|
1652
|
+
executor: resolveExecutor(CONFIG),
|
|
1653
|
+
requireDocker: opts.requireDocker,
|
|
1654
|
+
},
|
|
1655
|
+
details: {
|
|
1656
|
+
repo: opts.repo,
|
|
1657
|
+
baseRef: opts.worktreeBaseRef,
|
|
1658
|
+
dockerImage: opts.docker ? opts.dockerImage : null,
|
|
1659
|
+
dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
|
|
1660
|
+
},
|
|
1661
|
+
}),
|
|
1521
1662
|
);
|
|
1522
|
-
await withTimeout(failActiveJobOnShutdown(opts, headers, runtimeState, signalName));
|
|
1663
|
+
await withTimeout(failActiveJobOnShutdown(opts, headers, runtimeState, transport, signalName));
|
|
1664
|
+
await withTimeout(transport.flush());
|
|
1523
1665
|
if (dockerExecutor) {
|
|
1524
1666
|
await withTimeout(
|
|
1525
1667
|
dockerExecutor.shutdown().catch((err) => {
|
|
@@ -1548,7 +1690,13 @@ async function main(): Promise<void> {
|
|
|
1548
1690
|
}
|
|
1549
1691
|
});
|
|
1550
1692
|
|
|
1551
|
-
|
|
1693
|
+
const requestWorkerRestart = (reason: string) => {
|
|
1694
|
+
if (shutdownTriggered) return;
|
|
1695
|
+
console.error(`[WorkerPals] Control plane unhealthy: ${reason}. Recycling worker.`);
|
|
1696
|
+
shutdownAndExit("CONTROL_PLANE_UNHEALTHY", 91);
|
|
1697
|
+
};
|
|
1698
|
+
|
|
1699
|
+
workerLoop(opts, dockerExecutor, runtimeState, transport, requestWorkerRestart).catch((err) => {
|
|
1552
1700
|
console.error("[WorkerPals] Fatal:", err);
|
|
1553
1701
|
process.exit(1);
|
|
1554
1702
|
});
|