@pushpalsdev/cli 1.0.38 → 1.0.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/pushpals-cli.js
CHANGED
|
@@ -122,6 +122,7 @@ class ServiceManager {
|
|
|
122
122
|
onServiceDegraded;
|
|
123
123
|
onEvent;
|
|
124
124
|
timer;
|
|
125
|
+
shutdownBegun = false;
|
|
125
126
|
stopped = false;
|
|
126
127
|
constructor(options = {}) {
|
|
127
128
|
this.pollMs = Math.max(50, Math.floor(options.pollMs ?? DEFAULT_SERVICE_MANAGER_POLL_MS));
|
|
@@ -161,10 +162,10 @@ class ServiceManager {
|
|
|
161
162
|
action: this.degradedAction
|
|
162
163
|
};
|
|
163
164
|
}
|
|
164
|
-
|
|
165
|
-
if (this.stopped)
|
|
165
|
+
beginShutdown() {
|
|
166
|
+
if (this.shutdownBegun || this.stopped)
|
|
166
167
|
return;
|
|
167
|
-
this.
|
|
168
|
+
this.shutdownBegun = true;
|
|
168
169
|
clearInterval(this.timer);
|
|
169
170
|
for (const state of this.stateByService.values()) {
|
|
170
171
|
if (!state.pendingRestartTimer)
|
|
@@ -172,6 +173,12 @@ class ServiceManager {
|
|
|
172
173
|
clearTimeout(state.pendingRestartTimer);
|
|
173
174
|
state.pendingRestartTimer = null;
|
|
174
175
|
}
|
|
176
|
+
}
|
|
177
|
+
stop() {
|
|
178
|
+
if (this.stopped)
|
|
179
|
+
return;
|
|
180
|
+
this.beginShutdown();
|
|
181
|
+
this.stopped = true;
|
|
175
182
|
for (const service of this.services.values()) {
|
|
176
183
|
try {
|
|
177
184
|
const pid = service.proc.pid;
|
|
@@ -207,7 +214,7 @@ class ServiceManager {
|
|
|
207
214
|
this.onEvent?.(level, line);
|
|
208
215
|
}
|
|
209
216
|
tick() {
|
|
210
|
-
if (this.stopped)
|
|
217
|
+
if (this.shutdownBegun || this.stopped)
|
|
211
218
|
return;
|
|
212
219
|
const now = Date.now();
|
|
213
220
|
for (const [name, service] of this.services.entries()) {
|
|
@@ -250,7 +257,7 @@ class ServiceManager {
|
|
|
250
257
|
state.pendingRestartTimer = setTimeout(() => {
|
|
251
258
|
state.pendingRestartTimer = null;
|
|
252
259
|
state.nextRestartAtMs = 0;
|
|
253
|
-
if (this.stopped)
|
|
260
|
+
if (this.shutdownBegun || this.stopped)
|
|
254
261
|
return;
|
|
255
262
|
const current = this.services.get(name);
|
|
256
263
|
if (!current || !current.exited)
|
|
@@ -4792,6 +4799,7 @@ async function main() {
|
|
|
4792
4799
|
return;
|
|
4793
4800
|
const serviceManager = autoStartedServiceManager;
|
|
4794
4801
|
autoStartedServiceManager = null;
|
|
4802
|
+
serviceManager.beginShutdown();
|
|
4795
4803
|
const shutdown = await requestLocalRuntimeShutdown(serverUrl, repoRoot, reason);
|
|
4796
4804
|
if (shutdown.attempted && shutdown.accepted) {
|
|
4797
4805
|
console.log("[pushpals] Local runtime shutdown accepted; waiting for services to exit...");
|
package/package.json
CHANGED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
import type { CommandRequest } from "protocol";
|
|
2
|
+
|
|
3
|
+
export type WorkerHeartbeatStatus = "idle" | "busy" | "error" | "offline";
|
|
4
|
+
|
|
5
|
+
type QueuePriority = "high" | "normal";
|
|
6
|
+
|
|
7
|
+
type TransportTask = {
|
|
8
|
+
label: string;
|
|
9
|
+
priority: QueuePriority;
|
|
10
|
+
droppable: boolean;
|
|
11
|
+
run: () => Promise<void>;
|
|
12
|
+
resolve: () => void;
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
export type WorkerServerTransportOptions = {
|
|
16
|
+
server: string;
|
|
17
|
+
headers: Record<string, string>;
|
|
18
|
+
workerId: string;
|
|
19
|
+
pollMs: number;
|
|
20
|
+
heartbeatMs: number;
|
|
21
|
+
staleClaimTtlMs: number;
|
|
22
|
+
fetchFn?: typeof fetch;
|
|
23
|
+
logInfo?: (message: string) => void;
|
|
24
|
+
logWarn?: (message: string) => void;
|
|
25
|
+
nowFn?: () => number;
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
export type WorkerTransportHealthSnapshot = {
|
|
29
|
+
heartbeatInFlight: boolean;
|
|
30
|
+
consecutiveHeartbeatFailures: number;
|
|
31
|
+
lastHeartbeatAttemptAt: number;
|
|
32
|
+
lastHeartbeatSuccessAt: number;
|
|
33
|
+
queuedRequests: number;
|
|
34
|
+
droppedLogRequests: number;
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
export type WorkerHeartbeatPayload = {
|
|
38
|
+
status: WorkerHeartbeatStatus;
|
|
39
|
+
currentJobId: string | null;
|
|
40
|
+
capabilities?: Record<string, unknown>;
|
|
41
|
+
details?: Record<string, unknown>;
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
function computeHeartbeatTimeoutMs(heartbeatMs: number): number {
|
|
45
|
+
return Math.max(1_500, Math.min(4_000, Math.floor(heartbeatMs * 0.8)));
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function computeRequestTimeoutMs(heartbeatMs: number): number {
|
|
49
|
+
return Math.max(4_000, Math.min(10_000, Math.floor(heartbeatMs * 2)));
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
async function readResponseDetail(response: Response): Promise<string> {
|
|
53
|
+
const text = await response.text().catch(() => "");
|
|
54
|
+
return text.trim();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export class WorkerServerTransport {
|
|
58
|
+
private readonly server: string;
|
|
59
|
+
private readonly headers: Record<string, string>;
|
|
60
|
+
private readonly workerId: string;
|
|
61
|
+
private readonly pollMs: number;
|
|
62
|
+
private readonly staleClaimTtlMs: number;
|
|
63
|
+
private readonly fetchFn: typeof fetch;
|
|
64
|
+
private readonly logInfo: (message: string) => void;
|
|
65
|
+
private readonly logWarn: (message: string) => void;
|
|
66
|
+
private readonly nowFn: () => number;
|
|
67
|
+
private readonly heartbeatTimeoutMs: number;
|
|
68
|
+
private readonly requestTimeoutMs: number;
|
|
69
|
+
private readonly maxQueuedRequests = 256;
|
|
70
|
+
|
|
71
|
+
private queuedRequests: TransportTask[] = [];
|
|
72
|
+
private queueDrainInFlight = false;
|
|
73
|
+
private queueFlushWaiters: Array<() => void> = [];
|
|
74
|
+
private droppedLogRequests = 0;
|
|
75
|
+
private heartbeatInFlight = false;
|
|
76
|
+
private lastHeartbeatAttemptAt = 0;
|
|
77
|
+
private lastHeartbeatSuccessAt = 0;
|
|
78
|
+
private consecutiveHeartbeatFailures = 0;
|
|
79
|
+
private firstHeartbeatFailureAt = -1;
|
|
80
|
+
private lastHeartbeatFailureDetail = "";
|
|
81
|
+
|
|
82
|
+
constructor(options: WorkerServerTransportOptions) {
|
|
83
|
+
this.server = options.server;
|
|
84
|
+
this.headers = options.headers;
|
|
85
|
+
this.workerId = options.workerId;
|
|
86
|
+
this.pollMs = options.pollMs;
|
|
87
|
+
this.staleClaimTtlMs = options.staleClaimTtlMs;
|
|
88
|
+
this.fetchFn = options.fetchFn ?? fetch;
|
|
89
|
+
this.logInfo = options.logInfo ?? ((message) => console.log(message));
|
|
90
|
+
this.logWarn = options.logWarn ?? ((message) => console.warn(message));
|
|
91
|
+
this.nowFn = options.nowFn ?? (() => Date.now());
|
|
92
|
+
this.heartbeatTimeoutMs = computeHeartbeatTimeoutMs(options.heartbeatMs);
|
|
93
|
+
this.requestTimeoutMs = computeRequestTimeoutMs(options.heartbeatMs);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
getHealthSnapshot(): WorkerTransportHealthSnapshot {
|
|
97
|
+
return {
|
|
98
|
+
heartbeatInFlight: this.heartbeatInFlight,
|
|
99
|
+
consecutiveHeartbeatFailures: this.consecutiveHeartbeatFailures,
|
|
100
|
+
lastHeartbeatAttemptAt: this.lastHeartbeatAttemptAt,
|
|
101
|
+
lastHeartbeatSuccessAt: this.lastHeartbeatSuccessAt,
|
|
102
|
+
queuedRequests: this.queuedRequests.length,
|
|
103
|
+
droppedLogRequests: this.droppedLogRequests,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
getHeartbeatStaleAgeMs(nowMs = this.nowFn()): number {
|
|
108
|
+
if (this.lastHeartbeatSuccessAt <= 0) return Number.POSITIVE_INFINITY;
|
|
109
|
+
return Math.max(0, nowMs - this.lastHeartbeatSuccessAt);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
shouldRecycleBusyWorker(nowMs = this.nowFn()): boolean {
|
|
113
|
+
const failureAgeMs =
|
|
114
|
+
this.firstHeartbeatFailureAt >= 0
|
|
115
|
+
? Math.max(0, nowMs - this.firstHeartbeatFailureAt)
|
|
116
|
+
: null;
|
|
117
|
+
if (failureAgeMs == null) return false;
|
|
118
|
+
const threshold = Math.min(
|
|
119
|
+
this.staleClaimTtlMs,
|
|
120
|
+
Math.max(
|
|
121
|
+
30_000,
|
|
122
|
+
Math.min(
|
|
123
|
+
this.staleClaimTtlMs - this.heartbeatTimeoutMs,
|
|
124
|
+
Math.floor(this.staleClaimTtlMs * 0.75),
|
|
125
|
+
),
|
|
126
|
+
),
|
|
127
|
+
);
|
|
128
|
+
return failureAgeMs >= threshold;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
async sendHeartbeat(payload: WorkerHeartbeatPayload): Promise<boolean> {
|
|
132
|
+
if (this.heartbeatInFlight) {
|
|
133
|
+
return false;
|
|
134
|
+
}
|
|
135
|
+
this.heartbeatInFlight = true;
|
|
136
|
+
this.lastHeartbeatAttemptAt = this.nowFn();
|
|
137
|
+
try {
|
|
138
|
+
const response = await this.postJson("/workers/heartbeat", {
|
|
139
|
+
workerId: this.workerId,
|
|
140
|
+
status: payload.status,
|
|
141
|
+
currentJobId: payload.currentJobId,
|
|
142
|
+
pollMs: this.pollMs,
|
|
143
|
+
capabilities: payload.capabilities ?? {},
|
|
144
|
+
details: payload.details ?? {},
|
|
145
|
+
}, this.heartbeatTimeoutMs);
|
|
146
|
+
if (!response.ok) {
|
|
147
|
+
const detail = await readResponseDetail(response);
|
|
148
|
+
throw new Error(
|
|
149
|
+
`heartbeat rejected (${response.status})${detail ? `: ${detail}` : ""}`,
|
|
150
|
+
);
|
|
151
|
+
}
|
|
152
|
+
const previousFailures = this.consecutiveHeartbeatFailures;
|
|
153
|
+
this.lastHeartbeatSuccessAt = this.nowFn();
|
|
154
|
+
this.consecutiveHeartbeatFailures = 0;
|
|
155
|
+
this.firstHeartbeatFailureAt = -1;
|
|
156
|
+
this.lastHeartbeatFailureDetail = "";
|
|
157
|
+
if (previousFailures > 0) {
|
|
158
|
+
this.logInfo(
|
|
159
|
+
`[WorkerPals] Heartbeat recovered for ${this.workerId} after ${previousFailures} failed attempt(s).`,
|
|
160
|
+
);
|
|
161
|
+
}
|
|
162
|
+
return true;
|
|
163
|
+
} catch (error) {
|
|
164
|
+
if (this.consecutiveHeartbeatFailures === 0) {
|
|
165
|
+
this.firstHeartbeatFailureAt = this.nowFn();
|
|
166
|
+
}
|
|
167
|
+
this.consecutiveHeartbeatFailures += 1;
|
|
168
|
+
this.lastHeartbeatFailureDetail = error instanceof Error ? error.message : String(error);
|
|
169
|
+
const staleAgeMs = this.getHeartbeatStaleAgeMs();
|
|
170
|
+
this.logWarn(
|
|
171
|
+
`[WorkerPals] Heartbeat failure ${this.consecutiveHeartbeatFailures} for ${this.workerId}: ${this.lastHeartbeatFailureDetail} (lastSuccessAgeMs=${Number.isFinite(staleAgeMs) ? staleAgeMs : -1}).`,
|
|
172
|
+
);
|
|
173
|
+
return false;
|
|
174
|
+
} finally {
|
|
175
|
+
this.heartbeatInFlight = false;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
queueSessionCommand(
|
|
180
|
+
sessionId: string,
|
|
181
|
+
cmd: CommandRequest,
|
|
182
|
+
options: { priority?: QueuePriority; droppable?: boolean } = {},
|
|
183
|
+
): Promise<void> {
|
|
184
|
+
return this.enqueueTask({
|
|
185
|
+
label: `command:${cmd.type}`,
|
|
186
|
+
priority: options.priority ?? "normal",
|
|
187
|
+
droppable: options.droppable ?? false,
|
|
188
|
+
run: async () => {
|
|
189
|
+
const response = await this.postJson(
|
|
190
|
+
`/sessions/${sessionId}/command`,
|
|
191
|
+
cmd,
|
|
192
|
+
this.requestTimeoutMs,
|
|
193
|
+
);
|
|
194
|
+
if (!response.ok) {
|
|
195
|
+
const detail = await readResponseDetail(response);
|
|
196
|
+
this.logWarn(
|
|
197
|
+
`[WorkerPals] Command ${cmd.type} failed: ${response.status}${detail ? ` ${detail}` : ""}`,
|
|
198
|
+
);
|
|
199
|
+
}
|
|
200
|
+
},
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
queueJobLog(
|
|
205
|
+
jobId: string,
|
|
206
|
+
payload: { stream: "stdout" | "stderr"; seq: number; message: string; ts: string },
|
|
207
|
+
): Promise<void> {
|
|
208
|
+
return this.enqueueTask({
|
|
209
|
+
label: "job_log",
|
|
210
|
+
priority: "normal",
|
|
211
|
+
droppable: true,
|
|
212
|
+
run: async () => {
|
|
213
|
+
const response = await this.postJson(`/jobs/${jobId}/log`, payload, this.requestTimeoutMs);
|
|
214
|
+
if (!response.ok) {
|
|
215
|
+
const detail = await readResponseDetail(response);
|
|
216
|
+
this.logWarn(
|
|
217
|
+
`[WorkerPals] Job log delivery failed for ${jobId}: ${response.status}${detail ? ` ${detail}` : ""}`,
|
|
218
|
+
);
|
|
219
|
+
}
|
|
220
|
+
},
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
async flush(timeoutMs = 15_000): Promise<void> {
|
|
225
|
+
if (this.queuedRequests.length === 0 && !this.queueDrainInFlight) return;
|
|
226
|
+
await new Promise<void>((resolve) => {
|
|
227
|
+
let settled = false;
|
|
228
|
+
const timer = setTimeout(() => {
|
|
229
|
+
if (settled) return;
|
|
230
|
+
settled = true;
|
|
231
|
+
this.logWarn(
|
|
232
|
+
`[WorkerPals] Timed out flushing queued server transport requests after ${timeoutMs}ms (queued=${this.queuedRequests.length}).`,
|
|
233
|
+
);
|
|
234
|
+
resolve();
|
|
235
|
+
}, timeoutMs);
|
|
236
|
+
this.queueFlushWaiters.push(() => {
|
|
237
|
+
if (settled) return;
|
|
238
|
+
settled = true;
|
|
239
|
+
clearTimeout(timer);
|
|
240
|
+
resolve();
|
|
241
|
+
});
|
|
242
|
+
this.maybeResolveFlushWaiters();
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
private enqueueTask(task: Omit<TransportTask, "resolve">): Promise<void> {
|
|
247
|
+
if (task.droppable && this.queuedRequests.length >= this.maxQueuedRequests) {
|
|
248
|
+
this.droppedLogRequests += 1;
|
|
249
|
+
if (this.droppedLogRequests === 1 || this.droppedLogRequests % 25 === 0) {
|
|
250
|
+
this.logWarn(
|
|
251
|
+
`[WorkerPals] Dropped ${this.droppedLogRequests} queued low-priority transport request(s) because the queue is saturated (limit=${this.maxQueuedRequests}).`,
|
|
252
|
+
);
|
|
253
|
+
}
|
|
254
|
+
return Promise.resolve();
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return new Promise((resolve) => {
|
|
258
|
+
const queued: TransportTask = { ...task, resolve };
|
|
259
|
+
if (queued.priority === "high") {
|
|
260
|
+
const firstNormalIndex = this.queuedRequests.findIndex((entry) => entry.priority !== "high");
|
|
261
|
+
if (firstNormalIndex === -1) {
|
|
262
|
+
this.queuedRequests.push(queued);
|
|
263
|
+
} else {
|
|
264
|
+
this.queuedRequests.splice(firstNormalIndex, 0, queued);
|
|
265
|
+
}
|
|
266
|
+
} else {
|
|
267
|
+
this.queuedRequests.push(queued);
|
|
268
|
+
}
|
|
269
|
+
void this.drainQueue();
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
private async drainQueue(): Promise<void> {
|
|
274
|
+
if (this.queueDrainInFlight) return;
|
|
275
|
+
this.queueDrainInFlight = true;
|
|
276
|
+
try {
|
|
277
|
+
while (this.queuedRequests.length > 0) {
|
|
278
|
+
const task = this.queuedRequests.shift();
|
|
279
|
+
if (!task) break;
|
|
280
|
+
try {
|
|
281
|
+
await task.run();
|
|
282
|
+
} catch (error) {
|
|
283
|
+
this.logWarn(
|
|
284
|
+
`[WorkerPals] Transport request ${task.label} failed: ${
|
|
285
|
+
error instanceof Error ? error.message : String(error)
|
|
286
|
+
}`,
|
|
287
|
+
);
|
|
288
|
+
} finally {
|
|
289
|
+
task.resolve();
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
} finally {
|
|
293
|
+
this.queueDrainInFlight = false;
|
|
294
|
+
this.maybeResolveFlushWaiters();
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
private maybeResolveFlushWaiters(): void {
|
|
299
|
+
if (this.queuedRequests.length > 0 || this.queueDrainInFlight) return;
|
|
300
|
+
const waiters = this.queueFlushWaiters.splice(0);
|
|
301
|
+
for (const waiter of waiters) waiter();
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
private async postJson(path: string, payload: unknown, timeoutMs: number): Promise<Response> {
|
|
305
|
+
const controller = new AbortController();
|
|
306
|
+
const timer = setTimeout(() => controller.abort("timeout"), timeoutMs);
|
|
307
|
+
try {
|
|
308
|
+
return await this.fetchFn(`${this.server}${path}`, {
|
|
309
|
+
method: "POST",
|
|
310
|
+
headers: this.headers,
|
|
311
|
+
body: JSON.stringify(payload),
|
|
312
|
+
signal: controller.signal,
|
|
313
|
+
});
|
|
314
|
+
} catch (error) {
|
|
315
|
+
if (controller.signal.aborted) {
|
|
316
|
+
throw new Error(`request timed out after ${timeoutMs}ms (${path})`);
|
|
317
|
+
}
|
|
318
|
+
throw error;
|
|
319
|
+
} finally {
|
|
320
|
+
clearTimeout(timer);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
}
|
|
@@ -20,7 +20,6 @@
|
|
|
20
20
|
* JobRunner executes a single job, streams logs, and outputs a final result with a sentinel line.
|
|
21
21
|
*/
|
|
22
22
|
|
|
23
|
-
import type { CommandRequest } from "protocol";
|
|
24
23
|
import { randomUUID } from "crypto";
|
|
25
24
|
import { mkdirSync } from "fs";
|
|
26
25
|
import { resolve } from "path";
|
|
@@ -44,6 +43,7 @@ import {
|
|
|
44
43
|
} from "./execute_job.js";
|
|
45
44
|
import { DockerExecutionExhaustedError, DockerExecutor } from "./docker_executor.js";
|
|
46
45
|
import { forceDeleteWorktreePath } from "./common/worktree_cleanup.js";
|
|
46
|
+
import { WorkerServerTransport, type WorkerHeartbeatPayload } from "./common/server_transport.js";
|
|
47
47
|
import { DEFAULT_DOCKER_TIMEOUT_MS, parseDockerTimeoutMs } from "./timeout_policy.js";
|
|
48
48
|
|
|
49
49
|
type CommitRef = {
|
|
@@ -95,6 +95,31 @@ function estimateTokensFromText(text: string): number {
|
|
|
95
95
|
return Math.max(0, Math.ceil(String(text ?? "").length / 3));
|
|
96
96
|
}
|
|
97
97
|
|
|
98
|
+
async function postJsonWithTimeout(
|
|
99
|
+
url: string,
|
|
100
|
+
headers: Record<string, string>,
|
|
101
|
+
body: unknown,
|
|
102
|
+
timeoutMs = 10_000,
|
|
103
|
+
): Promise<Response> {
|
|
104
|
+
const controller = new AbortController();
|
|
105
|
+
const timer = setTimeout(() => controller.abort("timeout"), timeoutMs);
|
|
106
|
+
try {
|
|
107
|
+
return await fetch(url, {
|
|
108
|
+
method: "POST",
|
|
109
|
+
headers,
|
|
110
|
+
body: JSON.stringify(body),
|
|
111
|
+
signal: controller.signal,
|
|
112
|
+
});
|
|
113
|
+
} catch (error) {
|
|
114
|
+
if (controller.signal.aborted) {
|
|
115
|
+
throw new Error(`request timed out after ${timeoutMs}ms: ${url}`);
|
|
116
|
+
}
|
|
117
|
+
throw error;
|
|
118
|
+
} finally {
|
|
119
|
+
clearTimeout(timer);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
98
123
|
function buildWorkerLlmUsageEvent(
|
|
99
124
|
job: {
|
|
100
125
|
kind: string;
|
|
@@ -172,11 +197,7 @@ async function reportWorkerLlmUsage(
|
|
|
172
197
|
): Promise<void> {
|
|
173
198
|
const payload = buildWorkerLlmUsageEvent(job, result);
|
|
174
199
|
if (!payload) return;
|
|
175
|
-
const response = await
|
|
176
|
-
method: "POST",
|
|
177
|
-
headers,
|
|
178
|
-
body: JSON.stringify(payload),
|
|
179
|
-
});
|
|
200
|
+
const response = await postJsonWithTimeout(`${server}/telemetry/llm-usage`, headers, payload);
|
|
180
201
|
if (!response.ok) {
|
|
181
202
|
const detail = await response.text().catch(() => "");
|
|
182
203
|
throw new Error(
|
|
@@ -223,6 +244,16 @@ export function shouldEmitDirectSessionJobEvent(options: {
|
|
|
223
244
|
return !options.statusPersistedToServer;
|
|
224
245
|
}
|
|
225
246
|
|
|
247
|
+
export function shouldRecycleWorkerForHeartbeatDegradation(options: {
|
|
248
|
+
heartbeatDelivered: boolean;
|
|
249
|
+
allowHeartbeatRecycle: boolean;
|
|
250
|
+
transportStale: boolean;
|
|
251
|
+
}): boolean {
|
|
252
|
+
if (options.heartbeatDelivered) return false;
|
|
253
|
+
if (!options.allowHeartbeatRecycle) return false;
|
|
254
|
+
return options.transportStale;
|
|
255
|
+
}
|
|
256
|
+
|
|
226
257
|
function shouldRecycleWorkerForCodexUnavailableFailure(
|
|
227
258
|
summary: string,
|
|
228
259
|
stderr?: string | null,
|
|
@@ -853,19 +884,15 @@ async function enqueueCompletion(
|
|
|
853
884
|
resultSummary,
|
|
854
885
|
});
|
|
855
886
|
|
|
856
|
-
const response = await
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
prUrl,
|
|
866
|
-
prTitle: pr.title,
|
|
867
|
-
prBody: pr.body,
|
|
868
|
-
}),
|
|
887
|
+
const response = await postJsonWithTimeout(`${server}/completions/enqueue`, headers, {
|
|
888
|
+
jobId: job.id,
|
|
889
|
+
sessionId: job.sessionId,
|
|
890
|
+
commitSha: commit.sha,
|
|
891
|
+
branch: commit.branch,
|
|
892
|
+
message: `${job.kind}: ${job.taskId} (worker PR metadata attached)`,
|
|
893
|
+
prUrl,
|
|
894
|
+
prTitle: pr.title,
|
|
895
|
+
prBody: pr.body,
|
|
869
896
|
});
|
|
870
897
|
|
|
871
898
|
if (response.ok) {
|
|
@@ -883,24 +910,6 @@ async function enqueueCompletion(
|
|
|
883
910
|
}
|
|
884
911
|
}
|
|
885
912
|
|
|
886
|
-
function sendCommand(
|
|
887
|
-
server: string,
|
|
888
|
-
sessionId: string,
|
|
889
|
-
headers: Record<string, string>,
|
|
890
|
-
cmd: CommandRequest,
|
|
891
|
-
): Promise<void> {
|
|
892
|
-
return fetch(`${server}/sessions/${sessionId}/command`, {
|
|
893
|
-
method: "POST",
|
|
894
|
-
headers,
|
|
895
|
-
body: JSON.stringify(cmd),
|
|
896
|
-
})
|
|
897
|
-
.then((res) => {
|
|
898
|
-
if (!res.ok) console.error(`[WorkerPals] Command ${cmd.type} failed: ${res.status}`);
|
|
899
|
-
})
|
|
900
|
-
.catch((err) => console.error(`[WorkerPals] Command ${cmd.type} error:`, err));
|
|
901
|
-
}
|
|
902
|
-
|
|
903
|
-
type WorkerHeartbeatStatus = "idle" | "busy" | "error" | "offline";
|
|
904
913
|
type WorkerRuntimeState = {
|
|
905
914
|
currentJobId: string | null;
|
|
906
915
|
currentSessionId: string | null;
|
|
@@ -913,44 +922,11 @@ function buildWorkerHeaders(authToken: string | null): Record<string, string> {
|
|
|
913
922
|
return headers;
|
|
914
923
|
}
|
|
915
924
|
|
|
916
|
-
async function sendWorkerHeartbeat(
|
|
917
|
-
opts: ReturnType<typeof parseArgs>,
|
|
918
|
-
headers: Record<string, string>,
|
|
919
|
-
status: WorkerHeartbeatStatus,
|
|
920
|
-
currentJobId: string | null = null,
|
|
921
|
-
): Promise<void> {
|
|
922
|
-
try {
|
|
923
|
-
await fetch(`${opts.server}/workers/heartbeat`, {
|
|
924
|
-
method: "POST",
|
|
925
|
-
headers,
|
|
926
|
-
body: JSON.stringify({
|
|
927
|
-
workerId: opts.workerId,
|
|
928
|
-
status,
|
|
929
|
-
currentJobId,
|
|
930
|
-
pollMs: opts.pollMs,
|
|
931
|
-
capabilities: {
|
|
932
|
-
docker: opts.docker,
|
|
933
|
-
labels: opts.labels,
|
|
934
|
-
executor: resolveExecutor(CONFIG),
|
|
935
|
-
requireDocker: opts.requireDocker,
|
|
936
|
-
},
|
|
937
|
-
details: {
|
|
938
|
-
repo: opts.repo,
|
|
939
|
-
baseRef: opts.worktreeBaseRef,
|
|
940
|
-
dockerImage: opts.docker ? opts.dockerImage : null,
|
|
941
|
-
dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
|
|
942
|
-
},
|
|
943
|
-
}),
|
|
944
|
-
});
|
|
945
|
-
} catch (err) {
|
|
946
|
-
console.error(`[WorkerPals] Heartbeat error:`, err);
|
|
947
|
-
}
|
|
948
|
-
}
|
|
949
|
-
|
|
950
925
|
async function failActiveJobOnShutdown(
|
|
951
926
|
opts: ReturnType<typeof parseArgs>,
|
|
952
927
|
headers: Record<string, string>,
|
|
953
928
|
runtimeState: WorkerRuntimeState,
|
|
929
|
+
transport: WorkerServerTransport,
|
|
954
930
|
signalName: string,
|
|
955
931
|
): Promise<void> {
|
|
956
932
|
const activeJobId = runtimeState.currentJobId;
|
|
@@ -961,10 +937,9 @@ async function failActiveJobOnShutdown(
|
|
|
961
937
|
let statusPersistedToServer = false;
|
|
962
938
|
|
|
963
939
|
try {
|
|
964
|
-
const response = await
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
body: JSON.stringify({ message, detail }),
|
|
940
|
+
const response = await postJsonWithTimeout(`${opts.server}/jobs/${activeJobId}/fail`, headers, {
|
|
941
|
+
message,
|
|
942
|
+
detail,
|
|
968
943
|
});
|
|
969
944
|
statusPersistedToServer = response.ok;
|
|
970
945
|
} catch (err) {
|
|
@@ -978,7 +953,7 @@ async function failActiveJobOnShutdown(
|
|
|
978
953
|
runtimeState.currentSessionId &&
|
|
979
954
|
shouldEmitDirectSessionJobEvent({ ok: false, statusPersistedToServer })
|
|
980
955
|
) {
|
|
981
|
-
await
|
|
956
|
+
await transport.queueSessionCommand(runtimeState.currentSessionId, {
|
|
982
957
|
type: "job_failed",
|
|
983
958
|
payload: {
|
|
984
959
|
jobId: activeJobId,
|
|
@@ -986,7 +961,7 @@ async function failActiveJobOnShutdown(
|
|
|
986
961
|
detail,
|
|
987
962
|
},
|
|
988
963
|
from: `worker:${opts.workerId}`,
|
|
989
|
-
});
|
|
964
|
+
}, { priority: "high" });
|
|
990
965
|
}
|
|
991
966
|
}
|
|
992
967
|
|
|
@@ -994,6 +969,8 @@ async function workerLoop(
|
|
|
994
969
|
opts: ReturnType<typeof parseArgs>,
|
|
995
970
|
dockerExecutor: DockerExecutor | null,
|
|
996
971
|
runtimeState: WorkerRuntimeState,
|
|
972
|
+
transport: WorkerServerTransport,
|
|
973
|
+
requestWorkerRestart: (reason: string) => void,
|
|
997
974
|
): Promise<void> {
|
|
998
975
|
const headers = buildWorkerHeaders(opts.authToken);
|
|
999
976
|
|
|
@@ -1007,17 +984,37 @@ async function workerLoop(
|
|
|
1007
984
|
}
|
|
1008
985
|
console.log(`[WorkerPals ${opts.workerId}] Executor backend: ${resolveExecutor(CONFIG)}`);
|
|
1009
986
|
const heartbeatEveryMs = Math.max(1000, opts.heartbeatMs);
|
|
987
|
+
const claimTimeoutMs = Math.max(4_000, Math.min(15_000, opts.pollMs * 3));
|
|
1010
988
|
let lastHeartbeatAt = 0;
|
|
989
|
+
const buildHeartbeatPayload = (
|
|
990
|
+
status: WorkerHeartbeatPayload["status"],
|
|
991
|
+
currentJobId: string | null,
|
|
992
|
+
): WorkerHeartbeatPayload => ({
|
|
993
|
+
status,
|
|
994
|
+
currentJobId,
|
|
995
|
+
capabilities: {
|
|
996
|
+
docker: opts.docker,
|
|
997
|
+
labels: opts.labels,
|
|
998
|
+
executor: resolveExecutor(CONFIG),
|
|
999
|
+
requireDocker: opts.requireDocker,
|
|
1000
|
+
},
|
|
1001
|
+
details: {
|
|
1002
|
+
repo: opts.repo,
|
|
1003
|
+
baseRef: opts.worktreeBaseRef,
|
|
1004
|
+
dockerImage: opts.docker ? opts.dockerImage : null,
|
|
1005
|
+
dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
|
|
1006
|
+
},
|
|
1007
|
+
});
|
|
1011
1008
|
|
|
1012
1009
|
const maybeHeartbeat = async (
|
|
1013
|
-
status:
|
|
1010
|
+
status: WorkerHeartbeatPayload["status"],
|
|
1014
1011
|
currentJobId: string | null = null,
|
|
1015
1012
|
force = false,
|
|
1016
1013
|
) => {
|
|
1017
1014
|
const now = Date.now();
|
|
1018
1015
|
if (!force && now - lastHeartbeatAt < heartbeatEveryMs) return;
|
|
1019
|
-
await
|
|
1020
|
-
lastHeartbeatAt = now;
|
|
1016
|
+
const ok = await transport.sendHeartbeat(buildHeartbeatPayload(status, currentJobId));
|
|
1017
|
+
if (ok) lastHeartbeatAt = now;
|
|
1021
1018
|
};
|
|
1022
1019
|
|
|
1023
1020
|
await maybeHeartbeat("idle", null, true);
|
|
@@ -1025,11 +1022,12 @@ async function workerLoop(
|
|
|
1025
1022
|
while (!runtimeState.shutdownRequested) {
|
|
1026
1023
|
try {
|
|
1027
1024
|
await maybeHeartbeat("idle");
|
|
1028
|
-
const claimRes = await
|
|
1029
|
-
|
|
1025
|
+
const claimRes = await postJsonWithTimeout(
|
|
1026
|
+
`${opts.server}/jobs/claim`,
|
|
1030
1027
|
headers,
|
|
1031
|
-
|
|
1032
|
-
|
|
1028
|
+
{ workerId: opts.workerId },
|
|
1029
|
+
claimTimeoutMs,
|
|
1030
|
+
);
|
|
1033
1031
|
|
|
1034
1032
|
if (claimRes.ok) {
|
|
1035
1033
|
const data = (await claimRes.json()) as any;
|
|
@@ -1040,22 +1038,35 @@ async function workerLoop(
|
|
|
1040
1038
|
runtimeState.currentSessionId = job.sessionId ?? null;
|
|
1041
1039
|
console.log(`[WorkerPals] Claimed job ${job.id} (${job.kind})`);
|
|
1042
1040
|
await maybeHeartbeat("busy", job.id, true);
|
|
1041
|
+
let allowHeartbeatRecycle = true;
|
|
1043
1042
|
|
|
1044
1043
|
const busyHeartbeat = setInterval(() => {
|
|
1045
|
-
void
|
|
1044
|
+
void transport.sendHeartbeat(buildHeartbeatPayload("busy", job.id)).then((ok) => {
|
|
1045
|
+
if (
|
|
1046
|
+
!shouldRecycleWorkerForHeartbeatDegradation({
|
|
1047
|
+
heartbeatDelivered: ok,
|
|
1048
|
+
allowHeartbeatRecycle,
|
|
1049
|
+
transportStale: transport.shouldRecycleBusyWorker(),
|
|
1050
|
+
})
|
|
1051
|
+
) {
|
|
1052
|
+
return;
|
|
1053
|
+
}
|
|
1054
|
+
requestWorkerRestart(
|
|
1055
|
+
`heartbeat transport stale while claimed job ${job.id} is still running`,
|
|
1056
|
+
);
|
|
1057
|
+
});
|
|
1046
1058
|
}, heartbeatEveryMs);
|
|
1047
1059
|
|
|
1048
1060
|
if (job.sessionId) {
|
|
1049
|
-
await
|
|
1061
|
+
await transport.queueSessionCommand(job.sessionId, {
|
|
1050
1062
|
type: "job_claimed",
|
|
1051
1063
|
payload: { jobId: job.id, workerId: opts.workerId },
|
|
1052
1064
|
from: `worker:${opts.workerId}`,
|
|
1053
|
-
});
|
|
1065
|
+
}, { priority: "high" });
|
|
1054
1066
|
}
|
|
1055
1067
|
|
|
1056
1068
|
let stdoutSeq = 0;
|
|
1057
1069
|
let stderrSeq = 0;
|
|
1058
|
-
let logChain: Promise<void> = Promise.resolve();
|
|
1059
1070
|
let lastCleanLog = "";
|
|
1060
1071
|
let lastCleanLogAt = 0;
|
|
1061
1072
|
|
|
@@ -1077,20 +1088,17 @@ async function workerLoop(
|
|
|
1077
1088
|
const logTs = new Date(now).toISOString();
|
|
1078
1089
|
|
|
1079
1090
|
const seq = stream === "stdout" ? ++stdoutSeq : ++stderrSeq;
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
}),
|
|
1092
|
-
]).then(() => undefined),
|
|
1093
|
-
);
|
|
1091
|
+
void transport.queueSessionCommand(job.sessionId, {
|
|
1092
|
+
type: "job_log",
|
|
1093
|
+
payload: { jobId: job.id, stream, seq, line: cleaned, ts: logTs },
|
|
1094
|
+
from: `worker:${opts.workerId}`,
|
|
1095
|
+
}, { droppable: true });
|
|
1096
|
+
void transport.queueJobLog(job.id, {
|
|
1097
|
+
stream,
|
|
1098
|
+
seq,
|
|
1099
|
+
message: cleaned,
|
|
1100
|
+
ts: logTs,
|
|
1101
|
+
});
|
|
1094
1102
|
}
|
|
1095
1103
|
: undefined;
|
|
1096
1104
|
|
|
@@ -1153,7 +1161,8 @@ async function workerLoop(
|
|
|
1153
1161
|
}
|
|
1154
1162
|
const jobDurationMs = Math.max(0, Date.now() - jobStartedAtMs);
|
|
1155
1163
|
|
|
1156
|
-
|
|
1164
|
+
allowHeartbeatRecycle = false;
|
|
1165
|
+
await transport.flush();
|
|
1157
1166
|
try {
|
|
1158
1167
|
await reportWorkerLlmUsage(opts.server, headers, jobData, result);
|
|
1159
1168
|
} catch (err) {
|
|
@@ -1267,10 +1276,10 @@ async function workerLoop(
|
|
|
1267
1276
|
reviewAgent.prUrl.trim().length > 0
|
|
1268
1277
|
? reviewAgent.prUrl.trim()
|
|
1269
1278
|
: null;
|
|
1270
|
-
const response = await
|
|
1271
|
-
|
|
1279
|
+
const response = await postJsonWithTimeout(
|
|
1280
|
+
`${opts.server}/jobs/${job.id}/complete`,
|
|
1272
1281
|
headers,
|
|
1273
|
-
|
|
1282
|
+
{
|
|
1274
1283
|
summary: result.summary,
|
|
1275
1284
|
durationMs: jobDurationMs,
|
|
1276
1285
|
prUrl: jobPrUrl,
|
|
@@ -1278,21 +1287,17 @@ async function workerLoop(
|
|
|
1278
1287
|
...(result.stdout ? [{ kind: "stdout", text: result.stdout }] : []),
|
|
1279
1288
|
...(result.stderr ? [{ kind: "stderr", text: result.stderr }] : []),
|
|
1280
1289
|
],
|
|
1281
|
-
}
|
|
1282
|
-
|
|
1290
|
+
},
|
|
1291
|
+
);
|
|
1283
1292
|
statusPersistedToServer = response.ok;
|
|
1284
1293
|
console.log(
|
|
1285
1294
|
`[WorkerPals] Job ${job.id} completed in ${formatDurationMs(jobDurationMs)}: ${result.summary}`,
|
|
1286
1295
|
);
|
|
1287
1296
|
} else {
|
|
1288
|
-
const response = await
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
message: result.summary,
|
|
1293
|
-
detail: redactSensitiveText(result.stderr ?? ""),
|
|
1294
|
-
durationMs: jobDurationMs,
|
|
1295
|
-
}),
|
|
1297
|
+
const response = await postJsonWithTimeout(`${opts.server}/jobs/${job.id}/fail`, headers, {
|
|
1298
|
+
message: result.summary,
|
|
1299
|
+
detail: redactSensitiveText(result.stderr ?? ""),
|
|
1300
|
+
durationMs: jobDurationMs,
|
|
1296
1301
|
});
|
|
1297
1302
|
statusPersistedToServer = response.ok;
|
|
1298
1303
|
console.log(
|
|
@@ -1327,11 +1332,11 @@ async function workerLoop(
|
|
|
1327
1332
|
? `${rawText.slice(0, maxResponseChars - 3)}...`
|
|
1328
1333
|
: rawText;
|
|
1329
1334
|
if (assistantText) {
|
|
1330
|
-
await
|
|
1335
|
+
await transport.queueSessionCommand(job.sessionId, {
|
|
1331
1336
|
type: "assistant_message",
|
|
1332
1337
|
payload: { text: assistantText },
|
|
1333
1338
|
from: `worker:${opts.workerId}`,
|
|
1334
|
-
});
|
|
1339
|
+
}, { priority: "high" });
|
|
1335
1340
|
}
|
|
1336
1341
|
}
|
|
1337
1342
|
|
|
@@ -1358,7 +1363,9 @@ async function workerLoop(
|
|
|
1358
1363
|
from: `worker:${opts.workerId}`,
|
|
1359
1364
|
};
|
|
1360
1365
|
|
|
1361
|
-
await
|
|
1366
|
+
await transport.queueSessionCommand(job.sessionId, eventCmd, {
|
|
1367
|
+
priority: "high",
|
|
1368
|
+
});
|
|
1362
1369
|
}
|
|
1363
1370
|
}
|
|
1364
1371
|
} finally {
|
|
@@ -1369,13 +1376,13 @@ async function workerLoop(
|
|
|
1369
1376
|
result?.cooldownMs &&
|
|
1370
1377
|
result.cooldownMs > 0
|
|
1371
1378
|
) {
|
|
1372
|
-
await
|
|
1379
|
+
await transport.queueSessionCommand(job.sessionId, {
|
|
1373
1380
|
type: "assistant_message",
|
|
1374
1381
|
payload: {
|
|
1375
1382
|
text: `WorkerPal is cooling down for ${formatDurationMs(result.cooldownMs)} after transient infrastructure failures.`,
|
|
1376
1383
|
},
|
|
1377
1384
|
from: `worker:${opts.workerId}`,
|
|
1378
|
-
});
|
|
1385
|
+
}, { priority: "high" });
|
|
1379
1386
|
}
|
|
1380
1387
|
if (!recycleWorkerAfterJob && result?.cooldownMs && result.cooldownMs > 0) {
|
|
1381
1388
|
const cooldownMs = Math.max(0, Math.floor(result.cooldownMs));
|
|
@@ -1501,6 +1508,14 @@ async function main(): Promise<void> {
|
|
|
1501
1508
|
shutdownRequested: false,
|
|
1502
1509
|
};
|
|
1503
1510
|
const headers = buildWorkerHeaders(opts.authToken);
|
|
1511
|
+
const transport = new WorkerServerTransport({
|
|
1512
|
+
server: opts.server,
|
|
1513
|
+
headers,
|
|
1514
|
+
workerId: opts.workerId,
|
|
1515
|
+
pollMs: opts.pollMs,
|
|
1516
|
+
heartbeatMs: opts.heartbeatMs,
|
|
1517
|
+
staleClaimTtlMs: CONFIG.server.staleClaimTtlMs,
|
|
1518
|
+
});
|
|
1504
1519
|
let shutdownTriggered = false;
|
|
1505
1520
|
const shutdownAndExit = (signalName: string, code: number) => {
|
|
1506
1521
|
if (shutdownTriggered) return;
|
|
@@ -1517,9 +1532,25 @@ async function main(): Promise<void> {
|
|
|
1517
1532
|
|
|
1518
1533
|
void (async () => {
|
|
1519
1534
|
await withTimeout(
|
|
1520
|
-
|
|
1535
|
+
transport.sendHeartbeat({
|
|
1536
|
+
status: "offline",
|
|
1537
|
+
currentJobId: runtimeState.currentJobId ?? null,
|
|
1538
|
+
capabilities: {
|
|
1539
|
+
docker: opts.docker,
|
|
1540
|
+
labels: opts.labels,
|
|
1541
|
+
executor: resolveExecutor(CONFIG),
|
|
1542
|
+
requireDocker: opts.requireDocker,
|
|
1543
|
+
},
|
|
1544
|
+
details: {
|
|
1545
|
+
repo: opts.repo,
|
|
1546
|
+
baseRef: opts.worktreeBaseRef,
|
|
1547
|
+
dockerImage: opts.docker ? opts.dockerImage : null,
|
|
1548
|
+
dockerNetworkMode: opts.docker ? opts.dockerNetworkMode : null,
|
|
1549
|
+
},
|
|
1550
|
+
}),
|
|
1521
1551
|
);
|
|
1522
|
-
await withTimeout(failActiveJobOnShutdown(opts, headers, runtimeState, signalName));
|
|
1552
|
+
await withTimeout(failActiveJobOnShutdown(opts, headers, runtimeState, transport, signalName));
|
|
1553
|
+
await withTimeout(transport.flush());
|
|
1523
1554
|
if (dockerExecutor) {
|
|
1524
1555
|
await withTimeout(
|
|
1525
1556
|
dockerExecutor.shutdown().catch((err) => {
|
|
@@ -1548,7 +1579,13 @@ async function main(): Promise<void> {
|
|
|
1548
1579
|
}
|
|
1549
1580
|
});
|
|
1550
1581
|
|
|
1551
|
-
|
|
1582
|
+
const requestWorkerRestart = (reason: string) => {
|
|
1583
|
+
if (shutdownTriggered) return;
|
|
1584
|
+
console.error(`[WorkerPals] Control plane unhealthy: ${reason}. Recycling worker.`);
|
|
1585
|
+
shutdownAndExit("CONTROL_PLANE_UNHEALTHY", 91);
|
|
1586
|
+
};
|
|
1587
|
+
|
|
1588
|
+
workerLoop(opts, dockerExecutor, runtimeState, transport, requestWorkerRestart).catch((err) => {
|
|
1552
1589
|
console.error("[WorkerPals] Fatal:", err);
|
|
1553
1590
|
process.exit(1);
|
|
1554
1591
|
});
|