@shogo-ai/worker 1.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,645 @@
1
+ // SPDX-License-Identifier: MIT
2
+ // Copyright (C) 2026 Shogo Technologies, Inc.
3
+ /**
4
+ * WorkerRuntimeManager — multi-project agent-runtime spawner.
5
+ *
6
+ * License boundary: this module locates the agent-runtime via
7
+ * `runtime-resolver` and invokes it with `Bun.spawn` (or `child_process.spawn`
8
+ * as a fallback). It does NOT import `@shogo/agent-runtime` — the AGPL
9
+ * runtime runs as a separate OS process, communicating with the worker
10
+ * over HTTP-on-localhost only. See packages/shogo-worker/README.md
11
+ * "Process boundary" section for the full rationale.
12
+ *
13
+ * Responsibilities:
14
+ * - Allocate a free localhost port per projectId (range 37100-37900,
15
+ * mirroring the desktop runtime so port conflicts surface the same
16
+ * way for users running both locally).
17
+ * - Spawn the runtime binary with the env it expects (PROJECT_ID,
18
+ * PORT, API_SERVER_PORT, SKILL_SERVER_PORT, RUNTIME_AUTH_SECRET,
19
+ * WEBHOOK_TOKEN, SHOGO_API_URL, SHOGO_API_KEY, AI_PROXY_URL,
20
+ * AI_PROXY_TOKEN, NODE_ENV).
21
+ * - Wait for /health to respond before declaring `running`.
22
+ * - Restart with exponential backoff on unexpected exits.
23
+ * - Idle-evict per-project runtimes after RUNTIME_IDLE_MS of inactivity.
24
+ * - Stop everything on SIGINT/SIGTERM via `stopAll()`.
25
+ *
26
+ * What this manager does NOT do (vs the desktop one in apps/api):
27
+ * - No Vite spawning. The cloud-attached worker only serves agent
28
+ * traffic; previews are owned by the cloud preview path.
29
+ * - No workspace template seeding. The runtime assumes the workspace
30
+ * is already at PROJECT_DIR (cloud sets this before invoking).
31
+ * - No Prisma reads. All policy comes from the spawn config.
32
+ * - No security policy build. Cloud signs the policy and sends it.
33
+ */
34
+ import { type ChildProcess, spawn } from 'node:child_process';
35
+ import { createHmac, randomBytes } from 'node:crypto';
36
+ import { existsSync, mkdirSync } from 'node:fs';
37
+ import { tmpdir } from 'node:os';
38
+ import { join } from 'node:path';
39
+ import { resolveRuntime, type ResolvedRuntime } from './runtime-resolver.ts';
40
+ import type { RuntimeResolver } from './tunnel.ts';
41
+
42
+ /** Port range for random allocation (mirrors apps/api desktop manager). */
43
+ const PORT_RANGE_START = 37100;
44
+ const PORT_RANGE_END = 37900;
45
+ const API_PORT_OFFSET = 1; // API server port = agentPort + 1.
46
+
47
+ /** Default idle eviction window — unused runtimes get killed after this. */
48
+ const RUNTIME_IDLE_MS = 15 * 60 * 1000;
49
+
50
+ /** Restart backoff bounds. */
51
+ const RESTART_BACKOFF_BASE_MS = 1_000;
52
+ const RESTART_BACKOFF_MAX_MS = 60_000;
53
+
54
+ /** Health check poll interval while waiting for /health. */
55
+ const HEALTH_POLL_MS = 500;
56
+ /** Total timeout waiting for first /health success after spawn. */
57
+ const HEALTH_BOOT_TIMEOUT_MS = 30_000;
58
+
59
+ export type RuntimeStatus = 'starting' | 'running' | 'restarting' | 'stopping' | 'stopped' | 'error';
60
+
61
+ export interface RuntimeStatusInfo {
62
+ projectId: string;
63
+ status: RuntimeStatus;
64
+ agentPort?: number;
65
+ apiServerPort?: number;
66
+ pid?: number;
67
+ startedAt?: number;
68
+ lastUsedAt?: number;
69
+ restarts: number;
70
+ lastError?: string;
71
+ }
72
+
73
+ export interface ProjectSpawnConfig {
74
+ /** Cloud URL the runtime should hit for backend services. */
75
+ cloudUrl: string;
76
+ /** Worker's API key, forwarded so the runtime can authenticate to the cloud. */
77
+ apiKey: string;
78
+ /** Workspace dir on disk for this project. Cloud sets this before invoking. */
79
+ projectDir?: string;
80
+ /** Optional AI proxy URL the runtime should use (cloud-managed). */
81
+ aiProxyUrl?: string;
82
+ /** Optional AI proxy token (per-project, short-lived). */
83
+ aiProxyToken?: string;
84
+ /** Tech-stack id (for runtime to seed correct template if PROJECT_DIR is empty). */
85
+ techStackId?: string;
86
+ /** Template id passed through to the runtime. */
87
+ templateId?: string;
88
+ /** Friendly project name. */
89
+ name?: string;
90
+ /** Workspace id for this project. */
91
+ workspaceId?: string;
92
+ /** Extra env to merge in last (advanced; usually unused). */
93
+ extraEnv?: Record<string, string>;
94
+ }
95
+
96
+ /**
97
+ * Translate a resolved runtime binary path into the actual spawn
98
+ * command. Default returns `{ command: bin, args: [] }` (compiled
99
+ * binary). The desktop AGPL adapter overrides this to spawn
100
+ * `bun run packages/agent-runtime/src/server.ts` from source so it
101
+ * doesn't have to download a prebuilt binary in dev.
102
+ */
103
+ export type SpawnCommandFactory = (binPath: string) => { command: string; args: string[] };
104
+
105
+ export const defaultSpawnCommand: SpawnCommandFactory = (bin) => ({ command: bin, args: [] });
106
+
107
+ /**
108
+ * Override how the runtime binary is located. Default uses the
109
+ * `runtime-resolver` priority chain (--runtime-bin > env > ~/.shogo >
110
+ * PATH). The desktop AGPL adapter overrides this to point at the
111
+ * monorepo source (`packages/agent-runtime/src/server.ts`).
112
+ */
113
+ export type RuntimeBinResolver = () => ResolvedRuntime | null;
114
+
115
+ export interface WorkerRuntimeManagerOptions {
116
+ /** `--runtime-bin <path>` flag value if any (forwarded to resolveRuntime). */
117
+ runtimeBin?: string;
118
+ /** Idle window in ms before evicting an unused runtime (default 15min). */
119
+ idleMs?: number;
120
+ /** Optional logger. Defaults to console. */
121
+ logger?: Pick<Console, 'log' | 'warn' | 'error'>;
122
+ /** Working directory for spawned runtimes. Defaults to OS tmpdir/shogo-runtime. */
123
+ runtimeWorkDir?: string;
124
+ /** Override env (for tests). Defaults to process.env. */
125
+ env?: NodeJS.ProcessEnv;
126
+ /**
127
+ * How to translate the resolved binary path into spawn argv. Default
128
+ * spawns the binary directly. The desktop AGPL adapter overrides this
129
+ * to wrap it in `bun run`.
130
+ */
131
+ spawnCommand?: SpawnCommandFactory;
132
+ /**
133
+ * Override binary resolution. Default uses the `runtime-resolver`
134
+ * priority chain. The desktop AGPL adapter passes a resolver that
135
+ * points at the monorepo source so dev builds don't need a prebuilt
136
+ * AGPL binary on disk.
137
+ */
138
+ resolveBin?: RuntimeBinResolver;
139
+ /**
140
+ * Default spawn config used when the manager is acting as a
141
+ * `RuntimeResolver` (i.e. asked to ensureRunning by the tunnel,
142
+ * which doesn't carry per-project config). The runtime can fetch
143
+ * everything else it needs from the cloud using its api key.
144
+ */
145
+ defaultSpawnConfig?: ProjectSpawnConfig;
146
+ /**
147
+ * Optional callback to enrich the spawn config for a given projectId
148
+ * just before spawning. Lets the desktop adapter inject per-project
149
+ * Prisma-derived secrets (AI proxy token, security policy, etc.).
150
+ */
151
+ enrichSpawnConfig?: (projectId: string, base: ProjectSpawnConfig) => Promise<ProjectSpawnConfig>;
152
+ }
153
+
154
+ /** Internal per-project runtime record. */
155
+ interface InternalRuntime {
156
+ projectId: string;
157
+ agentPort: number;
158
+ apiServerPort: number;
159
+ status: RuntimeStatus;
160
+ proc: ChildProcess | null;
161
+ startedAt: number;
162
+ lastUsedAt: number;
163
+ restarts: number;
164
+ restartTimer: ReturnType<typeof setTimeout> | null;
165
+ idleTimer: ReturnType<typeof setTimeout> | null;
166
+ lastError?: string;
167
+ spawnConfig: ProjectSpawnConfig;
168
+ /**
169
+ * Promise of an in-flight start so concurrent ensureRunning() calls
170
+ * dedupe instead of double-spawning.
171
+ */
172
+ startPromise: Promise<InternalRuntime> | null;
173
+ }
174
+
175
+ /**
176
+ * Per-worker signing secret used to derive `RUNTIME_AUTH_SECRET` and
177
+ * `WEBHOOK_TOKEN` for each project. Generated lazily on first need and
178
+ * persisted nowhere — secrets exist for the worker's process lifetime.
179
+ *
180
+ * The cloud already authenticated the tunneled request; the runtime
181
+ * token only protects the localhost surface from co-tenants on shared
182
+ * dev machines, so a per-process random is sufficient.
183
+ */
184
+ let workerSigningSecret: string | null = null;
185
+ function getWorkerSigningSecret(): string {
186
+ if (!workerSigningSecret) {
187
+ workerSigningSecret = randomBytes(32).toString('hex');
188
+ }
189
+ return workerSigningSecret;
190
+ }
191
+
192
+ function deriveRuntimeToken(projectId: string): string {
193
+ return createHmac('sha256', getWorkerSigningSecret()).update(`runtime:${projectId}`).digest('hex');
194
+ }
195
+
196
+ function deriveWebhookToken(projectId: string): string {
197
+ return createHmac('sha256', getWorkerSigningSecret()).update(`webhook:${projectId}`).digest('hex');
198
+ }
199
+
200
+ function splitPathAndQuery(pathWithQuery: string): { pathname: string; search: string } {
201
+ const q = pathWithQuery.indexOf('?');
202
+ if (q === -1) return { pathname: pathWithQuery, search: '' };
203
+ return { pathname: pathWithQuery.slice(0, q), search: pathWithQuery.slice(q) };
204
+ }
205
+
206
+ export class WorkerRuntimeManager implements RuntimeResolver {
207
+ private readonly opts: WorkerRuntimeManagerOptions;
208
+ private readonly log: Pick<Console, 'log' | 'warn' | 'error'>;
209
+ private readonly runtimes = new Map<string, InternalRuntime>();
210
+ private readonly usedPorts = new Set<number>();
211
+ private readonly spawnCommand: SpawnCommandFactory;
212
+ private resolved: ResolvedRuntime | null = null;
213
+ private stopped = false;
214
+
215
+ constructor(opts: WorkerRuntimeManagerOptions = {}) {
216
+ this.opts = opts;
217
+ this.log = opts.logger ?? console;
218
+ this.spawnCommand = opts.spawnCommand ?? defaultSpawnCommand;
219
+ }
220
+
221
+ /**
222
+ * Resolve and cache the runtime binary path. Called eagerly by
223
+ * `shogo worker start` so the user sees the missing-binary error
224
+ * immediately rather than on the first inbound request.
225
+ */
226
+ resolveBinary(): ResolvedRuntime | null {
227
+ if (!this.resolved) {
228
+ this.resolved = this.opts.resolveBin
229
+ ? this.opts.resolveBin()
230
+ : resolveRuntime({ flag: this.opts.runtimeBin, env: this.opts.env });
231
+ }
232
+ return this.resolved;
233
+ }
234
+
235
+ // ─── RuntimeResolver implementation (used by WorkerTunnel) ──────
236
+
237
+ /**
238
+ * Resolve a tunneled path to a local URL. /agent/* paths trigger an
239
+ * on-demand spawn for the projectId; non-agent paths return null
240
+ * (the worker doesn't host an apps/api locally — the desktop adapter
241
+ * subclasses this to add that fallback).
242
+ */
243
+ async resolveLocalUrl(pathWithQuery: string, projectId?: string): Promise<string | null> {
244
+ const { pathname, search } = splitPathAndQuery(pathWithQuery);
245
+ if (!(pathname.startsWith('/agent/') || pathname === '/agent')) return null;
246
+ if (!projectId) {
247
+ // Without a projectId we can't pick a runtime; pick the first active
248
+ // one if there's exactly one — matches the desktop's permissive fallback.
249
+ const active = this.getActiveProjects();
250
+ if (active.length !== 1) return null;
251
+ projectId = active[0]!;
252
+ }
253
+ const config = await this.spawnConfigFor(projectId);
254
+ if (!config) {
255
+ this.log.warn(`[WorkerRuntimeManager] No spawn config for ${projectId} — set defaultSpawnConfig or enrichSpawnConfig`);
256
+ return null;
257
+ }
258
+ const status = await this.ensureRunning(projectId, config);
259
+ if (!status.agentPort) return null;
260
+ this.touch(projectId);
261
+ return `http://127.0.0.1:${status.agentPort}${pathname}${search}`;
262
+ }
263
+
264
+ deriveRuntimeToken(projectId: string): string | null {
265
+ return deriveRuntimeToken(projectId);
266
+ }
267
+
268
+ private async spawnConfigFor(projectId: string): Promise<ProjectSpawnConfig | null> {
269
+ const base = this.opts.defaultSpawnConfig;
270
+ if (!base) return null;
271
+ if (this.opts.enrichSpawnConfig) {
272
+ try {
273
+ return await this.opts.enrichSpawnConfig(projectId, base);
274
+ } catch (err: any) {
275
+ this.log.warn(
276
+ `[WorkerRuntimeManager] enrichSpawnConfig failed for ${projectId}: ${err?.message ?? err}`,
277
+ );
278
+ }
279
+ }
280
+ return base;
281
+ }
282
+
283
+ /**
284
+ * Idempotently ensure a runtime exists for this projectId. Concurrent
285
+ * callers share the in-flight spawn promise.
286
+ */
287
+ async ensureRunning(projectId: string, config: ProjectSpawnConfig): Promise<RuntimeStatusInfo> {
288
+ if (this.stopped) throw new Error('WorkerRuntimeManager is stopped');
289
+
290
+ const existing = this.runtimes.get(projectId);
291
+ if (existing?.status === 'running') {
292
+ this.touch(projectId);
293
+ return this.snapshot(existing);
294
+ }
295
+ if (existing?.startPromise) {
296
+ const r = await existing.startPromise;
297
+ return this.snapshot(r);
298
+ }
299
+
300
+ const slot: InternalRuntime = existing ?? this.makeSlot(projectId, config);
301
+ if (!existing) this.runtimes.set(projectId, slot);
302
+ slot.spawnConfig = config;
303
+ slot.startPromise = this.doStart(slot);
304
+ try {
305
+ const r = await slot.startPromise;
306
+ return this.snapshot(r);
307
+ } finally {
308
+ slot.startPromise = null;
309
+ }
310
+ }
311
+
312
+ status(projectId: string): RuntimeStatusInfo | null {
313
+ const r = this.runtimes.get(projectId);
314
+ return r ? this.snapshot(r) : null;
315
+ }
316
+
317
+ getActiveProjects(): string[] {
318
+ return Array.from(this.runtimes.keys()).filter((id) => {
319
+ const r = this.runtimes.get(id);
320
+ return r && (r.status === 'running' || r.status === 'starting' || r.status === 'restarting');
321
+ });
322
+ }
323
+
324
+ /** Mark the runtime as recently used. Resets the idle eviction timer. */
325
+ touch(projectId: string): void {
326
+ const r = this.runtimes.get(projectId);
327
+ if (!r) return;
328
+ r.lastUsedAt = Date.now();
329
+ this.armIdleTimer(r);
330
+ }
331
+
332
+ async stop(projectId: string, signal: NodeJS.Signals = 'SIGTERM'): Promise<void> {
333
+ const r = this.runtimes.get(projectId);
334
+ if (!r) return;
335
+ r.status = 'stopping';
336
+ if (r.restartTimer) { clearTimeout(r.restartTimer); r.restartTimer = null; }
337
+ if (r.idleTimer) { clearTimeout(r.idleTimer); r.idleTimer = null; }
338
+ if (r.proc) {
339
+ try { r.proc.kill(signal); } catch { /* already gone */ }
340
+ await this.waitForExit(r.proc, 5000);
341
+ }
342
+ this.releasePort(r.agentPort);
343
+ this.runtimes.delete(projectId);
344
+ }
345
+
346
+ async stopAll(signal: NodeJS.Signals = 'SIGTERM'): Promise<void> {
347
+ this.stopped = true;
348
+ const ids = Array.from(this.runtimes.keys());
349
+ await Promise.all(ids.map((id) => this.stop(id, signal).catch((err) => {
350
+ this.log.error(`[WorkerRuntimeManager] Failed to stop ${id}: ${err?.message ?? err}`);
351
+ })));
352
+ }
353
+
354
+ // ─── Internals ──────────────────────────────────────────────────
355
+
356
+ private makeSlot(projectId: string, config: ProjectSpawnConfig): InternalRuntime {
357
+ return {
358
+ projectId,
359
+ agentPort: 0,
360
+ apiServerPort: 0,
361
+ status: 'starting',
362
+ proc: null,
363
+ startedAt: 0,
364
+ lastUsedAt: Date.now(),
365
+ restarts: 0,
366
+ restartTimer: null,
367
+ idleTimer: null,
368
+ spawnConfig: config,
369
+ startPromise: null,
370
+ };
371
+ }
372
+
373
+ private async doStart(slot: InternalRuntime): Promise<InternalRuntime> {
374
+ const resolved = this.resolveBinary();
375
+ if (!resolved) {
376
+ slot.status = 'error';
377
+ slot.lastError = 'agent-runtime binary not found (run `shogo runtime install`)';
378
+ throw new Error(slot.lastError);
379
+ }
380
+
381
+ if (!slot.agentPort) {
382
+ slot.agentPort = await this.allocatePort();
383
+ slot.apiServerPort = slot.agentPort + API_PORT_OFFSET;
384
+ }
385
+
386
+ const env = this.buildEnv(slot);
387
+ const cwd = this.resolveCwd(slot);
388
+ const { command, args } = this.spawnCommand(resolved.path);
389
+
390
+ this.log.log(
391
+ `[WorkerRuntimeManager] Spawning agent-runtime for ${slot.projectId} ` +
392
+ `via ${command} ${args.join(' ')} (port=${slot.agentPort}, source=${resolved.source})`,
393
+ );
394
+
395
+ const proc = spawn(command, args, {
396
+ cwd,
397
+ env,
398
+ detached: false,
399
+ stdio: ['ignore', 'pipe', 'pipe'],
400
+ });
401
+
402
+ slot.proc = proc;
403
+ slot.status = 'starting';
404
+ slot.startedAt = Date.now();
405
+
406
+ proc.on('error', (err) => {
407
+ slot.lastError = err?.message ?? String(err);
408
+ this.log.error(`[WorkerRuntimeManager] spawn error for ${slot.projectId}: ${slot.lastError}`);
409
+ });
410
+
411
+ proc.on('exit', (code, signal) => {
412
+ this.handleExit(slot, code, signal);
413
+ });
414
+
415
+ const prefix = `[runtime:${slot.projectId.slice(0, 8)}]`;
416
+ proc.stdout?.on('data', (data) => {
417
+ for (const line of data.toString().trimEnd().split('\n')) {
418
+ if (line) this.log.log(`${prefix} ${line}`);
419
+ }
420
+ });
421
+ proc.stderr?.on('data', (data) => {
422
+ for (const line of data.toString().trimEnd().split('\n')) {
423
+ if (line) this.log.error(`${prefix} ${line}`);
424
+ }
425
+ });
426
+
427
+ try {
428
+ await this.waitForHealth(slot.agentPort, slot.proc, HEALTH_BOOT_TIMEOUT_MS);
429
+ slot.status = 'running';
430
+ slot.lastUsedAt = Date.now();
431
+ this.armIdleTimer(slot);
432
+ return slot;
433
+ } catch (err: any) {
434
+ slot.status = 'error';
435
+ slot.lastError = err?.message ?? String(err);
436
+ try { proc.kill('SIGTERM'); } catch { /* nothing */ }
437
+ this.releasePort(slot.agentPort);
438
+ slot.agentPort = 0;
439
+ slot.apiServerPort = 0;
440
+ throw err;
441
+ }
442
+ }
443
+
444
+ private buildEnv(slot: InternalRuntime): NodeJS.ProcessEnv {
445
+ const cfg = slot.spawnConfig;
446
+ const env: NodeJS.ProcessEnv = {
447
+ ...(this.opts.env ?? process.env),
448
+ PROJECT_ID: slot.projectId,
449
+ PORT: String(slot.agentPort),
450
+ API_SERVER_PORT: String(slot.apiServerPort),
451
+ SKILL_SERVER_PORT: String(slot.apiServerPort),
452
+ NODE_ENV: 'production',
453
+ SHOGO_CLOUD_URL: cfg.cloudUrl,
454
+ SHOGO_API_URL: cfg.cloudUrl,
455
+ SHOGO_API_KEY: cfg.apiKey,
456
+ RUNTIME_AUTH_SECRET: deriveRuntimeToken(slot.projectId),
457
+ WEBHOOK_TOKEN: deriveWebhookToken(slot.projectId),
458
+ };
459
+
460
+ if (cfg.projectDir) {
461
+ env.PROJECT_DIR = cfg.projectDir;
462
+ env.WORKSPACE_DIR = cfg.projectDir;
463
+ }
464
+ if (cfg.aiProxyUrl) env.AI_PROXY_URL = cfg.aiProxyUrl;
465
+ if (cfg.aiProxyToken) env.AI_PROXY_TOKEN = cfg.aiProxyToken;
466
+ if (cfg.techStackId) env.TECH_STACK_ID = cfg.techStackId;
467
+ if (cfg.templateId) env.TEMPLATE_ID = cfg.templateId;
468
+ if (cfg.name) env.AGENT_NAME = cfg.name;
469
+ if (cfg.workspaceId) env.WORKSPACE_ID = cfg.workspaceId;
470
+
471
+ if (cfg.extraEnv) Object.assign(env, cfg.extraEnv);
472
+ return env;
473
+ }
474
+
475
+ private resolveCwd(slot: InternalRuntime): string {
476
+ const cfg = slot.spawnConfig;
477
+ if (cfg.projectDir && existsSync(cfg.projectDir)) return cfg.projectDir;
478
+ const fallback = this.opts.runtimeWorkDir ?? join(tmpdir(), 'shogo-runtime', slot.projectId);
479
+ mkdirSync(fallback, { recursive: true });
480
+ return fallback;
481
+ }
482
+
483
+ private handleExit(slot: InternalRuntime, code: number | null, signal: NodeJS.Signals | null): void {
484
+ const exitedClean = signal === null && code === 0;
485
+ this.log.log(
486
+ `[WorkerRuntimeManager] runtime ${slot.projectId} exited (code=${code}, signal=${signal})`,
487
+ );
488
+ slot.proc = null;
489
+
490
+ if (slot.status === 'stopping' || this.stopped) {
491
+ slot.status = 'stopped';
492
+ this.releasePort(slot.agentPort);
493
+ slot.agentPort = 0;
494
+ slot.apiServerPort = 0;
495
+ return;
496
+ }
497
+
498
+ if (exitedClean) {
499
+ slot.status = 'stopped';
500
+ this.releasePort(slot.agentPort);
501
+ slot.agentPort = 0;
502
+ slot.apiServerPort = 0;
503
+ this.runtimes.delete(slot.projectId);
504
+ return;
505
+ }
506
+
507
+ slot.restarts += 1;
508
+ slot.lastError = `exited code=${code} signal=${signal}`;
509
+ const delay = this.restartBackoffMs(slot.restarts);
510
+ slot.status = 'restarting';
511
+ this.log.warn(
512
+ `[WorkerRuntimeManager] restarting ${slot.projectId} in ${Math.round(delay / 1000)}s ` +
513
+ `(restart #${slot.restarts})`,
514
+ );
515
+ if (slot.restartTimer) clearTimeout(slot.restartTimer);
516
+ slot.restartTimer = setTimeout(() => {
517
+ slot.restartTimer = null;
518
+ slot.startPromise = this.doStart(slot).then((r) => {
519
+ slot.startPromise = null;
520
+ return r;
521
+ }).catch((err) => {
522
+ slot.startPromise = null;
523
+ this.log.error(`[WorkerRuntimeManager] restart of ${slot.projectId} failed: ${err?.message ?? err}`);
524
+ return slot;
525
+ });
526
+ }, delay);
527
+ }
528
+
529
+ private restartBackoffMs(restarts: number): number {
530
+ const base = Math.min(RESTART_BACKOFF_BASE_MS * Math.pow(2, Math.max(0, restarts - 1)), RESTART_BACKOFF_MAX_MS);
531
+ const jitter = base * 0.2 * Math.random();
532
+ return base + jitter;
533
+ }
534
+
535
+ private armIdleTimer(slot: InternalRuntime): void {
536
+ if (slot.idleTimer) clearTimeout(slot.idleTimer);
537
+ const idleMs = this.opts.idleMs ?? RUNTIME_IDLE_MS;
538
+ slot.idleTimer = setTimeout(() => {
539
+ const since = Date.now() - slot.lastUsedAt;
540
+ if (since < idleMs) {
541
+ // Got touched between scheduling and firing — re-arm.
542
+ this.armIdleTimer(slot);
543
+ return;
544
+ }
545
+ this.log.log(`[WorkerRuntimeManager] idle-evicting ${slot.projectId} after ${Math.round(since / 1000)}s`);
546
+ void this.stop(slot.projectId).catch((err) => {
547
+ this.log.warn(`[WorkerRuntimeManager] idle stop failed: ${err?.message ?? err}`);
548
+ });
549
+ }, idleMs);
550
+ }
551
+
552
+ private async allocatePort(): Promise<number> {
553
+ const range = PORT_RANGE_END - PORT_RANGE_START;
554
+ const maxAttempts = Math.min(range, 50);
555
+ for (let i = 0; i < maxAttempts; i++) {
556
+ const candidate = PORT_RANGE_START + Math.floor(Math.random() * range);
557
+ if (this.usedPorts.has(candidate) || this.usedPorts.has(candidate + API_PORT_OFFSET)) continue;
558
+ const agentInUse = await this.isPortListening(candidate);
559
+ const apiInUse = await this.isPortListening(candidate + API_PORT_OFFSET);
560
+ if (agentInUse || apiInUse) continue;
561
+ this.usedPorts.add(candidate);
562
+ this.usedPorts.add(candidate + API_PORT_OFFSET);
563
+ return candidate;
564
+ }
565
+ throw new Error(
566
+ `Cannot allocate port in range ${PORT_RANGE_START}-${PORT_RANGE_END} after ${maxAttempts} attempts`,
567
+ );
568
+ }
569
+
570
+ private releasePort(port: number): void {
571
+ if (!port) return;
572
+ this.usedPorts.delete(port);
573
+ this.usedPorts.delete(port + API_PORT_OFFSET);
574
+ }
575
+
576
+ private async isPortListening(port: number): Promise<boolean> {
577
+ const controller = new AbortController();
578
+ const timer = setTimeout(() => controller.abort(), 250);
579
+ try {
580
+ await fetch(`http://127.0.0.1:${port}/`, { method: 'HEAD', signal: controller.signal });
581
+ clearTimeout(timer);
582
+ return true;
583
+ } catch {
584
+ clearTimeout(timer);
585
+ return false;
586
+ }
587
+ }
588
+
589
+ private async waitForHealth(
590
+ port: number,
591
+ proc: ChildProcess,
592
+ timeoutMs: number,
593
+ ): Promise<void> {
594
+ const deadline = Date.now() + timeoutMs;
595
+ while (Date.now() < deadline) {
596
+ if (proc.exitCode !== null || proc.signalCode != null || proc.killed) {
597
+ throw new Error(
598
+ `agent-runtime exited (code=${proc.exitCode}, signal=${proc.signalCode}) before becoming healthy on port ${port}`,
599
+ );
600
+ }
601
+ const controller = new AbortController();
602
+ const t = setTimeout(() => controller.abort(), 1500);
603
+ try {
604
+ const resp = await fetch(`http://127.0.0.1:${port}/health`, {
605
+ method: 'GET',
606
+ signal: controller.signal,
607
+ });
608
+ clearTimeout(t);
609
+ if (resp.ok) return;
610
+ } catch {
611
+ clearTimeout(t);
612
+ }
613
+ await new Promise((r) => setTimeout(r, HEALTH_POLL_MS));
614
+ }
615
+ throw new Error(`Timeout waiting for agent-runtime /health on port ${port}`);
616
+ }
617
+
618
+ private async waitForExit(proc: ChildProcess, timeoutMs: number): Promise<void> {
619
+ if (proc.exitCode !== null || proc.signalCode != null || proc.killed) return;
620
+ await new Promise<void>((resolve) => {
621
+ const t = setTimeout(() => {
622
+ try { proc.kill('SIGKILL'); } catch { /* already gone */ }
623
+ resolve();
624
+ }, timeoutMs);
625
+ proc.once('exit', () => {
626
+ clearTimeout(t);
627
+ resolve();
628
+ });
629
+ });
630
+ }
631
+
632
+ private snapshot(r: InternalRuntime): RuntimeStatusInfo {
633
+ return {
634
+ projectId: r.projectId,
635
+ status: r.status,
636
+ agentPort: r.agentPort || undefined,
637
+ apiServerPort: r.apiServerPort || undefined,
638
+ pid: r.proc?.pid,
639
+ startedAt: r.startedAt || undefined,
640
+ lastUsedAt: r.lastUsedAt,
641
+ restarts: r.restarts,
642
+ lastError: r.lastError,
643
+ };
644
+ }
645
+ }