@shogo-ai/worker 1.9.9 → 1.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -0
- package/dist/cli.mjs +63 -0
- package/package.json +5 -3
- package/src/cli.ts +37 -0
- package/src/commands/agent.ts +123 -0
- package/src/commands/doctor.ts +128 -0
- package/src/commands/start.ts +15 -2
- package/src/lib/__tests__/agent-launch.test.ts +92 -0
- package/src/lib/__tests__/db-doctor.test.ts +204 -0
- package/src/lib/__tests__/runtime-manager-tree-sitter-env.test.ts +65 -0
- package/src/lib/db-doctor.ts +355 -0
- package/src/lib/process-manager.ts +1 -1
- package/src/lib/runtime-install.ts +1 -1
- package/src/lib/runtime-manager.ts +124 -5
- package/src/lib/tunnel.ts +12 -4
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
// SPDX-License-Identifier: MIT
|
|
2
|
+
// Copyright (C) 2026 Shogo Technologies, Inc.
|
|
3
|
+
//
|
|
4
|
+
// Local SQLite migration "doctor" for the Shogo desktop app.
|
|
5
|
+
//
|
|
6
|
+
// The packaged desktop app stores its database at
|
|
7
|
+
// `<userData>/data/shogo.db` and applies schema with
|
|
8
|
+
// `prisma migrate deploy` on every launch. When a migration throws
|
|
9
|
+
// mid-way it leaves a row in `_prisma_migrations` with
|
|
10
|
+
// `finished_at = NULL`, and Prisma's P3009 check then refuses to run
|
|
11
|
+
// ANY further migrations — the app is wedged until that ledger row is
|
|
12
|
+
// cleared. The desktop app surfaces a recovery dialog when this happens
|
|
13
|
+
// on boot (see apps/desktop/src/db-recovery.ts), but a user whose app
|
|
14
|
+
// won't open, or who is being walked through a fix by support, has no
|
|
15
|
+
// way to trigger that repair from a terminal.
|
|
16
|
+
//
|
|
17
|
+
// `shogo doctor` (this module) is that terminal entry point. It performs
|
|
18
|
+
// the SAME safe, dependency-free repair the desktop dialog does:
|
|
19
|
+
//
|
|
20
|
+
// 1. detectFailedMigrations() — find stuck `_prisma_migrations` rows.
|
|
21
|
+
// 2. backupDatabase() — snapshot `shogo.db` (+ -wal/-shm)
|
|
22
|
+
// before touching anything.
|
|
23
|
+
// 3. repairFailedMigrations() — delete the stuck rows (equivalent to
|
|
24
|
+
// `prisma migrate resolve --rolled-back`).
|
|
25
|
+
//
|
|
26
|
+
// It deliberately does NOT re-run `prisma migrate deploy` itself — the
|
|
27
|
+
// CLI doesn't ship the Prisma schema/migration history that lives inside
|
|
28
|
+
// the desktop app bundle. Instead it clears the wedge and tells the user
|
|
29
|
+
// to relaunch Shogo, which re-applies migrations on its next boot.
|
|
30
|
+
//
|
|
31
|
+
// Why shell out to bun rather than link a SQLite driver
|
|
32
|
+
// -----------------------------------------------------
|
|
33
|
+
// The worker CLI runs under Node (npm install) OR Bun (tarball release),
|
|
34
|
+
// and we don't want a native `better-sqlite3` build step. The desktop
|
|
35
|
+
// app already ships a `bun` binary with `bun:sqlite` statically linked,
|
|
36
|
+
// and that's the exact SQLite version Prisma's bun-sqlite adapter uses at
|
|
37
|
+
// runtime — so running `bun -e "<small script>"` against it is both
|
|
38
|
+
// dependency-free and driver-version-matched. This mirrors
|
|
39
|
+
// apps/desktop/src/db-recovery.ts (kept as a separate copy there because
|
|
40
|
+
// the Electron main process is bundled in a different runtime context).
|
|
41
|
+
|
|
42
|
+
import { execFileSync } from 'node:child_process';
|
|
43
|
+
import { existsSync, copyFileSync } from 'node:fs';
|
|
44
|
+
import path from 'node:path';
|
|
45
|
+
import { homedir, platform } from 'node:os';
|
|
46
|
+
|
|
47
|
+
export interface FailedMigration {
|
|
48
|
+
name: string;
|
|
49
|
+
/** Epoch milliseconds the migration was attempted. */
|
|
50
|
+
startedAt: number;
|
|
51
|
+
/** First 600 chars of the Prisma error log row, for display. */
|
|
52
|
+
errorExcerpt: string;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Run a one-shot bun script against the given DB and parse its stdout
|
|
57
|
+
* as JSON. The script reads the DB path from `process.env.DBP` so we
|
|
58
|
+
* don't have to escape paths (which can contain spaces) through shell
|
|
59
|
+
* quoting. Throws if bun isn't usable or the script bails — we do NOT
|
|
60
|
+
* swallow these, since a broken recovery layer should surface its own
|
|
61
|
+
* defect rather than a misleading "database looks fine".
|
|
62
|
+
*/
|
|
63
|
+
function runBunScript<T>(bunPath: string, dbPath: string, script: string): T {
|
|
64
|
+
const out = execFileSync(bunPath, ['-e', script], {
|
|
65
|
+
env: { ...process.env, DBP: dbPath },
|
|
66
|
+
encoding: 'utf-8',
|
|
67
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
68
|
+
timeout: 5000,
|
|
69
|
+
});
|
|
70
|
+
const trimmed = out.trim();
|
|
71
|
+
if (!trimmed) {
|
|
72
|
+
throw new Error(`bun script returned empty output for ${dbPath}`);
|
|
73
|
+
}
|
|
74
|
+
return JSON.parse(trimmed) as T;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Return `_prisma_migrations` rows that are still failed (`finished_at`
|
|
79
|
+
* NULL) and not yet recovered (`rolled_back_at` NULL). Returns an empty
|
|
80
|
+
* array when the DB doesn't exist, the table hasn't been created, or no
|
|
81
|
+
* failures are present.
|
|
82
|
+
*/
|
|
83
|
+
export function detectFailedMigrations(bunPath: string, dbPath: string): FailedMigration[] {
|
|
84
|
+
if (!existsSync(dbPath)) return [];
|
|
85
|
+
|
|
86
|
+
const script = `
|
|
87
|
+
import { Database } from 'bun:sqlite';
|
|
88
|
+
try {
|
|
89
|
+
const db = new Database(process.env.DBP, { readonly: true });
|
|
90
|
+
const hasTable = db.query("SELECT name FROM sqlite_master WHERE type='table' AND name='_prisma_migrations'").get();
|
|
91
|
+
if (!hasTable) { console.log('[]'); process.exit(0); }
|
|
92
|
+
const rows = db
|
|
93
|
+
.query("SELECT migration_name as name, started_at as startedAt, substr(coalesce(logs, ''), 1, 600) as errorExcerpt FROM _prisma_migrations WHERE finished_at IS NULL AND rolled_back_at IS NULL ORDER BY started_at")
|
|
94
|
+
.all();
|
|
95
|
+
console.log(JSON.stringify(rows));
|
|
96
|
+
} catch (e) {
|
|
97
|
+
console.error(String(e?.stack || e));
|
|
98
|
+
process.exit(2);
|
|
99
|
+
}
|
|
100
|
+
`;
|
|
101
|
+
|
|
102
|
+
return runBunScript<FailedMigration[]>(bunPath, dbPath, script);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Snapshot `shogo.db` (plus `-wal`/`-shm` sidecars if present) to a
|
|
107
|
+
* timestamped sibling file. Returns the backup path of the main DB.
|
|
108
|
+
* Uses plain copies (not SQLite's online backup) because the DB is not
|
|
109
|
+
* open at this point and `copyFileSync` works even on a DB SQLite would
|
|
110
|
+
* refuse to open. Throws on any I/O failure — the caller MUST treat that
|
|
111
|
+
* as "do not proceed with repair".
|
|
112
|
+
*/
|
|
113
|
+
export function backupDatabase(dbPath: string): string {
|
|
114
|
+
if (!existsSync(dbPath)) {
|
|
115
|
+
throw new Error(`Database does not exist at ${dbPath} — refusing to back up nothing`);
|
|
116
|
+
}
|
|
117
|
+
const dir = path.dirname(dbPath);
|
|
118
|
+
const base = path.basename(dbPath);
|
|
119
|
+
const stamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
120
|
+
const backupPath = path.join(dir, `${base}.bak-${stamp}`);
|
|
121
|
+
copyFileSync(dbPath, backupPath);
|
|
122
|
+
|
|
123
|
+
for (const suffix of ['-wal', '-shm']) {
|
|
124
|
+
const sidecar = `${dbPath}${suffix}`;
|
|
125
|
+
if (existsSync(sidecar)) {
|
|
126
|
+
copyFileSync(sidecar, `${backupPath}${suffix}`);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return backupPath;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Delete the named failed-migration rows from `_prisma_migrations`. This
|
|
134
|
+
* is the equivalent of `prisma migrate resolve --rolled-back <name>` for
|
|
135
|
+
* each, but without needing the schema-engine binary. Only deletes rows
|
|
136
|
+
* that are actually still failed (defends against a stale name list).
|
|
137
|
+
* Returns the number of rows deleted.
|
|
138
|
+
*
|
|
139
|
+
* The caller is expected to have run `backupDatabase()` first. After this,
|
|
140
|
+
* the next `prisma migrate deploy` (on the desktop app's next launch)
|
|
141
|
+
* re-attempts the migration.
|
|
142
|
+
*/
|
|
143
|
+
export function repairFailedMigrations(
|
|
144
|
+
bunPath: string,
|
|
145
|
+
dbPath: string,
|
|
146
|
+
migrationNames: string[],
|
|
147
|
+
): number {
|
|
148
|
+
if (migrationNames.length === 0) return 0;
|
|
149
|
+
if (!existsSync(dbPath)) {
|
|
150
|
+
throw new Error(`Cannot repair: database does not exist at ${dbPath}`);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const namesJson = JSON.stringify(migrationNames);
|
|
154
|
+
const script = `
|
|
155
|
+
import { Database } from 'bun:sqlite';
|
|
156
|
+
const names = ${namesJson};
|
|
157
|
+
if (!Array.isArray(names) || names.some(n => typeof n !== 'string')) {
|
|
158
|
+
console.error('Invalid migration name list');
|
|
159
|
+
process.exit(2);
|
|
160
|
+
}
|
|
161
|
+
try {
|
|
162
|
+
const db = new Database(process.env.DBP, { create: false, readwrite: true });
|
|
163
|
+
const placeholders = names.map(() => '?').join(',');
|
|
164
|
+
const stmt = db.prepare(
|
|
165
|
+
\`DELETE FROM _prisma_migrations WHERE migration_name IN (\${placeholders}) AND finished_at IS NULL AND rolled_back_at IS NULL\`
|
|
166
|
+
);
|
|
167
|
+
const result = stmt.run(...names);
|
|
168
|
+
console.log(JSON.stringify({ deleted: Number(result.changes) }));
|
|
169
|
+
} catch (e) {
|
|
170
|
+
console.error(String(e?.stack || e));
|
|
171
|
+
process.exit(3);
|
|
172
|
+
}
|
|
173
|
+
`;
|
|
174
|
+
|
|
175
|
+
const out = runBunScript<{ deleted: number }>(bunPath, dbPath, script);
|
|
176
|
+
return out.deleted;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
// Orchestrator
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
export type DoctorStatus = 'healthy' | 'no-database' | 'repaired' | 'failed';
|
|
184
|
+
|
|
185
|
+
export interface DoctorResult {
|
|
186
|
+
status: DoctorStatus;
|
|
187
|
+
/** Migrations found in a failed state before repair. */
|
|
188
|
+
detected: FailedMigration[];
|
|
189
|
+
/** Path of the backup written before repair, if any. */
|
|
190
|
+
backupPath?: string;
|
|
191
|
+
/** Names of migration rows actually cleared. */
|
|
192
|
+
cleared: string[];
|
|
193
|
+
/** Migrations still failed after the repair attempt (should be empty on success). */
|
|
194
|
+
remaining: FailedMigration[];
|
|
195
|
+
/** Human-readable summary of what happened. */
|
|
196
|
+
message: string;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
export interface DoctorOptions {
|
|
200
|
+
bunPath: string;
|
|
201
|
+
dbPath: string;
|
|
202
|
+
/** Skip the pre-repair backup (default: false). Discouraged. */
|
|
203
|
+
skipBackup?: boolean;
|
|
204
|
+
/** Logger for progress lines (default: no-op). */
|
|
205
|
+
log?: (line: string) => void;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Run the full safe repair sequence against a local SQLite DB:
|
|
210
|
+
* detect → backup → clear stuck ledger rows → re-detect. Idempotent:
|
|
211
|
+
* a healthy DB is a no-op. Never re-runs `migrate deploy` (that's the
|
|
212
|
+
* desktop app's job on next launch).
|
|
213
|
+
*/
|
|
214
|
+
export function runDatabaseDoctor(opts: DoctorOptions): DoctorResult {
|
|
215
|
+
const log = opts.log ?? (() => {});
|
|
216
|
+
const { bunPath, dbPath } = opts;
|
|
217
|
+
|
|
218
|
+
if (!existsSync(dbPath)) {
|
|
219
|
+
return {
|
|
220
|
+
status: 'no-database',
|
|
221
|
+
detected: [],
|
|
222
|
+
cleared: [],
|
|
223
|
+
remaining: [],
|
|
224
|
+
message: `No database found at ${dbPath}. Nothing to repair — launch Shogo once to create it.`,
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const detected = detectFailedMigrations(bunPath, dbPath);
|
|
229
|
+
if (detected.length === 0) {
|
|
230
|
+
return {
|
|
231
|
+
status: 'healthy',
|
|
232
|
+
detected: [],
|
|
233
|
+
cleared: [],
|
|
234
|
+
remaining: [],
|
|
235
|
+
message: 'No failed migrations detected — the local database looks healthy.',
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
log(`Found ${detected.length} failed migration(s): ${detected.map((m) => m.name).join(', ')}`);
|
|
240
|
+
|
|
241
|
+
let backupPath: string | undefined;
|
|
242
|
+
if (!opts.skipBackup) {
|
|
243
|
+
backupPath = backupDatabase(dbPath);
|
|
244
|
+
log(`Backed up database to ${backupPath}`);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
const names = detected.map((m) => m.name);
|
|
248
|
+
const deleted = repairFailedMigrations(bunPath, dbPath, names);
|
|
249
|
+
log(`Cleared ${deleted} failed migration row(s).`);
|
|
250
|
+
|
|
251
|
+
const remaining = detectFailedMigrations(bunPath, dbPath);
|
|
252
|
+
const status: DoctorStatus = remaining.length === 0 ? 'repaired' : 'failed';
|
|
253
|
+
const message =
|
|
254
|
+
status === 'repaired'
|
|
255
|
+
? 'Cleared the failed migration record. Relaunch Shogo to re-apply migrations cleanly.'
|
|
256
|
+
: `Repair incomplete — ${remaining.length} migration(s) still failed: ${remaining
|
|
257
|
+
.map((m) => m.name)
|
|
258
|
+
.join(', ')}.`;
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
status,
|
|
262
|
+
detected,
|
|
263
|
+
backupPath,
|
|
264
|
+
cleared: names.slice(0, deleted),
|
|
265
|
+
remaining,
|
|
266
|
+
message,
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// ---------------------------------------------------------------------------
|
|
271
|
+
// Path / binary resolution (for the standalone CLI)
|
|
272
|
+
// ---------------------------------------------------------------------------
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Resolve the desktop app's per-user data directory, mirroring
|
|
276
|
+
* Electron's `app.getPath('userData')` + the `data/` subdir used by
|
|
277
|
+
* apps/desktop/src/paths.ts. `productName` is "Shogo".
|
|
278
|
+
*
|
|
279
|
+
* macOS: ~/Library/Application Support/Shogo/data
|
|
280
|
+
* Windows: %APPDATA%/Shogo/data (Roaming)
|
|
281
|
+
* Linux: $XDG_CONFIG_HOME/Shogo/data (or ~/.config/Shogo/data)
|
|
282
|
+
*/
|
|
283
|
+
export function resolveDesktopDataDir(): string {
|
|
284
|
+
const home = homedir();
|
|
285
|
+
const plat = platform();
|
|
286
|
+
let appData: string;
|
|
287
|
+
if (plat === 'darwin') {
|
|
288
|
+
appData = path.join(home, 'Library', 'Application Support');
|
|
289
|
+
} else if (plat === 'win32') {
|
|
290
|
+
appData = process.env.APPDATA ?? path.join(home, 'AppData', 'Roaming');
|
|
291
|
+
} else {
|
|
292
|
+
appData = process.env.XDG_CONFIG_HOME ?? path.join(home, '.config');
|
|
293
|
+
}
|
|
294
|
+
return path.join(appData, 'Shogo', 'data');
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/** Default path of the desktop app's local SQLite database. */
|
|
298
|
+
export function resolveDesktopDbPath(): string {
|
|
299
|
+
return path.join(resolveDesktopDataDir(), 'shogo.db');
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/**
|
|
303
|
+
* Candidate locations of the `bun` binary the installed desktop app
|
|
304
|
+
* ships in its `resources/bun/` directory. Best-effort and
|
|
305
|
+
* platform-specific; missing entries are filtered out by the caller.
|
|
306
|
+
*/
|
|
307
|
+
function bundledBunCandidates(): string[] {
|
|
308
|
+
const plat = platform();
|
|
309
|
+
const exe = plat === 'win32' ? 'bun.exe' : 'bun';
|
|
310
|
+
const home = homedir();
|
|
311
|
+
const candidates: string[] = [];
|
|
312
|
+
if (plat === 'darwin') {
|
|
313
|
+
candidates.push(
|
|
314
|
+
path.join('/Applications', 'Shogo.app', 'Contents', 'Resources', 'bun', exe),
|
|
315
|
+
path.join(home, 'Applications', 'Shogo.app', 'Contents', 'Resources', 'bun', exe),
|
|
316
|
+
);
|
|
317
|
+
} else if (plat === 'linux') {
|
|
318
|
+
candidates.push(
|
|
319
|
+
path.join('/opt', 'Shogo', 'resources', 'bun', exe),
|
|
320
|
+
path.join('/usr', 'lib', 'shogo', 'resources', 'bun', exe),
|
|
321
|
+
);
|
|
322
|
+
}
|
|
323
|
+
// Windows installs under a version-stamped Squirrel dir
|
|
324
|
+
// (%LOCALAPPDATA%/shogo/app-<ver>/resources/bun/bun.exe) which we can't
|
|
325
|
+
// resolve without globbing; rely on --bun / PATH there.
|
|
326
|
+
return candidates;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/** True if the given binary can be executed and reports a version. */
|
|
330
|
+
function bunIsUsable(bunPath: string): boolean {
|
|
331
|
+
try {
|
|
332
|
+
execFileSync(bunPath, ['--version'], { stdio: 'ignore', timeout: 5000 });
|
|
333
|
+
return true;
|
|
334
|
+
} catch {
|
|
335
|
+
return false;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Resolve a usable `bun` binary for the repair scripts, in priority order:
|
|
341
|
+
* 1. explicit `override` (the `--bun` flag)
|
|
342
|
+
* 2. the bun currently running this CLI (tarball release)
|
|
343
|
+
* 3. the desktop app's bundled bun
|
|
344
|
+
* 4. `bun` on PATH
|
|
345
|
+
* Returns null if none are usable.
|
|
346
|
+
*/
|
|
347
|
+
export function resolveBunBinary(override?: string): string | null {
|
|
348
|
+
if (override) return bunIsUsable(override) ? override : null;
|
|
349
|
+
if (process.versions.bun && process.execPath) return process.execPath;
|
|
350
|
+
for (const candidate of bundledBunCandidates()) {
|
|
351
|
+
if (existsSync(candidate) && bunIsUsable(candidate)) return candidate;
|
|
352
|
+
}
|
|
353
|
+
if (bunIsUsable('bun')) return 'bun';
|
|
354
|
+
return null;
|
|
355
|
+
}
|
|
@@ -60,7 +60,7 @@ export type Channel = 'stable' | 'beta' | 'nightly';
|
|
|
60
60
|
* Layout assumed by `buildAssetUrls()`:
|
|
61
61
|
* ${baseUrl}/v${version}/${assetName}
|
|
62
62
|
*/
|
|
63
|
-
export const DEFAULT_RELEASES_BASE_URL = 'https://github.com/shogo-
|
|
63
|
+
export const DEFAULT_RELEASES_BASE_URL = 'https://github.com/shogo-labs/shogo-ai/releases/download';
|
|
64
64
|
|
|
65
65
|
export interface InstallOptions {
|
|
66
66
|
/** Specific version to install (e.g. "0.1.0"). Default: latest in channel. */
|
|
@@ -48,9 +48,43 @@ const PORT_RANGE_START = 37100;
|
|
|
48
48
|
const PORT_RANGE_END = 37900;
|
|
49
49
|
const API_PORT_OFFSET = 1; // API server port = agentPort + 1.
|
|
50
50
|
|
|
51
|
+
/**
|
|
52
|
+
* Offset (from the agent port) of the workspace preview sidecar base.
|
|
53
|
+
*
|
|
54
|
+
* A workspace runtime serves N attached projects, each with its own preview
|
|
55
|
+
* sidecar (`server.tsx`) on `WORKSPACE_API_PORT_BASE + projectIndex`. We anchor
|
|
56
|
+
* that base at `agentPort + 2` (agentPort=+0, its API/skill server=+1) so every
|
|
57
|
+
* runtime gets a DISTINCT sidecar range. Before warm-multiple, only one runtime
|
|
58
|
+
* ran at a time and all of them could share the fixed default base (3101); now
|
|
59
|
+
* that several runtimes stay warm concurrently they were all binding 3101 and
|
|
60
|
+
* crash-looping (force-killing each other's leaked sidecars), which SIGKILLed
|
|
61
|
+
* the agent-runtime and restart-looped it.
|
|
62
|
+
*/
|
|
63
|
+
const PREVIEW_API_BASE_OFFSET = 2;
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Contiguous ports reserved per runtime: agent(+0), API/skill server(+1) and
|
|
67
|
+
* the preview sidecars (+2 … +RUNTIME_PORT_BLOCK-1). Reserving the whole block
|
|
68
|
+
* (rather than just the two eagerly-bound ports) guarantees no OTHER runtime's
|
|
69
|
+
* block overlaps this one's sidecar range. 16 supports up to 14 attached
|
|
70
|
+
* projects per workspace; the 800-port range still fits 50 such blocks (>> the
|
|
71
|
+
* default maxRuntimes of 10).
|
|
72
|
+
*/
|
|
73
|
+
const RUNTIME_PORT_BLOCK = 16;
|
|
74
|
+
|
|
51
75
|
/** Default idle eviction window — unused runtimes get killed after this. */
|
|
52
76
|
const RUNTIME_IDLE_MS = 15 * 60 * 1000;
|
|
53
77
|
|
|
78
|
+
/**
|
|
79
|
+
* A runtime touched within this window is treated as "actively in use"
|
|
80
|
+
* (likely mid-stream — the agent-proxy/ai-proxy refresh `lastUsedAt` on
|
|
81
|
+
* every forwarded chunk) and is never picked as an LRU eviction victim
|
|
82
|
+
* by {@link WorkerRuntimeManager.enforceMaxRuntimes}, even when the cap
|
|
83
|
+
* is exceeded. Better to briefly run one over the cap than to SIGKILL a
|
|
84
|
+
* live chat stream out from under the user.
|
|
85
|
+
*/
|
|
86
|
+
const STREAM_ACTIVE_WINDOW_MS = 30 * 1000;
|
|
87
|
+
|
|
54
88
|
/** Restart backoff bounds. */
|
|
55
89
|
const RESTART_BACKOFF_BASE_MS = 1_000;
|
|
56
90
|
const RESTART_BACKOFF_MAX_MS = 60_000;
|
|
@@ -222,6 +256,16 @@ export interface WorkerRuntimeManagerOptions {
|
|
|
222
256
|
* Cloud workers leave this unset so the default still fires.
|
|
223
257
|
*/
|
|
224
258
|
idleMs?: number;
|
|
259
|
+
/**
|
|
260
|
+
* Hard ceiling on the number of concurrently-running runtimes. Once
|
|
261
|
+
* exceeded, `ensureRunning` LRU-evicts the least-recently-used slot
|
|
262
|
+
* that has not been touched within {@link STREAM_ACTIVE_WINDOW_MS}
|
|
263
|
+
* (i.e. is not mid-stream). Pass `0`, a negative number, or a
|
|
264
|
+
* non-finite value to disable the cap (the historical behaviour —
|
|
265
|
+
* runtimes were then bounded only by idle eviction). Defaults to
|
|
266
|
+
* disabled when unset.
|
|
267
|
+
*/
|
|
268
|
+
maxRuntimes?: number;
|
|
225
269
|
/** Optional logger. Defaults to console. */
|
|
226
270
|
logger?: Pick<Console, 'log' | 'warn' | 'error'>;
|
|
227
271
|
/** Working directory for spawned runtimes. Defaults to OS tmpdir/shogo-runtime. */
|
|
@@ -629,12 +673,71 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
629
673
|
slot.startPromise = this.doStart(slot);
|
|
630
674
|
try {
|
|
631
675
|
const r = await slot.startPromise;
|
|
676
|
+
// We just brought a (possibly new) runtime up — enforce the hard
|
|
677
|
+
// ceiling now so a busy multi-project session can't accumulate
|
|
678
|
+
// runtimes without bound. Never evicts the one we just started.
|
|
679
|
+
this.enforceMaxRuntimes(projectId);
|
|
632
680
|
return this.snapshot(r);
|
|
633
681
|
} finally {
|
|
634
682
|
slot.startPromise = null;
|
|
635
683
|
}
|
|
636
684
|
}
|
|
637
685
|
|
|
686
|
+
/**
|
|
687
|
+
* Enforce {@link WorkerRuntimeManagerOptions.maxRuntimes} by LRU-evicting
|
|
688
|
+
* the least-recently-used running slot until the count is at/under the
|
|
689
|
+
* cap. Skips:
|
|
690
|
+
* - the slot we just started (`keepProjectId`),
|
|
691
|
+
* - any slot touched within {@link STREAM_ACTIVE_WINDOW_MS} (treated as
|
|
692
|
+
* mid-stream — we never cut a live chat),
|
|
693
|
+
* - non-running slots (starting/restarting/stopping/failed don't count
|
|
694
|
+
* against the cap and aren't safe to tear down here).
|
|
695
|
+
*
|
|
696
|
+
* If every over-cap slot is actively streaming we stop early and let the
|
|
697
|
+
* count ride briefly over the cap rather than killing a live stream — the
|
|
698
|
+
* next ensureRunning (or idle eviction) reclaims it once it goes quiet.
|
|
699
|
+
*
|
|
700
|
+
* Fire-and-forget: eviction `stop()` is async (process-group kill +
|
|
701
|
+
* grace window) but we don't await it — the caller shouldn't block its
|
|
702
|
+
* own spawn on tearing down someone else's idle runtime.
|
|
703
|
+
*/
|
|
704
|
+
private enforceMaxRuntimes(keepProjectId: string): void {
|
|
705
|
+
const cap = this.opts.maxRuntimes;
|
|
706
|
+
if (cap == null || !Number.isFinite(cap) || cap <= 0) return;
|
|
707
|
+
|
|
708
|
+
const now = Date.now();
|
|
709
|
+
const running = Array.from(this.runtimes.values()).filter((r) => r.status === 'running');
|
|
710
|
+
if (running.length <= cap) return;
|
|
711
|
+
|
|
712
|
+
// LRU order: oldest lastUsedAt first.
|
|
713
|
+
const candidates = running
|
|
714
|
+
.filter((r) => r.projectId !== keepProjectId && now - r.lastUsedAt >= STREAM_ACTIVE_WINDOW_MS)
|
|
715
|
+
.sort((a, b) => a.lastUsedAt - b.lastUsedAt);
|
|
716
|
+
|
|
717
|
+
let overBy = running.length - cap;
|
|
718
|
+
for (const victim of candidates) {
|
|
719
|
+
if (overBy <= 0) break;
|
|
720
|
+
const idleMs = now - victim.lastUsedAt;
|
|
721
|
+
this.log.log(
|
|
722
|
+
`[WorkerRuntimeManager] maxRuntimes=${cap} exceeded (${running.length} running) — ` +
|
|
723
|
+
`LRU-evicting ${victim.projectId} (idle ${Math.round(idleMs / 1000)}s)`,
|
|
724
|
+
);
|
|
725
|
+
void this.stop(victim.projectId).catch((err: any) => {
|
|
726
|
+
this.log.warn(
|
|
727
|
+
`[WorkerRuntimeManager] maxRuntimes eviction of ${victim.projectId} failed: ${err?.message ?? err}`,
|
|
728
|
+
);
|
|
729
|
+
});
|
|
730
|
+
overBy--;
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
if (overBy > 0) {
|
|
734
|
+
this.log.log(
|
|
735
|
+
`[WorkerRuntimeManager] maxRuntimes=${cap} still exceeded by ${overBy} after eviction pass — ` +
|
|
736
|
+
`remaining over-cap slots are mid-stream; will retry on next spawn / idle reap`,
|
|
737
|
+
);
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
|
|
638
741
|
/**
|
|
639
742
|
* Public entry point for tests + the `worker start` command to pre-warm
|
|
640
743
|
* a project's workspace without spawning anything. Internally idempotent.
|
|
@@ -1170,6 +1273,12 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
1170
1273
|
PORT: String(slot.agentPort),
|
|
1171
1274
|
API_SERVER_PORT: String(slot.apiServerPort),
|
|
1172
1275
|
SKILL_SERVER_PORT: String(slot.apiServerPort),
|
|
1276
|
+
// Per-runtime base for workspace preview sidecars (server.tsx). Anchored
|
|
1277
|
+
// at agentPort+2 so each warm runtime owns a distinct sidecar range and
|
|
1278
|
+
// they can't all collide on the fixed default (3101) — the cause of the
|
|
1279
|
+
// preview-manager crash-loop / agent-runtime SIGKILL restart storm when
|
|
1280
|
+
// multiple projects are kept warm at once.
|
|
1281
|
+
WORKSPACE_API_PORT_BASE: String(slot.agentPort + PREVIEW_API_BASE_OFFSET),
|
|
1173
1282
|
NODE_ENV: 'production',
|
|
1174
1283
|
SHOGO_CLOUD_URL: cfg.cloudUrl,
|
|
1175
1284
|
SHOGO_API_URL: cfg.cloudUrl,
|
|
@@ -1355,12 +1464,23 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
1355
1464
|
const maxAttempts = Math.min(range, 50);
|
|
1356
1465
|
for (let i = 0; i < maxAttempts; i++) {
|
|
1357
1466
|
const candidate = PORT_RANGE_START + Math.floor(Math.random() * range);
|
|
1358
|
-
|
|
1467
|
+
// Reserve a contiguous per-runtime block so the agent port, its API
|
|
1468
|
+
// server AND every preview sidecar (WORKSPACE_API_PORT_BASE + idx) live
|
|
1469
|
+
// in a range that no other warm runtime can overlap.
|
|
1470
|
+
if (candidate + RUNTIME_PORT_BLOCK - 1 > PORT_RANGE_END) continue;
|
|
1471
|
+
let blockFree = true;
|
|
1472
|
+
for (let off = 0; off < RUNTIME_PORT_BLOCK; off++) {
|
|
1473
|
+
if (this.usedPorts.has(candidate + off)) { blockFree = false; break; }
|
|
1474
|
+
}
|
|
1475
|
+
if (!blockFree) continue;
|
|
1476
|
+
// Liveness-probe only the two ports we bind eagerly (agent + its API
|
|
1477
|
+
// server). The sidecar ports are bound lazily by the agent-runtime and
|
|
1478
|
+
// guarded by its own leaked-process force-kill, so probing the whole
|
|
1479
|
+
// block here would just slow allocation down.
|
|
1359
1480
|
const agentInUse = await this.isPortListening(candidate);
|
|
1360
1481
|
const apiInUse = await this.isPortListening(candidate + API_PORT_OFFSET);
|
|
1361
1482
|
if (agentInUse || apiInUse) continue;
|
|
1362
|
-
this.usedPorts.add(candidate);
|
|
1363
|
-
this.usedPorts.add(candidate + API_PORT_OFFSET);
|
|
1483
|
+
for (let off = 0; off < RUNTIME_PORT_BLOCK; off++) this.usedPorts.add(candidate + off);
|
|
1364
1484
|
return candidate;
|
|
1365
1485
|
}
|
|
1366
1486
|
throw new Error(
|
|
@@ -1370,8 +1490,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
1370
1490
|
|
|
1371
1491
|
private releasePort(port: number): void {
|
|
1372
1492
|
if (!port) return;
|
|
1373
|
-
this.usedPorts.delete(port);
|
|
1374
|
-
this.usedPorts.delete(port + API_PORT_OFFSET);
|
|
1493
|
+
for (let off = 0; off < RUNTIME_PORT_BLOCK; off++) this.usedPorts.delete(port + off);
|
|
1375
1494
|
}
|
|
1376
1495
|
|
|
1377
1496
|
private async isPortListening(port: number): Promise<boolean> {
|
package/src/lib/tunnel.ts
CHANGED
|
@@ -102,7 +102,7 @@ type TunnelWebSocketConstructor = new (url: string, init: TunnelWebSocketInit) =
|
|
|
102
102
|
|
|
103
103
|
type RuntimeWithBunWebSocketHeaders = typeof globalThis & {
|
|
104
104
|
Bun?: unknown;
|
|
105
|
-
process?: { versions?: { bun?: string } };
|
|
105
|
+
process?: { versions?: { bun?: string; node?: string } };
|
|
106
106
|
};
|
|
107
107
|
|
|
108
108
|
interface HeartbeatResponse {
|
|
@@ -116,8 +116,8 @@ export class TunnelWebSocketHeaderSupportError extends Error {
|
|
|
116
116
|
code = 'TUNNEL_WS_HEADERS_UNSUPPORTED' as const;
|
|
117
117
|
constructor() {
|
|
118
118
|
super(
|
|
119
|
-
'Tunnel WebSocket auth requires
|
|
120
|
-
'
|
|
119
|
+
'Tunnel WebSocket auth requires a runtime with WebSocket header support (Bun or Node >= 21). ' +
|
|
120
|
+
'Current runtime does not support WebSocket constructor headers.',
|
|
121
121
|
);
|
|
122
122
|
this.name = 'TunnelWebSocketHeaderSupportError';
|
|
123
123
|
}
|
|
@@ -231,7 +231,15 @@ export class WorkerTunnel {
|
|
|
231
231
|
private supportsWebSocketConstructorHeaders(
|
|
232
232
|
runtime: RuntimeWithBunWebSocketHeaders = globalThis as RuntimeWithBunWebSocketHeaders,
|
|
233
233
|
): boolean {
|
|
234
|
-
|
|
234
|
+
if (typeof runtime.Bun !== 'undefined' || typeof runtime.process?.versions?.bun === 'string') return true;
|
|
235
|
+
// Node 21+ ships WebSocket (via undici) with header support in the constructor.
|
|
236
|
+
// Detect by checking for Node >= 21 (the version that made WebSocket a global).
|
|
237
|
+
const nodeVersion = runtime.process?.versions?.node;
|
|
238
|
+
if (nodeVersion) {
|
|
239
|
+
const major = parseInt(nodeVersion.split('.')[0] ?? '0', 10);
|
|
240
|
+
if (major >= 21) return true;
|
|
241
|
+
}
|
|
242
|
+
return false;
|
|
235
243
|
}
|
|
236
244
|
|
|
237
245
|
private createTunnelWebSocket(
|