claude-code-session-manager 0.8.5 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -65
- package/dist/assets/cssMode-DWlBzlpW.js +1 -0
- package/dist/assets/{freemarker2-dhfKZR7u.js → freemarker2-Cgg83m-Z.js} +2 -2
- package/dist/assets/handlebars-C4r4LOI9.js +1 -0
- package/dist/assets/html-DaxRI5sW.js +1 -0
- package/dist/assets/htmlMode-Bu_8jtXo.js +1 -0
- package/dist/assets/index-C_tgFedf.js +3986 -0
- package/dist/assets/{editor-BTnBOi8r.css → index-Dj3Db4OA.css} +32 -1
- package/dist/assets/javascript-D5Ztx-Ej.js +1 -0
- package/dist/assets/{jsonMode-BtjA-2w_.js → jsonMode-tfsgezVc.js} +4 -4
- package/dist/assets/liquid-F2cD9OL0.js +1 -0
- package/dist/assets/lspLanguageFeatures-Bz_Eih8F.js +4 -0
- package/dist/assets/mdx-BPlD1clX.js +1 -0
- package/dist/assets/{ort-wasm-simd-threaded.asyncify-CtKKja6V.wasm → ort-wasm-simd-threaded.asyncify-DMmc6YqF.wasm} +0 -0
- package/dist/assets/python-B4gUOWNI.js +1 -0
- package/dist/assets/razor-B6pMxVp1.js +1 -0
- package/dist/assets/{tsMode-hUkEyjsH.js → tsMode-C9nq6cHi.js} +2 -2
- package/dist/assets/typescript-Do5Vtwxu.js +1 -0
- package/dist/assets/{whisperWorker-QfIS0sPF.js → whisperWorker-CcsPqZUS.js} +19 -19
- package/dist/assets/xml-C0mTbVRp.js +1 -0
- package/dist/assets/yaml-D3sePJfA.js +1 -0
- package/dist/index.html +2 -2
- package/package.json +18 -9
- package/screenshots/.gitkeep +0 -0
- package/screenshots/README-screenshots.md +13 -0
- package/src/main/config.cjs +47 -9
- package/src/main/historyAggregator.cjs +10 -5
- package/src/main/index.cjs +85 -14
- package/src/main/ipcSchemas.cjs +165 -3
- package/src/main/lib/claudeBin.cjs +39 -0
- package/src/main/lib/encodeCwd.cjs +19 -0
- package/src/main/lib/fileTail.cjs +35 -0
- package/src/main/lib/insideHome.cjs +38 -0
- package/src/main/lib/prdFrontmatter.cjs +51 -0
- package/src/main/lib/sendToRenderer.cjs +21 -0
- package/src/main/memoryTool.cjs +203 -0
- package/src/main/otelSettings.cjs +2 -7
- package/src/main/pluginInstall.cjs +129 -0
- package/src/main/pty.cjs +13 -29
- package/src/main/queueOps.cjs +404 -0
- package/src/main/scheduler/prdParser.cjs +135 -0
- package/src/main/scheduler.cjs +291 -250
- package/src/main/sessionsStore.cjs +2 -6
- package/src/main/supervisor.cjs +3 -35
- package/src/main/teams.cjs +95 -0
- package/src/main/transcripts.cjs +5 -7
- package/src/main/usage.cjs +8 -0
- package/src/main/voiceHotkey.cjs +13 -9
- package/src/main/voiceSettings.cjs +2 -9
- package/src/main/voiceWizard.cjs +4 -11
- package/src/main/watchers.cjs +18 -42
- package/src/preload/api.d.ts +153 -1
- package/src/preload/index.cjs +29 -0
- package/dist/assets/cssMode-BCLoTYI0.js +0 -1
- package/dist/assets/editor.main-UoasbVGy.js +0 -908
- package/dist/assets/handlebars-DdpqwFuV.js +0 -1
- package/dist/assets/html-1oTJClkg.js +0 -1
- package/dist/assets/htmlMode-CF1QbIg-.js +0 -1
- package/dist/assets/index-DWDcKbgI.js +0 -3046
- package/dist/assets/index-eqxng9X2.css +0 -32
- package/dist/assets/javascript-BP_Q5MFx.js +0 -1
- package/dist/assets/liquid-DstuL8vm.js +0 -1
- package/dist/assets/lspLanguageFeatures-DvSiaY4f.js +0 -4
- package/dist/assets/mdx-qO-uvsJd.js +0 -1
- package/dist/assets/python-CCPz_1cy.js +0 -1
- package/dist/assets/razor-B7tCzkdh.js +0 -1
- package/dist/assets/typescript-BeXECzAk.js +0 -1
- package/dist/assets/xml-MRJd4GHf.js +0 -1
- package/dist/assets/yaml-CzGliMNL.js +0 -1
package/src/main/scheduler.cjs
CHANGED
|
@@ -45,10 +45,17 @@ const fsp = require('node:fs/promises');
|
|
|
45
45
|
const path = require('node:path');
|
|
46
46
|
const os = require('node:os');
|
|
47
47
|
const { spawn } = require('node:child_process');
|
|
48
|
+
const { randomUUID } = require('node:crypto');
|
|
48
49
|
const { ipcMain } = require('electron');
|
|
49
50
|
const billing = require('./usage.cjs');
|
|
50
51
|
const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
|
|
51
52
|
const supervisor = require('./supervisor.cjs');
|
|
53
|
+
const { resolveClaudeBin } = require('./lib/claudeBin.cjs');
|
|
54
|
+
const { readTail } = require('./lib/fileTail.cjs');
|
|
55
|
+
const { sendIfAlive } = require('./lib/sendToRenderer.cjs');
|
|
56
|
+
const prdParser = require('./scheduler/prdParser.cjs');
|
|
57
|
+
const logs = require('./logs.cjs');
|
|
58
|
+
const { schemas } = require('./ipcSchemas.cjs');
|
|
52
59
|
const {
|
|
53
60
|
POLL_INTERVAL_MS,
|
|
54
61
|
USAGE_REFRESH_INTERVAL_MS,
|
|
@@ -95,8 +102,6 @@ const ENV_CAP = process.env.SM_SCHEDULER_MAX_CONCURRENCY
|
|
|
95
102
|
: null;
|
|
96
103
|
|
|
97
104
|
const DEFAULT_CONFIG = {
|
|
98
|
-
// Legacy on/off retained for backwards compat; v0.5+ uses firePolicy.
|
|
99
|
-
enabled: false,
|
|
100
105
|
offsetMinutes: 15,
|
|
101
106
|
concurrencyCap: ENV_CAP ?? 4,
|
|
102
107
|
defaultCwd: DEFAULT_PROJECT_CWD,
|
|
@@ -117,16 +122,31 @@ const DEFAULT_CONFIG = {
|
|
|
117
122
|
|
|
118
123
|
// ---------- fs helpers ----------
|
|
119
124
|
|
|
125
|
+
/**
|
|
126
|
+
* Resolve PRDS_DIR/<slug>.md and enforce path containment. Returns the
|
|
127
|
+
* absolute path on success, null on slug-escape attempts. The zod schema
|
|
128
|
+
* for slugs already blocks `..` because the SLUG_RE excludes `/`, but
|
|
129
|
+
* defense-in-depth: a second containment check after path.resolve costs
|
|
130
|
+
* nothing and catches future regex laxity.
|
|
131
|
+
*/
|
|
132
|
+
function safeSlugPath(slug) {
|
|
133
|
+
const resolved = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
|
|
134
|
+
if (!resolved.startsWith(PRDS_DIR + path.sep)) return null;
|
|
135
|
+
return resolved;
|
|
136
|
+
}
|
|
137
|
+
|
|
120
138
|
function ensureDirs() {
|
|
121
139
|
fs.mkdirSync(PRDS_DIR, { recursive: true });
|
|
122
140
|
fs.mkdirSync(RUNS_DIR, { recursive: true });
|
|
123
141
|
}
|
|
124
142
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
143
|
+
// Atomic JSON write helpers delegate to config.cjs's shared implementation.
|
|
144
|
+
// Sync variant is required for the executeJob exit handler (Promise resolver
|
|
145
|
+
// callback that must flush meta.json before resolving) — replacing with async
|
|
146
|
+
// would deadlock the exit path.
|
|
147
|
+
const config = require('./config.cjs');
|
|
148
|
+
const atomicWriteJsonSync = (p, data) => config.writeJsonSync(p, data);
|
|
149
|
+
const atomicWriteJson = (p, data) => config.writeJson(p, data);
|
|
130
150
|
|
|
131
151
|
// ---------- scheduler-state.json (sidecar) ----------
|
|
132
152
|
|
|
@@ -143,8 +163,12 @@ function loadSchedulerState() {
|
|
|
143
163
|
}
|
|
144
164
|
|
|
145
165
|
function persistSchedulerState() {
|
|
166
|
+
// Sync write: called from many sync hot paths (clearPause, pollLoop catch
|
|
167
|
+
// block) and the sidecar is tiny (<1 KB). Converting to async here would
|
|
168
|
+
// require threading awaits through pause/resume bookkeeping for negligible
|
|
169
|
+
// benefit — the file is well under one page.
|
|
146
170
|
try {
|
|
147
|
-
|
|
171
|
+
atomicWriteJsonSync(SCHEDULER_STATE_PATH, {
|
|
148
172
|
version: 1,
|
|
149
173
|
lastObservedReset: cachedNextReset,
|
|
150
174
|
lastResetObservedAt: cachedNextReset ? Date.now() : null,
|
|
@@ -178,7 +202,10 @@ function appendHeartbeat(entry) {
|
|
|
178
202
|
}
|
|
179
203
|
}
|
|
180
204
|
|
|
181
|
-
|
|
205
|
+
// Sync queue read — passed to the supervisor module (which calls it from
|
|
206
|
+
// supervisorTick / applyAction with no await) and the heartbeat interval.
|
|
207
|
+
// IPC handlers and mutate() use readQueue (async) below.
|
|
208
|
+
function readQueueSync() {
|
|
182
209
|
try {
|
|
183
210
|
const raw = fs.readFileSync(QUEUE_PATH, 'utf8');
|
|
184
211
|
const data = JSON.parse(raw);
|
|
@@ -194,9 +221,28 @@ function readQueue() {
|
|
|
194
221
|
}
|
|
195
222
|
}
|
|
196
223
|
|
|
197
|
-
|
|
224
|
+
// Async queue read — used on all IPC hot paths. Reading queue.json sync was
|
|
225
|
+
// blocking the main thread inside ipcMain.handle callbacks; awaiting fsp.readFile
|
|
226
|
+
// hands control back to the renderer while the kernel paginates the file.
|
|
227
|
+
async function readQueue() {
|
|
228
|
+
try {
|
|
229
|
+
const raw = await fsp.readFile(QUEUE_PATH, 'utf8');
|
|
230
|
+
const data = JSON.parse(raw);
|
|
231
|
+
return {
|
|
232
|
+
config: { ...DEFAULT_CONFIG, ...(data.config || {}) },
|
|
233
|
+
jobs: Array.isArray(data.jobs) ? data.jobs : [],
|
|
234
|
+
scheduledFor: data.scheduledFor ?? null,
|
|
235
|
+
lastRunAt: data.lastRunAt ?? null,
|
|
236
|
+
paused: data.paused ?? null,
|
|
237
|
+
};
|
|
238
|
+
} catch {
|
|
239
|
+
return { config: { ...DEFAULT_CONFIG }, jobs: [], scheduledFor: null, lastRunAt: null, paused: null };
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
async function writeQueue(state) {
|
|
198
244
|
ensureDirs();
|
|
199
|
-
atomicWriteJson(QUEUE_PATH, state);
|
|
245
|
+
await atomicWriteJson(QUEUE_PATH, state);
|
|
200
246
|
}
|
|
201
247
|
|
|
202
248
|
// ---------- serialized mutation queue ----------
|
|
@@ -209,9 +255,9 @@ let mutateTail = Promise.resolve();
|
|
|
209
255
|
|
|
210
256
|
function mutate(fn) {
|
|
211
257
|
const next = mutateTail.then(async () => {
|
|
212
|
-
const state = readQueue();
|
|
258
|
+
const state = await readQueue();
|
|
213
259
|
const ret = await fn(state);
|
|
214
|
-
writeQueue(state);
|
|
260
|
+
await writeQueue(state);
|
|
215
261
|
return ret;
|
|
216
262
|
});
|
|
217
263
|
mutateTail = next.catch(() => {}); // keep chain alive on errors
|
|
@@ -225,55 +271,13 @@ function mutate(fn) {
|
|
|
225
271
|
* yaml dep; the schema is small (title, cwd, estimateMinutes, parallelGroup)
|
|
226
272
|
* and the format is documented in the user-facing README.
|
|
227
273
|
*/
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
if (text.startsWith('---\n')) {
|
|
234
|
-
const end = text.indexOf('\n---', 4);
|
|
235
|
-
if (end !== -1) {
|
|
236
|
-
const fm = text.slice(4, end);
|
|
237
|
-
body = text.slice(end + 4).replace(/^\n/, '');
|
|
238
|
-
for (const line of fm.split('\n')) {
|
|
239
|
-
const m = line.match(/^([a-zA-Z]+):\s*(.+?)\s*$/);
|
|
240
|
-
if (!m) continue;
|
|
241
|
-
const k = m[1];
|
|
242
|
-
let v = m[2];
|
|
243
|
-
if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) {
|
|
244
|
-
v = v.slice(1, -1);
|
|
245
|
-
}
|
|
246
|
-
if (k === 'title') meta.title = v;
|
|
247
|
-
else if (k === 'cwd') meta.cwd = v;
|
|
248
|
-
else if (k === 'estimateMinutes') meta.estimateMinutes = Number(v) || null;
|
|
249
|
-
else if (k === 'parallelGroup') meta.parallelGroup = Number(v) || null;
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
const base = path.basename(filePath, '.md');
|
|
255
|
-
const groupFromName = (() => {
|
|
256
|
-
const m = base.match(/^(\d+)-/);
|
|
257
|
-
return m ? Number(m[1]) : null;
|
|
258
|
-
})();
|
|
259
|
-
|
|
260
|
-
return {
|
|
261
|
-
slug: base,
|
|
262
|
-
path: filePath,
|
|
263
|
-
title: meta.title || base,
|
|
264
|
-
cwd: meta.cwd || null,
|
|
265
|
-
estimateMinutes: meta.estimateMinutes,
|
|
266
|
-
parallelGroup: meta.parallelGroup ?? groupFromName ?? 99,
|
|
267
|
-
body: body.trim(),
|
|
268
|
-
};
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
function listPrdFiles() {
|
|
274
|
+
// PRD parsing + dir-mtime cache live in scheduler/prdParser.cjs. Local wrappers
|
|
275
|
+
// preserve the existing call shape (callers don't need to thread PRDS_DIR).
|
|
276
|
+
const parsePrdRaw = prdParser.parsePrdRaw;
|
|
277
|
+
const parsePrd = prdParser.parsePrd;
|
|
278
|
+
async function listPrdFiles() {
|
|
272
279
|
ensureDirs();
|
|
273
|
-
return
|
|
274
|
-
.filter((f) => f.endsWith('.md') && !f.startsWith('.'))
|
|
275
|
-
.map((f) => path.join(PRDS_DIR, f))
|
|
276
|
-
.sort();
|
|
280
|
+
return prdParser.listPrdFiles(PRDS_DIR);
|
|
277
281
|
}
|
|
278
282
|
|
|
279
283
|
// ---------- queue reconciliation ----------
|
|
@@ -286,12 +290,14 @@ function listPrdFiles() {
|
|
|
286
290
|
* Status is preserved: pending stays pending, completed stays completed.
|
|
287
291
|
* Newly-discovered PRDs land as `pending`.
|
|
288
292
|
*/
|
|
289
|
-
function reconcile(state) {
|
|
290
|
-
const files = listPrdFiles();
|
|
293
|
+
async function reconcile(state) {
|
|
294
|
+
const files = await listPrdFiles();
|
|
291
295
|
const onDisk = new Map();
|
|
292
296
|
for (const f of files) {
|
|
293
297
|
try {
|
|
294
|
-
|
|
298
|
+
// Per-file await: parsing is mtime-cached so steady-state hits zero
|
|
299
|
+
// disk reads; on cold cache the awaits keep the main thread responsive.
|
|
300
|
+
const p = await parsePrd(f);
|
|
295
301
|
onDisk.set(p.slug, p);
|
|
296
302
|
} catch (e) {
|
|
297
303
|
console.warn('[scheduler] failed to parse', f, e?.message);
|
|
@@ -377,16 +383,17 @@ let heartbeatInterval = null;
|
|
|
377
383
|
// double-spawn when runDueJobs() is called while jobs are in flight.
|
|
378
384
|
const runningSet = new Set();
|
|
379
385
|
let cancelToken = { cancelled: false };
|
|
380
|
-
let claudeBinPathCached = null;
|
|
381
386
|
|
|
382
387
|
function attachWindow(w) { mainWindow = w; }
|
|
383
388
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
389
|
+
/**
|
|
390
|
+
* Build the snapshot payload consumed by both the `schedule:state` IPC
|
|
391
|
+
* handler and the `schedule:state` broadcast event. The IPC return adds a
|
|
392
|
+
* `paths` map (renderer uses it for "open folder" actions); broadcast omits
|
|
393
|
+
* it because subscribers don't need to re-derive paths on every tick.
|
|
394
|
+
*/
|
|
395
|
+
function buildScheduleStatePayload(state, { withPaths = false } = {}) {
|
|
396
|
+
const payload = {
|
|
390
397
|
config: state.config,
|
|
391
398
|
jobs: state.jobs,
|
|
392
399
|
scheduledFor: state.scheduledFor,
|
|
@@ -394,7 +401,19 @@ function broadcast() {
|
|
|
394
401
|
nextReset: getNextResetCached(),
|
|
395
402
|
paused: state.paused,
|
|
396
403
|
utilization: cachedUtilization,
|
|
397
|
-
}
|
|
404
|
+
};
|
|
405
|
+
if (withPaths) {
|
|
406
|
+
payload.paths = { root: ROOT, prds: PRDS_DIR, runs: RUNS_DIR, queue: QUEUE_PATH };
|
|
407
|
+
}
|
|
408
|
+
return payload;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
async function broadcast() {
|
|
412
|
+
if (!mainWindow || mainWindow.isDestroyed()) return;
|
|
413
|
+
const state = await readQueue();
|
|
414
|
+
await reconcile(state);
|
|
415
|
+
await writeQueue(state);
|
|
416
|
+
sendIfAlive(mainWindow, 'schedule:state', buildScheduleStatePayload(state));
|
|
398
417
|
}
|
|
399
418
|
|
|
400
419
|
function clearFireTimer() {
|
|
@@ -425,13 +444,13 @@ async function rescheduleTimer() {
|
|
|
425
444
|
} catch {
|
|
426
445
|
nextResetIso = cachedNextReset;
|
|
427
446
|
}
|
|
428
|
-
const fireAt = await mutate((state) => {
|
|
429
|
-
reconcile(state);
|
|
447
|
+
const fireAt = await mutate(async (state) => {
|
|
448
|
+
await reconcile(state);
|
|
430
449
|
const fa = computeFireAt(state, nextResetIso);
|
|
431
450
|
state.scheduledFor = fa ? new Date(fa).toISOString() : null;
|
|
432
451
|
return fa;
|
|
433
452
|
});
|
|
434
|
-
broadcast();
|
|
453
|
+
await broadcast();
|
|
435
454
|
if (!fireAt) return;
|
|
436
455
|
|
|
437
456
|
const delay = Math.max(1000, fireAt - Date.now());
|
|
@@ -462,7 +481,7 @@ async function setPaused(reason, resumeAtIso) {
|
|
|
462
481
|
s.paused = { reason, since: new Date().toISOString(), resumeAt: effectiveResumeAt || null };
|
|
463
482
|
}
|
|
464
483
|
});
|
|
465
|
-
broadcast();
|
|
484
|
+
await broadcast();
|
|
466
485
|
cancelToken.cancelled = true;
|
|
467
486
|
if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
|
|
468
487
|
if (!effectiveResumeAt) return;
|
|
@@ -491,9 +510,18 @@ async function clearPause(source) {
|
|
|
491
510
|
// Track manual clears for the auto-pause cooldown.
|
|
492
511
|
if (source === 'manual' || source === 'run-now') {
|
|
493
512
|
pauseClearedManuallyAt = Date.now();
|
|
513
|
+
// The user has just affirmed the queue should run — clear the failure
|
|
514
|
+
// counters so the renderer doesn't keep nagging about stale poll fails.
|
|
515
|
+
// The next poll will set them again if the condition still applies.
|
|
516
|
+
consecutiveFailures = 0;
|
|
517
|
+
backoffMs = 0;
|
|
518
|
+
backoffNextAt = null;
|
|
519
|
+
firstFailureAt = null;
|
|
520
|
+
firstNon429FailureAt = null;
|
|
521
|
+
lastFailureKind = null;
|
|
494
522
|
persistSchedulerState();
|
|
495
523
|
}
|
|
496
|
-
if (wasPaused) broadcast();
|
|
524
|
+
if (wasPaused) await broadcast();
|
|
497
525
|
}
|
|
498
526
|
|
|
499
527
|
/** Mutate a job in place to "pending" with cleared run metadata. */
|
|
@@ -528,24 +556,6 @@ function detectRateLimitInLog(logPath) {
|
|
|
528
556
|
}
|
|
529
557
|
}
|
|
530
558
|
|
|
531
|
-
// ---------- claude binary ----------
|
|
532
|
-
|
|
533
|
-
function resolveClaudeBin() {
|
|
534
|
-
if (claudeBinPathCached) return claudeBinPathCached;
|
|
535
|
-
const candidates = [
|
|
536
|
-
path.join(os.homedir(), '.claude', 'local', 'claude'),
|
|
537
|
-
'/usr/local/bin/claude',
|
|
538
|
-
'/opt/homebrew/bin/claude',
|
|
539
|
-
'/usr/bin/claude',
|
|
540
|
-
];
|
|
541
|
-
for (const c of candidates) {
|
|
542
|
-
try { fs.accessSync(c, fs.constants.X_OK); claudeBinPathCached = c; return c; } catch { /* */ }
|
|
543
|
-
}
|
|
544
|
-
// Last resort: rely on PATH lookup at spawn time.
|
|
545
|
-
claudeBinPathCached = 'claude';
|
|
546
|
-
return claudeBinPathCached;
|
|
547
|
-
}
|
|
548
|
-
|
|
549
559
|
// ---------- execution ----------
|
|
550
560
|
|
|
551
561
|
function pickRunDir() {
|
|
@@ -565,31 +575,43 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
565
575
|
const metaPath = path.join(runDir, `${job.slug}.meta.json`);
|
|
566
576
|
const cwd = job.cwd || defaultCwd;
|
|
567
577
|
const startedAt = Date.now();
|
|
578
|
+
const sessionId = randomUUID();
|
|
568
579
|
|
|
569
580
|
const fd = fs.openSync(logPath, 'a');
|
|
570
581
|
let fdClosed = false;
|
|
571
|
-
const closeFd = () => { if (fdClosed) return; fdClosed = true; fs.closeSync(fd); };
|
|
582
|
+
const closeFd = () => { if (fdClosed) return; fdClosed = true; try { fs.closeSync(fd); } catch { /* */ } };
|
|
583
|
+
// safeLog: no-op once the fd is closed, never throws on the watchdog timer
|
|
584
|
+
// path. Pre-fix, a post-result/idle watchdog firing AFTER closeFd would
|
|
585
|
+
// throw EBADF and crash the host. Every fs.writeSync(fd, …) below goes
|
|
586
|
+
// through this helper.
|
|
587
|
+
const safeLog = (msg) => {
|
|
588
|
+
if (fdClosed) return;
|
|
589
|
+
try { fs.writeSync(fd, msg); } catch { /* fd vanished mid-write */ }
|
|
590
|
+
};
|
|
572
591
|
|
|
573
|
-
|
|
592
|
+
safeLog(`[scheduler] starting ${job.slug} at ${new Date().toISOString()}\n[scheduler] cwd=${cwd}\n\n`);
|
|
574
593
|
|
|
575
594
|
// Dead-cwd guard: verify the target directory exists and is traversable
|
|
576
595
|
// before handing it to the child process.
|
|
577
596
|
try { fs.accessSync(cwd, fs.constants.X_OK); }
|
|
578
597
|
catch {
|
|
579
598
|
const errMsg = `cwd no longer exists: ${cwd}`;
|
|
580
|
-
|
|
599
|
+
safeLog(`[scheduler] ${errMsg}\n`);
|
|
581
600
|
closeFd();
|
|
582
|
-
|
|
583
|
-
|
|
601
|
+
// Sync write: this is an early-exit error path inside an async function,
|
|
602
|
+
// so we could await, but using the sync variant keeps the error path
|
|
603
|
+
// ordering identical to the spawn-failed branch below (also sync).
|
|
604
|
+
atomicWriteJsonSync(metaPath, { slug: job.slug, cwd, sessionId, exitCode: -1, error: errMsg, startedAt, finishedAt: Date.now(), durationMs: 0 });
|
|
605
|
+
return { exitCode: -1, durationMs: 0, error: errMsg, sessionId };
|
|
584
606
|
}
|
|
585
607
|
|
|
586
608
|
// Read full PRD body fresh from disk (queue stored only the preview).
|
|
587
609
|
let prompt;
|
|
588
610
|
try {
|
|
589
|
-
const parsed = parsePrd(path.join(PRDS_DIR, `${job.slug}.md`));
|
|
611
|
+
const parsed = await parsePrd(path.join(PRDS_DIR, `${job.slug}.md`));
|
|
590
612
|
prompt = parsed.body;
|
|
591
613
|
} catch (e) {
|
|
592
|
-
|
|
614
|
+
safeLog(`[scheduler] failed to read PRD: ${e?.message}\n`);
|
|
593
615
|
closeFd();
|
|
594
616
|
return { exitCode: -1, durationMs: 0, error: e?.message };
|
|
595
617
|
}
|
|
@@ -600,28 +622,46 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
600
622
|
// launched from a `claude` shell. CLAUDE_EFFORT=xhigh forces Opus and
|
|
601
623
|
// overrides `--model sonnet`, so scheduled jobs burn Opus credits silently.
|
|
602
624
|
const childEnv = cleanChildEnv();
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
625
|
+
// Guard against synchronous spawn failures (EAGAIN, ENOMEM on fork).
|
|
626
|
+
// Without this, the throw bubbles out of the Promise executor and the
|
|
627
|
+
// outer await rejects — but the open fd is leaked.
|
|
628
|
+
let child;
|
|
629
|
+
try {
|
|
630
|
+
child = spawn(claudeBin, [
|
|
631
|
+
'-p', prompt,
|
|
632
|
+
'--model', 'sonnet',
|
|
633
|
+
'--dangerously-skip-permissions',
|
|
634
|
+
'--output-format', 'stream-json',
|
|
635
|
+
'--verbose',
|
|
636
|
+
'--session-id', sessionId,
|
|
637
|
+
], {
|
|
638
|
+
cwd,
|
|
639
|
+
env: childEnv,
|
|
640
|
+
stdio: ['ignore', fd, fd],
|
|
641
|
+
// detached:true puts the child in its own process group so we can kill
|
|
642
|
+
// the entire descendant tree (including any stray background bashes the
|
|
643
|
+
// agent spawned) with `process.kill(-pid)`. Without this, child.kill()
|
|
644
|
+
// only kills the immediate `claude` process, leaving orphaned subprocs
|
|
645
|
+
// that keep the parent alive (the 2026-05-10 cellar-publish hang).
|
|
646
|
+
detached: true,
|
|
647
|
+
});
|
|
648
|
+
} catch (e) {
|
|
649
|
+
const errMsg = `spawn failed: ${e?.message ?? String(e)}`;
|
|
650
|
+
safeLog(`[scheduler] ${errMsg}\n`);
|
|
651
|
+
closeFd();
|
|
652
|
+
const durationMs = Date.now() - startedAt;
|
|
653
|
+
// Sync write: inside the Promise executor, before resolve(). Awaiting
|
|
654
|
+
// here would require restructuring the executor; the meta file is tiny
|
|
655
|
+
// and this is an error path, not the IPC hot path.
|
|
656
|
+
atomicWriteJsonSync(metaPath, { slug: job.slug, cwd, sessionId, exitCode: -1, error: errMsg, startedAt, finishedAt: Date.now(), durationMs });
|
|
657
|
+
resolve({ exitCode: -1, durationMs, error: errMsg, sessionId });
|
|
658
|
+
return;
|
|
659
|
+
}
|
|
620
660
|
|
|
621
|
-
|
|
661
|
+
safeLog(`[scheduler] spawned pid=${child.pid} sessionId=${sessionId} (process group)\n\n`);
|
|
622
662
|
|
|
623
663
|
// Fire-and-forget pid persistence — best effort.
|
|
624
|
-
if (onPid) onPid(child.pid).catch(() => {});
|
|
664
|
+
if (onPid) onPid(child.pid, sessionId, cwd).catch(() => {});
|
|
625
665
|
|
|
626
666
|
// Track whether the agent has emitted a `result` event in its JSONL stream.
|
|
627
667
|
// null until seen; then one of "success" | "error_max_turns" | ... per the
|
|
@@ -657,15 +697,15 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
657
697
|
const m = buf.toString('utf8').match(/\{"type":"result","subtype":"([a-z_]+)"/);
|
|
658
698
|
if (!m) return;
|
|
659
699
|
agentResultSubtype = m[1];
|
|
660
|
-
|
|
700
|
+
safeLog(`\n[scheduler] result event detected (subtype=${agentResultSubtype}); ` +
|
|
661
701
|
`starting ${Math.round(POST_RESULT_GRACE_MS/1000)}s exit-grace timer\n`);
|
|
662
702
|
clearInterval(resultTailer);
|
|
663
703
|
postResultTimer = setTimeout(() => {
|
|
664
|
-
|
|
704
|
+
safeLog(`\n[scheduler] post-result grace expired (${Math.round(POST_RESULT_GRACE_MS/1000)}s); ` +
|
|
665
705
|
`child still alive — SIGTERM process group\n`);
|
|
666
706
|
killTree('SIGTERM');
|
|
667
707
|
postResultKillTimer = setTimeout(() => {
|
|
668
|
-
|
|
708
|
+
safeLog(`\n[scheduler] still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
|
|
669
709
|
killTree('SIGKILL');
|
|
670
710
|
}, POST_RESULT_KILL_MS);
|
|
671
711
|
if (postResultKillTimer.unref) postResultKillTimer.unref();
|
|
@@ -677,7 +717,7 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
677
717
|
|
|
678
718
|
// Kill the child if it runs past the maximum allowed duration.
|
|
679
719
|
const watchdog = setTimeout(() => {
|
|
680
|
-
|
|
720
|
+
safeLog(`\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
|
|
681
721
|
killTree('SIGKILL');
|
|
682
722
|
}, MAX_JOB_DURATION_MS);
|
|
683
723
|
if (watchdog.unref) watchdog.unref();
|
|
@@ -691,12 +731,12 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
691
731
|
const stat = fs.statSync(logPath);
|
|
692
732
|
const idleMs = Date.now() - stat.mtimeMs;
|
|
693
733
|
if (idleMs > IDLE_OUTPUT_KILL_MS) {
|
|
694
|
-
|
|
734
|
+
safeLog(`\n[scheduler] idle-output watchdog: log mtime stalled ` +
|
|
695
735
|
`${Math.round(idleMs/1000)}s (> ${Math.round(IDLE_OUTPUT_KILL_MS/1000)}s threshold) — SIGTERM process group\n`);
|
|
696
736
|
clearInterval(idleChecker);
|
|
697
737
|
killTree('SIGTERM');
|
|
698
738
|
idleKillTimer = setTimeout(() => {
|
|
699
|
-
|
|
739
|
+
safeLog(`\n[scheduler] idle watchdog: still alive ${Math.round(POST_RESULT_KILL_MS/1000)}s after SIGTERM — SIGKILL\n`);
|
|
700
740
|
killTree('SIGKILL');
|
|
701
741
|
}, POST_RESULT_KILL_MS);
|
|
702
742
|
if (idleKillTimer.unref) idleKillTimer.unref();
|
|
@@ -717,10 +757,11 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
717
757
|
child.on('error', (err) => {
|
|
718
758
|
clearAllTimers();
|
|
719
759
|
const durationMs = Date.now() - startedAt;
|
|
720
|
-
|
|
760
|
+
safeLog(`\n[scheduler] error: ${err.message}\n`);
|
|
721
761
|
closeFd();
|
|
722
|
-
|
|
723
|
-
|
|
762
|
+
// Sync write: child event handler must flush meta before resolve().
|
|
763
|
+
atomicWriteJsonSync(metaPath, { slug: job.slug, cwd, sessionId, exitCode: -1, error: err.message, startedAt, finishedAt: Date.now(), durationMs });
|
|
764
|
+
resolve({ exitCode: -1, durationMs, error: err.message, sessionId });
|
|
724
765
|
});
|
|
725
766
|
|
|
726
767
|
child.on('exit', (code, signal) => {
|
|
@@ -737,19 +778,21 @@ async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
|
737
778
|
const mappedToSuccess = agentResultSubtype === 'success' && killedBySignal;
|
|
738
779
|
if (mappedToSuccess) {
|
|
739
780
|
effectiveCode = 0;
|
|
740
|
-
|
|
781
|
+
safeLog(`\n[scheduler] mapping exit code=${code} signal=${signal} → 0 ` +
|
|
741
782
|
`(result=success was emitted before kill)\n`);
|
|
742
783
|
}
|
|
743
|
-
|
|
784
|
+
safeLog(`\n[scheduler] exit code=${effectiveCode} (raw code=${code} signal=${signal}) ` +
|
|
744
785
|
`duration=${Math.round(durationMs / 1000)}s\n`);
|
|
745
786
|
closeFd();
|
|
746
787
|
const rateLimited = effectiveCode !== 0 && detectRateLimitInLog(logPath);
|
|
747
|
-
|
|
748
|
-
|
|
788
|
+
// Sync write: child 'exit' handler must flush meta before resolve()
|
|
789
|
+
// so the spawnJob mutate() that follows sees the persisted exit code.
|
|
790
|
+
atomicWriteJsonSync(metaPath, {
|
|
791
|
+
slug: job.slug, cwd, sessionId, exitCode: effectiveCode, rateLimited,
|
|
749
792
|
startedAt, finishedAt: Date.now(), durationMs,
|
|
750
793
|
agentResultSubtype, mappedFromSignal: mappedToSuccess ? signal || `code=${code}` : null,
|
|
751
794
|
});
|
|
752
|
-
resolve({ exitCode: effectiveCode, durationMs, rateLimited });
|
|
795
|
+
resolve({ exitCode: effectiveCode, durationMs, rateLimited, sessionId });
|
|
753
796
|
});
|
|
754
797
|
});
|
|
755
798
|
}
|
|
@@ -859,23 +902,6 @@ function isFixPlanSlug(slug) {
|
|
|
859
902
|
return /^\d+-fix-/.test(slug);
|
|
860
903
|
}
|
|
861
904
|
|
|
862
|
-
/**
|
|
863
|
-
* Read the last `bytes` of a file as utf8. Returns '' on error.
|
|
864
|
-
*/
|
|
865
|
-
function readTail(filePath, bytes) {
|
|
866
|
-
try {
|
|
867
|
-
const stat = fs.statSync(filePath);
|
|
868
|
-
const n = Math.min(stat.size, bytes);
|
|
869
|
-
const fd = fs.openSync(filePath, 'r');
|
|
870
|
-
const buf = Buffer.alloc(n);
|
|
871
|
-
fs.readSync(fd, buf, 0, n, stat.size - n);
|
|
872
|
-
fs.closeSync(fd);
|
|
873
|
-
return buf.toString('utf8');
|
|
874
|
-
} catch {
|
|
875
|
-
return '';
|
|
876
|
-
}
|
|
877
|
-
}
|
|
878
|
-
|
|
879
905
|
/**
|
|
880
906
|
* Spawn an Opus investigation session for a failed job. The investigator's job
|
|
881
907
|
* is to read the failure log + original PRD, identify the root cause, and write
|
|
@@ -897,7 +923,7 @@ async function spawnInvestigation(failedJob, runDir) {
|
|
|
897
923
|
|
|
898
924
|
let originalBody = '';
|
|
899
925
|
try {
|
|
900
|
-
originalBody = parsePrd(path.join(PRDS_DIR, `${failedJob.slug}.md`)).body;
|
|
926
|
+
originalBody = (await parsePrd(path.join(PRDS_DIR, `${failedJob.slug}.md`))).body;
|
|
901
927
|
} catch {
|
|
902
928
|
originalBody = failedJob.bodyPreview || '(original PRD missing from disk)';
|
|
903
929
|
}
|
|
@@ -914,7 +940,16 @@ async function spawnInvestigation(failedJob, runDir) {
|
|
|
914
940
|
return;
|
|
915
941
|
}
|
|
916
942
|
|
|
917
|
-
|
|
943
|
+
// cwd fallback: if the failed job's cwd is missing on disk, the investigator
|
|
944
|
+
// child would itself fail to spawn (ENOENT). Fall back to DEFAULT_PROJECT_CWD
|
|
945
|
+
// so the investigation can still write a fix plan that updates the cwd or
|
|
946
|
+
// re-creates the missing project directory.
|
|
947
|
+
let cwd = failedJob.cwd || DEFAULT_PROJECT_CWD;
|
|
948
|
+
try { fs.accessSync(cwd, fs.constants.X_OK); }
|
|
949
|
+
catch {
|
|
950
|
+
console.warn(`[scheduler] investigation cwd missing (${cwd}); falling back to ${DEFAULT_PROJECT_CWD}`);
|
|
951
|
+
cwd = DEFAULT_PROJECT_CWD;
|
|
952
|
+
}
|
|
918
953
|
const prompt = `You are investigating a failed scheduled job in the session-manager queue. Your ONLY job is to write a fix-plan PRD file. Do NOT attempt the fix yourself.
|
|
919
954
|
|
|
920
955
|
# Failed job
|
|
@@ -960,26 +995,37 @@ ${logTail}
|
|
|
960
995
|
DO NOT attempt the fix. ONLY write the file. When the file exists, exit immediately.`;
|
|
961
996
|
|
|
962
997
|
const fd = fs.openSync(investigationLogPath, 'a');
|
|
963
|
-
|
|
998
|
+
const sessionId = randomUUID();
|
|
999
|
+
try {
|
|
1000
|
+
fs.writeSync(fd, `[scheduler] investigation starting for ${failedJob.slug} at ${new Date().toISOString()}\n[scheduler] target fix PRD: ${fixPath}\n[scheduler] sessionId=${sessionId}\n\n`);
|
|
1001
|
+
} catch { /* */ }
|
|
964
1002
|
|
|
965
1003
|
const claudeBin = resolveClaudeBin();
|
|
966
1004
|
const childEnv = cleanChildEnv();
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
1005
|
+
let child;
|
|
1006
|
+
try {
|
|
1007
|
+
child = spawn(claudeBin, [
|
|
1008
|
+
'-p', prompt,
|
|
1009
|
+
'--model', 'opus',
|
|
1010
|
+
'--dangerously-skip-permissions',
|
|
1011
|
+
'--output-format', 'stream-json',
|
|
1012
|
+
'--verbose',
|
|
1013
|
+
'--session-id', sessionId,
|
|
1014
|
+
], {
|
|
1015
|
+
cwd,
|
|
1016
|
+
env: childEnv,
|
|
1017
|
+
stdio: ['ignore', fd, fd],
|
|
1018
|
+
});
|
|
1019
|
+
} catch (e) {
|
|
1020
|
+
try { fs.writeSync(fd, `\n[scheduler] investigation spawn failed: ${e?.message ?? e}\n`); } catch { /* */ }
|
|
1021
|
+
try { fs.closeSync(fd); } catch { /* */ }
|
|
1022
|
+
return;
|
|
1023
|
+
}
|
|
978
1024
|
|
|
979
|
-
fs.writeSync(fd, `[scheduler] investigation pid=${child.pid}\n\n`);
|
|
1025
|
+
try { fs.writeSync(fd, `[scheduler] investigation pid=${child.pid}\n\n`); } catch { /* */ }
|
|
980
1026
|
|
|
981
1027
|
const watchdog = setTimeout(() => {
|
|
982
|
-
fs.writeSync(fd, `\n[scheduler] investigation watchdog SIGKILL after ${MAX_INVESTIGATION_DURATION_MS}ms\n`);
|
|
1028
|
+
try { fs.writeSync(fd, `\n[scheduler] investigation watchdog SIGKILL after ${MAX_INVESTIGATION_DURATION_MS}ms\n`); } catch { /* */ }
|
|
983
1029
|
try { child.kill('SIGKILL'); } catch { /* already dead */ }
|
|
984
1030
|
}, MAX_INVESTIGATION_DURATION_MS);
|
|
985
1031
|
if (watchdog.unref) watchdog.unref();
|
|
@@ -1015,15 +1061,17 @@ async function spawnJob(job, runId, runDir, defaultCwd) {
|
|
|
1015
1061
|
s.jobs[idx].startedAt = new Date().toISOString();
|
|
1016
1062
|
}
|
|
1017
1063
|
});
|
|
1018
|
-
broadcast();
|
|
1064
|
+
await broadcast();
|
|
1019
1065
|
|
|
1020
|
-
const res = await executeJob(job, runDir, defaultCwd, async (pid) => {
|
|
1066
|
+
const res = await executeJob(job, runDir, defaultCwd, async (pid, sessionId, cwd) => {
|
|
1021
1067
|
await mutate((s) => {
|
|
1022
1068
|
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
1023
1069
|
if (idx >= 0) {
|
|
1024
|
-
s.jobs[idx].
|
|
1070
|
+
s.jobs[idx].sessionId = sessionId;
|
|
1071
|
+
s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt, sessionId, cwd };
|
|
1025
1072
|
}
|
|
1026
1073
|
});
|
|
1074
|
+
await broadcast();
|
|
1027
1075
|
});
|
|
1028
1076
|
|
|
1029
1077
|
if (res.rateLimited) {
|
|
@@ -1052,7 +1100,7 @@ async function spawnJob(job, runId, runDir, defaultCwd) {
|
|
|
1052
1100
|
}
|
|
1053
1101
|
}
|
|
1054
1102
|
});
|
|
1055
|
-
broadcast();
|
|
1103
|
+
await broadcast();
|
|
1056
1104
|
|
|
1057
1105
|
if (actuallyFailed && failedJobSnapshot) {
|
|
1058
1106
|
spawnInvestigation(failedJobSnapshot, runDir).catch((e) => {
|
|
@@ -1075,20 +1123,20 @@ let tickTail = Promise.resolve();
|
|
|
1075
1123
|
|
|
1076
1124
|
function tickQueue() {
|
|
1077
1125
|
const next = tickTail.then(async () => {
|
|
1078
|
-
const state = readQueue();
|
|
1126
|
+
const state = await readQueue();
|
|
1079
1127
|
if (state.paused) {
|
|
1080
1128
|
console.log('[scheduler] tickQueue skipped: paused');
|
|
1081
1129
|
return;
|
|
1082
1130
|
}
|
|
1083
1131
|
if (cancelToken.cancelled) return;
|
|
1084
1132
|
|
|
1085
|
-
reconcile(state);
|
|
1133
|
+
await reconcile(state);
|
|
1086
1134
|
const cap = ENV_CAP ?? state.config.concurrencyCap;
|
|
1087
1135
|
const batch = pickNextBatch(state.jobs, runningSet, cap);
|
|
1088
1136
|
if (batch.length === 0) return;
|
|
1089
1137
|
|
|
1090
1138
|
await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
|
|
1091
|
-
broadcast();
|
|
1139
|
+
await broadcast();
|
|
1092
1140
|
|
|
1093
1141
|
const { runId, dir: runDir } = pickRunDir();
|
|
1094
1142
|
for (const job of batch) {
|
|
@@ -1102,7 +1150,7 @@ function tickQueue() {
|
|
|
1102
1150
|
}
|
|
1103
1151
|
|
|
1104
1152
|
async function runDueJobs() {
|
|
1105
|
-
const state = readQueue();
|
|
1153
|
+
const state = await readQueue();
|
|
1106
1154
|
if (state.paused) {
|
|
1107
1155
|
console.log('[scheduler] runDueJobs skipped: paused');
|
|
1108
1156
|
return;
|
|
@@ -1111,7 +1159,7 @@ async function runDueJobs() {
|
|
|
1111
1159
|
await tickQueue();
|
|
1112
1160
|
// Clear the one-shot scheduledFor without waiting for jobs to settle.
|
|
1113
1161
|
await mutate((s) => { s.scheduledFor = null; });
|
|
1114
|
-
broadcast();
|
|
1162
|
+
await broadcast();
|
|
1115
1163
|
}
|
|
1116
1164
|
|
|
1117
1165
|
// ---------- when-available launch logic ----------
|
|
@@ -1123,7 +1171,7 @@ async function maybeLaunchWhenAvailable(state) {
|
|
|
1123
1171
|
if (pending.length === 0) return;
|
|
1124
1172
|
if (cachedUtilization === null || cachedUtilization === undefined) return;
|
|
1125
1173
|
if (cachedUtilization >= state.config.utilizationThreshold) {
|
|
1126
|
-
broadcast();
|
|
1174
|
+
await broadcast();
|
|
1127
1175
|
return;
|
|
1128
1176
|
}
|
|
1129
1177
|
console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending, ${runningSet.size} running — ticking`);
|
|
@@ -1150,7 +1198,7 @@ async function pollLoop() {
|
|
|
1150
1198
|
persistSchedulerState();
|
|
1151
1199
|
|
|
1152
1200
|
// If a 'network' pause resolved, clear it now that we have a good reading.
|
|
1153
|
-
const cur = readQueue();
|
|
1201
|
+
const cur = await readQueue();
|
|
1154
1202
|
if (cur.paused?.reason === 'network') {
|
|
1155
1203
|
await clearPause('network-recovered');
|
|
1156
1204
|
}
|
|
@@ -1160,7 +1208,7 @@ async function pollLoop() {
|
|
|
1160
1208
|
}
|
|
1161
1209
|
|
|
1162
1210
|
await maybeLaunchWhenAvailable(cur);
|
|
1163
|
-
broadcast();
|
|
1211
|
+
await broadcast();
|
|
1164
1212
|
} else if (r.kind === 'meter_rate_limited') {
|
|
1165
1213
|
// Billing meter is itself being rate-limited. Treat as "utilization unknown but safe":
|
|
1166
1214
|
// fire available jobs anyway at utilization=0 rather than pausing the queue.
|
|
@@ -1171,9 +1219,9 @@ async function pollLoop() {
|
|
|
1171
1219
|
// Don't update firstNon429FailureAt — 429s don't count toward the 30-min network-pause threshold.
|
|
1172
1220
|
cachedUtilization = 0; // assume safe; fire any pending work
|
|
1173
1221
|
console.log(`[scheduler] billing meter rate-limited (HTTP 429) — firing on heuristic (failure #${consecutiveFailures})`);
|
|
1174
|
-
const cur = readQueue();
|
|
1222
|
+
const cur = await readQueue();
|
|
1175
1223
|
await maybeLaunchWhenAvailable(cur);
|
|
1176
|
-
broadcast();
|
|
1224
|
+
await broadcast();
|
|
1177
1225
|
} else {
|
|
1178
1226
|
lastPollAt = Date.now();
|
|
1179
1227
|
lastPollOk = false;
|
|
@@ -1194,7 +1242,7 @@ async function pollLoop() {
|
|
|
1194
1242
|
|
|
1195
1243
|
// After 30 minutes of consecutive non-429 failures, set 'network' pause.
|
|
1196
1244
|
if (totalNon429FailureMs > 30 * 60_000) {
|
|
1197
|
-
const cur2 = readQueue();
|
|
1245
|
+
const cur2 = await readQueue();
|
|
1198
1246
|
if (!cur2.paused || cur2.paused.reason === 'network') {
|
|
1199
1247
|
await setPaused('network', null);
|
|
1200
1248
|
}
|
|
@@ -1229,23 +1277,14 @@ function registerScheduleHandlers() {
|
|
|
1229
1277
|
supervisor.registerHandlers();
|
|
1230
1278
|
|
|
1231
1279
|
ipcMain.handle('schedule:state', async () => {
|
|
1232
|
-
const state = readQueue();
|
|
1233
|
-
reconcile(state);
|
|
1234
|
-
writeQueue(state);
|
|
1235
|
-
return {
|
|
1236
|
-
config: state.config,
|
|
1237
|
-
jobs: state.jobs,
|
|
1238
|
-
scheduledFor: state.scheduledFor,
|
|
1239
|
-
lastRunAt: state.lastRunAt,
|
|
1240
|
-
nextReset: getNextResetCached(),
|
|
1241
|
-
paused: state.paused,
|
|
1242
|
-
utilization: cachedUtilization,
|
|
1243
|
-
paths: { root: ROOT, prds: PRDS_DIR, runs: RUNS_DIR, queue: QUEUE_PATH },
|
|
1244
|
-
};
|
|
1280
|
+
const state = await readQueue();
|
|
1281
|
+
await reconcile(state);
|
|
1282
|
+
await writeQueue(state);
|
|
1283
|
+
return buildScheduleStatePayload(state, { withPaths: true });
|
|
1245
1284
|
});
|
|
1246
1285
|
|
|
1247
1286
|
ipcMain.handle('schedule:health', async () => {
|
|
1248
|
-
const state = readQueue();
|
|
1287
|
+
const state = await readQueue();
|
|
1249
1288
|
const runningJobs = [];
|
|
1250
1289
|
for (const j of state.jobs) {
|
|
1251
1290
|
if (j.status === 'running' && j.runtime) {
|
|
@@ -1274,15 +1313,14 @@ function registerScheduleHandlers() {
|
|
|
1274
1313
|
// Bypass the billing-poll gate entirely — fire pending jobs immediately regardless of meter state.
|
|
1275
1314
|
// Clears any existing pause first (same semantics as run-now).
|
|
1276
1315
|
await clearPause('run-now');
|
|
1277
|
-
runDueJobs().catch((e) =>
|
|
1316
|
+
runDueJobs().catch((e) => logs.writeLine({ level: 'error', scope: 'scheduler', message: 'runDueJobs error (force-tick)', meta: { error: e?.message } }));
|
|
1278
1317
|
return { ok: true };
|
|
1279
1318
|
});
|
|
1280
1319
|
|
|
1281
1320
|
ipcMain.handle('schedule:set-config', async (_e, partial) => {
|
|
1282
|
-
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
1283
1321
|
let validated;
|
|
1284
1322
|
try {
|
|
1285
|
-
validated =
|
|
1323
|
+
validated = schemas.setConfigSchema.parse(partial || {});
|
|
1286
1324
|
} catch (e) {
|
|
1287
1325
|
return { ok: false, error: e?.message ?? 'invalid config' };
|
|
1288
1326
|
}
|
|
@@ -1299,18 +1337,13 @@ function registerScheduleHandlers() {
|
|
|
1299
1337
|
});
|
|
1300
1338
|
|
|
1301
1339
|
ipcMain.handle('schedule:reset-job', async (_e, payload) => {
|
|
1302
|
-
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
1303
1340
|
let slug;
|
|
1304
1341
|
try {
|
|
1305
|
-
({ slug } =
|
|
1342
|
+
({ slug } = schemas.scheduleSlug.parse(payload));
|
|
1306
1343
|
} catch (e) {
|
|
1307
1344
|
return { ok: false, error: 'invalid slug' };
|
|
1308
1345
|
}
|
|
1309
|
-
|
|
1310
|
-
const resolved = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
|
|
1311
|
-
if (!resolved.startsWith(PRDS_DIR + path.sep)) {
|
|
1312
|
-
return { ok: false, error: 'invalid slug' };
|
|
1313
|
-
}
|
|
1346
|
+
if (!safeSlugPath(slug)) return { ok: false, error: 'invalid slug' };
|
|
1314
1347
|
const found = await mutate((state) => {
|
|
1315
1348
|
const idx = state.jobs.findIndex((j) => j.slug === slug);
|
|
1316
1349
|
if (idx < 0) return false;
|
|
@@ -1318,14 +1351,14 @@ function registerScheduleHandlers() {
|
|
|
1318
1351
|
return true;
|
|
1319
1352
|
});
|
|
1320
1353
|
if (!found) return { ok: false, error: 'not found' };
|
|
1321
|
-
broadcast();
|
|
1354
|
+
await broadcast();
|
|
1322
1355
|
return { ok: true };
|
|
1323
1356
|
});
|
|
1324
1357
|
|
|
1325
1358
|
ipcMain.handle('schedule:run-now', async () => {
|
|
1326
1359
|
// Manual run-now overrides any auto-pause. Clear it first.
|
|
1327
1360
|
await clearPause('run-now');
|
|
1328
|
-
runDueJobs().catch((e) =>
|
|
1361
|
+
runDueJobs().catch((e) => logs.writeLine({ level: 'error', scope: 'scheduler', message: 'runDueJobs error (run-now)', meta: { error: e?.message } }));
|
|
1329
1362
|
return { ok: true };
|
|
1330
1363
|
});
|
|
1331
1364
|
|
|
@@ -1344,11 +1377,11 @@ function registerScheduleHandlers() {
|
|
|
1344
1377
|
// handler already reconciles on read, but this gives the renderer an
|
|
1345
1378
|
// explicit refresh path that also broadcasts so all views update.
|
|
1346
1379
|
ipcMain.handle('schedule:rescan', async () => {
|
|
1347
|
-
await mutate((state) => {
|
|
1348
|
-
reconcile(state);
|
|
1380
|
+
await mutate(async (state) => {
|
|
1381
|
+
await reconcile(state);
|
|
1349
1382
|
return null;
|
|
1350
1383
|
});
|
|
1351
|
-
broadcast();
|
|
1384
|
+
await broadcast();
|
|
1352
1385
|
return { ok: true };
|
|
1353
1386
|
});
|
|
1354
1387
|
|
|
@@ -1360,12 +1393,12 @@ function registerScheduleHandlers() {
|
|
|
1360
1393
|
ensureDirs();
|
|
1361
1394
|
const ts = new Date().toISOString().replace(/[:.]/g, '-');
|
|
1362
1395
|
const archiveDir = path.join(PRDS_ARCHIVE_DIR, ts);
|
|
1363
|
-
const state = readQueue();
|
|
1396
|
+
const state = await readQueue();
|
|
1364
1397
|
const victims = state.jobs.filter((j) => j.status === 'pending' || j.status === 'failed');
|
|
1365
1398
|
if (victims.length === 0) {
|
|
1366
1399
|
return { ok: true, archived: 0, archivedTo: null };
|
|
1367
1400
|
}
|
|
1368
|
-
|
|
1401
|
+
await fsp.mkdir(archiveDir, { recursive: true });
|
|
1369
1402
|
let archived = 0;
|
|
1370
1403
|
for (const job of victims) {
|
|
1371
1404
|
const src = path.resolve(path.join(PRDS_DIR, `${job.slug}.md`));
|
|
@@ -1378,17 +1411,17 @@ function registerScheduleHandlers() {
|
|
|
1378
1411
|
// ENOENT: the .md is already gone (reconcile would drop it on next
|
|
1379
1412
|
// read anyway). Either way, fall through and remove from queue.
|
|
1380
1413
|
if (e?.code !== 'ENOENT') {
|
|
1381
|
-
|
|
1414
|
+
logs.writeLine({ level: 'warn', scope: 'scheduler', message: 'clear-queue: rename failed', meta: { slug: job.slug, error: e?.message } });
|
|
1382
1415
|
}
|
|
1383
1416
|
}
|
|
1384
1417
|
}
|
|
1385
|
-
await mutate((s) => {
|
|
1418
|
+
await mutate(async (s) => {
|
|
1386
1419
|
const victimSlugs = new Set(victims.map((j) => j.slug));
|
|
1387
1420
|
s.jobs = s.jobs.filter((j) => !victimSlugs.has(j.slug));
|
|
1388
|
-
reconcile(s);
|
|
1421
|
+
await reconcile(s);
|
|
1389
1422
|
return null;
|
|
1390
1423
|
});
|
|
1391
|
-
broadcast();
|
|
1424
|
+
await broadcast();
|
|
1392
1425
|
return { ok: true, archived, archivedTo: archiveDir };
|
|
1393
1426
|
});
|
|
1394
1427
|
|
|
@@ -1399,17 +1432,14 @@ function registerScheduleHandlers() {
|
|
|
1399
1432
|
});
|
|
1400
1433
|
|
|
1401
1434
|
ipcMain.handle('schedule:read-prd', async (_e, payload) => {
|
|
1402
|
-
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
1403
1435
|
let slug;
|
|
1404
1436
|
try {
|
|
1405
|
-
({ slug } =
|
|
1437
|
+
({ slug } = schemas.scheduleSlug.parse(payload));
|
|
1406
1438
|
} catch {
|
|
1407
1439
|
return { ok: false, error: 'invalid slug' };
|
|
1408
1440
|
}
|
|
1409
|
-
const filePath =
|
|
1410
|
-
if (!filePath
|
|
1411
|
-
return { ok: false, error: 'invalid slug' };
|
|
1412
|
-
}
|
|
1441
|
+
const filePath = safeSlugPath(slug);
|
|
1442
|
+
if (!filePath) return { ok: false, error: 'invalid slug' };
|
|
1413
1443
|
try {
|
|
1414
1444
|
const text = await fsp.readFile(filePath, 'utf8');
|
|
1415
1445
|
return { ok: true, text };
|
|
@@ -1419,13 +1449,14 @@ function registerScheduleHandlers() {
|
|
|
1419
1449
|
});
|
|
1420
1450
|
|
|
1421
1451
|
ipcMain.handle('schedule:read-log', async (_e, payload) => {
|
|
1422
|
-
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
1423
1452
|
let slug, runId;
|
|
1424
1453
|
try {
|
|
1425
|
-
({ slug, runId } =
|
|
1454
|
+
({ slug, runId } = schemas.scheduleReadLog.parse(payload));
|
|
1426
1455
|
} catch {
|
|
1427
1456
|
return { ok: false, error: 'invalid slug or runId' };
|
|
1428
1457
|
}
|
|
1458
|
+
// Defense-in-depth: re-check containment after path.resolve even though
|
|
1459
|
+
// SLUG_RE / RUN_ID_RE already forbid path separators.
|
|
1429
1460
|
const logPath = path.resolve(path.join(RUNS_DIR, runId, `${slug}.log`));
|
|
1430
1461
|
if (!logPath.startsWith(RUNS_DIR + path.sep)) {
|
|
1431
1462
|
return { ok: false, error: 'invalid slug or runId' };
|
|
@@ -1438,21 +1469,23 @@ function registerScheduleHandlers() {
|
|
|
1438
1469
|
}
|
|
1439
1470
|
});
|
|
1440
1471
|
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
if (
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1472
|
+
ipcMain.handle('schedule:write-prd', async (_e, payload) => {
|
|
1473
|
+
let parsed;
|
|
1474
|
+
try { parsed = schemas.scheduleWritePrd.parse(payload); }
|
|
1475
|
+
catch (e) { return { ok: false, error: e?.message ?? 'invalid payload' }; }
|
|
1476
|
+
const resolved = safeSlugPath(parsed.slug);
|
|
1477
|
+
if (!resolved) return { ok: false, error: 'invalid slug' };
|
|
1478
|
+
try {
|
|
1479
|
+
await config.writeTextAtomic(resolved, parsed.body);
|
|
1480
|
+
} catch (e) {
|
|
1481
|
+
return { ok: false, error: e?.message ?? 'write failed' };
|
|
1482
|
+
}
|
|
1483
|
+
try {
|
|
1484
|
+
const stat = await fsp.stat(resolved);
|
|
1485
|
+
return { ok: true, bytesWritten: stat.size };
|
|
1486
|
+
} catch (e) {
|
|
1487
|
+
return { ok: false, error: e?.message ?? 'stat failed' };
|
|
1488
|
+
}
|
|
1456
1489
|
});
|
|
1457
1490
|
|
|
1458
1491
|
ipcMain.handle('schedule:list-prds', async () => {
|
|
@@ -1460,7 +1493,8 @@ function registerScheduleHandlers() {
|
|
|
1460
1493
|
let entries;
|
|
1461
1494
|
try {
|
|
1462
1495
|
entries = await fsp.readdir(PRDS_DIR);
|
|
1463
|
-
} catch {
|
|
1496
|
+
} catch (e) {
|
|
1497
|
+
logs.writeLine({ level: 'warn', scope: 'scheduler', message: 'list-prds: readdir failed', meta: { error: e?.message } });
|
|
1464
1498
|
return [];
|
|
1465
1499
|
}
|
|
1466
1500
|
const out = [];
|
|
@@ -1468,7 +1502,7 @@ function registerScheduleHandlers() {
|
|
|
1468
1502
|
if (!name.endsWith('.md') || name.startsWith('.')) continue;
|
|
1469
1503
|
const filePath = path.join(PRDS_DIR, name);
|
|
1470
1504
|
try {
|
|
1471
|
-
const parsed = parsePrd(filePath);
|
|
1505
|
+
const parsed = await parsePrd(filePath);
|
|
1472
1506
|
const stat = await fsp.stat(filePath);
|
|
1473
1507
|
out.push({
|
|
1474
1508
|
slug: parsed.slug,
|
|
@@ -1479,7 +1513,7 @@ function registerScheduleHandlers() {
|
|
|
1479
1513
|
mtimeMs: stat.mtimeMs,
|
|
1480
1514
|
});
|
|
1481
1515
|
} catch (e) {
|
|
1482
|
-
|
|
1516
|
+
logs.writeLine({ level: 'warn', scope: 'scheduler', message: 'list-prds: skipping unparseable file', meta: { name, error: e?.message } });
|
|
1483
1517
|
}
|
|
1484
1518
|
}
|
|
1485
1519
|
out.sort((a, b) => a.slug.localeCompare(b.slug, undefined, { numeric: true }));
|
|
@@ -1509,7 +1543,7 @@ async function init() {
|
|
|
1509
1543
|
|
|
1510
1544
|
// If we boot up while paused with a resumeAt in the past, clear it. This
|
|
1511
1545
|
// happens when the app was closed across the reset window.
|
|
1512
|
-
const boot = readQueue();
|
|
1546
|
+
const boot = await readQueue();
|
|
1513
1547
|
if (boot.paused && boot.paused.resumeAt && new Date(boot.paused.resumeAt).getTime() <= Date.now()) {
|
|
1514
1548
|
await clearPause('boot-elapsed');
|
|
1515
1549
|
} else if (boot.paused && boot.paused.resumeAt) {
|
|
@@ -1530,15 +1564,20 @@ async function init() {
|
|
|
1530
1564
|
pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
|
|
1531
1565
|
if (pollLoopTimer.unref) pollLoopTimer.unref();
|
|
1532
1566
|
|
|
1533
|
-
// Supervisor: probe running jobs for wedged poll-loops.
|
|
1567
|
+
// Supervisor: probe running jobs for wedged poll-loops. Supervisor calls
|
|
1568
|
+
// its injected readQueue() synchronously from supervisorTick and applyAction,
|
|
1569
|
+
// so pass the sync variant; the 15-min probe cadence makes the blocking cost
|
|
1570
|
+
// negligible vs IPC handler latency.
|
|
1534
1571
|
if (process.env.SM_SUPERVISOR_DISABLE !== '1') {
|
|
1535
|
-
supervisor.startSupervisor({ readQueue
|
|
1572
|
+
supervisor.startSupervisor({ readQueue: readQueueSync });
|
|
1536
1573
|
}
|
|
1537
1574
|
|
|
1538
1575
|
// Heartbeat: once per minute, log queue state for 24h visibility.
|
|
1576
|
+
// setInterval callback is sync; readQueueSync stays sync to avoid awaiting
|
|
1577
|
+
// inside the timer body (and the 60s cadence makes the cost moot).
|
|
1539
1578
|
if (heartbeatInterval) clearInterval(heartbeatInterval);
|
|
1540
1579
|
heartbeatInterval = setInterval(() => {
|
|
1541
|
-
const s =
|
|
1580
|
+
const s = readQueueSync();
|
|
1542
1581
|
const counts = { pending: 0, running: 0, completed: 0, failed: 0 };
|
|
1543
1582
|
for (const j of s.jobs) counts[j.status] = (counts[j.status] || 0) + 1;
|
|
1544
1583
|
appendHeartbeat({
|
|
@@ -1560,8 +1599,10 @@ async function init() {
|
|
|
1560
1599
|
if (pollLoopTimer) { clearTimeout(pollLoopTimer); pollLoopTimer = null; }
|
|
1561
1600
|
backoffMs = 0;
|
|
1562
1601
|
backoffNextAt = null;
|
|
1563
|
-
// Clear any paused-but-resumeAt-elapsed state immediately.
|
|
1564
|
-
|
|
1602
|
+
// Clear any paused-but-resumeAt-elapsed state immediately. Sync read:
|
|
1603
|
+
// the powerMonitor 'resume' callback fires rarely and isn't on the IPC
|
|
1604
|
+
// hot path; switching to async would require an IIFE wrapper for no gain.
|
|
1605
|
+
const wakeState = readQueueSync();
|
|
1565
1606
|
if (wakeState.paused?.resumeAt && new Date(wakeState.paused.resumeAt).getTime() <= Date.now()) {
|
|
1566
1607
|
clearPause('boot-elapsed').then(() => { runDueJobs().catch(() => {}); }).catch(() => {});
|
|
1567
1608
|
}
|