claude-code-session-manager 0.8.0 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assets/{cssMode-CDGOCAW5.js → cssMode-30PYohIN.js} +1 -1
- package/dist/assets/{editor.main-zj3Myqhk.js → editor.main-CZ_l_CSt.js} +3 -3
- package/dist/assets/{freemarker2-Dh6-Vi35.js → freemarker2-DA5xODSz.js} +1 -1
- package/dist/assets/{handlebars-KgXZ3LUu.js → handlebars-BgJKogMf.js} +1 -1
- package/dist/assets/{html-D12q2PkL.js → html-D3DAPwAR.js} +1 -1
- package/dist/assets/{htmlMode-CCaSY5vs.js → htmlMode-mS5mzFjU.js} +1 -1
- package/dist/assets/index-Bs-mHiD-.js +2976 -0
- package/dist/assets/index-DCK87t79.css +32 -0
- package/dist/assets/{javascript-ClkFzW_a.js → javascript-CJ-Uxk_I.js} +1 -1
- package/dist/assets/{jsonMode-BV7azwkW.js → jsonMode-DbcDRati.js} +1 -1
- package/dist/assets/{liquid-C9Id9V-K.js → liquid-I4DHwPR_.js} +1 -1
- package/dist/assets/{lspLanguageFeatures-COD69jzF.js → lspLanguageFeatures-BntDl6Xn.js} +1 -1
- package/dist/assets/{mdx-D8YvWtiq.js → mdx-DWI58irx.js} +1 -1
- package/dist/assets/{python-YtsitwE4.js → python-DPx3c0QA.js} +1 -1
- package/dist/assets/{razor-T48OHh5u.js → razor-BcxFqE_H.js} +1 -1
- package/dist/assets/{tsMode-DHTMR4b8.js → tsMode-CGTi49DJ.js} +1 -1
- package/dist/assets/{typescript-Ckq032Ud.js → typescript-CE9RqBjC.js} +1 -1
- package/dist/assets/{xml-B7dYWEXB.js → xml-DsrLAWcV.js} +1 -1
- package/dist/assets/{yaml-DUufEgrd.js → yaml-CA8rRsQI.js} +1 -1
- package/dist/index.html +2 -2
- package/package.json +1 -1
- package/src/main/config.cjs +93 -19
- package/src/main/index.cjs +163 -31
- package/src/main/ipcSchemas.cjs +59 -2
- package/src/main/lib/cleanEnv.cjs +20 -0
- package/src/main/lib/credentials.cjs +184 -0
- package/src/main/lib/schedulerConfig.cjs +10 -0
- package/src/main/logs.cjs +1 -1
- package/src/main/otelSettings.cjs +1 -1
- package/src/main/pty.cjs +53 -6
- package/src/main/scheduler.cjs +521 -148
- package/src/main/transcripts.cjs +26 -21
- package/src/main/usage.cjs +76 -25
- package/src/main/voiceSettings.cjs +1 -1
- package/src/main/watchers.cjs +69 -11
- package/src/preload/api.d.ts +53 -11
- package/src/preload/index.cjs +13 -0
- package/dist/assets/index-Dejlz0I1.js +0 -2972
- package/dist/assets/index-DsC4vT8M.css +0 -32
package/src/main/scheduler.cjs
CHANGED
|
@@ -24,7 +24,8 @@
|
|
|
24
24
|
* frees in that group.
|
|
25
25
|
*
|
|
26
26
|
* Execution:
|
|
27
|
-
* - `claude -p "<PRD body>" --dangerously-skip-permissions`
|
|
27
|
+
* - `claude -p "<PRD body>" --model sonnet --dangerously-skip-permissions`
|
|
28
|
+
* per PRD. Backlog runs on Sonnet; interactive sessions stay on Opus.
|
|
28
29
|
* - Stdout/stderr → runs/<ts>/<slug>.log; meta json gets exit + duration.
|
|
29
30
|
* - PRD frontmatter `cwd` → child cwd. Default: PROJECT_CWD const below.
|
|
30
31
|
*
|
|
@@ -46,11 +47,20 @@ const os = require('node:os');
|
|
|
46
47
|
const { spawn } = require('node:child_process');
|
|
47
48
|
const { ipcMain } = require('electron');
|
|
48
49
|
const billing = require('./usage.cjs');
|
|
50
|
+
const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
|
|
51
|
+
const {
|
|
52
|
+
POLL_INTERVAL_MS,
|
|
53
|
+
USAGE_REFRESH_INTERVAL_MS,
|
|
54
|
+
MAX_JOB_DURATION_MS,
|
|
55
|
+
} = require('./lib/schedulerConfig.cjs');
|
|
49
56
|
|
|
50
57
|
const ROOT = path.join(os.homedir(), '.claude', 'session-manager', 'scheduled-plans');
|
|
51
58
|
const PRDS_DIR = path.join(ROOT, 'prds');
|
|
52
59
|
const RUNS_DIR = path.join(ROOT, 'runs');
|
|
53
60
|
const QUEUE_PATH = path.join(ROOT, 'queue.json');
|
|
61
|
+
const SCHEDULER_STATE_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-state.json');
|
|
62
|
+
const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-heartbeat.log');
|
|
63
|
+
const HEARTBEAT_MAX_BYTES = 1024 * 1024;
|
|
54
64
|
const DEFAULT_PROJECT_CWD = path.join(os.homedir(), 'Projects', 'session-manager');
|
|
55
65
|
|
|
56
66
|
const DEFAULT_CONFIG = {
|
|
@@ -61,7 +71,7 @@ const DEFAULT_CONFIG = {
|
|
|
61
71
|
defaultCwd: DEFAULT_PROJECT_CWD,
|
|
62
72
|
// 'when-available' = poll usage and fire whenever utilization < threshold.
|
|
63
73
|
// 'on-reset' = fire offsetMinutes after the next 5h reset (legacy).
|
|
64
|
-
// 'manual' = only fire on explicit Run
|
|
74
|
+
// 'manual' = only fire on explicit Run now click.
|
|
65
75
|
firePolicy: 'when-available',
|
|
66
76
|
// For 'when-available'. Fire only when five_hour utilization < this percent.
|
|
67
77
|
utilizationThreshold: 90,
|
|
@@ -76,11 +86,61 @@ function ensureDirs() {
|
|
|
76
86
|
}
|
|
77
87
|
|
|
78
88
|
function atomicWriteJson(p, data) {
|
|
79
|
-
const tmp = `${p}.${process.pid}.tmp`;
|
|
89
|
+
const tmp = `${p}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}.tmp`;
|
|
80
90
|
fs.writeFileSync(tmp, JSON.stringify(data, null, 2));
|
|
81
91
|
fs.renameSync(tmp, p);
|
|
82
92
|
}
|
|
83
93
|
|
|
94
|
+
// ---------- scheduler-state.json (sidecar) ----------
|
|
95
|
+
|
|
96
|
+
function loadSchedulerState() {
|
|
97
|
+
try {
|
|
98
|
+
const raw = fs.readFileSync(SCHEDULER_STATE_PATH, 'utf8');
|
|
99
|
+
const s = JSON.parse(raw);
|
|
100
|
+
if (s.lastObservedReset) cachedNextReset = s.lastObservedReset;
|
|
101
|
+
if (typeof s.consecutiveFailures === 'number') consecutiveFailures = s.consecutiveFailures;
|
|
102
|
+
if (typeof s.backoffMs === 'number') backoffMs = s.backoffMs;
|
|
103
|
+
if (typeof s.pauseClearedManuallyAt === 'number') pauseClearedManuallyAt = s.pauseClearedManuallyAt;
|
|
104
|
+
if (typeof s.lastPollAt === 'number') lastPollAt = s.lastPollAt;
|
|
105
|
+
} catch { /* first boot or corrupt — start fresh */ }
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function persistSchedulerState() {
|
|
109
|
+
try {
|
|
110
|
+
atomicWriteJson(SCHEDULER_STATE_PATH, {
|
|
111
|
+
version: 1,
|
|
112
|
+
lastObservedReset: cachedNextReset,
|
|
113
|
+
lastResetObservedAt: cachedNextReset ? Date.now() : null,
|
|
114
|
+
lastPollAt,
|
|
115
|
+
consecutiveFailures,
|
|
116
|
+
backoffMs,
|
|
117
|
+
pausedReason: null,
|
|
118
|
+
pausedSince: null,
|
|
119
|
+
pauseClearedManuallyAt,
|
|
120
|
+
});
|
|
121
|
+
} catch (e) {
|
|
122
|
+
console.warn('[scheduler] failed to persist scheduler state', e?.message);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// ---------- heartbeat log ----------
|
|
127
|
+
|
|
128
|
+
function appendHeartbeat(entry) {
|
|
129
|
+
try {
|
|
130
|
+
const line = JSON.stringify(entry) + '\n';
|
|
131
|
+
let size = 0;
|
|
132
|
+
try { size = fs.statSync(HEARTBEAT_PATH).size; } catch { /* new file */ }
|
|
133
|
+
if (size >= HEARTBEAT_MAX_BYTES) {
|
|
134
|
+
const rotated = HEARTBEAT_PATH + '.1';
|
|
135
|
+
try { fs.unlinkSync(rotated); } catch { /* */ }
|
|
136
|
+
try { fs.renameSync(HEARTBEAT_PATH, rotated); } catch { /* */ }
|
|
137
|
+
}
|
|
138
|
+
fs.appendFileSync(HEARTBEAT_PATH, line);
|
|
139
|
+
} catch (e) {
|
|
140
|
+
console.warn('[scheduler] heartbeat write failed', e?.message);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
84
144
|
function readQueue() {
|
|
85
145
|
try {
|
|
86
146
|
const raw = fs.readFileSync(QUEUE_PATH, 'utf8');
|
|
@@ -102,6 +162,25 @@ function writeQueue(state) {
|
|
|
102
162
|
atomicWriteJson(QUEUE_PATH, state);
|
|
103
163
|
}
|
|
104
164
|
|
|
165
|
+
// ---------- serialized mutation queue ----------
|
|
166
|
+
|
|
167
|
+
// All read-modify-write operations on queue.json go through mutate() so
|
|
168
|
+
// concurrent job completions in a parallel wave cannot lose each other's
|
|
169
|
+
// status updates. mutateTail is always a resolved promise even when the
|
|
170
|
+
// preceding mutate threw, so the chain never deadlocks.
|
|
171
|
+
let mutateTail = Promise.resolve();
|
|
172
|
+
|
|
173
|
+
function mutate(fn) {
|
|
174
|
+
const next = mutateTail.then(async () => {
|
|
175
|
+
const state = readQueue();
|
|
176
|
+
const ret = await fn(state);
|
|
177
|
+
writeQueue(state);
|
|
178
|
+
return ret;
|
|
179
|
+
});
|
|
180
|
+
mutateTail = next.catch(() => {}); // keep chain alive on errors
|
|
181
|
+
return next;
|
|
182
|
+
}
|
|
183
|
+
|
|
105
184
|
// ---------- PRD parsing ----------
|
|
106
185
|
|
|
107
186
|
/**
|
|
@@ -220,32 +299,41 @@ function reconcile(state) {
|
|
|
220
299
|
|
|
221
300
|
// ---------- next-reset detection ----------
|
|
222
301
|
|
|
223
|
-
let cachedNextReset = null;
|
|
302
|
+
let cachedNextReset = null; // bare ISO string or null
|
|
224
303
|
let cachedUtilization = null; // five_hour utilization %, 0–100, or null if unknown
|
|
225
304
|
|
|
305
|
+
/** Fetches latest usage from billing API. Throws on any error — callers handle it. */
|
|
226
306
|
async function refreshNextReset() {
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
} catch {
|
|
233
|
-
return cachedNextReset;
|
|
234
|
-
}
|
|
307
|
+
const r = await billing.fetchUsage();
|
|
308
|
+
if (r.kind !== 'ok') throw new Error(`usage fetch failed (${r.kind}): ${r.message ?? ''}`);
|
|
309
|
+
cachedNextReset = r.data?.usage?.five_hour?.resets_at ?? null;
|
|
310
|
+
cachedUtilization = r.data?.usage?.five_hour?.utilization ?? cachedUtilization;
|
|
311
|
+
return cachedNextReset;
|
|
235
312
|
}
|
|
236
313
|
|
|
237
314
|
function getNextResetCached() {
|
|
238
315
|
return cachedNextReset;
|
|
239
316
|
}
|
|
240
317
|
|
|
318
|
+
// ---------- health / poll state ----------
|
|
319
|
+
|
|
320
|
+
let bootedAt = Date.now();
|
|
321
|
+
let lastPollAt = null;
|
|
322
|
+
let lastPollOk = false;
|
|
323
|
+
let consecutiveFailures = 0;
|
|
324
|
+
let backoffMs = 0;
|
|
325
|
+
let backoffNextAt = null;
|
|
326
|
+
let firstFailureAt = null;
|
|
327
|
+
let pauseClearedManuallyAt = null;
|
|
328
|
+
|
|
241
329
|
// ---------- timer ----------
|
|
242
330
|
|
|
243
331
|
let mainWindow = null;
|
|
244
332
|
let fireTimer = null;
|
|
245
333
|
let resumeTimer = null;
|
|
246
|
-
let
|
|
334
|
+
let pollLoopTimer = null;
|
|
247
335
|
let rescheduleInterval = null;
|
|
248
|
-
let
|
|
336
|
+
let heartbeatInterval = null;
|
|
249
337
|
let isExecuting = false;
|
|
250
338
|
let cancelToken = { cancelled: false };
|
|
251
339
|
let claudeBinPathCached = null;
|
|
@@ -288,65 +376,83 @@ function computeFireAt(state, nextResetIso) {
|
|
|
288
376
|
|
|
289
377
|
async function rescheduleTimer() {
|
|
290
378
|
clearFireTimer();
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
broadcast();
|
|
299
|
-
return;
|
|
379
|
+
// Wrap in try/catch — on failure use the cached value so the on-reset
|
|
380
|
+
// timer can still be armed from the last known reset.
|
|
381
|
+
let nextResetIso;
|
|
382
|
+
try {
|
|
383
|
+
nextResetIso = await refreshNextReset();
|
|
384
|
+
} catch {
|
|
385
|
+
nextResetIso = cachedNextReset;
|
|
300
386
|
}
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
387
|
+
const fireAt = await mutate((state) => {
|
|
388
|
+
reconcile(state);
|
|
389
|
+
const fa = computeFireAt(state, nextResetIso);
|
|
390
|
+
state.scheduledFor = fa ? new Date(fa).toISOString() : null;
|
|
391
|
+
return fa;
|
|
392
|
+
});
|
|
304
393
|
broadcast();
|
|
394
|
+
if (!fireAt) return;
|
|
305
395
|
|
|
306
396
|
const delay = Math.max(1000, fireAt - Date.now());
|
|
307
|
-
// setTimeout caps at int32 ms (~24.8 days) — well above our 5h horizon, so
|
|
308
|
-
// a single timer is fine. If reset_at is wildly in the future we'd still
|
|
309
|
-
// re-anchor on the next billing refresh.
|
|
310
397
|
fireTimer = setTimeout(() => { runDueJobs().catch(() => {}); }, delay);
|
|
311
|
-
console.log(`[scheduler] next fire in ${Math.round(delay / 1000)}s @ ${
|
|
398
|
+
console.log(`[scheduler] next fire in ${Math.round(delay / 1000)}s @ ${new Date(fireAt).toISOString()}`);
|
|
312
399
|
}
|
|
313
400
|
|
|
314
401
|
// ---------- pause / resume ----------
|
|
315
402
|
|
|
316
|
-
function setPaused(reason, resumeAtIso) {
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
403
|
+
async function setPaused(reason, resumeAtIso) {
|
|
404
|
+
// Honor manual-override cooldown: if the user cleared a pause within the
|
|
405
|
+
// last 5 minutes, suppress auto-pause re-engagement on the same condition.
|
|
406
|
+
if (pauseClearedManuallyAt && Date.now() - pauseClearedManuallyAt < 300_000) {
|
|
407
|
+
console.log(`[scheduler] setPaused(${reason}) suppressed by manual override cooldown`);
|
|
408
|
+
return;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// For 'network' with no explicit resumeAt, auto-resume after 30 minutes.
|
|
412
|
+
let effectiveResumeAt = resumeAtIso;
|
|
413
|
+
if (reason === 'network' && !resumeAtIso) {
|
|
414
|
+
effectiveResumeAt = new Date(Date.now() + 30 * 60_000).toISOString();
|
|
322
415
|
}
|
|
323
|
-
|
|
416
|
+
|
|
417
|
+
await mutate((s) => {
|
|
418
|
+
if (s.paused && s.paused.reason === reason) {
|
|
419
|
+
if (effectiveResumeAt) s.paused.resumeAt = effectiveResumeAt;
|
|
420
|
+
} else {
|
|
421
|
+
s.paused = { reason, since: new Date().toISOString(), resumeAt: effectiveResumeAt || null };
|
|
422
|
+
}
|
|
423
|
+
});
|
|
324
424
|
broadcast();
|
|
325
425
|
cancelToken.cancelled = true;
|
|
326
426
|
if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
|
|
327
|
-
if (!
|
|
427
|
+
if (!effectiveResumeAt) return;
|
|
428
|
+
|
|
328
429
|
// Resume 30s after the reset to give the auth/billing endpoint time to flip.
|
|
329
|
-
const delay = Math.max(30_000, new Date(
|
|
430
|
+
const delay = Math.max(30_000, new Date(effectiveResumeAt).getTime() - Date.now() + 30_000);
|
|
330
431
|
if (delay > 0x7fffffff) {
|
|
331
432
|
console.warn(`[scheduler] paused (${reason}); resumeAt too far for setTimeout (${delay}ms)`);
|
|
332
433
|
return;
|
|
333
434
|
}
|
|
334
|
-
resumeTimer = setTimeout(() => {
|
|
335
|
-
clearPause('resume-timer');
|
|
435
|
+
resumeTimer = setTimeout(async () => {
|
|
436
|
+
await clearPause('resume-timer');
|
|
336
437
|
runDueJobs().catch(() => {});
|
|
337
438
|
}, delay);
|
|
338
|
-
console.log(`[scheduler] paused (${reason}); auto-resume in ${Math.round(delay/1000)}s`);
|
|
439
|
+
console.log(`[scheduler] paused (${reason}); auto-resume in ${Math.round(delay / 1000)}s`);
|
|
339
440
|
}
|
|
340
441
|
|
|
341
|
-
function clearPause(source) {
|
|
442
|
+
async function clearPause(source) {
|
|
342
443
|
if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
|
|
343
|
-
const
|
|
344
|
-
|
|
444
|
+
const wasPaused = await mutate((s) => {
|
|
445
|
+
if (!s.paused) return false;
|
|
345
446
|
console.log(`[scheduler] clearPause (${source || 'manual'})`);
|
|
346
447
|
s.paused = null;
|
|
347
|
-
|
|
348
|
-
|
|
448
|
+
return true;
|
|
449
|
+
});
|
|
450
|
+
// Track manual clears for the auto-pause cooldown.
|
|
451
|
+
if (source === 'manual' || source === 'run-now') {
|
|
452
|
+
pauseClearedManuallyAt = Date.now();
|
|
453
|
+
persistSchedulerState();
|
|
349
454
|
}
|
|
455
|
+
if (wasPaused) broadcast();
|
|
350
456
|
}
|
|
351
457
|
|
|
352
458
|
/** Mutate a job in place to "pending" with cleared run metadata. */
|
|
@@ -357,6 +463,7 @@ function resetJobFields(job, errorMsg) {
|
|
|
357
463
|
job.finishedAt = null;
|
|
358
464
|
job.exitCode = null;
|
|
359
465
|
job.error = errorMsg ?? null;
|
|
466
|
+
delete job.runtime;
|
|
360
467
|
}
|
|
361
468
|
|
|
362
469
|
/** Scan the tail of a job's log for the canonical rate-limit signal. We look
|
|
@@ -407,15 +514,34 @@ function pickRunDir() {
|
|
|
407
514
|
return { runId: ts, dir };
|
|
408
515
|
}
|
|
409
516
|
|
|
410
|
-
|
|
517
|
+
/**
|
|
518
|
+
* Execute a single PRD job. Writes stdout/stderr to a log file and a meta
|
|
519
|
+
* JSON sidecar. Accepts an optional onPid(pid) callback called synchronously
|
|
520
|
+
* after spawn so callers can persist the pid before the job finishes.
|
|
521
|
+
*/
|
|
522
|
+
async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
411
523
|
const logPath = path.join(runDir, `${job.slug}.log`);
|
|
412
524
|
const metaPath = path.join(runDir, `${job.slug}.meta.json`);
|
|
413
525
|
const cwd = job.cwd || defaultCwd;
|
|
414
526
|
const startedAt = Date.now();
|
|
415
527
|
|
|
416
528
|
const fd = fs.openSync(logPath, 'a');
|
|
529
|
+
let fdClosed = false;
|
|
530
|
+
const closeFd = () => { if (fdClosed) return; fdClosed = true; fs.closeSync(fd); };
|
|
531
|
+
|
|
417
532
|
fs.writeSync(fd, `[scheduler] starting ${job.slug} at ${new Date().toISOString()}\n[scheduler] cwd=${cwd}\n\n`);
|
|
418
533
|
|
|
534
|
+
// Dead-cwd guard: verify the target directory exists and is traversable
|
|
535
|
+
// before handing it to the child process.
|
|
536
|
+
try { fs.accessSync(cwd, fs.constants.X_OK); }
|
|
537
|
+
catch {
|
|
538
|
+
const errMsg = `cwd no longer exists: ${cwd}`;
|
|
539
|
+
fs.writeSync(fd, `[scheduler] ${errMsg}\n`);
|
|
540
|
+
closeFd();
|
|
541
|
+
atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: errMsg, startedAt, finishedAt: Date.now(), durationMs: 0 });
|
|
542
|
+
return { exitCode: -1, durationMs: 0, error: errMsg };
|
|
543
|
+
}
|
|
544
|
+
|
|
419
545
|
// Read full PRD body fresh from disk (queue stored only the preview).
|
|
420
546
|
let prompt;
|
|
421
547
|
try {
|
|
@@ -423,37 +549,54 @@ async function executeJob(job, runDir, defaultCwd) {
|
|
|
423
549
|
prompt = parsed.body;
|
|
424
550
|
} catch (e) {
|
|
425
551
|
fs.writeSync(fd, `[scheduler] failed to read PRD: ${e?.message}\n`);
|
|
426
|
-
|
|
552
|
+
closeFd();
|
|
427
553
|
return { exitCode: -1, durationMs: 0, error: e?.message };
|
|
428
554
|
}
|
|
429
555
|
|
|
430
556
|
return await new Promise((resolve) => {
|
|
431
557
|
const claudeBin = resolveClaudeBin();
|
|
558
|
+
// Strip Claude Code env and secrets that leak in when session-manager is
|
|
559
|
+
// launched from a `claude` shell. CLAUDE_EFFORT=xhigh forces Opus and
|
|
560
|
+
// overrides `--model sonnet`, so scheduled jobs burn Opus credits silently.
|
|
561
|
+
const childEnv = cleanChildEnv();
|
|
432
562
|
const child = spawn(claudeBin, [
|
|
433
563
|
'-p', prompt,
|
|
564
|
+
'--model', 'sonnet',
|
|
434
565
|
'--dangerously-skip-permissions',
|
|
435
566
|
'--output-format', 'stream-json',
|
|
436
567
|
'--verbose',
|
|
437
568
|
], {
|
|
438
569
|
cwd,
|
|
439
|
-
env:
|
|
570
|
+
env: childEnv,
|
|
440
571
|
stdio: ['ignore', fd, fd],
|
|
441
572
|
});
|
|
442
573
|
|
|
443
574
|
fs.writeSync(fd, `[scheduler] spawned pid=${child.pid}\n\n`);
|
|
444
575
|
|
|
576
|
+
// Fire-and-forget pid persistence — best effort.
|
|
577
|
+
if (onPid) onPid(child.pid).catch(() => {});
|
|
578
|
+
|
|
579
|
+
// Kill the child if it runs past the maximum allowed duration.
|
|
580
|
+
const watchdog = setTimeout(() => {
|
|
581
|
+
fs.writeSync(fd, `\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
|
|
582
|
+
try { child.kill('SIGKILL'); } catch { /* already dead */ }
|
|
583
|
+
}, MAX_JOB_DURATION_MS);
|
|
584
|
+
if (watchdog.unref) watchdog.unref();
|
|
585
|
+
|
|
445
586
|
child.on('error', (err) => {
|
|
587
|
+
clearTimeout(watchdog);
|
|
446
588
|
const durationMs = Date.now() - startedAt;
|
|
447
|
-
fs.writeSync(fd, `\n[scheduler]
|
|
448
|
-
|
|
589
|
+
fs.writeSync(fd, `\n[scheduler] error: ${err.message}\n`);
|
|
590
|
+
closeFd();
|
|
449
591
|
atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: err.message, startedAt, finishedAt: Date.now(), durationMs });
|
|
450
592
|
resolve({ exitCode: -1, durationMs, error: err.message });
|
|
451
593
|
});
|
|
452
594
|
|
|
453
595
|
child.on('exit', (code) => {
|
|
596
|
+
clearTimeout(watchdog);
|
|
454
597
|
const durationMs = Date.now() - startedAt;
|
|
455
598
|
fs.writeSync(fd, `\n[scheduler] exit code=${code} duration=${Math.round(durationMs / 1000)}s\n`);
|
|
456
|
-
|
|
599
|
+
closeFd();
|
|
457
600
|
const rateLimited = code !== 0 && detectRateLimitInLog(logPath);
|
|
458
601
|
atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: code, rateLimited, startedAt, finishedAt: Date.now(), durationMs });
|
|
459
602
|
resolve({ exitCode: code, durationMs, rateLimited });
|
|
@@ -477,7 +620,6 @@ async function runDueJobs() {
|
|
|
477
620
|
return;
|
|
478
621
|
}
|
|
479
622
|
const { runId, dir: runDir } = pickRunDir();
|
|
480
|
-
state.lastRunAt = new Date().toISOString();
|
|
481
623
|
|
|
482
624
|
// Group by parallelGroup, ascending. Each group runs serially after the
|
|
483
625
|
// previous group completes.
|
|
@@ -489,7 +631,7 @@ async function runDueJobs() {
|
|
|
489
631
|
}
|
|
490
632
|
const groupKeys = Array.from(groups.keys()).sort((a, b) => a - b);
|
|
491
633
|
|
|
492
|
-
|
|
634
|
+
await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
|
|
493
635
|
broadcast();
|
|
494
636
|
|
|
495
637
|
for (const gk of groupKeys) {
|
|
@@ -501,41 +643,59 @@ async function runDueJobs() {
|
|
|
501
643
|
const inFlight = new Set();
|
|
502
644
|
|
|
503
645
|
const launch = (job) => {
|
|
504
|
-
const
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
const
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
646
|
+
const promise = (async () => {
|
|
647
|
+
try {
|
|
648
|
+
// Mark job running.
|
|
649
|
+
await mutate((s) => {
|
|
650
|
+
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
651
|
+
if (idx >= 0) {
|
|
652
|
+
s.jobs[idx].status = 'running';
|
|
653
|
+
s.jobs[idx].runId = runId;
|
|
654
|
+
s.jobs[idx].startedAt = new Date().toISOString();
|
|
655
|
+
}
|
|
656
|
+
});
|
|
657
|
+
broadcast();
|
|
658
|
+
|
|
659
|
+
// Execute — onPid persists the child PID into the running state.
|
|
660
|
+
const res = await executeJob(job, runDir, state.config.defaultCwd, async (pid) => {
|
|
661
|
+
await mutate((s) => {
|
|
662
|
+
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
663
|
+
if (idx >= 0) {
|
|
664
|
+
s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
|
|
665
|
+
}
|
|
666
|
+
});
|
|
667
|
+
});
|
|
668
|
+
|
|
669
|
+
// Rate-limit: pause before writing terminal status so the status
|
|
670
|
+
// mutate below can read the pause state.
|
|
671
|
+
if (res.rateLimited) {
|
|
672
|
+
const resetIso = await refreshNextReset().catch(() => cachedNextReset);
|
|
673
|
+
await setPaused('rate_limit', resetIso);
|
|
532
674
|
}
|
|
533
|
-
|
|
675
|
+
|
|
676
|
+
// Write terminal status; strip runtime regardless of outcome.
|
|
677
|
+
await mutate((s) => {
|
|
678
|
+
const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
679
|
+
if (i2 >= 0) {
|
|
680
|
+
const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
|
|
681
|
+
if (treatAsPending) {
|
|
682
|
+
resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
|
|
683
|
+
} else {
|
|
684
|
+
s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
|
|
685
|
+
s.jobs[i2].finishedAt = new Date().toISOString();
|
|
686
|
+
s.jobs[i2].exitCode = res.exitCode;
|
|
687
|
+
s.jobs[i2].error = res.error || null;
|
|
688
|
+
delete s.jobs[i2].runtime;
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
});
|
|
534
692
|
broadcast();
|
|
693
|
+
} catch (e) {
|
|
694
|
+
console.error('[scheduler] launch error', job.slug, e);
|
|
535
695
|
}
|
|
536
|
-
|
|
537
|
-
});
|
|
696
|
+
})();
|
|
538
697
|
inFlight.add(promise);
|
|
698
|
+
promise.then(() => inFlight.delete(promise), () => inFlight.delete(promise));
|
|
539
699
|
};
|
|
540
700
|
|
|
541
701
|
// Prime up to cap
|
|
@@ -555,45 +715,97 @@ async function runDueJobs() {
|
|
|
555
715
|
isExecuting = false;
|
|
556
716
|
// No longer auto-disable after a run. The firePolicy now governs whether
|
|
557
717
|
// the next batch fires automatically. Just clear the one-shot scheduledFor.
|
|
558
|
-
|
|
559
|
-
s.scheduledFor = null;
|
|
560
|
-
writeQueue(s);
|
|
718
|
+
await mutate((s) => { s.scheduledFor = null; });
|
|
561
719
|
broadcast();
|
|
562
720
|
}
|
|
563
721
|
}
|
|
564
722
|
|
|
565
|
-
// ---------- when-available
|
|
723
|
+
// ---------- when-available launch logic ----------
|
|
566
724
|
|
|
567
|
-
async function
|
|
725
|
+
async function maybeLaunchWhenAvailable(state) {
|
|
726
|
+
if (state.config.firePolicy !== 'when-available') return;
|
|
727
|
+
if (state.paused) return;
|
|
728
|
+
if (isExecuting) return;
|
|
729
|
+
const pending = state.jobs.filter((j) => j.status === 'pending');
|
|
730
|
+
if (pending.length === 0) return;
|
|
731
|
+
if (cachedUtilization === null || cachedUtilization === undefined) return;
|
|
732
|
+
if (cachedUtilization >= state.config.utilizationThreshold) {
|
|
733
|
+
broadcast();
|
|
734
|
+
return;
|
|
735
|
+
}
|
|
736
|
+
console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending — firing`);
|
|
737
|
+
runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// ---------- poll loop with exponential backoff ----------
|
|
741
|
+
|
|
742
|
+
async function pollLoop() {
|
|
568
743
|
try {
|
|
569
|
-
const
|
|
570
|
-
if (state.config.firePolicy !== 'when-available') return;
|
|
571
|
-
if (state.paused) return;
|
|
572
|
-
if (isExecuting) return;
|
|
573
|
-
const pending = state.jobs.filter((j) => j.status === 'pending');
|
|
574
|
-
if (pending.length === 0) return;
|
|
744
|
+
const r = await billing.fetchUsage();
|
|
575
745
|
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
746
|
+
if (r.kind === 'ok') {
|
|
747
|
+
cachedNextReset = r.data?.usage?.five_hour?.resets_at ?? cachedNextReset;
|
|
748
|
+
cachedUtilization = r.data?.usage?.five_hour?.utilization ?? cachedUtilization;
|
|
749
|
+
consecutiveFailures = 0;
|
|
750
|
+
backoffMs = 0;
|
|
751
|
+
backoffNextAt = null;
|
|
752
|
+
firstFailureAt = null;
|
|
753
|
+
lastPollAt = Date.now();
|
|
754
|
+
lastPollOk = true;
|
|
755
|
+
persistSchedulerState();
|
|
756
|
+
|
|
757
|
+
// If a 'network' pause resolved, clear it now that we have a good reading.
|
|
758
|
+
const cur = readQueue();
|
|
759
|
+
if (cur.paused?.reason === 'network') {
|
|
760
|
+
await clearPause('network-recovered');
|
|
761
|
+
}
|
|
762
|
+
// If 'reset_failure' was set and we now have a valid reset, clear it.
|
|
763
|
+
if (cur.paused?.reason === 'reset_failure' && cachedNextReset) {
|
|
764
|
+
await clearPause('reset-recovered');
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
await maybeLaunchWhenAvailable(cur);
|
|
590
768
|
broadcast();
|
|
591
|
-
|
|
769
|
+
} else {
|
|
770
|
+
lastPollAt = Date.now();
|
|
771
|
+
lastPollOk = false;
|
|
772
|
+
consecutiveFailures++;
|
|
773
|
+
if (!firstFailureAt) firstFailureAt = Date.now();
|
|
774
|
+
|
|
775
|
+
if (r.kind === 'auth') {
|
|
776
|
+
console.error(`[scheduler] auth failure (HTTP ${r.httpStatus}): ${r.message}`);
|
|
777
|
+
await setPaused('auth', null);
|
|
778
|
+
} else {
|
|
779
|
+
// transient or config — apply exponential backoff.
|
|
780
|
+
backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
|
|
781
|
+
const totalFailureMs = Date.now() - firstFailureAt;
|
|
782
|
+
console.log(`[scheduler] transient failure #${consecutiveFailures}: ${r.kind} ${r.message ?? ''}; retry in ${backoffMs / 1000}s`);
|
|
783
|
+
|
|
784
|
+
// After 30 minutes of consecutive failures, set 'network' pause.
|
|
785
|
+
if (totalFailureMs > 30 * 60_000) {
|
|
786
|
+
const cur2 = readQueue();
|
|
787
|
+
if (!cur2.paused || cur2.paused.reason === 'network') {
|
|
788
|
+
await setPaused('network', null);
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
backoffNextAt = Date.now() + backoffMs;
|
|
794
|
+
persistSchedulerState();
|
|
592
795
|
}
|
|
593
|
-
console.log(`[scheduler] when-available: util=${util}%, ${pending.length} pending — firing`);
|
|
594
|
-
runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
|
|
595
796
|
} catch (e) {
|
|
596
|
-
|
|
797
|
+
// Unexpected error (e.g., IPC transport failure)
|
|
798
|
+
lastPollAt = Date.now();
|
|
799
|
+
lastPollOk = false;
|
|
800
|
+
consecutiveFailures++;
|
|
801
|
+
if (!firstFailureAt) firstFailureAt = Date.now();
|
|
802
|
+
backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
|
|
803
|
+
backoffNextAt = Date.now() + backoffMs;
|
|
804
|
+
persistSchedulerState();
|
|
805
|
+
} finally {
|
|
806
|
+
const delay = backoffMs || POLL_INTERVAL_MS;
|
|
807
|
+
pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, delay);
|
|
808
|
+
if (pollLoopTimer.unref) pollLoopTimer.unref();
|
|
597
809
|
}
|
|
598
810
|
}
|
|
599
811
|
|
|
@@ -618,44 +830,85 @@ function registerScheduleHandlers() {
|
|
|
618
830
|
};
|
|
619
831
|
});
|
|
620
832
|
|
|
621
|
-
ipcMain.handle('schedule:
|
|
833
|
+
ipcMain.handle('schedule:health', async () => {
|
|
622
834
|
const state = readQueue();
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
835
|
+
const runningJobs = [];
|
|
836
|
+
for (const j of state.jobs) {
|
|
837
|
+
if (j.status === 'running' && j.runtime) {
|
|
838
|
+
runningJobs.push({
|
|
839
|
+
slug: j.slug,
|
|
840
|
+
startedAt: j.startedAt ? Date.parse(j.startedAt) : 0,
|
|
841
|
+
pid: j.runtime.pid ?? 0,
|
|
842
|
+
});
|
|
843
|
+
}
|
|
626
844
|
}
|
|
627
|
-
|
|
628
|
-
|
|
845
|
+
return {
|
|
846
|
+
bootedAt,
|
|
847
|
+
lastPollAt,
|
|
848
|
+
lastPollOk,
|
|
849
|
+
consecutiveFailures,
|
|
850
|
+
backoffNextAt,
|
|
851
|
+
nextResetCached: cachedNextReset,
|
|
852
|
+
pausedSince: state.paused ? Date.parse(state.paused.since) : null,
|
|
853
|
+
pauseReason: state.paused?.reason ?? null,
|
|
854
|
+
runningJobs,
|
|
855
|
+
};
|
|
856
|
+
});
|
|
857
|
+
|
|
858
|
+
ipcMain.handle('schedule:set-config', async (_e, partial) => {
|
|
859
|
+
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
860
|
+
let validated;
|
|
861
|
+
try {
|
|
862
|
+
validated = s.setConfigSchema.parse(partial || {});
|
|
863
|
+
} catch (e) {
|
|
864
|
+
return { ok: false, error: e?.message ?? 'invalid config' };
|
|
629
865
|
}
|
|
630
|
-
|
|
866
|
+
const config = await mutate((state) => {
|
|
867
|
+
state.config = { ...state.config, ...validated };
|
|
868
|
+
return state.config;
|
|
869
|
+
});
|
|
631
870
|
await rescheduleTimer();
|
|
632
|
-
return { ok: true, config
|
|
871
|
+
return { ok: true, config };
|
|
633
872
|
});
|
|
634
873
|
|
|
635
|
-
ipcMain.handle('schedule:reset-job', async (_e,
|
|
636
|
-
const
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
874
|
+
ipcMain.handle('schedule:reset-job', async (_e, payload) => {
|
|
875
|
+
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
876
|
+
let slug;
|
|
877
|
+
try {
|
|
878
|
+
({ slug } = s.scheduleSlug.parse(payload));
|
|
879
|
+
} catch (e) {
|
|
880
|
+
return { ok: false, error: 'invalid slug' };
|
|
881
|
+
}
|
|
882
|
+
// Containment check after path.join.
|
|
883
|
+
const resolved = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
|
|
884
|
+
if (!resolved.startsWith(PRDS_DIR + path.sep)) {
|
|
885
|
+
return { ok: false, error: 'invalid slug' };
|
|
886
|
+
}
|
|
887
|
+
const found = await mutate((state) => {
|
|
888
|
+
const idx = state.jobs.findIndex((j) => j.slug === slug);
|
|
889
|
+
if (idx < 0) return false;
|
|
890
|
+
resetJobFields(state.jobs[idx]);
|
|
891
|
+
return true;
|
|
892
|
+
});
|
|
893
|
+
if (!found) return { ok: false, error: 'not found' };
|
|
641
894
|
broadcast();
|
|
642
895
|
return { ok: true };
|
|
643
896
|
});
|
|
644
897
|
|
|
645
898
|
ipcMain.handle('schedule:run-now', async () => {
|
|
646
899
|
// Manual run-now overrides any auto-pause. Clear it first.
|
|
647
|
-
clearPause('run-now');
|
|
900
|
+
await clearPause('run-now');
|
|
648
901
|
runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
|
|
649
902
|
return { ok: true };
|
|
650
903
|
});
|
|
651
904
|
|
|
652
905
|
ipcMain.handle('schedule:resume', async () => {
|
|
653
|
-
clearPause('manual');
|
|
906
|
+
await clearPause('manual');
|
|
654
907
|
return { ok: true };
|
|
655
908
|
});
|
|
656
909
|
|
|
657
910
|
ipcMain.handle('schedule:refresh-reset', async () => {
|
|
658
|
-
const at = await refreshNextReset();
|
|
911
|
+
const at = await refreshNextReset().catch(() => cachedNextReset);
|
|
659
912
|
await rescheduleTimer();
|
|
660
913
|
return { ok: true, nextReset: at };
|
|
661
914
|
});
|
|
@@ -666,39 +919,123 @@ function registerScheduleHandlers() {
|
|
|
666
919
|
return { ok: true };
|
|
667
920
|
});
|
|
668
921
|
|
|
669
|
-
ipcMain.handle('schedule:read-prd', async (_e,
|
|
922
|
+
ipcMain.handle('schedule:read-prd', async (_e, payload) => {
|
|
923
|
+
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
924
|
+
let slug;
|
|
670
925
|
try {
|
|
671
|
-
|
|
926
|
+
({ slug } = s.scheduleSlug.parse(payload));
|
|
927
|
+
} catch {
|
|
928
|
+
return { ok: false, error: 'invalid slug' };
|
|
929
|
+
}
|
|
930
|
+
const filePath = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
|
|
931
|
+
if (!filePath.startsWith(PRDS_DIR + path.sep)) {
|
|
932
|
+
return { ok: false, error: 'invalid slug' };
|
|
933
|
+
}
|
|
934
|
+
try {
|
|
935
|
+
const text = await fsp.readFile(filePath, 'utf8');
|
|
672
936
|
return { ok: true, text };
|
|
673
937
|
} catch (e) {
|
|
674
938
|
return { ok: false, error: e?.message };
|
|
675
939
|
}
|
|
676
940
|
});
|
|
677
941
|
|
|
678
|
-
ipcMain.handle('schedule:read-log', async (_e,
|
|
942
|
+
ipcMain.handle('schedule:read-log', async (_e, payload) => {
|
|
943
|
+
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
944
|
+
let slug, runId;
|
|
945
|
+
try {
|
|
946
|
+
({ slug, runId } = s.scheduleReadLog.parse(payload));
|
|
947
|
+
} catch {
|
|
948
|
+
return { ok: false, error: 'invalid slug or runId' };
|
|
949
|
+
}
|
|
950
|
+
const logPath = path.resolve(path.join(RUNS_DIR, runId, `${slug}.log`));
|
|
951
|
+
if (!logPath.startsWith(RUNS_DIR + path.sep)) {
|
|
952
|
+
return { ok: false, error: 'invalid slug or runId' };
|
|
953
|
+
}
|
|
679
954
|
try {
|
|
680
|
-
const
|
|
681
|
-
const text = await fsp.readFile(p, 'utf8');
|
|
955
|
+
const text = await fsp.readFile(logPath, 'utf8');
|
|
682
956
|
return { ok: true, text };
|
|
683
957
|
} catch (e) {
|
|
684
958
|
return { ok: false, error: e?.message };
|
|
685
959
|
}
|
|
686
960
|
});
|
|
961
|
+
|
|
962
|
+
const PRD_WRITE_MAX_BYTES = 256 * 1024;
|
|
963
|
+
const SLUG_RE = /^[A-Za-z0-9._-]{1,128}$/;
|
|
964
|
+
|
|
965
|
+
ipcMain.handle('schedule:write-prd', async (_e, { slug, body }) => {
|
|
966
|
+
if (!SLUG_RE.test(slug)) throw new Error(`invalid slug: ${slug}`);
|
|
967
|
+
if (typeof body !== 'string') throw new Error('body must be string');
|
|
968
|
+
if (Buffer.byteLength(body, 'utf8') > PRD_WRITE_MAX_BYTES) throw new Error('body too large');
|
|
969
|
+
const file = path.join(PRDS_DIR, `${slug}.md`);
|
|
970
|
+
const resolved = path.resolve(file);
|
|
971
|
+
if (!resolved.startsWith(PRDS_DIR + path.sep)) throw new Error('path escape');
|
|
972
|
+
const tmp = `${resolved}.${process.pid}.${Date.now()}.tmp`;
|
|
973
|
+
await fsp.writeFile(tmp, body, { encoding: 'utf8', mode: 0o644 });
|
|
974
|
+
await fsp.rename(tmp, resolved);
|
|
975
|
+
const stat = await fsp.stat(resolved);
|
|
976
|
+
return { ok: true, bytesWritten: stat.size };
|
|
977
|
+
});
|
|
978
|
+
|
|
979
|
+
ipcMain.handle('schedule:list-prds', async () => {
|
|
980
|
+
ensureDirs();
|
|
981
|
+
let entries;
|
|
982
|
+
try {
|
|
983
|
+
entries = await fsp.readdir(PRDS_DIR);
|
|
984
|
+
} catch {
|
|
985
|
+
return [];
|
|
986
|
+
}
|
|
987
|
+
const out = [];
|
|
988
|
+
for (const name of entries) {
|
|
989
|
+
if (!name.endsWith('.md') || name.startsWith('.')) continue;
|
|
990
|
+
const filePath = path.join(PRDS_DIR, name);
|
|
991
|
+
try {
|
|
992
|
+
const parsed = parsePrd(filePath);
|
|
993
|
+
const stat = await fsp.stat(filePath);
|
|
994
|
+
out.push({
|
|
995
|
+
slug: parsed.slug,
|
|
996
|
+
parallelGroup: parsed.parallelGroup,
|
|
997
|
+
title: parsed.title,
|
|
998
|
+
cwd: parsed.cwd || '',
|
|
999
|
+
estimateMinutes: parsed.estimateMinutes,
|
|
1000
|
+
mtimeMs: stat.mtimeMs,
|
|
1001
|
+
});
|
|
1002
|
+
} catch (e) {
|
|
1003
|
+
console.warn('[scheduler] list-prds: skipping unparseable', name, e?.message);
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
out.sort((a, b) => a.slug.localeCompare(b.slug, undefined, { numeric: true }));
|
|
1007
|
+
return out;
|
|
1008
|
+
});
|
|
687
1009
|
}
|
|
688
1010
|
|
|
689
1011
|
async function init() {
|
|
690
1012
|
ensureDirs();
|
|
691
|
-
|
|
692
|
-
|
|
1013
|
+
|
|
1014
|
+
// Hydrate cached state from the sidecar before any scheduling decisions.
|
|
1015
|
+
loadSchedulerState();
|
|
1016
|
+
bootedAt = Date.now();
|
|
1017
|
+
|
|
1018
|
+
// Boot reconciliation: mark any job that was 'running' when the app died as
|
|
1019
|
+
// 'failed'. mutate() creates queue.json from defaults if it doesn't exist.
|
|
1020
|
+
await mutate((state) => {
|
|
1021
|
+
for (const j of state.jobs) {
|
|
1022
|
+
if (j.status === 'running') {
|
|
1023
|
+
j.status = 'failed';
|
|
1024
|
+
j.error = 'orphaned: app restarted while running';
|
|
1025
|
+
j.finishedAt = new Date().toISOString();
|
|
1026
|
+
delete j.runtime;
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
});
|
|
693
1030
|
|
|
694
1031
|
// If we boot up while paused with a resumeAt in the past, clear it. This
|
|
695
1032
|
// happens when the app was closed across the reset window.
|
|
696
1033
|
const boot = readQueue();
|
|
697
1034
|
if (boot.paused && boot.paused.resumeAt && new Date(boot.paused.resumeAt).getTime() <= Date.now()) {
|
|
698
|
-
clearPause('boot-elapsed');
|
|
1035
|
+
await clearPause('boot-elapsed');
|
|
699
1036
|
} else if (boot.paused && boot.paused.resumeAt) {
|
|
700
1037
|
// Re-arm the resume timer (lost across restart).
|
|
701
|
-
setPaused(boot.paused.reason, boot.paused.resumeAt);
|
|
1038
|
+
await setPaused(boot.paused.reason, boot.paused.resumeAt);
|
|
702
1039
|
}
|
|
703
1040
|
|
|
704
1041
|
await rescheduleTimer();
|
|
@@ -706,13 +1043,49 @@ async function init() {
|
|
|
706
1043
|
// resets early or the auth token rotates. Tracked so re-init doesn't leak.
|
|
707
1044
|
if (rescheduleInterval) clearInterval(rescheduleInterval);
|
|
708
1045
|
rescheduleInterval = setInterval(() => { rescheduleTimer().catch(() => {}); }, 10 * 60_000);
|
|
709
|
-
|
|
710
|
-
//
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
// First tick fires after
|
|
714
|
-
|
|
715
|
-
|
|
1046
|
+
|
|
1047
|
+
// Self-rescheduling poll loop with exponential backoff. Replaces the
|
|
1048
|
+
// old fixed-interval pollTimer + initialPollTimeout.
|
|
1049
|
+
if (pollLoopTimer) clearTimeout(pollLoopTimer);
|
|
1050
|
+
// First tick fires after the standard warmup delay so billing is ready.
|
|
1051
|
+
pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
|
|
1052
|
+
if (pollLoopTimer.unref) pollLoopTimer.unref();
|
|
1053
|
+
|
|
1054
|
+
// Heartbeat: once per minute, log queue state for 24h visibility.
|
|
1055
|
+
if (heartbeatInterval) clearInterval(heartbeatInterval);
|
|
1056
|
+
heartbeatInterval = setInterval(() => {
|
|
1057
|
+
const s = readQueue();
|
|
1058
|
+
const counts = { pending: 0, running: 0, completed: 0, failed: 0 };
|
|
1059
|
+
for (const j of s.jobs) counts[j.status] = (counts[j.status] || 0) + 1;
|
|
1060
|
+
appendHeartbeat({
|
|
1061
|
+
ts: Date.now(),
|
|
1062
|
+
counts,
|
|
1063
|
+
paused: s.paused ? { reason: s.paused.reason, resumeAt: s.paused.resumeAt } : null,
|
|
1064
|
+
nextReset: cachedNextReset,
|
|
1065
|
+
utilization: cachedUtilization,
|
|
1066
|
+
consecutiveFailures,
|
|
1067
|
+
});
|
|
1068
|
+
}, 60_000);
|
|
1069
|
+
if (heartbeatInterval.unref) heartbeatInterval.unref();
|
|
1070
|
+
|
|
1071
|
+
// Wake-from-sleep: immediately re-poll and re-evaluate the queue.
|
|
1072
|
+
try {
|
|
1073
|
+
const { powerMonitor } = require('electron');
|
|
1074
|
+
powerMonitor.on('resume', () => {
|
|
1075
|
+
console.log('[scheduler] system resumed; re-polling and re-evaluating queue');
|
|
1076
|
+
if (pollLoopTimer) { clearTimeout(pollLoopTimer); pollLoopTimer = null; }
|
|
1077
|
+
backoffMs = 0;
|
|
1078
|
+
backoffNextAt = null;
|
|
1079
|
+
// Clear any paused-but-resumeAt-elapsed state immediately.
|
|
1080
|
+
const wakeState = readQueue();
|
|
1081
|
+
if (wakeState.paused?.resumeAt && new Date(wakeState.paused.resumeAt).getTime() <= Date.now()) {
|
|
1082
|
+
clearPause('boot-elapsed').then(() => { runDueJobs().catch(() => {}); }).catch(() => {});
|
|
1083
|
+
}
|
|
1084
|
+
pollLoop().catch(() => {});
|
|
1085
|
+
});
|
|
1086
|
+
} catch (e) {
|
|
1087
|
+
console.warn('[scheduler] powerMonitor unavailable', e?.message);
|
|
1088
|
+
}
|
|
716
1089
|
}
|
|
717
1090
|
|
|
718
1091
|
module.exports = { registerScheduleHandlers, attachWindow, init, ROOT, PRDS_DIR };
|