claude-code-session-manager 0.8.1 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +66 -11
- package/dist/assets/{cssMode-DKTELvb6.js → cssMode-DyaNC2Cs.js} +1 -1
- package/dist/assets/{editor.main-Dx55Am4z.js → editor.main-BhSGi_Jw.js} +3 -3
- package/dist/assets/{freemarker2-CBdvn_u-.js → freemarker2-DZH3si5v.js} +1 -1
- package/dist/assets/{handlebars-B67ay2ue.js → handlebars-DvzTd6uL.js} +1 -1
- package/dist/assets/{html-002uK0_M.js → html-C5GmopAN.js} +1 -1
- package/dist/assets/{htmlMode-DsT8oVY_.js → htmlMode-DwnrHwx1.js} +1 -1
- package/dist/assets/index-BGshD4Pw.js +2976 -0
- package/dist/assets/index-DCK87t79.css +32 -0
- package/dist/assets/{javascript-Cfg-gFlu.js → javascript-JqHrxiCa.js} +1 -1
- package/dist/assets/{jsonMode-CCIKxANa.js → jsonMode-8rZcy09i.js} +1 -1
- package/dist/assets/{liquid-DewgYvox.js → liquid-ClpD_v7G.js} +1 -1
- package/dist/assets/{lspLanguageFeatures-BcMPMUo0.js → lspLanguageFeatures-u0WgQBQz.js} +1 -1
- package/dist/assets/{mdx-BGrrIvjV.js → mdx-DtViUgdm.js} +1 -1
- package/dist/assets/{python-CVhAv32T.js → python-CaAvhRGm.js} +1 -1
- package/dist/assets/{razor-DteXtrPO.js → razor-saGNVU7l.js} +1 -1
- package/dist/assets/{tsMode-DKeWRYvl.js → tsMode-HZwWTCj8.js} +1 -1
- package/dist/assets/{typescript-Dl1KPrAp.js → typescript-BInV4PNE.js} +1 -1
- package/dist/assets/{xml-DdyOGE0N.js → xml-tgO806YR.js} +1 -1
- package/dist/assets/{yaml-BwFXDW6t.js → yaml-CHApZArv.js} +1 -1
- package/dist/index.html +2 -2
- package/package.json +1 -1
- package/src/main/config.cjs +93 -19
- package/src/main/index.cjs +163 -31
- package/src/main/ipcSchemas.cjs +59 -2
- package/src/main/lib/cleanEnv.cjs +20 -0
- package/src/main/lib/credentials.cjs +184 -0
- package/src/main/lib/schedulerConfig.cjs +10 -0
- package/src/main/logs.cjs +1 -1
- package/src/main/otelSettings.cjs +1 -1
- package/src/main/pty.cjs +53 -6
- package/src/main/scheduler.cjs +518 -147
- package/src/main/transcripts.cjs +26 -21
- package/src/main/usage.cjs +76 -25
- package/src/main/voiceSettings.cjs +1 -1
- package/src/main/watchers.cjs +69 -11
- package/src/preload/api.d.ts +51 -11
- package/src/preload/index.cjs +13 -0
- package/dist/assets/index-DsC4vT8M.css +0 -32
- package/dist/assets/index-E14-spyd.js +0 -2972
package/src/main/scheduler.cjs
CHANGED
|
@@ -47,11 +47,20 @@ const os = require('node:os');
|
|
|
47
47
|
const { spawn } = require('node:child_process');
|
|
48
48
|
const { ipcMain } = require('electron');
|
|
49
49
|
const billing = require('./usage.cjs');
|
|
50
|
+
const { cleanChildEnv } = require('./lib/cleanEnv.cjs');
|
|
51
|
+
const {
|
|
52
|
+
POLL_INTERVAL_MS,
|
|
53
|
+
USAGE_REFRESH_INTERVAL_MS,
|
|
54
|
+
MAX_JOB_DURATION_MS,
|
|
55
|
+
} = require('./lib/schedulerConfig.cjs');
|
|
50
56
|
|
|
51
57
|
const ROOT = path.join(os.homedir(), '.claude', 'session-manager', 'scheduled-plans');
|
|
52
58
|
const PRDS_DIR = path.join(ROOT, 'prds');
|
|
53
59
|
const RUNS_DIR = path.join(ROOT, 'runs');
|
|
54
60
|
const QUEUE_PATH = path.join(ROOT, 'queue.json');
|
|
61
|
+
const SCHEDULER_STATE_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-state.json');
|
|
62
|
+
const HEARTBEAT_PATH = path.join(os.homedir(), '.claude', 'session-manager', 'scheduler-heartbeat.log');
|
|
63
|
+
const HEARTBEAT_MAX_BYTES = 1024 * 1024;
|
|
55
64
|
const DEFAULT_PROJECT_CWD = path.join(os.homedir(), 'Projects', 'session-manager');
|
|
56
65
|
|
|
57
66
|
const DEFAULT_CONFIG = {
|
|
@@ -62,7 +71,7 @@ const DEFAULT_CONFIG = {
|
|
|
62
71
|
defaultCwd: DEFAULT_PROJECT_CWD,
|
|
63
72
|
// 'when-available' = poll usage and fire whenever utilization < threshold.
|
|
64
73
|
// 'on-reset' = fire offsetMinutes after the next 5h reset (legacy).
|
|
65
|
-
// 'manual' = only fire on explicit Run
|
|
74
|
+
// 'manual' = only fire on explicit Run now click.
|
|
66
75
|
firePolicy: 'when-available',
|
|
67
76
|
// For 'when-available'. Fire only when five_hour utilization < this percent.
|
|
68
77
|
utilizationThreshold: 90,
|
|
@@ -77,11 +86,61 @@ function ensureDirs() {
|
|
|
77
86
|
}
|
|
78
87
|
|
|
79
88
|
function atomicWriteJson(p, data) {
|
|
80
|
-
const tmp = `${p}.${process.pid}.tmp`;
|
|
89
|
+
const tmp = `${p}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}.tmp`;
|
|
81
90
|
fs.writeFileSync(tmp, JSON.stringify(data, null, 2));
|
|
82
91
|
fs.renameSync(tmp, p);
|
|
83
92
|
}
|
|
84
93
|
|
|
94
|
+
// ---------- scheduler-state.json (sidecar) ----------
|
|
95
|
+
|
|
96
|
+
function loadSchedulerState() {
|
|
97
|
+
try {
|
|
98
|
+
const raw = fs.readFileSync(SCHEDULER_STATE_PATH, 'utf8');
|
|
99
|
+
const s = JSON.parse(raw);
|
|
100
|
+
if (s.lastObservedReset) cachedNextReset = s.lastObservedReset;
|
|
101
|
+
if (typeof s.consecutiveFailures === 'number') consecutiveFailures = s.consecutiveFailures;
|
|
102
|
+
if (typeof s.backoffMs === 'number') backoffMs = s.backoffMs;
|
|
103
|
+
if (typeof s.pauseClearedManuallyAt === 'number') pauseClearedManuallyAt = s.pauseClearedManuallyAt;
|
|
104
|
+
if (typeof s.lastPollAt === 'number') lastPollAt = s.lastPollAt;
|
|
105
|
+
} catch { /* first boot or corrupt — start fresh */ }
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function persistSchedulerState() {
|
|
109
|
+
try {
|
|
110
|
+
atomicWriteJson(SCHEDULER_STATE_PATH, {
|
|
111
|
+
version: 1,
|
|
112
|
+
lastObservedReset: cachedNextReset,
|
|
113
|
+
lastResetObservedAt: cachedNextReset ? Date.now() : null,
|
|
114
|
+
lastPollAt,
|
|
115
|
+
consecutiveFailures,
|
|
116
|
+
backoffMs,
|
|
117
|
+
pausedReason: null,
|
|
118
|
+
pausedSince: null,
|
|
119
|
+
pauseClearedManuallyAt,
|
|
120
|
+
});
|
|
121
|
+
} catch (e) {
|
|
122
|
+
console.warn('[scheduler] failed to persist scheduler state', e?.message);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// ---------- heartbeat log ----------
|
|
127
|
+
|
|
128
|
+
function appendHeartbeat(entry) {
|
|
129
|
+
try {
|
|
130
|
+
const line = JSON.stringify(entry) + '\n';
|
|
131
|
+
let size = 0;
|
|
132
|
+
try { size = fs.statSync(HEARTBEAT_PATH).size; } catch { /* new file */ }
|
|
133
|
+
if (size >= HEARTBEAT_MAX_BYTES) {
|
|
134
|
+
const rotated = HEARTBEAT_PATH + '.1';
|
|
135
|
+
try { fs.unlinkSync(rotated); } catch { /* */ }
|
|
136
|
+
try { fs.renameSync(HEARTBEAT_PATH, rotated); } catch { /* */ }
|
|
137
|
+
}
|
|
138
|
+
fs.appendFileSync(HEARTBEAT_PATH, line);
|
|
139
|
+
} catch (e) {
|
|
140
|
+
console.warn('[scheduler] heartbeat write failed', e?.message);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
85
144
|
function readQueue() {
|
|
86
145
|
try {
|
|
87
146
|
const raw = fs.readFileSync(QUEUE_PATH, 'utf8');
|
|
@@ -103,6 +162,25 @@ function writeQueue(state) {
|
|
|
103
162
|
atomicWriteJson(QUEUE_PATH, state);
|
|
104
163
|
}
|
|
105
164
|
|
|
165
|
+
// ---------- serialized mutation queue ----------
|
|
166
|
+
|
|
167
|
+
// All read-modify-write operations on queue.json go through mutate() so
|
|
168
|
+
// concurrent job completions in a parallel wave cannot lose each other's
|
|
169
|
+
// status updates. mutateTail is always a resolved promise even when the
|
|
170
|
+
// preceding mutate threw, so the chain never deadlocks.
|
|
171
|
+
let mutateTail = Promise.resolve();
|
|
172
|
+
|
|
173
|
+
function mutate(fn) {
|
|
174
|
+
const next = mutateTail.then(async () => {
|
|
175
|
+
const state = readQueue();
|
|
176
|
+
const ret = await fn(state);
|
|
177
|
+
writeQueue(state);
|
|
178
|
+
return ret;
|
|
179
|
+
});
|
|
180
|
+
mutateTail = next.catch(() => {}); // keep chain alive on errors
|
|
181
|
+
return next;
|
|
182
|
+
}
|
|
183
|
+
|
|
106
184
|
// ---------- PRD parsing ----------
|
|
107
185
|
|
|
108
186
|
/**
|
|
@@ -221,32 +299,41 @@ function reconcile(state) {
|
|
|
221
299
|
|
|
222
300
|
// ---------- next-reset detection ----------
|
|
223
301
|
|
|
224
|
-
let cachedNextReset = null;
|
|
302
|
+
let cachedNextReset = null; // bare ISO string or null
|
|
225
303
|
let cachedUtilization = null; // five_hour utilization %, 0–100, or null if unknown
|
|
226
304
|
|
|
305
|
+
/** Fetches latest usage from billing API. Throws on any error — callers handle it. */
|
|
227
306
|
async function refreshNextReset() {
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
} catch {
|
|
234
|
-
return cachedNextReset;
|
|
235
|
-
}
|
|
307
|
+
const r = await billing.fetchUsage();
|
|
308
|
+
if (r.kind !== 'ok') throw new Error(`usage fetch failed (${r.kind}): ${r.message ?? ''}`);
|
|
309
|
+
cachedNextReset = r.data?.usage?.five_hour?.resets_at ?? null;
|
|
310
|
+
cachedUtilization = r.data?.usage?.five_hour?.utilization ?? cachedUtilization;
|
|
311
|
+
return cachedNextReset;
|
|
236
312
|
}
|
|
237
313
|
|
|
238
314
|
function getNextResetCached() {
|
|
239
315
|
return cachedNextReset;
|
|
240
316
|
}
|
|
241
317
|
|
|
318
|
+
// ---------- health / poll state ----------
|
|
319
|
+
|
|
320
|
+
let bootedAt = Date.now();
|
|
321
|
+
let lastPollAt = null;
|
|
322
|
+
let lastPollOk = false;
|
|
323
|
+
let consecutiveFailures = 0;
|
|
324
|
+
let backoffMs = 0;
|
|
325
|
+
let backoffNextAt = null;
|
|
326
|
+
let firstFailureAt = null;
|
|
327
|
+
let pauseClearedManuallyAt = null;
|
|
328
|
+
|
|
242
329
|
// ---------- timer ----------
|
|
243
330
|
|
|
244
331
|
let mainWindow = null;
|
|
245
332
|
let fireTimer = null;
|
|
246
333
|
let resumeTimer = null;
|
|
247
|
-
let
|
|
334
|
+
let pollLoopTimer = null;
|
|
248
335
|
let rescheduleInterval = null;
|
|
249
|
-
let
|
|
336
|
+
let heartbeatInterval = null;
|
|
250
337
|
let isExecuting = false;
|
|
251
338
|
let cancelToken = { cancelled: false };
|
|
252
339
|
let claudeBinPathCached = null;
|
|
@@ -289,65 +376,83 @@ function computeFireAt(state, nextResetIso) {
|
|
|
289
376
|
|
|
290
377
|
async function rescheduleTimer() {
|
|
291
378
|
clearFireTimer();
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
broadcast();
|
|
300
|
-
return;
|
|
379
|
+
// Wrap in try/catch — on failure use the cached value so the on-reset
|
|
380
|
+
// timer can still be armed from the last known reset.
|
|
381
|
+
let nextResetIso;
|
|
382
|
+
try {
|
|
383
|
+
nextResetIso = await refreshNextReset();
|
|
384
|
+
} catch {
|
|
385
|
+
nextResetIso = cachedNextReset;
|
|
301
386
|
}
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
387
|
+
const fireAt = await mutate((state) => {
|
|
388
|
+
reconcile(state);
|
|
389
|
+
const fa = computeFireAt(state, nextResetIso);
|
|
390
|
+
state.scheduledFor = fa ? new Date(fa).toISOString() : null;
|
|
391
|
+
return fa;
|
|
392
|
+
});
|
|
305
393
|
broadcast();
|
|
394
|
+
if (!fireAt) return;
|
|
306
395
|
|
|
307
396
|
const delay = Math.max(1000, fireAt - Date.now());
|
|
308
|
-
// setTimeout caps at int32 ms (~24.8 days) — well above our 5h horizon, so
|
|
309
|
-
// a single timer is fine. If reset_at is wildly in the future we'd still
|
|
310
|
-
// re-anchor on the next billing refresh.
|
|
311
397
|
fireTimer = setTimeout(() => { runDueJobs().catch(() => {}); }, delay);
|
|
312
|
-
console.log(`[scheduler] next fire in ${Math.round(delay / 1000)}s @ ${
|
|
398
|
+
console.log(`[scheduler] next fire in ${Math.round(delay / 1000)}s @ ${new Date(fireAt).toISOString()}`);
|
|
313
399
|
}
|
|
314
400
|
|
|
315
401
|
// ---------- pause / resume ----------
|
|
316
402
|
|
|
317
|
-
function setPaused(reason, resumeAtIso) {
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
403
|
+
async function setPaused(reason, resumeAtIso) {
|
|
404
|
+
// Honor manual-override cooldown: if the user cleared a pause within the
|
|
405
|
+
// last 5 minutes, suppress auto-pause re-engagement on the same condition.
|
|
406
|
+
if (pauseClearedManuallyAt && Date.now() - pauseClearedManuallyAt < 300_000) {
|
|
407
|
+
console.log(`[scheduler] setPaused(${reason}) suppressed by manual override cooldown`);
|
|
408
|
+
return;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// For 'network' with no explicit resumeAt, auto-resume after 30 minutes.
|
|
412
|
+
let effectiveResumeAt = resumeAtIso;
|
|
413
|
+
if (reason === 'network' && !resumeAtIso) {
|
|
414
|
+
effectiveResumeAt = new Date(Date.now() + 30 * 60_000).toISOString();
|
|
323
415
|
}
|
|
324
|
-
|
|
416
|
+
|
|
417
|
+
await mutate((s) => {
|
|
418
|
+
if (s.paused && s.paused.reason === reason) {
|
|
419
|
+
if (effectiveResumeAt) s.paused.resumeAt = effectiveResumeAt;
|
|
420
|
+
} else {
|
|
421
|
+
s.paused = { reason, since: new Date().toISOString(), resumeAt: effectiveResumeAt || null };
|
|
422
|
+
}
|
|
423
|
+
});
|
|
325
424
|
broadcast();
|
|
326
425
|
cancelToken.cancelled = true;
|
|
327
426
|
if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
|
|
328
|
-
if (!
|
|
427
|
+
if (!effectiveResumeAt) return;
|
|
428
|
+
|
|
329
429
|
// Resume 30s after the reset to give the auth/billing endpoint time to flip.
|
|
330
|
-
const delay = Math.max(30_000, new Date(
|
|
430
|
+
const delay = Math.max(30_000, new Date(effectiveResumeAt).getTime() - Date.now() + 30_000);
|
|
331
431
|
if (delay > 0x7fffffff) {
|
|
332
432
|
console.warn(`[scheduler] paused (${reason}); resumeAt too far for setTimeout (${delay}ms)`);
|
|
333
433
|
return;
|
|
334
434
|
}
|
|
335
|
-
resumeTimer = setTimeout(() => {
|
|
336
|
-
clearPause('resume-timer');
|
|
435
|
+
resumeTimer = setTimeout(async () => {
|
|
436
|
+
await clearPause('resume-timer');
|
|
337
437
|
runDueJobs().catch(() => {});
|
|
338
438
|
}, delay);
|
|
339
|
-
console.log(`[scheduler] paused (${reason}); auto-resume in ${Math.round(delay/1000)}s`);
|
|
439
|
+
console.log(`[scheduler] paused (${reason}); auto-resume in ${Math.round(delay / 1000)}s`);
|
|
340
440
|
}
|
|
341
441
|
|
|
342
|
-
function clearPause(source) {
|
|
442
|
+
async function clearPause(source) {
|
|
343
443
|
if (resumeTimer) { clearTimeout(resumeTimer); resumeTimer = null; }
|
|
344
|
-
const
|
|
345
|
-
|
|
444
|
+
const wasPaused = await mutate((s) => {
|
|
445
|
+
if (!s.paused) return false;
|
|
346
446
|
console.log(`[scheduler] clearPause (${source || 'manual'})`);
|
|
347
447
|
s.paused = null;
|
|
348
|
-
|
|
349
|
-
|
|
448
|
+
return true;
|
|
449
|
+
});
|
|
450
|
+
// Track manual clears for the auto-pause cooldown.
|
|
451
|
+
if (source === 'manual' || source === 'run-now') {
|
|
452
|
+
pauseClearedManuallyAt = Date.now();
|
|
453
|
+
persistSchedulerState();
|
|
350
454
|
}
|
|
455
|
+
if (wasPaused) broadcast();
|
|
351
456
|
}
|
|
352
457
|
|
|
353
458
|
/** Mutate a job in place to "pending" with cleared run metadata. */
|
|
@@ -358,6 +463,7 @@ function resetJobFields(job, errorMsg) {
|
|
|
358
463
|
job.finishedAt = null;
|
|
359
464
|
job.exitCode = null;
|
|
360
465
|
job.error = errorMsg ?? null;
|
|
466
|
+
delete job.runtime;
|
|
361
467
|
}
|
|
362
468
|
|
|
363
469
|
/** Scan the tail of a job's log for the canonical rate-limit signal. We look
|
|
@@ -408,15 +514,34 @@ function pickRunDir() {
|
|
|
408
514
|
return { runId: ts, dir };
|
|
409
515
|
}
|
|
410
516
|
|
|
411
|
-
|
|
517
|
+
/**
|
|
518
|
+
* Execute a single PRD job. Writes stdout/stderr to a log file and a meta
|
|
519
|
+
* JSON sidecar. Accepts an optional onPid(pid) callback called synchronously
|
|
520
|
+
* after spawn so callers can persist the pid before the job finishes.
|
|
521
|
+
*/
|
|
522
|
+
async function executeJob(job, runDir, defaultCwd, onPid) {
|
|
412
523
|
const logPath = path.join(runDir, `${job.slug}.log`);
|
|
413
524
|
const metaPath = path.join(runDir, `${job.slug}.meta.json`);
|
|
414
525
|
const cwd = job.cwd || defaultCwd;
|
|
415
526
|
const startedAt = Date.now();
|
|
416
527
|
|
|
417
528
|
const fd = fs.openSync(logPath, 'a');
|
|
529
|
+
let fdClosed = false;
|
|
530
|
+
const closeFd = () => { if (fdClosed) return; fdClosed = true; fs.closeSync(fd); };
|
|
531
|
+
|
|
418
532
|
fs.writeSync(fd, `[scheduler] starting ${job.slug} at ${new Date().toISOString()}\n[scheduler] cwd=${cwd}\n\n`);
|
|
419
533
|
|
|
534
|
+
// Dead-cwd guard: verify the target directory exists and is traversable
|
|
535
|
+
// before handing it to the child process.
|
|
536
|
+
try { fs.accessSync(cwd, fs.constants.X_OK); }
|
|
537
|
+
catch {
|
|
538
|
+
const errMsg = `cwd no longer exists: ${cwd}`;
|
|
539
|
+
fs.writeSync(fd, `[scheduler] ${errMsg}\n`);
|
|
540
|
+
closeFd();
|
|
541
|
+
atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: errMsg, startedAt, finishedAt: Date.now(), durationMs: 0 });
|
|
542
|
+
return { exitCode: -1, durationMs: 0, error: errMsg };
|
|
543
|
+
}
|
|
544
|
+
|
|
420
545
|
// Read full PRD body fresh from disk (queue stored only the preview).
|
|
421
546
|
let prompt;
|
|
422
547
|
try {
|
|
@@ -424,12 +549,16 @@ async function executeJob(job, runDir, defaultCwd) {
|
|
|
424
549
|
prompt = parsed.body;
|
|
425
550
|
} catch (e) {
|
|
426
551
|
fs.writeSync(fd, `[scheduler] failed to read PRD: ${e?.message}\n`);
|
|
427
|
-
|
|
552
|
+
closeFd();
|
|
428
553
|
return { exitCode: -1, durationMs: 0, error: e?.message };
|
|
429
554
|
}
|
|
430
555
|
|
|
431
556
|
return await new Promise((resolve) => {
|
|
432
557
|
const claudeBin = resolveClaudeBin();
|
|
558
|
+
// Strip Claude Code env and secrets that leak in when session-manager is
|
|
559
|
+
// launched from a `claude` shell. CLAUDE_EFFORT=xhigh forces Opus and
|
|
560
|
+
// overrides `--model sonnet`, so scheduled jobs burn Opus credits silently.
|
|
561
|
+
const childEnv = cleanChildEnv();
|
|
433
562
|
const child = spawn(claudeBin, [
|
|
434
563
|
'-p', prompt,
|
|
435
564
|
'--model', 'sonnet',
|
|
@@ -438,24 +567,36 @@ async function executeJob(job, runDir, defaultCwd) {
|
|
|
438
567
|
'--verbose',
|
|
439
568
|
], {
|
|
440
569
|
cwd,
|
|
441
|
-
env:
|
|
570
|
+
env: childEnv,
|
|
442
571
|
stdio: ['ignore', fd, fd],
|
|
443
572
|
});
|
|
444
573
|
|
|
445
574
|
fs.writeSync(fd, `[scheduler] spawned pid=${child.pid}\n\n`);
|
|
446
575
|
|
|
576
|
+
// Fire-and-forget pid persistence — best effort.
|
|
577
|
+
if (onPid) onPid(child.pid).catch(() => {});
|
|
578
|
+
|
|
579
|
+
// Kill the child if it runs past the maximum allowed duration.
|
|
580
|
+
const watchdog = setTimeout(() => {
|
|
581
|
+
fs.writeSync(fd, `\n[scheduler] watchdog SIGKILL after ${MAX_JOB_DURATION_MS}ms\n`);
|
|
582
|
+
try { child.kill('SIGKILL'); } catch { /* already dead */ }
|
|
583
|
+
}, MAX_JOB_DURATION_MS);
|
|
584
|
+
if (watchdog.unref) watchdog.unref();
|
|
585
|
+
|
|
447
586
|
child.on('error', (err) => {
|
|
587
|
+
clearTimeout(watchdog);
|
|
448
588
|
const durationMs = Date.now() - startedAt;
|
|
449
|
-
fs.writeSync(fd, `\n[scheduler]
|
|
450
|
-
|
|
589
|
+
fs.writeSync(fd, `\n[scheduler] error: ${err.message}\n`);
|
|
590
|
+
closeFd();
|
|
451
591
|
atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: -1, error: err.message, startedAt, finishedAt: Date.now(), durationMs });
|
|
452
592
|
resolve({ exitCode: -1, durationMs, error: err.message });
|
|
453
593
|
});
|
|
454
594
|
|
|
455
595
|
child.on('exit', (code) => {
|
|
596
|
+
clearTimeout(watchdog);
|
|
456
597
|
const durationMs = Date.now() - startedAt;
|
|
457
598
|
fs.writeSync(fd, `\n[scheduler] exit code=${code} duration=${Math.round(durationMs / 1000)}s\n`);
|
|
458
|
-
|
|
599
|
+
closeFd();
|
|
459
600
|
const rateLimited = code !== 0 && detectRateLimitInLog(logPath);
|
|
460
601
|
atomicWriteJson(metaPath, { slug: job.slug, cwd, exitCode: code, rateLimited, startedAt, finishedAt: Date.now(), durationMs });
|
|
461
602
|
resolve({ exitCode: code, durationMs, rateLimited });
|
|
@@ -479,7 +620,6 @@ async function runDueJobs() {
|
|
|
479
620
|
return;
|
|
480
621
|
}
|
|
481
622
|
const { runId, dir: runDir } = pickRunDir();
|
|
482
|
-
state.lastRunAt = new Date().toISOString();
|
|
483
623
|
|
|
484
624
|
// Group by parallelGroup, ascending. Each group runs serially after the
|
|
485
625
|
// previous group completes.
|
|
@@ -491,7 +631,7 @@ async function runDueJobs() {
|
|
|
491
631
|
}
|
|
492
632
|
const groupKeys = Array.from(groups.keys()).sort((a, b) => a - b);
|
|
493
633
|
|
|
494
|
-
|
|
634
|
+
await mutate((s) => { s.lastRunAt = new Date().toISOString(); });
|
|
495
635
|
broadcast();
|
|
496
636
|
|
|
497
637
|
for (const gk of groupKeys) {
|
|
@@ -503,41 +643,59 @@ async function runDueJobs() {
|
|
|
503
643
|
const inFlight = new Set();
|
|
504
644
|
|
|
505
645
|
const launch = (job) => {
|
|
506
|
-
const
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
const
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
646
|
+
const promise = (async () => {
|
|
647
|
+
try {
|
|
648
|
+
// Mark job running.
|
|
649
|
+
await mutate((s) => {
|
|
650
|
+
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
651
|
+
if (idx >= 0) {
|
|
652
|
+
s.jobs[idx].status = 'running';
|
|
653
|
+
s.jobs[idx].runId = runId;
|
|
654
|
+
s.jobs[idx].startedAt = new Date().toISOString();
|
|
655
|
+
}
|
|
656
|
+
});
|
|
657
|
+
broadcast();
|
|
658
|
+
|
|
659
|
+
// Execute — onPid persists the child PID into the running state.
|
|
660
|
+
const res = await executeJob(job, runDir, state.config.defaultCwd, async (pid) => {
|
|
661
|
+
await mutate((s) => {
|
|
662
|
+
const idx = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
663
|
+
if (idx >= 0) {
|
|
664
|
+
s.jobs[idx].runtime = { pid, runId, startedAt: s.jobs[idx].startedAt };
|
|
665
|
+
}
|
|
666
|
+
});
|
|
667
|
+
});
|
|
668
|
+
|
|
669
|
+
// Rate-limit: pause before writing terminal status so the status
|
|
670
|
+
// mutate below can read the pause state.
|
|
671
|
+
if (res.rateLimited) {
|
|
672
|
+
const resetIso = await refreshNextReset().catch(() => cachedNextReset);
|
|
673
|
+
await setPaused('rate_limit', resetIso);
|
|
534
674
|
}
|
|
535
|
-
|
|
675
|
+
|
|
676
|
+
// Write terminal status; strip runtime regardless of outcome.
|
|
677
|
+
await mutate((s) => {
|
|
678
|
+
const i2 = s.jobs.findIndex((x) => x.slug === job.slug);
|
|
679
|
+
if (i2 >= 0) {
|
|
680
|
+
const treatAsPending = res.rateLimited || (s.paused && s.paused.reason === 'rate_limit');
|
|
681
|
+
if (treatAsPending) {
|
|
682
|
+
resetJobFields(s.jobs[i2], res.rateLimited ? 'paused: rate limit' : 'paused: queue halted');
|
|
683
|
+
} else {
|
|
684
|
+
s.jobs[i2].status = res.exitCode === 0 ? 'completed' : 'failed';
|
|
685
|
+
s.jobs[i2].finishedAt = new Date().toISOString();
|
|
686
|
+
s.jobs[i2].exitCode = res.exitCode;
|
|
687
|
+
s.jobs[i2].error = res.error || null;
|
|
688
|
+
delete s.jobs[i2].runtime;
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
});
|
|
536
692
|
broadcast();
|
|
693
|
+
} catch (e) {
|
|
694
|
+
console.error('[scheduler] launch error', job.slug, e);
|
|
537
695
|
}
|
|
538
|
-
|
|
539
|
-
});
|
|
696
|
+
})();
|
|
540
697
|
inFlight.add(promise);
|
|
698
|
+
promise.then(() => inFlight.delete(promise), () => inFlight.delete(promise));
|
|
541
699
|
};
|
|
542
700
|
|
|
543
701
|
// Prime up to cap
|
|
@@ -557,45 +715,97 @@ async function runDueJobs() {
|
|
|
557
715
|
isExecuting = false;
|
|
558
716
|
// No longer auto-disable after a run. The firePolicy now governs whether
|
|
559
717
|
// the next batch fires automatically. Just clear the one-shot scheduledFor.
|
|
560
|
-
|
|
561
|
-
s.scheduledFor = null;
|
|
562
|
-
writeQueue(s);
|
|
718
|
+
await mutate((s) => { s.scheduledFor = null; });
|
|
563
719
|
broadcast();
|
|
564
720
|
}
|
|
565
721
|
}
|
|
566
722
|
|
|
567
|
-
// ---------- when-available
|
|
723
|
+
// ---------- when-available launch logic ----------
|
|
568
724
|
|
|
569
|
-
async function
|
|
725
|
+
async function maybeLaunchWhenAvailable(state) {
|
|
726
|
+
if (state.config.firePolicy !== 'when-available') return;
|
|
727
|
+
if (state.paused) return;
|
|
728
|
+
if (isExecuting) return;
|
|
729
|
+
const pending = state.jobs.filter((j) => j.status === 'pending');
|
|
730
|
+
if (pending.length === 0) return;
|
|
731
|
+
if (cachedUtilization === null || cachedUtilization === undefined) return;
|
|
732
|
+
if (cachedUtilization >= state.config.utilizationThreshold) {
|
|
733
|
+
broadcast();
|
|
734
|
+
return;
|
|
735
|
+
}
|
|
736
|
+
console.log(`[scheduler] when-available: util=${cachedUtilization}%, ${pending.length} pending — firing`);
|
|
737
|
+
runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// ---------- poll loop with exponential backoff ----------
|
|
741
|
+
|
|
742
|
+
async function pollLoop() {
|
|
570
743
|
try {
|
|
571
|
-
const
|
|
572
|
-
if (state.config.firePolicy !== 'when-available') return;
|
|
573
|
-
if (state.paused) return;
|
|
574
|
-
if (isExecuting) return;
|
|
575
|
-
const pending = state.jobs.filter((j) => j.status === 'pending');
|
|
576
|
-
if (pending.length === 0) return;
|
|
744
|
+
const r = await billing.fetchUsage();
|
|
577
745
|
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
746
|
+
if (r.kind === 'ok') {
|
|
747
|
+
cachedNextReset = r.data?.usage?.five_hour?.resets_at ?? cachedNextReset;
|
|
748
|
+
cachedUtilization = r.data?.usage?.five_hour?.utilization ?? cachedUtilization;
|
|
749
|
+
consecutiveFailures = 0;
|
|
750
|
+
backoffMs = 0;
|
|
751
|
+
backoffNextAt = null;
|
|
752
|
+
firstFailureAt = null;
|
|
753
|
+
lastPollAt = Date.now();
|
|
754
|
+
lastPollOk = true;
|
|
755
|
+
persistSchedulerState();
|
|
756
|
+
|
|
757
|
+
// If a 'network' pause resolved, clear it now that we have a good reading.
|
|
758
|
+
const cur = readQueue();
|
|
759
|
+
if (cur.paused?.reason === 'network') {
|
|
760
|
+
await clearPause('network-recovered');
|
|
761
|
+
}
|
|
762
|
+
// If 'reset_failure' was set and we now have a valid reset, clear it.
|
|
763
|
+
if (cur.paused?.reason === 'reset_failure' && cachedNextReset) {
|
|
764
|
+
await clearPause('reset-recovered');
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
await maybeLaunchWhenAvailable(cur);
|
|
592
768
|
broadcast();
|
|
593
|
-
|
|
769
|
+
} else {
|
|
770
|
+
lastPollAt = Date.now();
|
|
771
|
+
lastPollOk = false;
|
|
772
|
+
consecutiveFailures++;
|
|
773
|
+
if (!firstFailureAt) firstFailureAt = Date.now();
|
|
774
|
+
|
|
775
|
+
if (r.kind === 'auth') {
|
|
776
|
+
console.error(`[scheduler] auth failure (HTTP ${r.httpStatus}): ${r.message}`);
|
|
777
|
+
await setPaused('auth', null);
|
|
778
|
+
} else {
|
|
779
|
+
// transient or config — apply exponential backoff.
|
|
780
|
+
backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
|
|
781
|
+
const totalFailureMs = Date.now() - firstFailureAt;
|
|
782
|
+
console.log(`[scheduler] transient failure #${consecutiveFailures}: ${r.kind} ${r.message ?? ''}; retry in ${backoffMs / 1000}s`);
|
|
783
|
+
|
|
784
|
+
// After 30 minutes of consecutive failures, set 'network' pause.
|
|
785
|
+
if (totalFailureMs > 30 * 60_000) {
|
|
786
|
+
const cur2 = readQueue();
|
|
787
|
+
if (!cur2.paused || cur2.paused.reason === 'network') {
|
|
788
|
+
await setPaused('network', null);
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
backoffNextAt = Date.now() + backoffMs;
|
|
794
|
+
persistSchedulerState();
|
|
594
795
|
}
|
|
595
|
-
console.log(`[scheduler] when-available: util=${util}%, ${pending.length} pending — firing`);
|
|
596
|
-
runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
|
|
597
796
|
} catch (e) {
|
|
598
|
-
|
|
797
|
+
// Unexpected error (e.g., IPC transport failure)
|
|
798
|
+
lastPollAt = Date.now();
|
|
799
|
+
lastPollOk = false;
|
|
800
|
+
consecutiveFailures++;
|
|
801
|
+
if (!firstFailureAt) firstFailureAt = Date.now();
|
|
802
|
+
backoffMs = backoffMs ? Math.min(backoffMs * 2, 480_000) : 30_000;
|
|
803
|
+
backoffNextAt = Date.now() + backoffMs;
|
|
804
|
+
persistSchedulerState();
|
|
805
|
+
} finally {
|
|
806
|
+
const delay = backoffMs || POLL_INTERVAL_MS;
|
|
807
|
+
pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, delay);
|
|
808
|
+
if (pollLoopTimer.unref) pollLoopTimer.unref();
|
|
599
809
|
}
|
|
600
810
|
}
|
|
601
811
|
|
|
@@ -620,44 +830,85 @@ function registerScheduleHandlers() {
|
|
|
620
830
|
};
|
|
621
831
|
});
|
|
622
832
|
|
|
623
|
-
ipcMain.handle('schedule:
|
|
833
|
+
ipcMain.handle('schedule:health', async () => {
|
|
624
834
|
const state = readQueue();
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
835
|
+
const runningJobs = [];
|
|
836
|
+
for (const j of state.jobs) {
|
|
837
|
+
if (j.status === 'running' && j.runtime) {
|
|
838
|
+
runningJobs.push({
|
|
839
|
+
slug: j.slug,
|
|
840
|
+
startedAt: j.startedAt ? Date.parse(j.startedAt) : 0,
|
|
841
|
+
pid: j.runtime.pid ?? 0,
|
|
842
|
+
});
|
|
843
|
+
}
|
|
628
844
|
}
|
|
629
|
-
|
|
630
|
-
|
|
845
|
+
return {
|
|
846
|
+
bootedAt,
|
|
847
|
+
lastPollAt,
|
|
848
|
+
lastPollOk,
|
|
849
|
+
consecutiveFailures,
|
|
850
|
+
backoffNextAt,
|
|
851
|
+
nextResetCached: cachedNextReset,
|
|
852
|
+
pausedSince: state.paused ? Date.parse(state.paused.since) : null,
|
|
853
|
+
pauseReason: state.paused?.reason ?? null,
|
|
854
|
+
runningJobs,
|
|
855
|
+
};
|
|
856
|
+
});
|
|
857
|
+
|
|
858
|
+
ipcMain.handle('schedule:set-config', async (_e, partial) => {
|
|
859
|
+
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
860
|
+
let validated;
|
|
861
|
+
try {
|
|
862
|
+
validated = s.setConfigSchema.parse(partial || {});
|
|
863
|
+
} catch (e) {
|
|
864
|
+
return { ok: false, error: e?.message ?? 'invalid config' };
|
|
631
865
|
}
|
|
632
|
-
|
|
866
|
+
const config = await mutate((state) => {
|
|
867
|
+
state.config = { ...state.config, ...validated };
|
|
868
|
+
return state.config;
|
|
869
|
+
});
|
|
633
870
|
await rescheduleTimer();
|
|
634
|
-
return { ok: true, config
|
|
871
|
+
return { ok: true, config };
|
|
635
872
|
});
|
|
636
873
|
|
|
637
|
-
ipcMain.handle('schedule:reset-job', async (_e,
|
|
638
|
-
const
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
874
|
+
ipcMain.handle('schedule:reset-job', async (_e, payload) => {
|
|
875
|
+
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
876
|
+
let slug;
|
|
877
|
+
try {
|
|
878
|
+
({ slug } = s.scheduleSlug.parse(payload));
|
|
879
|
+
} catch (e) {
|
|
880
|
+
return { ok: false, error: 'invalid slug' };
|
|
881
|
+
}
|
|
882
|
+
// Containment check after path.join.
|
|
883
|
+
const resolved = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
|
|
884
|
+
if (!resolved.startsWith(PRDS_DIR + path.sep)) {
|
|
885
|
+
return { ok: false, error: 'invalid slug' };
|
|
886
|
+
}
|
|
887
|
+
const found = await mutate((state) => {
|
|
888
|
+
const idx = state.jobs.findIndex((j) => j.slug === slug);
|
|
889
|
+
if (idx < 0) return false;
|
|
890
|
+
resetJobFields(state.jobs[idx]);
|
|
891
|
+
return true;
|
|
892
|
+
});
|
|
893
|
+
if (!found) return { ok: false, error: 'not found' };
|
|
643
894
|
broadcast();
|
|
644
895
|
return { ok: true };
|
|
645
896
|
});
|
|
646
897
|
|
|
647
898
|
ipcMain.handle('schedule:run-now', async () => {
|
|
648
899
|
// Manual run-now overrides any auto-pause. Clear it first.
|
|
649
|
-
clearPause('run-now');
|
|
900
|
+
await clearPause('run-now');
|
|
650
901
|
runDueJobs().catch((e) => console.error('[scheduler] runDueJobs error', e));
|
|
651
902
|
return { ok: true };
|
|
652
903
|
});
|
|
653
904
|
|
|
654
905
|
ipcMain.handle('schedule:resume', async () => {
|
|
655
|
-
clearPause('manual');
|
|
906
|
+
await clearPause('manual');
|
|
656
907
|
return { ok: true };
|
|
657
908
|
});
|
|
658
909
|
|
|
659
910
|
ipcMain.handle('schedule:refresh-reset', async () => {
|
|
660
|
-
const at = await refreshNextReset();
|
|
911
|
+
const at = await refreshNextReset().catch(() => cachedNextReset);
|
|
661
912
|
await rescheduleTimer();
|
|
662
913
|
return { ok: true, nextReset: at };
|
|
663
914
|
});
|
|
@@ -668,39 +919,123 @@ function registerScheduleHandlers() {
|
|
|
668
919
|
return { ok: true };
|
|
669
920
|
});
|
|
670
921
|
|
|
671
|
-
ipcMain.handle('schedule:read-prd', async (_e,
|
|
922
|
+
ipcMain.handle('schedule:read-prd', async (_e, payload) => {
|
|
923
|
+
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
924
|
+
let slug;
|
|
672
925
|
try {
|
|
673
|
-
|
|
926
|
+
({ slug } = s.scheduleSlug.parse(payload));
|
|
927
|
+
} catch {
|
|
928
|
+
return { ok: false, error: 'invalid slug' };
|
|
929
|
+
}
|
|
930
|
+
const filePath = path.resolve(path.join(PRDS_DIR, `${slug}.md`));
|
|
931
|
+
if (!filePath.startsWith(PRDS_DIR + path.sep)) {
|
|
932
|
+
return { ok: false, error: 'invalid slug' };
|
|
933
|
+
}
|
|
934
|
+
try {
|
|
935
|
+
const text = await fsp.readFile(filePath, 'utf8');
|
|
674
936
|
return { ok: true, text };
|
|
675
937
|
} catch (e) {
|
|
676
938
|
return { ok: false, error: e?.message };
|
|
677
939
|
}
|
|
678
940
|
});
|
|
679
941
|
|
|
680
|
-
ipcMain.handle('schedule:read-log', async (_e,
|
|
942
|
+
ipcMain.handle('schedule:read-log', async (_e, payload) => {
|
|
943
|
+
const { schemas: s } = require('./ipcSchemas.cjs');
|
|
944
|
+
let slug, runId;
|
|
945
|
+
try {
|
|
946
|
+
({ slug, runId } = s.scheduleReadLog.parse(payload));
|
|
947
|
+
} catch {
|
|
948
|
+
return { ok: false, error: 'invalid slug or runId' };
|
|
949
|
+
}
|
|
950
|
+
const logPath = path.resolve(path.join(RUNS_DIR, runId, `${slug}.log`));
|
|
951
|
+
if (!logPath.startsWith(RUNS_DIR + path.sep)) {
|
|
952
|
+
return { ok: false, error: 'invalid slug or runId' };
|
|
953
|
+
}
|
|
681
954
|
try {
|
|
682
|
-
const
|
|
683
|
-
const text = await fsp.readFile(p, 'utf8');
|
|
955
|
+
const text = await fsp.readFile(logPath, 'utf8');
|
|
684
956
|
return { ok: true, text };
|
|
685
957
|
} catch (e) {
|
|
686
958
|
return { ok: false, error: e?.message };
|
|
687
959
|
}
|
|
688
960
|
});
|
|
961
|
+
|
|
962
|
+
const PRD_WRITE_MAX_BYTES = 256 * 1024;
|
|
963
|
+
const SLUG_RE = /^[A-Za-z0-9._-]{1,128}$/;
|
|
964
|
+
|
|
965
|
+
ipcMain.handle('schedule:write-prd', async (_e, { slug, body }) => {
|
|
966
|
+
if (!SLUG_RE.test(slug)) throw new Error(`invalid slug: ${slug}`);
|
|
967
|
+
if (typeof body !== 'string') throw new Error('body must be string');
|
|
968
|
+
if (Buffer.byteLength(body, 'utf8') > PRD_WRITE_MAX_BYTES) throw new Error('body too large');
|
|
969
|
+
const file = path.join(PRDS_DIR, `${slug}.md`);
|
|
970
|
+
const resolved = path.resolve(file);
|
|
971
|
+
if (!resolved.startsWith(PRDS_DIR + path.sep)) throw new Error('path escape');
|
|
972
|
+
const tmp = `${resolved}.${process.pid}.${Date.now()}.tmp`;
|
|
973
|
+
await fsp.writeFile(tmp, body, { encoding: 'utf8', mode: 0o644 });
|
|
974
|
+
await fsp.rename(tmp, resolved);
|
|
975
|
+
const stat = await fsp.stat(resolved);
|
|
976
|
+
return { ok: true, bytesWritten: stat.size };
|
|
977
|
+
});
|
|
978
|
+
|
|
979
|
+
ipcMain.handle('schedule:list-prds', async () => {
|
|
980
|
+
ensureDirs();
|
|
981
|
+
let entries;
|
|
982
|
+
try {
|
|
983
|
+
entries = await fsp.readdir(PRDS_DIR);
|
|
984
|
+
} catch {
|
|
985
|
+
return [];
|
|
986
|
+
}
|
|
987
|
+
const out = [];
|
|
988
|
+
for (const name of entries) {
|
|
989
|
+
if (!name.endsWith('.md') || name.startsWith('.')) continue;
|
|
990
|
+
const filePath = path.join(PRDS_DIR, name);
|
|
991
|
+
try {
|
|
992
|
+
const parsed = parsePrd(filePath);
|
|
993
|
+
const stat = await fsp.stat(filePath);
|
|
994
|
+
out.push({
|
|
995
|
+
slug: parsed.slug,
|
|
996
|
+
parallelGroup: parsed.parallelGroup,
|
|
997
|
+
title: parsed.title,
|
|
998
|
+
cwd: parsed.cwd || '',
|
|
999
|
+
estimateMinutes: parsed.estimateMinutes,
|
|
1000
|
+
mtimeMs: stat.mtimeMs,
|
|
1001
|
+
});
|
|
1002
|
+
} catch (e) {
|
|
1003
|
+
console.warn('[scheduler] list-prds: skipping unparseable', name, e?.message);
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
out.sort((a, b) => a.slug.localeCompare(b.slug, undefined, { numeric: true }));
|
|
1007
|
+
return out;
|
|
1008
|
+
});
|
|
689
1009
|
}
|
|
690
1010
|
|
|
691
1011
|
async function init() {
|
|
692
1012
|
ensureDirs();
|
|
693
|
-
|
|
694
|
-
|
|
1013
|
+
|
|
1014
|
+
// Hydrate cached state from the sidecar before any scheduling decisions.
|
|
1015
|
+
loadSchedulerState();
|
|
1016
|
+
bootedAt = Date.now();
|
|
1017
|
+
|
|
1018
|
+
// Boot reconciliation: mark any job that was 'running' when the app died as
|
|
1019
|
+
// 'failed'. mutate() creates queue.json from defaults if it doesn't exist.
|
|
1020
|
+
await mutate((state) => {
|
|
1021
|
+
for (const j of state.jobs) {
|
|
1022
|
+
if (j.status === 'running') {
|
|
1023
|
+
j.status = 'failed';
|
|
1024
|
+
j.error = 'orphaned: app restarted while running';
|
|
1025
|
+
j.finishedAt = new Date().toISOString();
|
|
1026
|
+
delete j.runtime;
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
});
|
|
695
1030
|
|
|
696
1031
|
// If we boot up while paused with a resumeAt in the past, clear it. This
|
|
697
1032
|
// happens when the app was closed across the reset window.
|
|
698
1033
|
const boot = readQueue();
|
|
699
1034
|
if (boot.paused && boot.paused.resumeAt && new Date(boot.paused.resumeAt).getTime() <= Date.now()) {
|
|
700
|
-
clearPause('boot-elapsed');
|
|
1035
|
+
await clearPause('boot-elapsed');
|
|
701
1036
|
} else if (boot.paused && boot.paused.resumeAt) {
|
|
702
1037
|
// Re-arm the resume timer (lost across restart).
|
|
703
|
-
setPaused(boot.paused.reason, boot.paused.resumeAt);
|
|
1038
|
+
await setPaused(boot.paused.reason, boot.paused.resumeAt);
|
|
704
1039
|
}
|
|
705
1040
|
|
|
706
1041
|
await rescheduleTimer();
|
|
@@ -708,13 +1043,49 @@ async function init() {
|
|
|
708
1043
|
// resets early or the auth token rotates. Tracked so re-init doesn't leak.
|
|
709
1044
|
if (rescheduleInterval) clearInterval(rescheduleInterval);
|
|
710
1045
|
rescheduleInterval = setInterval(() => { rescheduleTimer().catch(() => {}); }, 10 * 60_000);
|
|
711
|
-
|
|
712
|
-
//
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
// First tick fires after
|
|
716
|
-
|
|
717
|
-
|
|
1046
|
+
|
|
1047
|
+
// Self-rescheduling poll loop with exponential backoff. Replaces the
|
|
1048
|
+
// old fixed-interval pollTimer + initialPollTimeout.
|
|
1049
|
+
if (pollLoopTimer) clearTimeout(pollLoopTimer);
|
|
1050
|
+
// First tick fires after the standard warmup delay so billing is ready.
|
|
1051
|
+
pollLoopTimer = setTimeout(() => { pollLoop().catch(() => {}); }, USAGE_REFRESH_INTERVAL_MS);
|
|
1052
|
+
if (pollLoopTimer.unref) pollLoopTimer.unref();
|
|
1053
|
+
|
|
1054
|
+
// Heartbeat: once per minute, log queue state for 24h visibility.
|
|
1055
|
+
if (heartbeatInterval) clearInterval(heartbeatInterval);
|
|
1056
|
+
heartbeatInterval = setInterval(() => {
|
|
1057
|
+
const s = readQueue();
|
|
1058
|
+
const counts = { pending: 0, running: 0, completed: 0, failed: 0 };
|
|
1059
|
+
for (const j of s.jobs) counts[j.status] = (counts[j.status] || 0) + 1;
|
|
1060
|
+
appendHeartbeat({
|
|
1061
|
+
ts: Date.now(),
|
|
1062
|
+
counts,
|
|
1063
|
+
paused: s.paused ? { reason: s.paused.reason, resumeAt: s.paused.resumeAt } : null,
|
|
1064
|
+
nextReset: cachedNextReset,
|
|
1065
|
+
utilization: cachedUtilization,
|
|
1066
|
+
consecutiveFailures,
|
|
1067
|
+
});
|
|
1068
|
+
}, 60_000);
|
|
1069
|
+
if (heartbeatInterval.unref) heartbeatInterval.unref();
|
|
1070
|
+
|
|
1071
|
+
// Wake-from-sleep: immediately re-poll and re-evaluate the queue.
|
|
1072
|
+
try {
|
|
1073
|
+
const { powerMonitor } = require('electron');
|
|
1074
|
+
powerMonitor.on('resume', () => {
|
|
1075
|
+
console.log('[scheduler] system resumed; re-polling and re-evaluating queue');
|
|
1076
|
+
if (pollLoopTimer) { clearTimeout(pollLoopTimer); pollLoopTimer = null; }
|
|
1077
|
+
backoffMs = 0;
|
|
1078
|
+
backoffNextAt = null;
|
|
1079
|
+
// Clear any paused-but-resumeAt-elapsed state immediately.
|
|
1080
|
+
const wakeState = readQueue();
|
|
1081
|
+
if (wakeState.paused?.resumeAt && new Date(wakeState.paused.resumeAt).getTime() <= Date.now()) {
|
|
1082
|
+
clearPause('boot-elapsed').then(() => { runDueJobs().catch(() => {}); }).catch(() => {});
|
|
1083
|
+
}
|
|
1084
|
+
pollLoop().catch(() => {});
|
|
1085
|
+
});
|
|
1086
|
+
} catch (e) {
|
|
1087
|
+
console.warn('[scheduler] powerMonitor unavailable', e?.message);
|
|
1088
|
+
}
|
|
718
1089
|
}
|
|
719
1090
|
|
|
720
1091
|
module.exports = { registerScheduleHandlers, attachWindow, init, ROOT, PRDS_DIR };
|