@yemi33/minions 0.1.1929 → 0.1.1931

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -182,24 +182,45 @@ async function _ccDashboardHealth() {
182
182
  // Triggered by the CC "Restart Minions" recovery button when a stale dashboard
183
183
  // connection is killing CC streams with "Failed to fetch". Spawns the same
184
184
  // `minions restart` flow as the CLI command (kills + respawns engine AND
185
- // dashboard). The browser auto-reloads via refresh.js when it sees the new
186
- // dashboardStartedAt; if the POST itself fails (dashboard truly unreachable),
187
- // fall back to a plain location.reload() so behavior is never worse than before.
185
+ // dashboard). Then polls /api/health until the new dashboard is online before
186
+ // reloading fixed timers either fired too early (reload hits dead port) or
187
+ // fired after the fetch threw (dashboard killed mid-response 500ms reload
188
+ // to nothing).
188
189
  async function ccRestartMinions(btn) {
189
190
  if (btn) { try { btn.disabled = true; btn.textContent = 'Restarting...'; } catch {} }
191
+ // Fire and forget the POST. We do NOT await it — the dashboard often kills
192
+ // its own process before the response is flushed, so the fetch throws even
193
+ // though the restart child is happily running. Whatever the POST does, the
194
+ // next step (wait-for-healthy) is the truth.
190
195
  try {
191
- var res = await fetch('/api/dashboard/restart', { method: 'POST', headers: { 'Content-Type': 'application/json' } });
192
- if (!res.ok) throw new Error('HTTP ' + res.status);
193
- if (btn) { try { btn.textContent = 'Restarting Minions page will reload...'; } catch {} }
194
- // The dashboard process is about to be killed. refresh.js's status poll
195
- // will detect the new dashboardStartedAt and call location.reload() once
196
- // the new dashboard is online; this fallback handles the case where the
197
- // poll hasn't fired (e.g., user closed the browser tab and reopened).
198
- setTimeout(function() { try { location.reload(); } catch {} }, 8000);
199
- } catch (e) {
200
- if (btn) { try { btn.textContent = 'Restart failed — reloading...'; } catch {} }
201
- setTimeout(function() { try { location.reload(); } catch {} }, 500);
196
+ fetch('/api/dashboard/restart', { method: 'POST', headers: { 'Content-Type': 'application/json' } })
197
+ .catch(function() { /* dashboard process likely killed mid-response — expected */ });
198
+ } catch { /* network layer threw before fetch even queued also expected */ }
199
+ if (btn) { try { btn.textContent = 'Restarting Minions — waiting for new dashboard...'; } catch {} }
200
+ var startedAt = Date.now();
201
+ var DEADLINE_MS = 60000;
202
+ var INTERVAL_MS = 500;
203
+ function pollHealth() {
204
+ fetch('/api/health', { cache: 'no-store' }).then(function(res) {
205
+ if (res && res.ok) {
206
+ if (btn) { try { btn.textContent = 'Dashboard online — reloading...'; } catch {} }
207
+ try { location.reload(); } catch {}
208
+ return;
209
+ }
210
+ throw new Error('not ok');
211
+ }).catch(function() {
212
+ if (Date.now() - startedAt > DEADLINE_MS) {
213
+ if (btn) { try { btn.textContent = 'Restart timed out — reloading...'; } catch {} }
214
+ try { location.reload(); } catch {}
215
+ return;
216
+ }
217
+ setTimeout(pollHealth, INTERVAL_MS);
218
+ });
202
219
  }
220
+ // Don't poll immediately — give the restart child a moment to actually kill
221
+ // the old dashboard. If we polled at t=0 we'd hit the OLD (dying) dashboard
222
+ // and prematurely reload before the new one is up.
223
+ setTimeout(pollHealth, 2000);
203
224
  }
204
225
 
205
226
  function _ccIsReconnectableStreamError(err) {
package/dashboard.js CHANGED
@@ -4631,14 +4631,17 @@ const server = http.createServer(async (req, res) => {
4631
4631
  if (swept) result.lastSwept = swept.timestamp;
4632
4632
  // Surface in-flight sweep state so the UI can render a 'now sweeping (Xm)'
4633
4633
  // badge alongside the previous-completion 'swept N days ago' indicator.
4634
- // Memory wins when present, disk fallback survives dashboard restarts.
4635
- const sweepState = safeJson(path.join(ENGINE_DIR, 'kb-sweep-state.json'));
4636
- const memInFlight = !!global._kbSweepInFlight;
4637
- const diskInFlight = !!(sweepState && sweepState.status === 'in-flight');
4638
- if (memInFlight || diskInFlight) {
4639
- result.sweepInFlight = true;
4640
- result.sweepStartedAt = global._kbSweepStartedAt || (sweepState && sweepState.startedAt) || null;
4641
- }
4634
+ // Source of truth: kb-sweep-state.json + PID liveness the in-process
4635
+ // sweep moved to a detached runner so in-memory globals are no longer
4636
+ // authoritative (they die with the dashboard).
4637
+ try {
4638
+ const { readSweepLiveness } = require('./engine/kb-sweep');
4639
+ const liveness = readSweepLiveness({ entryCount: entries.length });
4640
+ if (liveness.inFlight && liveness.alive) {
4641
+ result.sweepInFlight = true;
4642
+ result.sweepStartedAt = liveness.startedAt || null;
4643
+ }
4644
+ } catch { /* best-effort UI indicator */ }
4642
4645
  return jsonReply(res, 200, result);
4643
4646
  }
4644
4647
 
@@ -4657,73 +4660,135 @@ const server = http.createServer(async (req, res) => {
4657
4660
  }
4658
4661
 
4659
4662
  async function handleKnowledgeSweep(req, res) {
4660
- // Auto-release stale guard dynamic floor based on KB size (30 min min, +1s per entry)
4661
- const { staleGuardMs } = require('./engine/kb-sweep');
4663
+ // Source of truth = kb-sweep-state.json + PID liveness. The sweep now runs
4664
+ // as a detached child (engine/kb-sweep-runner.js) so it survives
4665
+ // `minions restart`; the in-memory `global._kbSweep*` flags from the old
4666
+ // in-process implementation are gone.
4667
+ const {
4668
+ readSweepLiveness, staleGuardMs, KB_SWEEP_STATE_PATH, KB_SWEEP_LOG_PATH, KB_SWEEP_RUNNER_PATH,
4669
+ } = require('./engine/kb-sweep');
4662
4670
  const entryCount = (queries.getKnowledgeBaseEntries() || []).length;
4663
4671
  const guardMs = staleGuardMs(entryCount);
4664
- if (global._kbSweepInFlight && global._kbSweepStartedAt && Date.now() - global._kbSweepStartedAt > guardMs) {
4665
- console.log(`[kb-sweep] Auto-releasing stale guard (>${Math.round(guardMs / 60000)}min for ${entryCount} entries)`);
4666
- global._kbSweepInFlight = false;
4667
- }
4668
- // Disk-state fallback: if a previous dashboard process died mid-sweep, the
4669
- // state file says 'in-flight' forever. Treat it as stale past the guard so
4670
- // a new sweep can start.
4671
- const sweepStateFile = path.join(ENGINE_DIR, 'kb-sweep-state.json');
4672
- const diskState = safeJson(sweepStateFile);
4673
- const diskInFlight = !!(diskState && diskState.status === 'in-flight');
4674
- const diskStartedAt = diskState && diskState.startedAt ? Number(diskState.startedAt) : 0;
4675
- const diskStale = diskInFlight && diskStartedAt && Date.now() - diskStartedAt > guardMs;
4676
- if (diskStale) {
4677
- console.log(`[kb-sweep] Auto-releasing stale disk-state guard (>${Math.round(guardMs / 60000)}min)`);
4678
- try { shared.safeUnlink(sweepStateFile); } catch { /* ignore */ }
4679
- }
4680
- if (global._kbSweepInFlight || (diskInFlight && !diskStale)) {
4672
+
4673
+ // Synchronous pre-claim BEFORE awaiting the body so a concurrent POST
4674
+ // arriving in the same tick sees in-flight state and can't double-spawn.
4675
+ const sweepToken = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
4676
+ const liveness = readSweepLiveness({ entryCount });
4677
+ if (liveness.inFlight && liveness.stale) {
4678
+ const reason = !liveness.alive
4679
+ ? `runner pid=${liveness.pid} is no longer alive`
4680
+ : `>${Math.round(guardMs / 60000)}min for ${entryCount} entries`;
4681
+ console.log(`[kb-sweep] Auto-releasing stale guard (${reason})`);
4682
+ try { shared.safeUnlink(KB_SWEEP_STATE_PATH); } catch { /* ignore */ }
4683
+ } else if (liveness.inFlight) {
4681
4684
  return jsonReply(res, 200, {
4682
4685
  ok: true, alreadyRunning: true,
4683
- startedAt: global._kbSweepStartedAt || diskStartedAt || null,
4686
+ startedAt: liveness.startedAt || null,
4684
4687
  });
4685
4688
  }
4686
- const sweepToken = Date.now() + Math.random();
4687
- global._kbSweepToken = sweepToken;
4688
- global._kbSweepInFlight = true;
4689
- global._kbSweepStartedAt = Date.now();
4689
+
4690
+ // Claim the slot synchronously by writing a "starting" state. The runner
4691
+ // will overwrite this with status:'in-flight' + its real pid once it boots.
4692
+ // readSweepLiveness grants a 15s boot-grace to "starting" records with no pid.
4693
+ const startedAt = Date.now();
4694
+ try {
4695
+ safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
4696
+ status: 'starting', startedAt, startedAtIso: new Date().toISOString(),
4697
+ sweepToken, pid: null,
4698
+ }));
4699
+ } catch (e) {
4700
+ console.error(`[kb-sweep] failed to write starting state: ${e.message}`);
4701
+ }
4702
+
4690
4703
  const body = await readBody(req).catch(() => ({}));
4691
- _runKbSweepBackground(body, sweepToken);
4692
- return jsonReply(res, 202, { ok: true, started: true });
4693
- }
4694
4704
 
4695
- async function _runKbSweepBackground(body, sweepToken) {
4705
+ // Persist body to a temp file so spawn doesn't have to serialize large
4706
+ // pinnedKeys arrays via argv. Skip when body is empty.
4707
+ let bodyFile = null;
4708
+ if (body && (Array.isArray(body.pinnedKeys) || body.dryRun != null)) {
4709
+ bodyFile = path.join(ENGINE_DIR, `tmp-kb-sweep-body-${sweepToken}.json`);
4710
+ try { safeWrite(bodyFile, JSON.stringify(body)); }
4711
+ catch (e) {
4712
+ console.error(`[kb-sweep] failed to write body-file ${bodyFile}: ${e.message}`);
4713
+ bodyFile = null;
4714
+ }
4715
+ }
4716
+
4717
+ const { spawn: cpSpawn } = require('child_process');
4718
+ // Open log fd in append mode so spawn can pipe stdio there. Child inherits
4719
+ // the fd; parent closes its copy after spawn returns successfully.
4720
+ let logFdNum = null;
4721
+ let stdio = ['ignore', 'ignore', 'ignore'];
4696
4722
  try {
4697
- const { runKbSweep } = require('./engine/kb-sweep');
4698
- const result = await runKbSweep({ pinnedKeys: body.pinnedKeys, engineConfig: CONFIG.engine });
4699
- global._kbSweepLastResult = result;
4700
- global._kbSweepLastCompletedAt = Date.now();
4723
+ logFdNum = fs.openSync(KB_SWEEP_LOG_PATH, 'a');
4724
+ stdio = ['ignore', logFdNum, logFdNum];
4725
+ } catch (e) {
4726
+ console.error(`[kb-sweep] failed to open log ${KB_SWEEP_LOG_PATH}: ${e.message}`);
4727
+ }
4728
+
4729
+ const spawnArgs = ['--sweep-token', sweepToken];
4730
+ if (bodyFile) spawnArgs.push('--body-file', bodyFile);
4731
+
4732
+ let proc;
4733
+ try {
4734
+ proc = cpSpawn(process.execPath, [KB_SWEEP_RUNNER_PATH, ...spawnArgs], {
4735
+ cwd: MINIONS_DIR, stdio, detached: true, windowsHide: true,
4736
+ env: { ...process.env },
4737
+ });
4701
4738
  } catch (e) {
4702
- console.error('[kb-sweep] background error:', e.message);
4703
- global._kbSweepLastResult = { ok: false, error: e.message };
4704
- global._kbSweepLastCompletedAt = Date.now();
4705
- } finally { if (global._kbSweepToken === sweepToken) global._kbSweepInFlight = false; }
4739
+ if (logFdNum != null) try { fs.closeSync(logFdNum); } catch { /* ignore */ }
4740
+ if (bodyFile) try { fs.unlinkSync(bodyFile); } catch { /* ignore */ }
4741
+ // Release the "starting" claim on synchronous spawn failure so the user
4742
+ // can retry immediately.
4743
+ try { shared.safeUnlink(KB_SWEEP_STATE_PATH); } catch { /* ignore */ }
4744
+ return jsonReply(res, 500, { error: `spawn failed: ${e.message}` });
4745
+ }
4746
+ if (logFdNum != null) try { fs.closeSync(logFdNum); } catch { /* ignore */ }
4747
+
4748
+ // Conditional CAS: only update the state file from "starting" → "in-flight"
4749
+ // if our sweepToken still owns it. If the (fast) runner already wrote
4750
+ // "completed"/"failed" or its own "in-flight", leave that newer state alone.
4751
+ try {
4752
+ const current = safeJson(KB_SWEEP_STATE_PATH);
4753
+ if (current && current.status === 'starting' && current.sweepToken === sweepToken) {
4754
+ safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
4755
+ status: 'in-flight', startedAt, startedAtIso: new Date().toISOString(),
4756
+ sweepToken, pid: proc.pid,
4757
+ }));
4758
+ }
4759
+ } catch { /* best-effort */ }
4760
+
4761
+ proc.unref();
4762
+ return jsonReply(res, 202, { ok: true, started: true, sweepToken });
4706
4763
  }
4707
4764
 
4708
4765
 
4709
4766
  function handleKnowledgeSweepStatus(req, res) {
4710
- // Disk-state fallback: when the dashboard restarts mid-sweep the in-memory
4711
- // globals get reset, but engine/kb-sweep-state.json survives. Memory still
4712
- // wins when present (faster, no disk read on every poll).
4767
+ // Source of truth = kb-sweep-state.json + PID liveness. Globals are gone
4768
+ // the runner is detached, so its lifecycle is independent of this process.
4769
+ const { readSweepLiveness } = require('./engine/kb-sweep');
4770
+ const entries = queries.getKnowledgeBaseEntries() || [];
4771
+ const liveness = readSweepLiveness({ entryCount: entries.length });
4713
4772
  const diskState = safeJson(path.join(ENGINE_DIR, 'kb-sweep-state.json'));
4714
- const memInFlight = !!global._kbSweepInFlight;
4715
- const diskInFlight = !!(diskState && diskState.status === 'in-flight');
4716
- const inFlight = memInFlight || diskInFlight;
4717
- const startedAt = global._kbSweepStartedAt || (diskState && diskState.startedAt) || null;
4718
- let lastResult = global._kbSweepLastResult || null;
4719
- let lastCompletedAt = global._kbSweepLastCompletedAt || null;
4720
- if (!lastResult && diskState && (diskState.status === 'completed' || diskState.status === 'failed')) {
4721
- if (diskState.status === 'failed') {
4722
- lastResult = { ok: false, error: diskState.error || 'sweep failed' };
4723
- } else {
4724
- lastResult = diskState.lastResult || { ok: true, summary: diskState.summary };
4725
- }
4726
- if (!lastCompletedAt && diskState.completedAt) lastCompletedAt = diskState.completedAt;
4773
+ let inFlight = false;
4774
+ let startedAt = null;
4775
+ let lastResult = null;
4776
+ let lastCompletedAt = null;
4777
+ if (liveness.inFlight && liveness.alive) {
4778
+ inFlight = true;
4779
+ startedAt = liveness.startedAt || null;
4780
+ } else if (liveness.inFlight && !liveness.alive) {
4781
+ // Runner crashed pre-completion (or "starting" claim expired without a
4782
+ // runner ever booting). Surface a synthetic error so the UI doesn't
4783
+ // silently lose the previous attempt.
4784
+ lastResult = { ok: false, error: 'sweep process exited before reporting completion' };
4785
+ lastCompletedAt = liveness.startedAt || null;
4786
+ } else if (diskState && diskState.status === 'completed') {
4787
+ lastResult = diskState.lastResult || { ok: true, summary: diskState.summary };
4788
+ lastCompletedAt = diskState.completedAt || null;
4789
+ } else if (diskState && diskState.status === 'failed') {
4790
+ lastResult = { ok: false, error: diskState.error || 'sweep failed' };
4791
+ lastCompletedAt = diskState.completedAt || null;
4727
4792
  }
4728
4793
  return jsonReply(res, 200, { inFlight, startedAt, lastResult, lastCompletedAt });
4729
4794
  }
@@ -1,5 +1,5 @@
1
1
  {
2
2
  "runtime": "copilot",
3
3
  "models": null,
4
- "cachedAt": "2026-05-14T02:50:03.274Z"
4
+ "cachedAt": "2026-05-14T03:32:45.485Z"
5
5
  }
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * engine/kb-sweep-runner.js — Detached entrypoint for the KB sweep.
4
+ *
5
+ * Spawned by dashboard.js `handleKnowledgeSweep` with `{ detached: true,
6
+ * stdio: ['ignore', logFd, logFd] }` so the sweep survives dashboard /
7
+ * engine restarts. The sweep regularly runs 1h+ and was previously killed
8
+ * mid-stream every `minions restart`.
9
+ *
10
+ * Args:
11
+ * --sweep-token <token> Opaque token from the dashboard (string/number).
12
+ * --body-file <path> Optional path to a JSON file with request body
13
+ * fields (pinnedKeys, dryRun). Deleted on exit.
14
+ * --dry-run Equivalent to `body.dryRun = true`.
15
+ *
16
+ * State protocol: runKbSweep itself writes `engine/kb-sweep-state.json`
17
+ * (in-flight → completed/failed) and includes `pid: process.pid` (this
18
+ * runner's pid) so the dashboard can liveness-check via `process.kill(pid, 0)`.
19
+ * Exits 0 on success, 1 on error. stdout/stderr land in engine/kb-sweep.log.
20
+ */
21
+
22
+ const fs = require('fs');
23
+
24
+ function getArg(argv, name) {
25
+ const idx = argv.indexOf(name);
26
+ if (idx >= 0 && idx + 1 < argv.length) return argv[idx + 1];
27
+ return null;
28
+ }
29
+ function hasFlag(argv, name) {
30
+ return argv.indexOf(name) >= 0;
31
+ }
32
+
33
+ const argv = process.argv.slice(2);
34
+ const sweepToken = getArg(argv, '--sweep-token') || String(Date.now());
35
+ const bodyFile = getArg(argv, '--body-file');
36
+ const cliDryRun = hasFlag(argv, '--dry-run');
37
+
38
+ let body = {};
39
+ if (bodyFile) {
40
+ try {
41
+ const raw = fs.readFileSync(bodyFile, 'utf8');
42
+ body = JSON.parse(raw || '{}');
43
+ } catch (e) {
44
+ console.error(`[kb-sweep-runner] failed to read body-file ${bodyFile}: ${e.message}`);
45
+ }
46
+ }
47
+ const dryRun = cliDryRun || body.dryRun === true;
48
+
49
+ // Lazy-require AFTER args are parsed so a malformed body-file doesn't drag in
50
+ // the whole sweep stack before we've reported the failure.
51
+ const queries = require('./queries');
52
+ const { runKbSweep } = require('./kb-sweep');
53
+
54
+ function cleanupBodyFile() {
55
+ if (!bodyFile) return;
56
+ try { fs.unlinkSync(bodyFile); } catch { /* ignore */ }
57
+ }
58
+
59
+ (async () => {
60
+ const startedIso = new Date().toISOString();
61
+ console.log(`[kb-sweep-runner] ${startedIso} starting pid=${process.pid} token=${sweepToken} dryRun=${dryRun}`);
62
+ try {
63
+ const engineConfig = (queries.getConfig() || {}).engine || {};
64
+ const result = await runKbSweep({
65
+ pinnedKeys: body.pinnedKeys,
66
+ engineConfig,
67
+ sweepToken,
68
+ dryRun,
69
+ });
70
+ const summary = result && result.summary ? result.summary : 'ok';
71
+ console.log(`[kb-sweep-runner] ${new Date().toISOString()} done: ${summary}`);
72
+ cleanupBodyFile();
73
+ process.exit(0);
74
+ } catch (e) {
75
+ const msg = e && e.message ? e.message : String(e);
76
+ console.error(`[kb-sweep-runner] ${new Date().toISOString()} error: ${msg}`);
77
+ if (e && e.stack) console.error(e.stack);
78
+ cleanupBodyFile();
79
+ process.exit(1);
80
+ }
81
+ })();
@@ -14,12 +14,14 @@ const path = require('path');
14
14
  const crypto = require('crypto');
15
15
  const shared = require('./shared');
16
16
  const queries = require('./queries');
17
- const { safeRead, safeWrite, safeUnlink, log, ts } = shared;
17
+ const { safeRead, safeWrite, safeJson, safeUnlink, log, ts } = shared;
18
18
  const { MINIONS_DIR, ENGINE_DIR } = queries;
19
19
 
20
20
  const KB_DIR = path.join(MINIONS_DIR, 'knowledge');
21
21
  const SWEPT_DIR = path.join(KB_DIR, '_swept');
22
22
  const KB_SWEEP_STATE_PATH = path.join(ENGINE_DIR, 'kb-sweep-state.json');
23
+ const KB_SWEEP_LOG_PATH = path.join(ENGINE_DIR, 'kb-sweep.log');
24
+ const KB_SWEEP_RUNNER_PATH = path.join(__dirname, 'kb-sweep-runner.js');
23
25
  const SWEPT_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
24
26
  const COMPRESS_THRESHOLD_BYTES = 5000;
25
27
  const LLM_BATCH_SIZE = 30;
@@ -279,7 +281,62 @@ function _applyLlmPlan(plan, manifest, opts = {}) {
279
281
  }
280
282
 
281
283
  function _writeSweepState(state) {
282
- try { safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify(state)); } catch { /* ignore */ }
284
+ // Always include the current process pid + the caller-supplied sweepToken so
285
+ // the dashboard's liveness check (process.kill(pid, 0)) and the stale-guard
286
+ // can distinguish "still running" from "runner crashed". When this module is
287
+ // imported by the detached runner, process.pid is the runner's pid — which
288
+ // is exactly what we want.
289
+ const augmented = { pid: process.pid, ...state };
290
+ try { safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify(augmented)); } catch { /* ignore */ }
291
+ }
292
+
293
+ /**
294
+ * Read kb-sweep-state.json and classify whether a sweep is alive + stale.
295
+ *
296
+ * Used by the dashboard's start endpoint, status endpoint, and stale-guard so
297
+ * they share a single source of truth (disk state + PID liveness) instead of
298
+ * relying on in-memory globals that die with the dashboard process.
299
+ *
300
+ * @param {object} [opts]
301
+ * @param {number} [opts.entryCount=0] - KB entry count for staleGuardMs()
302
+ * @param {number} [opts.now=Date.now()] - injectable clock for tests
303
+ * @param {(pid:number)=>boolean} [opts.isPidAlive] - injectable for tests
304
+ * @returns {{ inFlight: boolean, alive?: boolean, stale?: boolean, pid?: number,
305
+ * startedAt?: number, sweepToken?: string|number|null, guardMs?: number,
306
+ * status?: string }}
307
+ */
308
+ function readSweepLiveness(opts = {}) {
309
+ const now = Number(opts.now) || Date.now();
310
+ const entryCount = Number(opts.entryCount) || 0;
311
+ const isPidAlive = typeof opts.isPidAlive === 'function'
312
+ ? opts.isPidAlive
313
+ : (pid) => { try { process.kill(pid, 0); return true; } catch { return false; } };
314
+ const state = safeJson(KB_SWEEP_STATE_PATH);
315
+ if (!state) return { inFlight: false };
316
+ // "starting" is written by the dashboard pre-spawn (no PID yet) to close the
317
+ // race window between two concurrent POSTs. "in-flight" is written by the
318
+ // runner once it boots and has its own pid.
319
+ if (state.status !== 'starting' && state.status !== 'in-flight') {
320
+ return { inFlight: false, status: state.status };
321
+ }
322
+ const pid = Number(state.pid) || 0;
323
+ const startedAt = Number(state.startedAt) || 0;
324
+ const guardMs = staleGuardMs(entryCount);
325
+ const age = startedAt ? now - startedAt : 0;
326
+ let alive;
327
+ if (state.status === 'starting') {
328
+ // No PID yet — grant a short boot-grace so spawn can complete and the
329
+ // runner can overwrite with status:'in-flight' + its pid.
330
+ const STARTING_GRACE_MS = 15000;
331
+ alive = age <= STARTING_GRACE_MS;
332
+ } else {
333
+ alive = pid > 0 ? !!isPidAlive(pid) : false;
334
+ }
335
+ const stale = !alive || (startedAt > 0 && age > guardMs);
336
+ return {
337
+ inFlight: true, alive, stale, pid, startedAt, guardMs,
338
+ sweepToken: state.sweepToken || null, status: state.status,
339
+ };
283
340
  }
284
341
 
285
342
  /**
@@ -298,23 +355,26 @@ function _writeSweepState(state) {
298
355
  async function runKbSweep(opts = {}) {
299
356
  const dryRun = !!opts.dryRun;
300
357
  const startedAt = Date.now();
301
- if (!dryRun) _writeSweepState({ status: 'in-flight', startedAt, startedAtIso: ts() });
358
+ const sweepToken = opts.sweepToken != null ? opts.sweepToken : null;
359
+ // Always write state — even for dryRun — so a runner spawned with dryRun
360
+ // still reports terminal status and the dashboard pre-write doesn't leak
361
+ // a stale "in-flight"/"starting" record. The inner _runKbSweepImpl still
362
+ // honors dryRun for actual file mutations.
363
+ _writeSweepState({ status: 'in-flight', startedAt, startedAtIso: ts(), sweepToken, dryRun });
302
364
  try {
303
365
  const result = await _runKbSweepImpl(opts);
304
- if (!dryRun) {
305
- _writeSweepState({
306
- status: 'completed', startedAt, completedAt: Date.now(), completedAtIso: ts(),
307
- durationMs: result.durationMs, summary: result.summary, lastResult: result,
308
- });
309
- }
366
+ _writeSweepState({
367
+ status: 'completed', startedAt, completedAt: Date.now(), completedAtIso: ts(),
368
+ durationMs: result.durationMs, summary: result.summary, lastResult: result,
369
+ sweepToken, dryRun,
370
+ });
310
371
  return result;
311
372
  } catch (e) {
312
- if (!dryRun) {
313
- _writeSweepState({
314
- status: 'failed', startedAt, completedAt: Date.now(), completedAtIso: ts(),
315
- error: e && e.message ? e.message : String(e),
316
- });
317
- }
373
+ _writeSweepState({
374
+ status: 'failed', startedAt, completedAt: Date.now(), completedAtIso: ts(),
375
+ error: e && e.message ? e.message : String(e),
376
+ sweepToken, dryRun,
377
+ });
318
378
  throw e;
319
379
  }
320
380
  }
@@ -419,7 +479,10 @@ function staleGuardMs(entryCount) {
419
479
  module.exports = {
420
480
  runKbSweep,
421
481
  staleGuardMs,
482
+ readSweepLiveness,
422
483
  KB_SWEEP_STATE_PATH,
484
+ KB_SWEEP_LOG_PATH,
485
+ KB_SWEEP_RUNNER_PATH,
423
486
  // Exported for tests
424
487
  _hashEntry,
425
488
  _parseFrontmatter,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.1929",
3
+ "version": "0.1.1931",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"