@yemi33/minions 0.1.2044 → 0.1.2045
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/js/command-center.js +64 -7
- package/dashboard/js/refresh.js +143 -2
- package/dashboard/js/render-prs.js +43 -9
- package/dashboard/js/settings.js +4 -0
- package/dashboard/styles.css +21 -0
- package/dashboard.js +21 -79
- package/docs/auto-discovery.md +3 -1
- package/docs/qa-runbook-lifecycle.md +71 -0
- package/docs/qa-runbooks.md +6 -5
- package/docs/runtime-adapters.md +1 -1
- package/docs/security.md +2 -1
- package/docs/watches.md +19 -19
- package/engine/cleanup.js +84 -2
- package/engine/dispatch.js +6 -0
- package/engine/kb-sweep.js +127 -0
- package/engine/lifecycle.js +18 -0
- package/engine/queries.js +84 -7
- package/engine/shared.js +36 -0
- package/engine/timeout.js +4 -0
- package/engine.js +240 -11
- package/package.json +1 -1
package/docs/qa-runbooks.md
CHANGED
|
@@ -95,10 +95,11 @@ All writes use `mutateJsonFileLocked` per the repo convention. Deletes use
|
|
|
95
95
|
unlink (so an in-progress `saveRunbook` rename can't race with the
|
|
96
96
|
unlink).
|
|
97
97
|
|
|
98
|
-
##
|
|
98
|
+
## Run records, artifacts, and UI
|
|
99
99
|
|
|
100
|
-
|
|
100
|
+
The deferred follow-up items (W-mpeiwz6k0005bf34-b/c/d) have since landed. Brief pointers — see [CLAUDE.md](../CLAUDE.md) → "QA validation runs" for the deep dive:
|
|
101
101
|
|
|
102
|
-
-
|
|
103
|
-
-
|
|
104
|
-
-
|
|
102
|
+
- **Run dispatch + persistence** (`engine/qa-runs.js`): `POST /api/qa/runbooks/run` creates a `qa-runs.json` record with `status ∈ pending|dispatched|running|passed|failed|error` and dispatches a `qa-validate` work item against the runbook's `target`. Read via `GET /api/qa/runs?limit=N&status=...` and `GET /api/qa/runs/<id>`.
|
|
103
|
+
- **Artifact contract**: the `qa-validate` agent writes `agents/<id>/qa-run.json` before exit; the engine copies listed files into `engine/qa-artifacts/<runId>/` and serves them via `GET /api/qa/artifacts/<runId>/<file>` (path-traversal-gated, 403 on escape). Per-file size cap: `engine.qaArtifactsMaxBytes`.
|
|
104
|
+
- **UI**: `/qa` dashboard page (`dashboard/pages/qa.html`, `dashboard/js/qa.js`) polls `GET /api/qa/runs` every 5s while active; auto-detects screenshots/videos/logs for inline preview.
|
|
105
|
+
- **Playbook**: `playbooks/qa-validate.md` (routed via the synthetic `qa-validate` task-type in `routing.md`).
|
package/docs/runtime-adapters.md
CHANGED
|
@@ -103,7 +103,7 @@ directly.
|
|
|
103
103
|
Agent dispatch resolves the runtime once at spawn time:
|
|
104
104
|
|
|
105
105
|
```js
|
|
106
|
-
// engine.js spawnAgent (~line
|
|
106
|
+
// engine.js spawnAgent (~line 1866)
|
|
107
107
|
const runtime = resolveRuntime(shared.resolveAgentCli(agentConfig, engineConfig));
|
|
108
108
|
```
|
|
109
109
|
|
package/docs/security.md
CHANGED
|
@@ -60,7 +60,8 @@ system. Its threat model:
|
|
|
60
60
|
operator visits could in principle issue requests to `http://127.0.0.1:7331`.
|
|
61
61
|
The dashboard defends against this with:
|
|
62
62
|
- An **Origin gate** on mutating methods (`POST`/`PUT`/`PATCH`/`DELETE`)
|
|
63
|
-
and CORS preflights — see `dashboard.js` ~
|
|
63
|
+
and CORS preflights — see `dashboard.js` ~4565–4609 (and additional
|
|
64
|
+
`isAllowedOrigin` enforcement points in the SSE/CC handlers) and
|
|
64
65
|
`shared.isAllowedOrigin` / `shared.buildSecurityHeaders` in
|
|
65
66
|
[`engine/shared.js`](../engine/shared.js). Requests whose `Origin` (or
|
|
66
67
|
`Referer`, if `Origin` is absent) is not in the local allowlist are
|
package/docs/watches.md
CHANGED
|
@@ -20,11 +20,11 @@ A watch is a small JSON record persisted to `engine/watches.json`. It binds:
|
|
|
20
20
|
| `requires` | Optional guard: array of predicate objects evaluated against `state` / `entity` / `prevState`; trigger is suppressed when any guard fails (false-or-error). Used to gate a watch on "PR is mergeable AND build passing" etc. |
|
|
21
21
|
| `status` | `WATCH_STATUS.ACTIVE` \| `PAUSED` \| `TRIGGERED` \| `EXPIRED` |
|
|
22
22
|
|
|
23
|
-
`createWatch()` allocates a `watch-<uid>` id, defaults the fields above, and persists atomically via `mutateJsonFileLocked` *(source: `engine/watches.js:184-
|
|
23
|
+
`createWatch()` allocates a `watch-<uid>` id, defaults the fields above, and persists atomically via `mutateJsonFileLocked` *(source: `engine/watches.js:184-248`)*.
|
|
24
24
|
|
|
25
25
|
## Lifecycle (`WATCH_STATUS`)
|
|
26
26
|
|
|
27
|
-
Defined in `engine/shared.js:
|
|
27
|
+
Defined in `engine/shared.js:2523`:
|
|
28
28
|
|
|
29
29
|
| Status | Meaning |
|
|
30
30
|
|-------------|-------------------------------------------------------------------------|
|
|
@@ -37,10 +37,10 @@ Pause/resume flips the `status` field via `POST /api/watches/update` *(source: `
|
|
|
37
37
|
|
|
38
38
|
## Conditions (`WATCH_CONDITION`)
|
|
39
39
|
|
|
40
|
-
Defined in `engine/shared.js:
|
|
40
|
+
Defined in `engine/shared.js:2539-2577`. Conditions split into two families:
|
|
41
41
|
|
|
42
42
|
### Absolute conditions (`WATCH_ABSOLUTE_CONDITIONS`)
|
|
43
|
-
*(source: `engine/shared.js:
|
|
43
|
+
*(source: `engine/shared.js:2586-2602`)*
|
|
44
44
|
|
|
45
45
|
`merged`, `build-fail`, `build-pass`, `completed`, `failed`, `concluded`, `approved`, `rejected`, `ready-for-merge`, `retry-limit-reached`, `all-items-done`, `item-failed-n-times`.
|
|
46
46
|
|
|
@@ -49,12 +49,12 @@ When `stopAfter === 0`, these are **fire-once** — the engine flips the watch t
|
|
|
49
49
|
> **Per-target override (W-mp7hg58e000b5212):** the global `WATCH_ABSOLUTE_CONDITIONS` set is the legacy fallback. Each target type now declares its own `absoluteConditions: [...]` array in its spec; `registerTargetType` normalizes that into a `Set` that takes precedence at evaluation time. The plugin contract (see below) uses this to keep absolute-vs-change semantics local to each target type. Plugins that omit `absoluteConditions` get an empty set (all change-based).
|
|
50
50
|
|
|
51
51
|
### Change-based conditions
|
|
52
|
-
`status-change`, `any`, `new-comments`, `vote-change`, `stage-complete`, `ran`, `enabled`, `disabled`, `activity-change`, plus the predicate conditions added under P-w4e2f6a1 / P-w5b8d2c9 for the `pr`, `work-item`, `plan`, and `pipeline` target types (`head-commit-change`, `mergeable-flipped`, `behind-master`, `draft-flipped`, `stalled`, `dependency-met`, `stage-advanced`, `stuck-in-stage`). See `engine/shared.js:
|
|
52
|
+
`status-change`, `any`, `new-comments`, `vote-change`, `stage-complete`, `ran`, `enabled`, `disabled`, `activity-change`, plus the predicate conditions added under P-w4e2f6a1 / P-w5b8d2c9 for the `pr`, `work-item`, `plan`, and `pipeline` target types (`head-commit-change`, `mergeable-flipped`, `behind-master`, `draft-flipped`, `stalled`, `dependency-met`, `stage-advanced`, `stuck-in-stage`). See `engine/shared.js:2539-2577` for the canonical enum.
|
|
53
53
|
|
|
54
54
|
These compare the live entity against the watch's `_lastState` snapshot and run forever when `stopAfter === 0`. Baseline `_lastState` is captured on the first check so the very next change triggers the watch *(source: `engine/watches.js:434, 520`)*.
|
|
55
55
|
|
|
56
56
|
### Tick-counted conditions
|
|
57
|
-
`stalled`, `stuck-in-stage` — require N consecutive unchanged captures (default `WATCH_STALLED_DEFAULT_TICKS = 12`, `WATCH_STUCK_STAGE_DEFAULT_TICKS = 12`, both in `engine/shared.js:
|
|
57
|
+
`stalled`, `stuck-in-stage` — require N consecutive unchanged captures (default `WATCH_STALLED_DEFAULT_TICKS = 12`, `WATCH_STUCK_STAGE_DEFAULT_TICKS = 12`, both in `engine/shared.js:2582-2583`). Counters (`_unchangedTicks`, `_stuckStageTicks`) are recomputed inside `_captureState` by comparing the fresh snapshot against `prevState`.
|
|
58
58
|
|
|
59
59
|
### Predicate conditions
|
|
60
60
|
|
|
@@ -65,11 +65,11 @@ Several condition keys evaluate a derived predicate on the captured entity/state
|
|
|
65
65
|
- **plan** — `all-items-done` (`items_done === items_total > 0`), `item-failed-n-times` (any `missing_features[*]._retryCount >= ENGINE_DEFAULTS.maxRetries`).
|
|
66
66
|
- **pipeline** — `stage-advanced` (`current_stage_id` changed within the same `runId`), `stuck-in-stage` (current stage unchanged for `WATCH_STUCK_STAGE_DEFAULT_TICKS` checks, default 12).
|
|
67
67
|
|
|
68
|
-
Compound state-assertion predicates (`ready-for-merge`, `retry-limit-reached`, `all-items-done`, `item-failed-n-times`) live in `WATCH_ABSOLUTE_CONDITIONS` so they fire-once when `stopAfter === 0` — without that they would re-fire every tick while the assertion holds *(source: `engine/shared.js:
|
|
68
|
+
Compound state-assertion predicates (`ready-for-merge`, `retry-limit-reached`, `all-items-done`, `item-failed-n-times`) live in `WATCH_ABSOLUTE_CONDITIONS` so they fire-once when `stopAfter === 0` — without that they would re-fire every tick while the assertion holds *(source: `engine/shared.js:2586` `WATCH_ABSOLUTE_CONDITIONS`)*.
|
|
69
69
|
|
|
70
70
|
## Target Types — `TARGET_TYPES` Registry
|
|
71
71
|
|
|
72
|
-
Target-type behavior in `engine/watches.js` is **data-driven via a registry** *(source: `engine/watches.js:124-
|
|
72
|
+
Target-type behavior in `engine/watches.js` is **data-driven via a registry** *(source: `engine/watches.js:124-153`)*. Each spec must provide:
|
|
73
73
|
|
|
74
74
|
- `label` — human name shown in dashboard pickers
|
|
75
75
|
- `description` — short help text
|
|
@@ -79,17 +79,17 @@ Target-type behavior in `engine/watches.js` is **data-driven via a registry** *(
|
|
|
79
79
|
- `captureState(entity)` — snapshot used for change-detection diffs
|
|
80
80
|
- `evaluate(condition, entity, prevState, target)` — returns `{ triggered, message }`
|
|
81
81
|
|
|
82
|
-
The registry IS the allowlist for `createWatch` and `/api/watches/target-types`; the old hardcoded "pr or work-item" check is gone. Add a new target type at runtime with `registerTargetType(type, spec)` and look one up with `getTargetType(type)`. `listTargetTypes()` returns the serializable form used by the dashboard *(source: `engine/watches.js:124-
|
|
82
|
+
The registry IS the allowlist for `createWatch` and `/api/watches/target-types`; the old hardcoded "pr or work-item" check is gone. Add a new target type at runtime with `registerTargetType(type, spec)` and look one up with `getTargetType(type)`. `listTargetTypes()` returns the serializable form used by the dashboard *(source: `engine/watches.js:124-174`)*.
|
|
83
83
|
|
|
84
84
|
### User-extensible via `watches.d/` (W-mp7hg58e000b5212)
|
|
85
85
|
|
|
86
|
-
At engine boot, every `*.js` file in `<MINIONS_DIR>/watches.d/` is auto-loaded **after** the built-in registrations *(source: `engine/watches.js:
|
|
86
|
+
At engine boot, every `*.js` file in `<MINIONS_DIR>/watches.d/` is auto-loaded **after** the built-in registrations *(source: `engine/watches.js:1319-1354`)*, so plugins can both add new target types and override built-ins. A plugin file exports either `{ name, spec }` or an array of such objects. Failures are logged-and-skipped — one bad plugin must not break boot or block other plugins. Reloads require an engine restart.
|
|
87
87
|
|
|
88
88
|
Canonical example: `watches.d/http.js` (W-mp7i22mu00191b07) — a generic HTTP poller covering the full plugin contract including `extractState` (custom snapshot fields not on the entity itself) and `extendTemplateVars` (custom action-template vars like `{{httpStatus}}`, `{{prevExtracted}}`).
|
|
89
89
|
|
|
90
90
|
### Built-in target types
|
|
91
91
|
|
|
92
|
-
The eight built-ins are registered at module load *(source: `engine/watches.js:672-1313`)*. Constants live at `engine/shared.js:
|
|
92
|
+
The eight built-ins are registered at module load *(source: `engine/watches.js:672-1313`)*. Constants live at `engine/shared.js:2529-2538` (`WATCH_TARGET_TYPE`).
|
|
93
93
|
|
|
94
94
|
| `targetType` | Target value | Conditions | Notes |
|
|
95
95
|
|---------------|--------------------------------------|----------------------------------------------------------------------------|-------|
|
|
@@ -102,11 +102,11 @@ The eight built-ins are registered at module load *(source: `engine/watches.js:6
|
|
|
102
102
|
| `dispatch` | Dispatch entry id | `completed`, `failed`, `status-change`, `any` | Looks across `pending` / `active` / `completed` lists |
|
|
103
103
|
| `agent` | Agent id | `activity-change`, `status-change`, `any` | `activity-change` fires only on transitions in/out of `'working'` |
|
|
104
104
|
|
|
105
|
-
`evaluateWatch` dispatches to `tt.evaluate(...)`; unknown target types return `"Unknown target type: ..."` and unknown conditions return `"Unknown condition: ..."` — both are non-triggering *(source: `engine/watches.js:318-
|
|
105
|
+
`evaluateWatch` dispatches to `tt.evaluate(...)`; unknown target types return `"Unknown target type: ..."` and unknown conditions return `"Unknown condition: ..."` — both are non-triggering *(source: `engine/watches.js:318-371`)*.
|
|
106
106
|
|
|
107
107
|
### Plugin folder (`watches.d/`) — user-extensible target types
|
|
108
108
|
|
|
109
|
-
W-mp7hg58e000b5212 added a **plugin folder** so operators can register new target types without editing engine source. At engine boot, `engine/watches.js` scans `<MINIONS_DIR>/watches.d/*.js` *after* the eight built-ins are registered (so plugins can override a built-in by re-using its key — last-write-wins) and calls `registerTargetType()` for each export *(source: `engine/watches.js:
|
|
109
|
+
W-mp7hg58e000b5212 added a **plugin folder** so operators can register new target types without editing engine source. At engine boot, `engine/watches.js` scans `<MINIONS_DIR>/watches.d/*.js` *after* the eight built-ins are registered (so plugins can override a built-in by re-using its key — last-write-wins) and calls `registerTargetType()` for each export *(source: `engine/watches.js:1319-1354`)*.
|
|
110
110
|
|
|
111
111
|
Each `watches.d/<name>.js` file must export `{ name, spec }` (or an array of those):
|
|
112
112
|
|
|
@@ -133,7 +133,7 @@ Resolution is `path.join(shared.MINIONS_DIR, 'watches.d')` so it works in both d
|
|
|
133
133
|
|
|
134
134
|
## Tick Integration
|
|
135
135
|
|
|
136
|
-
`engine.js` calls `checkWatches(config, state)` every 3 ticks (~3 min at the default 60s tick) inside its own `safe('checkWatches', ...)` block *(source: `engine.js:
|
|
136
|
+
`engine.js` calls `checkWatches(config, state)` every 3 ticks (~3 min at the default 60s tick) inside its own `safe('checkWatches', ...)` block *(source: `engine.js:6386-6440`)*. The engine builds the state object from cached project files + module reads:
|
|
137
137
|
|
|
138
138
|
```
|
|
139
139
|
{
|
|
@@ -144,7 +144,7 @@ Resolution is `path.join(shared.MINIONS_DIR, 'watches.d')` so it works in both d
|
|
|
144
144
|
}
|
|
145
145
|
```
|
|
146
146
|
|
|
147
|
-
`checkWatches` walks every active watch and, inside a single `mutateJsonFileLocked` callback *(source: `engine/watches.js:410-
|
|
147
|
+
`checkWatches` walks every active watch and, inside a single `mutateJsonFileLocked` callback *(source: `engine/watches.js:410-561`)*:
|
|
148
148
|
|
|
149
149
|
1. Skips paused/expired watches and any watch checked within its `interval`.
|
|
150
150
|
2. Captures a baseline `_lastState` on first check (so change conditions have something to diff).
|
|
@@ -158,7 +158,7 @@ I/O happens **outside the lock**: notifications via `writeToInbox`, follow-up ac
|
|
|
158
158
|
|
|
159
159
|
## Follow-Up Actions on Trigger
|
|
160
160
|
|
|
161
|
-
`watch.action` is an optional structured action that runs after the inbox notification fires. Action types live in a sibling registry in `engine/watch-actions.js` and are validated at create/update time *(source: `engine/watches.js:184-
|
|
161
|
+
`watch.action` is an optional structured action that runs after the inbox notification fires. Action types live in a sibling registry in `engine/watch-actions.js` and are validated at create/update time *(source: `engine/watches.js:184-248` `createWatch`, `engine/watch-actions.js:56` `registerActionType`)*. `GET /api/watches/action-types` returns the live list for dashboard pickers.
|
|
162
162
|
|
|
163
163
|
### Built-in actions
|
|
164
164
|
|
|
@@ -174,7 +174,7 @@ I/O happens **outside the lock**: notifications via `writeToInbox`, follow-up ac
|
|
|
174
174
|
| `archive-plan` | Set PRD `status="archived"` + `archivedAt` |
|
|
175
175
|
| `resume-plan` | Set PRD `status=PLAN_STATUS.ACTIVE` and clear `planStale` |
|
|
176
176
|
|
|
177
|
-
Constants live in `WATCH_ACTION_TYPE` (`engine/shared.js:
|
|
177
|
+
Constants live in `WATCH_ACTION_TYPE` (`engine/shared.js:2608`); handlers in `engine/watch-actions.js`.
|
|
178
178
|
|
|
179
179
|
### Templating
|
|
180
180
|
|
|
@@ -241,11 +241,11 @@ Absolute conditions firing under `stopAfter === 0` flip `status` to `expired`; `
|
|
|
241
241
|
| Webhook action returns `"only http/https allowed"` | URLs must use `http://` or `https://` schemes; other protocols are rejected by design *(source: `engine/watch-actions.js` `WEBHOOK` handler)* |
|
|
242
242
|
| Trigger fires but follow-up `dispatch-work-item` is missing | Check the engine log for `Watch <id> action <type>: <summary>`. Common reasons: missing `title`, the project's `work-items.json` couldn't be written, or the WI landed in central `work-items.json` because no project was specified |
|
|
243
243
|
| Watch `_lastActionResult` shows `"timeout"` for webhook | Webhooks have a 10s safety timeout to keep the watches tick fast *(source: `engine/watch-actions.js:482-484`)* |
|
|
244
|
-
| `checkWatches` block crashes silently | Wrapped in `safe('checkWatches', ...)` so one failure doesn't abort the tick *(source: `engine.js:
|
|
244
|
+
| `checkWatches` block crashes silently | Wrapped in `safe('checkWatches', ...)` so one failure doesn't abort the tick *(source: `engine.js:6386`)*. Inspect `engine/log.json` for `Watch check error (<id>)` lines. Regression #1088: the block must use `getProjects(config)`, never the long-removed `PROJECTS` constant |
|
|
245
245
|
|
|
246
246
|
## See Also
|
|
247
247
|
|
|
248
|
-
- `engine/shared.js:
|
|
248
|
+
- `engine/shared.js:2523-2618` — `WATCH_STATUS`, `WATCH_TARGET_TYPE`, `WATCH_CONDITION`, `WATCH_ABSOLUTE_CONDITIONS`, `WATCH_ACTION_TYPE` constants
|
|
249
249
|
- `engine/watches.js` — registry, lifecycle, tick integration, `watches.d/` plugin loader
|
|
250
250
|
- `engine/watch-actions.js` — action registry and built-in handlers (including `minions-api`)
|
|
251
251
|
- `watches.d/http.js` — canonical user-extensible target type plugin
|
package/engine/cleanup.js
CHANGED
|
@@ -419,6 +419,35 @@ async function runCleanup(config, verbose = false) {
|
|
|
419
419
|
}
|
|
420
420
|
}
|
|
421
421
|
|
|
422
|
+
// 2c. Reap orphan agents/temp-* dirs whose dispatch is no longer referenced
|
|
423
|
+
// anywhere in dispatch.json. Temp-agent dirs are created by the engine for
|
|
424
|
+
// ephemeral temp-<uid> agents; once the dispatch ages out of dispatch
|
|
425
|
+
// history they're never touched again, accumulating MB of live-output.log
|
|
426
|
+
// tails over weeks. 1h mtime gate prevents reaping a still-spawning temp
|
|
427
|
+
// agent that races dispatch.json visibility.
|
|
428
|
+
cleaned.orphanTempAgentDirs = 0;
|
|
429
|
+
try {
|
|
430
|
+
const dispatch = getDispatch();
|
|
431
|
+
const referencedAgents = new Set();
|
|
432
|
+
for (const seg of [dispatch.pending || [], dispatch.active || [], dispatch.completed || [], dispatch.history || []]) {
|
|
433
|
+
for (const e of seg) if (e?.agent) referencedAgents.add(String(e.agent));
|
|
434
|
+
}
|
|
435
|
+
let entries;
|
|
436
|
+
try { entries = fs.readdirSync(AGENTS_DIR, { withFileTypes: true }); } catch { entries = []; }
|
|
437
|
+
for (const entry of entries) {
|
|
438
|
+
if (!entry.isDirectory()) continue;
|
|
439
|
+
if (!entry.name.startsWith('temp-')) continue;
|
|
440
|
+
if (referencedAgents.has(entry.name)) continue;
|
|
441
|
+
const full = path.join(AGENTS_DIR, entry.name);
|
|
442
|
+
let stat; try { stat = fs.statSync(full); } catch { continue; }
|
|
443
|
+
if (stat.mtimeMs >= oneHourAgo) continue;
|
|
444
|
+
try {
|
|
445
|
+
fs.rmSync(full, { recursive: true, force: true });
|
|
446
|
+
cleaned.orphanTempAgentDirs++;
|
|
447
|
+
} catch (err) { log('warn', `orphan temp agent dir ${entry.name}: ${err.message}`); }
|
|
448
|
+
}
|
|
449
|
+
} catch (e) { log('warn', `orphan temp agent sweep: ${e.message}`); }
|
|
450
|
+
|
|
422
451
|
// 2b. Detect git worktrees registered inside any linked project's working tree.
|
|
423
452
|
// Nested worktrees cause glob/grep tools running with cwd=projectRoot to match
|
|
424
453
|
// BOTH copies of every file; a single Edit/MultiEdit then writes the same
|
|
@@ -452,6 +481,57 @@ async function runCleanup(config, verbose = false) {
|
|
|
452
481
|
}
|
|
453
482
|
}
|
|
454
483
|
|
|
484
|
+
// 2d. Reap on-disk worktree dirs not registered in `git worktree list`. Can
|
|
485
|
+
// be left behind when removeWorktree fails mid-way, when `git worktree prune`
|
|
486
|
+
// ran without a follow-up rm -rf, or after manual `git worktree remove
|
|
487
|
+
// --force` leaves an empty dir. Phase 3 below only walks dirs already in
|
|
488
|
+
// git's list, so these are invisible to it. 2h mtime gate matches the
|
|
489
|
+
// existing age sweep further down.
|
|
490
|
+
cleaned.orphanWorktreeDirs = 0;
|
|
491
|
+
const _twoHoursAgo = Date.now() - 7200000;
|
|
492
|
+
const _scannedWtRoots = new Set();
|
|
493
|
+
for (const project of projects) {
|
|
494
|
+
const root = project.localPath ? path.resolve(project.localPath) : null;
|
|
495
|
+
if (!root || !fs.existsSync(root)) continue;
|
|
496
|
+
const wtRoots = new Set();
|
|
497
|
+
const configuredRoot = path.resolve(root, config.engine?.worktreeRoot || '../worktrees');
|
|
498
|
+
if (fs.existsSync(configuredRoot)) wtRoots.add(configuredRoot);
|
|
499
|
+
for (const d of ['worktrees', '.claude/worktrees'].map(d => path.join(root, d))) {
|
|
500
|
+
if (fs.existsSync(d)) wtRoots.add(d);
|
|
501
|
+
}
|
|
502
|
+
let registered = null;
|
|
503
|
+
for (const wtRoot of wtRoots) {
|
|
504
|
+
if (_scannedWtRoots.has(wtRoot)) continue;
|
|
505
|
+
_scannedWtRoots.add(wtRoot);
|
|
506
|
+
// Resolve `git worktree list` once per project; reused across its roots.
|
|
507
|
+
if (registered === null) {
|
|
508
|
+
try {
|
|
509
|
+
const raw = String(shared.execSilent('git worktree list --porcelain', { cwd: root, timeout: 10000, windowsHide: true }) || '');
|
|
510
|
+
registered = new Set(shared.parseWorktreePorcelain(raw).map(wt => path.resolve(wt.path)));
|
|
511
|
+
} catch (e) {
|
|
512
|
+
log('warn', `orphan worktree dir scan for ${project.name || root}: ${e.message}`);
|
|
513
|
+
registered = new Set();
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
let entries;
|
|
517
|
+
try { entries = fs.readdirSync(wtRoot, { withFileTypes: true }); } catch { continue; }
|
|
518
|
+
for (const entry of entries) {
|
|
519
|
+
if (!entry.isDirectory()) continue;
|
|
520
|
+
const full = path.resolve(wtRoot, entry.name);
|
|
521
|
+
if (registered.has(full)) continue;
|
|
522
|
+
let stat; try { stat = fs.statSync(full); } catch { continue; }
|
|
523
|
+
if (stat.mtimeMs >= _twoHoursAgo) continue;
|
|
524
|
+
try {
|
|
525
|
+
fs.rmSync(full, { recursive: true, force: true });
|
|
526
|
+
cleaned.orphanWorktreeDirs++;
|
|
527
|
+
log('info', `Cleanup: removed orphan worktree dir ${full} (not registered in git)`);
|
|
528
|
+
} catch (err) {
|
|
529
|
+
log('warn', `orphan worktree dir ${full}: ${err.message}`);
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
455
535
|
// 3. Clean git worktrees for merged/abandoned PRs
|
|
456
536
|
const _attemptedWorktreePaths = new Set(); // dedup across projects sharing a worktreeRoot
|
|
457
537
|
for (const project of projects) {
|
|
@@ -909,8 +989,10 @@ async function runCleanup(config, verbose = false) {
|
|
|
909
989
|
}
|
|
910
990
|
} catch (e) { log('warn', 'prune orphaned dispatches: ' + e.message); }
|
|
911
991
|
|
|
912
|
-
|
|
913
|
-
|
|
992
|
+
const _orphanTemp = cleaned.orphanTempAgentDirs || 0;
|
|
993
|
+
const _orphanWt = cleaned.orphanWorktreeDirs || 0;
|
|
994
|
+
if (cleaned.tempFiles + cleaned.liveOutputs + cleaned.worktrees + cleaned.zombies + (cleaned.files || 0) + cleaned.orphanedDispatches + (cleaned.nestedWorktrees || 0) + _orphanTemp + _orphanWt > 0) {
|
|
995
|
+
log('info', `Cleanup: ${cleaned.tempFiles} temp, ${cleaned.liveOutputs} live outputs, ${cleaned.worktrees} worktrees, ${cleaned.zombies} zombies, ${cleaned.files || 0} archives, ${cleaned.orphanedDispatches} orphaned dispatches, ${cleaned.nestedWorktrees || 0} nested worktrees flagged, ${_orphanTemp} orphan temp dirs, ${_orphanWt} orphan worktree dirs`);
|
|
914
996
|
}
|
|
915
997
|
|
|
916
998
|
// 8. Clean swept KB files older than 7 days
|
package/engine/dispatch.js
CHANGED
|
@@ -696,6 +696,12 @@ function completeDispatch(id, result = DISPATCH_RESULT.SUCCESS, reason = '', res
|
|
|
696
696
|
// (overloaded_error / 503). Empty string clears any stale
|
|
697
697
|
// value from an earlier failure cycle.
|
|
698
698
|
wi._lastFailureClass = failureClass || '';
|
|
699
|
+
// W-mpmwxn1j — Bump per-agent retry count so the next dispatch
|
|
700
|
+
// can reassign to a different eligible agent once the same
|
|
701
|
+
// agent hits the threshold. Skip when no agent is resolvable
|
|
702
|
+
// (anonymous failures shouldn't corrupt the map shape).
|
|
703
|
+
const failedAgent = item.agent || wi.dispatched_to;
|
|
704
|
+
if (failedAgent) shared.bumpAgentRetryCount(wi, failedAgent);
|
|
699
705
|
delete wi.failReason;
|
|
700
706
|
delete wi.failedAt;
|
|
701
707
|
delete wi.dispatched_at;
|
package/engine/kb-sweep.js
CHANGED
|
@@ -23,6 +23,8 @@ const KB_SWEEP_STATE_PATH = path.join(ENGINE_DIR, 'kb-sweep-state.json');
|
|
|
23
23
|
const KB_SWEEP_LOG_PATH = path.join(ENGINE_DIR, 'kb-sweep.log');
|
|
24
24
|
const KB_SWEEP_RUNNER_PATH = path.join(__dirname, 'kb-sweep-runner.js');
|
|
25
25
|
const SWEPT_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
|
|
26
|
+
const AUTO_SWEEP_INTERVAL_MS = 4 * 60 * 60 * 1000;
|
|
27
|
+
const KB_SWEPT_PATH = path.join(ENGINE_DIR, 'kb-swept.json');
|
|
26
28
|
const COMPRESS_THRESHOLD_BYTES = 5000;
|
|
27
29
|
const LLM_BATCH_SIZE = 30;
|
|
28
30
|
const NORMALIZE_CONCURRENCY = 5;
|
|
@@ -555,6 +557,127 @@ async function _runKbSweepImpl(opts = {}) {
|
|
|
555
557
|
return summary;
|
|
556
558
|
}
|
|
557
559
|
|
|
560
|
+
/**
|
|
561
|
+
* Spawn the KB sweep runner (`engine/kb-sweep-runner.js`) as a detached child.
|
|
562
|
+
* Shared between dashboard's POST /api/knowledge/sweep handler and the engine
|
|
563
|
+
* tick's auto-sweep phase. Performs the same synchronous "starting" → "in-flight"
|
|
564
|
+
* CAS dance the dashboard handler used to do inline.
|
|
565
|
+
*
|
|
566
|
+
* Callers are responsible for the in-flight / stale-guard check BEFORE calling
|
|
567
|
+
* (so they can return distinct HTTP responses or log levels).
|
|
568
|
+
*
|
|
569
|
+
* @param {object} opts
|
|
570
|
+
* @param {string[]} [opts.pinnedKeys] - extra pinned KB keys to skip in the sweep
|
|
571
|
+
* @param {boolean} [opts.dryRun] - dry-run mode for the runner
|
|
572
|
+
* @param {string} [opts.cwd=MINIONS_DIR] - working directory for the spawned runner
|
|
573
|
+
* @param {(level:string,msg:string)=>void} [opts.log] - logger (defaults to console)
|
|
574
|
+
* @returns {{ sweepToken:string, pid:number|null, bodyFile:string|null,
|
|
575
|
+
* ok:boolean, error?:string }}
|
|
576
|
+
* ok=false + error on synchronous spawn failure; the "starting" claim is
|
|
577
|
+
* released so the caller can retry immediately.
|
|
578
|
+
*/
|
|
579
|
+
function spawnSweepRunnerDetached(opts = {}) {
|
|
580
|
+
const fsLocal = require('fs');
|
|
581
|
+
const { spawn: cpSpawn } = require('child_process');
|
|
582
|
+
const logFn = typeof opts.log === 'function'
|
|
583
|
+
? opts.log
|
|
584
|
+
: (level, msg) => { (level === 'error' ? console.error : console.log)(`[kb-sweep] ${msg}`); };
|
|
585
|
+
const cwd = opts.cwd || require('./queries').MINIONS_DIR;
|
|
586
|
+
const startedAt = Date.now();
|
|
587
|
+
const sweepToken = `${startedAt}-${Math.random().toString(36).slice(2, 8)}`;
|
|
588
|
+
|
|
589
|
+
try {
|
|
590
|
+
safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
|
|
591
|
+
status: 'starting', startedAt, startedAtIso: new Date().toISOString(),
|
|
592
|
+
sweepToken, pid: null,
|
|
593
|
+
}));
|
|
594
|
+
} catch (e) {
|
|
595
|
+
logFn('error', `failed to write starting state: ${e.message}`);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
let bodyFile = null;
|
|
599
|
+
const hasBody = (Array.isArray(opts.pinnedKeys) && opts.pinnedKeys.length > 0)
|
|
600
|
+
|| opts.dryRun != null;
|
|
601
|
+
if (hasBody) {
|
|
602
|
+
bodyFile = path.join(ENGINE_DIR, `tmp-kb-sweep-body-${sweepToken}.json`);
|
|
603
|
+
try {
|
|
604
|
+
safeWrite(bodyFile, JSON.stringify({
|
|
605
|
+
pinnedKeys: Array.isArray(opts.pinnedKeys) ? opts.pinnedKeys : undefined,
|
|
606
|
+
dryRun: opts.dryRun != null ? !!opts.dryRun : undefined,
|
|
607
|
+
}));
|
|
608
|
+
} catch (e) {
|
|
609
|
+
logFn('error', `failed to write body-file ${bodyFile}: ${e.message}`);
|
|
610
|
+
bodyFile = null;
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
let logFdNum = null;
|
|
615
|
+
let stdio = ['ignore', 'ignore', 'ignore'];
|
|
616
|
+
try {
|
|
617
|
+
logFdNum = fsLocal.openSync(KB_SWEEP_LOG_PATH, 'a');
|
|
618
|
+
stdio = ['ignore', logFdNum, logFdNum];
|
|
619
|
+
} catch (e) {
|
|
620
|
+
logFn('error', `failed to open log ${KB_SWEEP_LOG_PATH}: ${e.message}`);
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
const spawnArgs = ['--sweep-token', sweepToken];
|
|
624
|
+
if (bodyFile) spawnArgs.push('--body-file', bodyFile);
|
|
625
|
+
|
|
626
|
+
let proc;
|
|
627
|
+
try {
|
|
628
|
+
proc = cpSpawn(process.execPath, [KB_SWEEP_RUNNER_PATH, ...spawnArgs], {
|
|
629
|
+
cwd, stdio, detached: true, windowsHide: true,
|
|
630
|
+
env: { ...process.env },
|
|
631
|
+
});
|
|
632
|
+
} catch (e) {
|
|
633
|
+
if (logFdNum != null) try { fsLocal.closeSync(logFdNum); } catch { /* ignore */ }
|
|
634
|
+
if (bodyFile) try { fsLocal.unlinkSync(bodyFile); } catch { /* ignore */ }
|
|
635
|
+
try { shared.safeUnlink(KB_SWEEP_STATE_PATH); } catch { /* ignore */ }
|
|
636
|
+
return { ok: false, error: `spawn failed: ${e.message}`, sweepToken, pid: null, bodyFile: null };
|
|
637
|
+
}
|
|
638
|
+
if (logFdNum != null) try { fsLocal.closeSync(logFdNum); } catch { /* ignore */ }
|
|
639
|
+
|
|
640
|
+
try {
|
|
641
|
+
const current = safeJson(KB_SWEEP_STATE_PATH);
|
|
642
|
+
if (current && current.status === 'starting' && current.sweepToken === sweepToken) {
|
|
643
|
+
safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
|
|
644
|
+
status: 'in-flight', startedAt, startedAtIso: new Date().toISOString(),
|
|
645
|
+
sweepToken, pid: proc.pid,
|
|
646
|
+
}));
|
|
647
|
+
}
|
|
648
|
+
} catch { /* best-effort */ }
|
|
649
|
+
|
|
650
|
+
proc.unref();
|
|
651
|
+
return { ok: true, sweepToken, pid: proc.pid, bodyFile };
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
/**
|
|
655
|
+
* Decide whether the engine tick should auto-spawn a sweep right now.
|
|
656
|
+
* Pure function (reads disk, no side effects). Used by the tick's
|
|
657
|
+
* auto-sweep phase.
|
|
658
|
+
*
|
|
659
|
+
* @param {object} [opts]
|
|
660
|
+
* @param {number} [opts.now=Date.now()] injectable clock (tests)
|
|
661
|
+
* @param {number} [opts.intervalMs=AUTO_SWEEP_INTERVAL_MS]
|
|
662
|
+
* @param {object} [opts.liveness] pre-computed liveness (optional)
|
|
663
|
+
* @returns {{ shouldSpawn:boolean, reason:string, lastCompletedAt:number|null }}
|
|
664
|
+
*/
|
|
665
|
+
function shouldAutoSweep(opts = {}) {
|
|
666
|
+
const now = Number(opts.now) || Date.now();
|
|
667
|
+
const intervalMs = Number(opts.intervalMs) || AUTO_SWEEP_INTERVAL_MS;
|
|
668
|
+
const liveness = opts.liveness || readSweepLiveness({ entryCount: opts.entryCount || 0, now });
|
|
669
|
+
if (liveness.inFlight && liveness.alive && !liveness.stale) {
|
|
670
|
+
return { shouldSpawn: false, reason: 'sweep-in-flight', lastCompletedAt: null };
|
|
671
|
+
}
|
|
672
|
+
const swept = safeJson(KB_SWEPT_PATH);
|
|
673
|
+
const sweptTs = swept && swept.timestamp ? Date.parse(swept.timestamp) : NaN;
|
|
674
|
+
const lastCompletedAt = Number.isFinite(sweptTs) ? sweptTs : null;
|
|
675
|
+
if (lastCompletedAt != null && (now - lastCompletedAt) < intervalMs) {
|
|
676
|
+
return { shouldSpawn: false, reason: 'within-interval', lastCompletedAt };
|
|
677
|
+
}
|
|
678
|
+
return { shouldSpawn: true, reason: lastCompletedAt == null ? 'no-prior-sweep' : 'interval-elapsed', lastCompletedAt };
|
|
679
|
+
}
|
|
680
|
+
|
|
558
681
|
/** Compute a dynamic stale-guard timeout based on KB size. */
|
|
559
682
|
function staleGuardMs(entryCount) {
|
|
560
683
|
// 30 minutes minimum, plus 1 second per entry (for the rewrite pass)
|
|
@@ -566,6 +689,10 @@ module.exports = {
|
|
|
566
689
|
staleGuardMs,
|
|
567
690
|
readSweepLiveness,
|
|
568
691
|
reconcileSweepStateOnBoot,
|
|
692
|
+
spawnSweepRunnerDetached,
|
|
693
|
+
shouldAutoSweep,
|
|
694
|
+
AUTO_SWEEP_INTERVAL_MS,
|
|
695
|
+
KB_SWEPT_PATH,
|
|
569
696
|
KB_SWEEP_STATE_PATH,
|
|
570
697
|
KB_SWEEP_LOG_PATH,
|
|
571
698
|
KB_SWEEP_RUNNER_PATH,
|
package/engine/lifecycle.js
CHANGED
|
@@ -595,6 +595,7 @@ function updateWorkItemStatus(meta, status, reason) {
|
|
|
595
595
|
delete target.failReason;
|
|
596
596
|
delete target.failedAt;
|
|
597
597
|
delete target._retryCount;
|
|
598
|
+
delete target._retriesByAgent;
|
|
598
599
|
target.completedAgents = Object.entries(target.agentResults)
|
|
599
600
|
.filter(([, r]) => r.status === WI_STATUS.DONE)
|
|
600
601
|
.map(([a]) => a);
|
|
@@ -611,6 +612,7 @@ function updateWorkItemStatus(meta, status, reason) {
|
|
|
611
612
|
delete target.failReason;
|
|
612
613
|
delete target.failedAt;
|
|
613
614
|
delete target._retryCount;
|
|
615
|
+
delete target._retriesByAgent;
|
|
614
616
|
// P-e0b4f7a5 — successful completion (including a phantom-retry
|
|
615
617
|
// succeeding) clears the phantom markers so cleanup can reap the
|
|
616
618
|
// worktree on the next sweep.
|
|
@@ -3218,6 +3220,14 @@ function _deferRetryWithCounter(meta, detection, counterField, maxCount, pending
|
|
|
3218
3220
|
w._lastRetryAt = ts();
|
|
3219
3221
|
w._lastRetryReason = reason;
|
|
3220
3222
|
w._pendingReason = pendingReason;
|
|
3223
|
+
// W-mpmwxn1j — only the standard PR-attachment / nonterminal counter
|
|
3224
|
+
// (_retryCount) participates in per-agent reassignment. Phantom
|
|
3225
|
+
// retries (runtime crashes before any work product) are not
|
|
3226
|
+
// agent-specific failures, so we don't bump _retriesByAgent for them.
|
|
3227
|
+
if (counterField === '_retryCount') {
|
|
3228
|
+
const failedAgent = meta?._agentId || w.dispatched_to;
|
|
3229
|
+
if (failedAgent) shared.bumpAgentRetryCount(w, failedAgent);
|
|
3230
|
+
}
|
|
3221
3231
|
// P-e0b4f7a5 — phantom-retry path stamps _phantomCompletion +
|
|
3222
3232
|
// _phantomBranch so cleanup.js can preserve the worktree across the
|
|
3223
3233
|
// re-dispatch window. Only set for the phantom counter; nonterminal
|
|
@@ -4018,6 +4028,10 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
|
|
|
4018
4028
|
w._retryCount = retries + 1;
|
|
4019
4029
|
w._lastRetryAt = ts();
|
|
4020
4030
|
w._lastRetryReason = 'no review verdict';
|
|
4031
|
+
// W-mpmwxn1j — bump per-agent counter so a reviewer who never
|
|
4032
|
+
// emits a verdict gets reassigned after maxRetriesPerAgent hits.
|
|
4033
|
+
const failedAgent = meta?._agentId || w.dispatched_to;
|
|
4034
|
+
if (failedAgent) shared.bumpAgentRetryCount(w, failedAgent);
|
|
4021
4035
|
delete w.dispatched_at;
|
|
4022
4036
|
delete w.completedAt;
|
|
4023
4037
|
delete w._pendingReason;
|
|
@@ -4125,6 +4139,10 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
|
|
|
4125
4139
|
if (retries < ENGINE_DEFAULTS.maxRetries) {
|
|
4126
4140
|
w.status = WI_STATUS.PENDING;
|
|
4127
4141
|
w._retryCount = retries + 1;
|
|
4142
|
+
// W-mpmwxn1j — bump per-agent counter so a planner that never
|
|
4143
|
+
// writes the PRD gets reassigned after maxRetriesPerAgent hits.
|
|
4144
|
+
const failedAgent = meta?._agentId || w.dispatched_to;
|
|
4145
|
+
if (failedAgent) shared.bumpAgentRetryCount(w, failedAgent);
|
|
4128
4146
|
delete w.dispatched_at;
|
|
4129
4147
|
delete w.completedAt;
|
|
4130
4148
|
log('warn', `plan-to-prd ${meta.item.id} completed without PRD file — auto-retry ${retries + 1}/${ENGINE_DEFAULTS.maxRetries}`);
|