@yemi33/minions 0.1.2044 → 0.1.2046
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dashboard/js/command-center.js +64 -7
- package/dashboard/js/fre.js +3 -2
- package/dashboard/js/refresh.js +143 -2
- package/dashboard/js/render-prs.js +43 -9
- package/dashboard/js/settings.js +9 -5
- package/dashboard/styles.css +21 -0
- package/dashboard.js +308 -164
- package/docs/auto-discovery.md +3 -1
- package/docs/qa-runbook-lifecycle.md +71 -0
- package/docs/qa-runbooks.md +6 -5
- package/docs/runtime-adapters.md +9 -4
- package/docs/security.md +2 -1
- package/docs/watches.md +19 -19
- package/engine/cc-worker-pool.js +87 -11
- package/engine/cleanup.js +84 -2
- package/engine/dispatch.js +6 -0
- package/engine/kb-sweep.js +127 -0
- package/engine/lifecycle.js +18 -0
- package/engine/llm.js +148 -2
- package/engine/preflight.js +5 -5
- package/engine/queries.js +133 -27
- package/engine/shared.js +40 -3
- package/engine/timeout.js +4 -0
- package/engine.js +240 -11
- package/package.json +1 -1
package/docs/auto-discovery.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Auto-Discovery & Execution Pipeline
|
|
2
2
|
|
|
3
|
-
> Last verified: 2026-05-
|
|
3
|
+
> Last verified: 2026-05-25 against `engine.js` `tickInner()` (lines 6293-6947) and `routing.md`.
|
|
4
4
|
|
|
5
5
|
How the minions engine finds work and dispatches agents automatically.
|
|
6
6
|
|
|
@@ -199,6 +199,8 @@ routing.md table (see the file for the authoritative list):
|
|
|
199
199
|
decompose → ripley (fallback: rebecca)
|
|
200
200
|
meeting → ripley (fallback: lambert)
|
|
201
201
|
docs → lambert (fallback: _any_)
|
|
202
|
+
setup → dallas (fallback: _any_)
|
|
203
|
+
qa-validate → dallas (fallback: ralph)
|
|
202
204
|
```
|
|
203
205
|
|
|
204
206
|
Resolution order:
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# QA runbook lifecycle (W-mpeiwz6k0005bf34)
|
|
2
|
+
|
|
3
|
+
Validation runbooks dispatched against live managed instances. Mirrors the
|
|
4
|
+
managed-spawn lifecycle (declare → engine spawns → healthcheck → observable)
|
|
5
|
+
but optimized for human/agent-driven smoke + E2E flows. Surfaced on the
|
|
6
|
+
`/qa` dashboard page (`dashboard/pages/qa.html`, `dashboard/js/qa.js`); full
|
|
7
|
+
live-process inventory remains on `/engine` (do NOT mirror it; see
|
|
8
|
+
W-mpdad3mq000m53bb).
|
|
9
|
+
|
|
10
|
+
## Runbook location
|
|
11
|
+
|
|
12
|
+
`qa-runbooks.json` (engine state, JSON list keyed by `id`). Each entry:
|
|
13
|
+
`{ id, name, target, steps, expectedArtifacts, createdAt, createdBy }`.
|
|
14
|
+
CRUD via `GET/POST /api/qa/runbooks` (POST returns the new runbook with
|
|
15
|
+
engine-assigned `id`). `target` is a name from `/api/managed-processes` or
|
|
16
|
+
`/api/keep-processes` (deduped by `<project>::<name>`, managed wins).
|
|
17
|
+
|
|
18
|
+
## Run-record path
|
|
19
|
+
|
|
20
|
+
`qa-runs.json` (newest-first, capped). Each run:
|
|
21
|
+
`{ id, runbookId, runbookName, target, status, startedAt, completedAt, workItemId, agentId, artifacts }`.
|
|
22
|
+
`status` ∈ `pending|dispatched|running|passed|failed|error`. Created by
|
|
23
|
+
`POST /api/qa/runbooks/run` (`{ id }`), which dispatches a `qa-validate`
|
|
24
|
+
work item and seeds the run with `dispatched`. Read via
|
|
25
|
+
`GET /api/qa/runs?limit=50` — UI polls every 5s while the QA page is active
|
|
26
|
+
and clears the interval on page navigation via the `switchPage` wrapper in
|
|
27
|
+
`dashboard/js/qa.js` (matches `_stopPlanPoll`/`_stopMeetingPoll` pattern in
|
|
28
|
+
`dashboard/js/state.js`).
|
|
29
|
+
|
|
30
|
+
## Artifact contract
|
|
31
|
+
|
|
32
|
+
`engine/qa-artifacts/<runId>/<file>`, served via
|
|
33
|
+
`GET /api/qa/artifacts/<runId>/<file>`. Files are agent-uploaded screenshots,
|
|
34
|
+
video recordings, and log captures listed in the run record's
|
|
35
|
+
`artifacts: [{ file, kind }]`. UI auto-detects:
|
|
36
|
+
`screenshot`/`png|jpg|jpeg|gif|webp|svg` → `<img>`;
|
|
37
|
+
`video`/`mp4|webm|ogg|mov` → `<video controls>`; everything else → log
|
|
38
|
+
preview (first 40 lines fetched lazily) with a `View full` link to the same
|
|
39
|
+
endpoint. **No direct filesystem paths are exposed** — every artifact URL
|
|
40
|
+
goes through `/api/qa/artifacts/` so path traversal is server-gated.
|
|
41
|
+
Optional config: `engine.qaArtifactsMaxBytes` caps per-file upload size;
|
|
42
|
+
when set, dashboard Settings exposes a matching toggle (CLAUDE.md
|
|
43
|
+
best-practice #15).
|
|
44
|
+
|
|
45
|
+
## Agent sidecar shape
|
|
46
|
+
|
|
47
|
+
The `qa-validate` agent writes `agents/<id>/qa-run.json` before exit:
|
|
48
|
+
|
|
49
|
+
```json
|
|
50
|
+
{ "runId": "qa-run-<id>",
|
|
51
|
+
"status": "passed|failed|error",
|
|
52
|
+
"summary": "...",
|
|
53
|
+
"artifacts": [ { "file": "dashboard.png", "kind": "screenshot" },
|
|
54
|
+
{ "file": "test.log", "kind": "log" } ],
|
|
55
|
+
"written_by": "<agentId>", "wi_id": "<workItemId>" }
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The engine reads the sidecar in `onAgentClose`, copies the listed files into
|
|
59
|
+
`engine/qa-artifacts/<runId>/`, and stamps the run record with `status`,
|
|
60
|
+
`completedAt`, and the recorded `artifacts`. Sidecar validation failure is
|
|
61
|
+
non-retryable (`failure_class: invalid-qa-run`); listed files outside the
|
|
62
|
+
agent worktree or larger than `qaArtifactsMaxBytes` are rejected without
|
|
63
|
+
stamping the run.
|
|
64
|
+
|
|
65
|
+
## Entry point
|
|
66
|
+
|
|
67
|
+
`playbooks/qa-validate.md`. Dispatched by `POST /api/qa/runbooks/run`;
|
|
68
|
+
receives `target`, `steps`, `expectedArtifacts` as template vars; required
|
|
69
|
+
to write the sidecar above before exit. Routing line in `routing.md` maps
|
|
70
|
+
the synthetic `qa-validate` task-type to the playbook so manual dispatches
|
|
71
|
+
work too.
|
package/docs/qa-runbooks.md
CHANGED
|
@@ -95,10 +95,11 @@ All writes use `mutateJsonFileLocked` per the repo convention. Deletes use
|
|
|
95
95
|
unlink (so an in-progress `saveRunbook` rename can't race with the
|
|
96
96
|
unlink).
|
|
97
97
|
|
|
98
|
-
##
|
|
98
|
+
## Run records, artifacts, and UI
|
|
99
99
|
|
|
100
|
-
|
|
100
|
+
The deferred follow-up items (W-mpeiwz6k0005bf34-b/c/d) have since landed. Brief pointers — see [CLAUDE.md](../CLAUDE.md) → "QA validation runs" for the deep dive:
|
|
101
101
|
|
|
102
|
-
-
|
|
103
|
-
-
|
|
104
|
-
-
|
|
102
|
+
- **Run dispatch + persistence** (`engine/qa-runs.js`): `POST /api/qa/runbooks/run` creates a `qa-runs.json` record with `status ∈ pending|dispatched|running|passed|failed|error` and dispatches a `qa-validate` work item against the runbook's `target`. Read via `GET /api/qa/runs?limit=N&status=...` and `GET /api/qa/runs/<id>`.
|
|
103
|
+
- **Artifact contract**: the `qa-validate` agent writes `agents/<id>/qa-run.json` before exit; the engine copies listed files into `engine/qa-artifacts/<runId>/` and serves them via `GET /api/qa/artifacts/<runId>/<file>` (path-traversal-gated, 403 on escape). Per-file size cap: `engine.qaArtifactsMaxBytes`.
|
|
104
|
+
- **UI**: `/qa` dashboard page (`dashboard/pages/qa.html`, `dashboard/js/qa.js`) polls `GET /api/qa/runs` every 5s while active; auto-detects screenshots/videos/logs for inline preview.
|
|
105
|
+
- **Playbook**: `playbooks/qa-validate.md` (routed via the synthetic `qa-validate` task-type in `routing.md`).
|
package/docs/runtime-adapters.md
CHANGED
|
@@ -14,7 +14,12 @@ behavior is hidden behind an adapter object resolved through `resolveRuntime()`.
|
|
|
14
14
|
|
|
15
15
|
`resolveRuntime(name)` throws when `name` is unknown so misconfigurations surface
|
|
16
16
|
at dispatch time instead of producing silent fallbacks deep inside spawn logic.
|
|
17
|
-
|
|
17
|
+
When `name` is `null`/omitted, `resolveRuntime()` falls back to `'claude'` for
|
|
18
|
+
parser-routing compatibility (Copilot's `parseOutput` cannot consume the Claude
|
|
19
|
+
JSONL `{type:"result",result:"..."}` shape — see W-mpmwxkk40007c995). The fleet
|
|
20
|
+
default that determines which runtime *new spawns* use is separate:
|
|
21
|
+
`ENGINE_DEFAULTS.defaultCli` (also in W-mpmwxkk40007c995) is now `'copilot'`, so
|
|
22
|
+
operators with no explicit `engine.defaultCli` get Copilot on dispatch.
|
|
18
23
|
|
|
19
24
|
## Adapter Interface
|
|
20
25
|
|
|
@@ -93,8 +98,8 @@ directly.
|
|
|
93
98
|
|
|
94
99
|
| Helper | Chain |
|
|
95
100
|
|--------|-------|
|
|
96
|
-
| `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'
|
|
97
|
-
| `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'
|
|
101
|
+
| `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'copilot'` |
|
|
102
|
+
| `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'copilot'` |
|
|
98
103
|
| `resolveAgentModel(agent, engine)` | `agent.model` → `engine.defaultModel` → undefined |
|
|
99
104
|
| `resolveCcModel(engine)` | `engine.ccModel` → `engine.defaultModel` → undefined |
|
|
100
105
|
| `resolveAgentMaxBudget(agent, engine)` | `agent.maxBudgetUsd` → `engine.maxBudgetUsd`. Honors literal `0`. |
|
|
@@ -103,7 +108,7 @@ directly.
|
|
|
103
108
|
Agent dispatch resolves the runtime once at spawn time:
|
|
104
109
|
|
|
105
110
|
```js
|
|
106
|
-
// engine.js spawnAgent (~line
|
|
111
|
+
// engine.js spawnAgent (~line 1866)
|
|
107
112
|
const runtime = resolveRuntime(shared.resolveAgentCli(agentConfig, engineConfig));
|
|
108
113
|
```
|
|
109
114
|
|
package/docs/security.md
CHANGED
|
@@ -60,7 +60,8 @@ system. Its threat model:
|
|
|
60
60
|
operator visits could in principle issue requests to `http://127.0.0.1:7331`.
|
|
61
61
|
The dashboard defends against this with:
|
|
62
62
|
- An **Origin gate** on mutating methods (`POST`/`PUT`/`PATCH`/`DELETE`)
|
|
63
|
-
and CORS preflights — see `dashboard.js` ~
|
|
63
|
+
and CORS preflights — see `dashboard.js` ~4565–4609 (and additional
|
|
64
|
+
`isAllowedOrigin` enforcement points in the SSE/CC handlers) and
|
|
64
65
|
`shared.isAllowedOrigin` / `shared.buildSecurityHeaders` in
|
|
65
66
|
[`engine/shared.js`](../engine/shared.js). Requests whose `Origin` (or
|
|
66
67
|
`Referer`, if `Origin` is absent) is not in the local allowlist are
|
package/docs/watches.md
CHANGED
|
@@ -20,11 +20,11 @@ A watch is a small JSON record persisted to `engine/watches.json`. It binds:
|
|
|
20
20
|
| `requires` | Optional guard: array of predicate objects evaluated against `state` / `entity` / `prevState`; trigger is suppressed when any guard fails (false-or-error). Used to gate a watch on "PR is mergeable AND build passing" etc. |
|
|
21
21
|
| `status` | `WATCH_STATUS.ACTIVE` \| `PAUSED` \| `TRIGGERED` \| `EXPIRED` |
|
|
22
22
|
|
|
23
|
-
`createWatch()` allocates a `watch-<uid>` id, defaults the fields above, and persists atomically via `mutateJsonFileLocked` *(source: `engine/watches.js:184-
|
|
23
|
+
`createWatch()` allocates a `watch-<uid>` id, defaults the fields above, and persists atomically via `mutateJsonFileLocked` *(source: `engine/watches.js:184-248`)*.
|
|
24
24
|
|
|
25
25
|
## Lifecycle (`WATCH_STATUS`)
|
|
26
26
|
|
|
27
|
-
Defined in `engine/shared.js:
|
|
27
|
+
Defined in `engine/shared.js:2523`:
|
|
28
28
|
|
|
29
29
|
| Status | Meaning |
|
|
30
30
|
|-------------|-------------------------------------------------------------------------|
|
|
@@ -37,10 +37,10 @@ Pause/resume flips the `status` field via `POST /api/watches/update` *(source: `
|
|
|
37
37
|
|
|
38
38
|
## Conditions (`WATCH_CONDITION`)
|
|
39
39
|
|
|
40
|
-
Defined in `engine/shared.js:
|
|
40
|
+
Defined in `engine/shared.js:2539-2577`. Conditions split into two families:
|
|
41
41
|
|
|
42
42
|
### Absolute conditions (`WATCH_ABSOLUTE_CONDITIONS`)
|
|
43
|
-
*(source: `engine/shared.js:
|
|
43
|
+
*(source: `engine/shared.js:2586-2602`)*
|
|
44
44
|
|
|
45
45
|
`merged`, `build-fail`, `build-pass`, `completed`, `failed`, `concluded`, `approved`, `rejected`, `ready-for-merge`, `retry-limit-reached`, `all-items-done`, `item-failed-n-times`.
|
|
46
46
|
|
|
@@ -49,12 +49,12 @@ When `stopAfter === 0`, these are **fire-once** — the engine flips the watch t
|
|
|
49
49
|
> **Per-target override (W-mp7hg58e000b5212):** the global `WATCH_ABSOLUTE_CONDITIONS` set is the legacy fallback. Each target type now declares its own `absoluteConditions: [...]` array in its spec; `registerTargetType` normalizes that into a `Set` that takes precedence at evaluation time. The plugin contract (see below) uses this to keep absolute-vs-change semantics local to each target type. Plugins that omit `absoluteConditions` get an empty set (all change-based).
|
|
50
50
|
|
|
51
51
|
### Change-based conditions
|
|
52
|
-
`status-change`, `any`, `new-comments`, `vote-change`, `stage-complete`, `ran`, `enabled`, `disabled`, `activity-change`, plus the predicate conditions added under P-w4e2f6a1 / P-w5b8d2c9 for the `pr`, `work-item`, `plan`, and `pipeline` target types (`head-commit-change`, `mergeable-flipped`, `behind-master`, `draft-flipped`, `stalled`, `dependency-met`, `stage-advanced`, `stuck-in-stage`). See `engine/shared.js:
|
|
52
|
+
`status-change`, `any`, `new-comments`, `vote-change`, `stage-complete`, `ran`, `enabled`, `disabled`, `activity-change`, plus the predicate conditions added under P-w4e2f6a1 / P-w5b8d2c9 for the `pr`, `work-item`, `plan`, and `pipeline` target types (`head-commit-change`, `mergeable-flipped`, `behind-master`, `draft-flipped`, `stalled`, `dependency-met`, `stage-advanced`, `stuck-in-stage`). See `engine/shared.js:2539-2577` for the canonical enum.
|
|
53
53
|
|
|
54
54
|
These compare the live entity against the watch's `_lastState` snapshot and run forever when `stopAfter === 0`. Baseline `_lastState` is captured on the first check so the very next change triggers the watch *(source: `engine/watches.js:434, 520`)*.
|
|
55
55
|
|
|
56
56
|
### Tick-counted conditions
|
|
57
|
-
`stalled`, `stuck-in-stage` — require N consecutive unchanged captures (default `WATCH_STALLED_DEFAULT_TICKS = 12`, `WATCH_STUCK_STAGE_DEFAULT_TICKS = 12`, both in `engine/shared.js:
|
|
57
|
+
`stalled`, `stuck-in-stage` — require N consecutive unchanged captures (default `WATCH_STALLED_DEFAULT_TICKS = 12`, `WATCH_STUCK_STAGE_DEFAULT_TICKS = 12`, both in `engine/shared.js:2582-2583`). Counters (`_unchangedTicks`, `_stuckStageTicks`) are recomputed inside `_captureState` by comparing the fresh snapshot against `prevState`.
|
|
58
58
|
|
|
59
59
|
### Predicate conditions
|
|
60
60
|
|
|
@@ -65,11 +65,11 @@ Several condition keys evaluate a derived predicate on the captured entity/state
|
|
|
65
65
|
- **plan** — `all-items-done` (`items_done === items_total > 0`), `item-failed-n-times` (any `missing_features[*]._retryCount >= ENGINE_DEFAULTS.maxRetries`).
|
|
66
66
|
- **pipeline** — `stage-advanced` (`current_stage_id` changed within the same `runId`), `stuck-in-stage` (current stage unchanged for `WATCH_STUCK_STAGE_DEFAULT_TICKS` checks, default 12).
|
|
67
67
|
|
|
68
|
-
Compound state-assertion predicates (`ready-for-merge`, `retry-limit-reached`, `all-items-done`, `item-failed-n-times`) live in `WATCH_ABSOLUTE_CONDITIONS` so they fire-once when `stopAfter === 0` — without that they would re-fire every tick while the assertion holds *(source: `engine/shared.js:
|
|
68
|
+
Compound state-assertion predicates (`ready-for-merge`, `retry-limit-reached`, `all-items-done`, `item-failed-n-times`) live in `WATCH_ABSOLUTE_CONDITIONS` so they fire-once when `stopAfter === 0` — without that they would re-fire every tick while the assertion holds *(source: `engine/shared.js:2586` `WATCH_ABSOLUTE_CONDITIONS`)*.
|
|
69
69
|
|
|
70
70
|
## Target Types — `TARGET_TYPES` Registry
|
|
71
71
|
|
|
72
|
-
Target-type behavior in `engine/watches.js` is **data-driven via a registry** *(source: `engine/watches.js:124-
|
|
72
|
+
Target-type behavior in `engine/watches.js` is **data-driven via a registry** *(source: `engine/watches.js:124-153`)*. Each spec must provide:
|
|
73
73
|
|
|
74
74
|
- `label` — human name shown in dashboard pickers
|
|
75
75
|
- `description` — short help text
|
|
@@ -79,17 +79,17 @@ Target-type behavior in `engine/watches.js` is **data-driven via a registry** *(
|
|
|
79
79
|
- `captureState(entity)` — snapshot used for change-detection diffs
|
|
80
80
|
- `evaluate(condition, entity, prevState, target)` — returns `{ triggered, message }`
|
|
81
81
|
|
|
82
|
-
The registry IS the allowlist for `createWatch` and `/api/watches/target-types`; the old hardcoded "pr or work-item" check is gone. Add a new target type at runtime with `registerTargetType(type, spec)` and look one up with `getTargetType(type)`. `listTargetTypes()` returns the serializable form used by the dashboard *(source: `engine/watches.js:124-
|
|
82
|
+
The registry IS the allowlist for `createWatch` and `/api/watches/target-types`; the old hardcoded "pr or work-item" check is gone. Add a new target type at runtime with `registerTargetType(type, spec)` and look one up with `getTargetType(type)`. `listTargetTypes()` returns the serializable form used by the dashboard *(source: `engine/watches.js:124-174`)*.
|
|
83
83
|
|
|
84
84
|
### User-extensible via `watches.d/` (W-mp7hg58e000b5212)
|
|
85
85
|
|
|
86
|
-
At engine boot, every `*.js` file in `<MINIONS_DIR>/watches.d/` is auto-loaded **after** the built-in registrations *(source: `engine/watches.js:
|
|
86
|
+
At engine boot, every `*.js` file in `<MINIONS_DIR>/watches.d/` is auto-loaded **after** the built-in registrations *(source: `engine/watches.js:1319-1354`)*, so plugins can both add new target types and override built-ins. A plugin file exports either `{ name, spec }` or an array of such objects. Failures are logged-and-skipped — one bad plugin must not break boot or block other plugins. Reloads require an engine restart.
|
|
87
87
|
|
|
88
88
|
Canonical example: `watches.d/http.js` (W-mp7i22mu00191b07) — a generic HTTP poller covering the full plugin contract including `extractState` (custom snapshot fields not on the entity itself) and `extendTemplateVars` (custom action-template vars like `{{httpStatus}}`, `{{prevExtracted}}`).
|
|
89
89
|
|
|
90
90
|
### Built-in target types
|
|
91
91
|
|
|
92
|
-
The eight built-ins are registered at module load *(source: `engine/watches.js:672-1313`)*. Constants live at `engine/shared.js:
|
|
92
|
+
The eight built-ins are registered at module load *(source: `engine/watches.js:672-1313`)*. Constants live at `engine/shared.js:2529-2538` (`WATCH_TARGET_TYPE`).
|
|
93
93
|
|
|
94
94
|
| `targetType` | Target value | Conditions | Notes |
|
|
95
95
|
|---------------|--------------------------------------|----------------------------------------------------------------------------|-------|
|
|
@@ -102,11 +102,11 @@ The eight built-ins are registered at module load *(source: `engine/watches.js:6
|
|
|
102
102
|
| `dispatch` | Dispatch entry id | `completed`, `failed`, `status-change`, `any` | Looks across `pending` / `active` / `completed` lists |
|
|
103
103
|
| `agent` | Agent id | `activity-change`, `status-change`, `any` | `activity-change` fires only on transitions in/out of `'working'` |
|
|
104
104
|
|
|
105
|
-
`evaluateWatch` dispatches to `tt.evaluate(...)`; unknown target types return `"Unknown target type: ..."` and unknown conditions return `"Unknown condition: ..."` — both are non-triggering *(source: `engine/watches.js:318-
|
|
105
|
+
`evaluateWatch` dispatches to `tt.evaluate(...)`; unknown target types return `"Unknown target type: ..."` and unknown conditions return `"Unknown condition: ..."` — both are non-triggering *(source: `engine/watches.js:318-371`)*.
|
|
106
106
|
|
|
107
107
|
### Plugin folder (`watches.d/`) — user-extensible target types
|
|
108
108
|
|
|
109
|
-
W-mp7hg58e000b5212 added a **plugin folder** so operators can register new target types without editing engine source. At engine boot, `engine/watches.js` scans `<MINIONS_DIR>/watches.d/*.js` *after* the eight built-ins are registered (so plugins can override a built-in by re-using its key — last-write-wins) and calls `registerTargetType()` for each export *(source: `engine/watches.js:
|
|
109
|
+
W-mp7hg58e000b5212 added a **plugin folder** so operators can register new target types without editing engine source. At engine boot, `engine/watches.js` scans `<MINIONS_DIR>/watches.d/*.js` *after* the eight built-ins are registered (so plugins can override a built-in by re-using its key — last-write-wins) and calls `registerTargetType()` for each export *(source: `engine/watches.js:1319-1354`)*.
|
|
110
110
|
|
|
111
111
|
Each `watches.d/<name>.js` file must export `{ name, spec }` (or an array of those):
|
|
112
112
|
|
|
@@ -133,7 +133,7 @@ Resolution is `path.join(shared.MINIONS_DIR, 'watches.d')` so it works in both d
|
|
|
133
133
|
|
|
134
134
|
## Tick Integration
|
|
135
135
|
|
|
136
|
-
`engine.js` calls `checkWatches(config, state)` every 3 ticks (~3 min at the default 60s tick) inside its own `safe('checkWatches', ...)` block *(source: `engine.js:
|
|
136
|
+
`engine.js` calls `checkWatches(config, state)` every 3 ticks (~3 min at the default 60s tick) inside its own `safe('checkWatches', ...)` block *(source: `engine.js:6386-6440`)*. The engine builds the state object from cached project files + module reads:
|
|
137
137
|
|
|
138
138
|
```
|
|
139
139
|
{
|
|
@@ -144,7 +144,7 @@ Resolution is `path.join(shared.MINIONS_DIR, 'watches.d')` so it works in both d
|
|
|
144
144
|
}
|
|
145
145
|
```
|
|
146
146
|
|
|
147
|
-
`checkWatches` walks every active watch and, inside a single `mutateJsonFileLocked` callback *(source: `engine/watches.js:410-
|
|
147
|
+
`checkWatches` walks every active watch and, inside a single `mutateJsonFileLocked` callback *(source: `engine/watches.js:410-561`)*:
|
|
148
148
|
|
|
149
149
|
1. Skips paused/expired watches and any watch checked within its `interval`.
|
|
150
150
|
2. Captures a baseline `_lastState` on first check (so change conditions have something to diff).
|
|
@@ -158,7 +158,7 @@ I/O happens **outside the lock**: notifications via `writeToInbox`, follow-up ac
|
|
|
158
158
|
|
|
159
159
|
## Follow-Up Actions on Trigger
|
|
160
160
|
|
|
161
|
-
`watch.action` is an optional structured action that runs after the inbox notification fires. Action types live in a sibling registry in `engine/watch-actions.js` and are validated at create/update time *(source: `engine/watches.js:184-
|
|
161
|
+
`watch.action` is an optional structured action that runs after the inbox notification fires. Action types live in a sibling registry in `engine/watch-actions.js` and are validated at create/update time *(source: `engine/watches.js:184-248` `createWatch`, `engine/watch-actions.js:56` `registerActionType`)*. `GET /api/watches/action-types` returns the live list for dashboard pickers.
|
|
162
162
|
|
|
163
163
|
### Built-in actions
|
|
164
164
|
|
|
@@ -174,7 +174,7 @@ I/O happens **outside the lock**: notifications via `writeToInbox`, follow-up ac
|
|
|
174
174
|
| `archive-plan` | Set PRD `status="archived"` + `archivedAt` |
|
|
175
175
|
| `resume-plan` | Set PRD `status=PLAN_STATUS.ACTIVE` and clear `planStale` |
|
|
176
176
|
|
|
177
|
-
Constants live in `WATCH_ACTION_TYPE` (`engine/shared.js:
|
|
177
|
+
Constants live in `WATCH_ACTION_TYPE` (`engine/shared.js:2608`); handlers in `engine/watch-actions.js`.
|
|
178
178
|
|
|
179
179
|
### Templating
|
|
180
180
|
|
|
@@ -241,11 +241,11 @@ Absolute conditions firing under `stopAfter === 0` flip `status` to `expired`; `
|
|
|
241
241
|
| Webhook action returns `"only http/https allowed"` | URLs must use `http://` or `https://` schemes; other protocols are rejected by design *(source: `engine/watch-actions.js` `WEBHOOK` handler)* |
|
|
242
242
|
| Trigger fires but follow-up `dispatch-work-item` is missing | Check the engine log for `Watch <id> action <type>: <summary>`. Common reasons: missing `title`, the project's `work-items.json` couldn't be written, or the WI landed in central `work-items.json` because no project was specified |
|
|
243
243
|
| Watch `_lastActionResult` shows `"timeout"` for webhook | Webhooks have a 10s safety timeout to keep the watches tick fast *(source: `engine/watch-actions.js:482-484`)* |
|
|
244
|
-
| `checkWatches` block crashes silently | Wrapped in `safe('checkWatches', ...)` so one failure doesn't abort the tick *(source: `engine.js:
|
|
244
|
+
| `checkWatches` block crashes silently | Wrapped in `safe('checkWatches', ...)` so one failure doesn't abort the tick *(source: `engine.js:6386`)*. Inspect `engine/log.json` for `Watch check error (<id>)` lines. Regression #1088: the block must use `getProjects(config)`, never the long-removed `PROJECTS` constant |
|
|
245
245
|
|
|
246
246
|
## See Also
|
|
247
247
|
|
|
248
|
-
- `engine/shared.js:
|
|
248
|
+
- `engine/shared.js:2523-2618` — `WATCH_STATUS`, `WATCH_TARGET_TYPE`, `WATCH_CONDITION`, `WATCH_ABSOLUTE_CONDITIONS`, `WATCH_ACTION_TYPE` constants
|
|
249
249
|
- `engine/watches.js` — registry, lifecycle, tick integration, `watches.d/` plugin loader
|
|
250
250
|
- `engine/watch-actions.js` — action registry and built-in handlers (including `minions-api`)
|
|
251
251
|
- `watches.d/http.js` — canonical user-extensible target type plugin
|
package/engine/cc-worker-pool.js
CHANGED
|
@@ -54,6 +54,45 @@
|
|
|
54
54
|
const { spawn } = require('child_process');
|
|
55
55
|
const crypto = require('crypto');
|
|
56
56
|
|
|
57
|
+
// W-mpmwxni2000c25c7-c — typed error codes the pool emits through every
|
|
58
|
+
// failure exit so the consumer (CC streaming handler / doc-chat pool
|
|
59
|
+
// wrapper / SSE writer) can render a structured error envelope instead of
|
|
60
|
+
// parsing the stderr string. Matches the `{ message, code, retriable }`
|
|
61
|
+
// shape sub-item b standardized on for the dashboard's SSE envelope and
|
|
62
|
+
// the runtime adapter parseError() contract (engine/runtimes/*.js).
|
|
63
|
+
const ERROR_CODES = Object.freeze({
|
|
64
|
+
// spawn() threw synchronously OR the child process emitted an 'error'
|
|
65
|
+
// event (binary missing on PATH, exec failed, EPERM, etc.). Retriable
|
|
66
|
+
// because a transient PATH / fs glitch may recover.
|
|
67
|
+
WORKER_SPAWN_FAILED: 'worker-spawn-failed',
|
|
68
|
+
// The worker process exited DURING the ACP handshake (initialize or
|
|
69
|
+
// session/new) — usually `copilot login` is incomplete or the CLI
|
|
70
|
+
// version is too old. Also fires when session/new returns no
|
|
71
|
+
// sessionId. Retriable: the engine swaps to a fallback model / a re-auth
|
|
72
|
+
// may unblock the next attempt.
|
|
73
|
+
ACP_HANDSHAKE_FAILED: 'acp-handshake-failed',
|
|
74
|
+
// The worker process exited AFTER a successful handshake (the daemon
|
|
75
|
+
// died mid-turn). Retriable — the next call cold-spawns a fresh worker.
|
|
76
|
+
WORKER_DIED: 'worker-died',
|
|
77
|
+
// The consumer's per-turn timeout fired before the ACP session/prompt
|
|
78
|
+
// resolved. Owned by the dashboard pool wrappers (cc-worker-pool itself
|
|
79
|
+
// has no turn timeout) but exported here so all callers stringify the
|
|
80
|
+
// same constant. Retriable — most timeouts are transient.
|
|
81
|
+
CC_TURN_TIMEOUT: 'cc-turn-timeout',
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
// Build a typed Error carrying the `{ message, code, retriable }` envelope
|
|
85
|
+
// fields the consumer expects. Plain Errors flow through unchanged; the
|
|
86
|
+
// helper only stamps the extra metadata. Keep retriable defaulting to
|
|
87
|
+
// `true` so a caller that forgets to set it still gets the safe default
|
|
88
|
+
// (the legacy pre-typed-error code path treated every failure as retriable).
|
|
89
|
+
function _typedError(message, code, retriable = true) {
|
|
90
|
+
const err = new Error(message);
|
|
91
|
+
err.code = code;
|
|
92
|
+
err.retriable = retriable;
|
|
93
|
+
return err;
|
|
94
|
+
}
|
|
95
|
+
|
|
57
96
|
// 10 minutes — matches the work-item spec.
|
|
58
97
|
const IDLE_REAPER_MS = 10 * 60 * 1000;
|
|
59
98
|
// Reaper sweep cadence. Not exposed as ENGINE_DEFAULTS to keep the pool
|
|
@@ -176,8 +215,13 @@ class Worker {
|
|
|
176
215
|
try {
|
|
177
216
|
proc = _internals.spawnAcp({ cwd: this.cwd });
|
|
178
217
|
} catch (err) {
|
|
179
|
-
|
|
180
|
-
|
|
218
|
+
// spawn() threw synchronously — typically ENOENT (copilot binary not
|
|
219
|
+
// on PATH) or EACCES. Surface as worker-spawn-failed so the consumer
|
|
220
|
+
// can show "install the CLI / fix PATH" guidance.
|
|
221
|
+
throw _typedError(
|
|
222
|
+
`copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
|
|
223
|
+
ERROR_CODES.WORKER_SPAWN_FAILED,
|
|
224
|
+
true
|
|
181
225
|
);
|
|
182
226
|
}
|
|
183
227
|
this.proc = proc;
|
|
@@ -193,8 +237,13 @@ class Worker {
|
|
|
193
237
|
const earlyExitPromise = new Promise((_, reject) => {
|
|
194
238
|
earlyExitReject = (code) => {
|
|
195
239
|
this.killed = true;
|
|
196
|
-
|
|
197
|
-
|
|
240
|
+
// Early exit DURING the handshake = acp-handshake-failed (almost
|
|
241
|
+
// always missing `copilot login`, stale CLI, or daemon crash on
|
|
242
|
+
// boot). Retriable so re-auth or a CLI upgrade can recover.
|
|
243
|
+
const err = _typedError(
|
|
244
|
+
`copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (exit ${code})`,
|
|
245
|
+
ERROR_CODES.ACP_HANDSHAKE_FAILED,
|
|
246
|
+
true
|
|
198
247
|
);
|
|
199
248
|
this.spawnError = err;
|
|
200
249
|
this._failAllPending(err);
|
|
@@ -205,8 +254,13 @@ class Worker {
|
|
|
205
254
|
proc.once('exit', earlyExitHandler);
|
|
206
255
|
|
|
207
256
|
const errorHandler = (err) => {
|
|
208
|
-
|
|
209
|
-
|
|
257
|
+
// proc 'error' event fires when the OS can't actually start the child
|
|
258
|
+
// (ENOENT after a successful spawn() call, etc.). Treat as a spawn
|
|
259
|
+
// failure even though we made it past the synchronous spawn() above.
|
|
260
|
+
const wrapped = _typedError(
|
|
261
|
+
`copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
|
|
262
|
+
ERROR_CODES.WORKER_SPAWN_FAILED,
|
|
263
|
+
true
|
|
210
264
|
);
|
|
211
265
|
this.spawnError = wrapped;
|
|
212
266
|
this.killed = true;
|
|
@@ -227,7 +281,13 @@ class Worker {
|
|
|
227
281
|
]);
|
|
228
282
|
this.sessionId = result && result.sessionId;
|
|
229
283
|
if (!this.sessionId) {
|
|
230
|
-
|
|
284
|
+
// Handshake completed without an error but the daemon didn't hand
|
|
285
|
+
// back a sessionId — protocol violation or partial init failure.
|
|
286
|
+
throw _typedError(
|
|
287
|
+
'copilot --acp failed -- session/new returned no sessionId',
|
|
288
|
+
ERROR_CODES.ACP_HANDSHAKE_FAILED,
|
|
289
|
+
true
|
|
290
|
+
);
|
|
231
291
|
}
|
|
232
292
|
} finally {
|
|
233
293
|
// Either the handshake finished (swap to a persistent exit handler that
|
|
@@ -236,7 +296,13 @@ class Worker {
|
|
|
236
296
|
}
|
|
237
297
|
proc.on('exit', () => {
|
|
238
298
|
this.killed = true;
|
|
239
|
-
|
|
299
|
+
// Post-handshake exit = the daemon died mid-conversation. Retriable
|
|
300
|
+
// because the next call will cold-spawn a fresh worker.
|
|
301
|
+
const err = _typedError(
|
|
302
|
+
'copilot --acp process exited',
|
|
303
|
+
ERROR_CODES.WORKER_DIED,
|
|
304
|
+
true
|
|
305
|
+
);
|
|
240
306
|
this._failAllPending(err);
|
|
241
307
|
// Settle inflight too if it's still hanging
|
|
242
308
|
if (this.inflight && !this.inflight.settled) {
|
|
@@ -656,9 +722,13 @@ async function getSession({ tabId, model, effort, mcpServers, systemPromptHash,
|
|
|
656
722
|
// This is the bug class the ab141995 fix closed; if it ever recurs the
|
|
657
723
|
// engine should fail loudly rather than hand back a half-initialized
|
|
658
724
|
// handle. Throwing here lets the dashboard surface spawn-failed instead
|
|
659
|
-
// of the silent thinking-dots-forever symptom.
|
|
660
|
-
|
|
661
|
-
|
|
725
|
+
// of the silent thinking-dots-forever symptom. Mark non-retriable —
|
|
726
|
+
// this is a real engine bug, not a transient pool failure; the next
|
|
727
|
+
// attempt would hit the same race.
|
|
728
|
+
throw _typedError(
|
|
729
|
+
`cc-worker-pool: getSession returning handle with null sessionId (tab=${tabId} lifecycle=${lifecycle}) — engine race regression, see W-mpd45blx00072f04 / W-mpdavudb000v8446`,
|
|
730
|
+
ERROR_CODES.ACP_HANDSHAKE_FAILED,
|
|
731
|
+
false
|
|
662
732
|
);
|
|
663
733
|
}
|
|
664
734
|
|
|
@@ -766,4 +836,10 @@ module.exports = {
|
|
|
766
836
|
IDLE_REAPER_MS,
|
|
767
837
|
REAPER_INTERVAL_MS,
|
|
768
838
|
WARM_MAX_CONCURRENT,
|
|
839
|
+
// W-mpmwxni2000c25c7-c — typed-error envelope contract. Exported so the
|
|
840
|
+
// dashboard pool wrappers (and their tests) reference the same string
|
|
841
|
+
// constants and so the doc-chat timeout path can stamp the same
|
|
842
|
+
// `{ message, code, retriable }` shape the pool itself emits.
|
|
843
|
+
ERROR_CODES,
|
|
844
|
+
_typedError,
|
|
769
845
|
};
|
package/engine/cleanup.js
CHANGED
|
@@ -419,6 +419,35 @@ async function runCleanup(config, verbose = false) {
|
|
|
419
419
|
}
|
|
420
420
|
}
|
|
421
421
|
|
|
422
|
+
// 2c. Reap orphan agents/temp-* dirs whose dispatch is no longer referenced
|
|
423
|
+
// anywhere in dispatch.json. Temp-agent dirs are created by the engine for
|
|
424
|
+
// ephemeral temp-<uid> agents; once the dispatch ages out of dispatch
|
|
425
|
+
// history they're never touched again, accumulating MB of live-output.log
|
|
426
|
+
// tails over weeks. 1h mtime gate prevents reaping a still-spawning temp
|
|
427
|
+
// agent that races dispatch.json visibility.
|
|
428
|
+
cleaned.orphanTempAgentDirs = 0;
|
|
429
|
+
try {
|
|
430
|
+
const dispatch = getDispatch();
|
|
431
|
+
const referencedAgents = new Set();
|
|
432
|
+
for (const seg of [dispatch.pending || [], dispatch.active || [], dispatch.completed || [], dispatch.history || []]) {
|
|
433
|
+
for (const e of seg) if (e?.agent) referencedAgents.add(String(e.agent));
|
|
434
|
+
}
|
|
435
|
+
let entries;
|
|
436
|
+
try { entries = fs.readdirSync(AGENTS_DIR, { withFileTypes: true }); } catch { entries = []; }
|
|
437
|
+
for (const entry of entries) {
|
|
438
|
+
if (!entry.isDirectory()) continue;
|
|
439
|
+
if (!entry.name.startsWith('temp-')) continue;
|
|
440
|
+
if (referencedAgents.has(entry.name)) continue;
|
|
441
|
+
const full = path.join(AGENTS_DIR, entry.name);
|
|
442
|
+
let stat; try { stat = fs.statSync(full); } catch { continue; }
|
|
443
|
+
if (stat.mtimeMs >= oneHourAgo) continue;
|
|
444
|
+
try {
|
|
445
|
+
fs.rmSync(full, { recursive: true, force: true });
|
|
446
|
+
cleaned.orphanTempAgentDirs++;
|
|
447
|
+
} catch (err) { log('warn', `orphan temp agent dir ${entry.name}: ${err.message}`); }
|
|
448
|
+
}
|
|
449
|
+
} catch (e) { log('warn', `orphan temp agent sweep: ${e.message}`); }
|
|
450
|
+
|
|
422
451
|
// 2b. Detect git worktrees registered inside any linked project's working tree.
|
|
423
452
|
// Nested worktrees cause glob/grep tools running with cwd=projectRoot to match
|
|
424
453
|
// BOTH copies of every file; a single Edit/MultiEdit then writes the same
|
|
@@ -452,6 +481,57 @@ async function runCleanup(config, verbose = false) {
|
|
|
452
481
|
}
|
|
453
482
|
}
|
|
454
483
|
|
|
484
|
+
// 2d. Reap on-disk worktree dirs not registered in `git worktree list`. Can
|
|
485
|
+
// be left behind when removeWorktree fails mid-way, when `git worktree prune`
|
|
486
|
+
// ran without a follow-up rm -rf, or after manual `git worktree remove
|
|
487
|
+
// --force` leaves an empty dir. Phase 3 below only walks dirs already in
|
|
488
|
+
// git's list, so these are invisible to it. 2h mtime gate matches the
|
|
489
|
+
// existing age sweep further down.
|
|
490
|
+
cleaned.orphanWorktreeDirs = 0;
|
|
491
|
+
const _twoHoursAgo = Date.now() - 7200000;
|
|
492
|
+
const _scannedWtRoots = new Set();
|
|
493
|
+
for (const project of projects) {
|
|
494
|
+
const root = project.localPath ? path.resolve(project.localPath) : null;
|
|
495
|
+
if (!root || !fs.existsSync(root)) continue;
|
|
496
|
+
const wtRoots = new Set();
|
|
497
|
+
const configuredRoot = path.resolve(root, config.engine?.worktreeRoot || '../worktrees');
|
|
498
|
+
if (fs.existsSync(configuredRoot)) wtRoots.add(configuredRoot);
|
|
499
|
+
for (const d of ['worktrees', '.claude/worktrees'].map(d => path.join(root, d))) {
|
|
500
|
+
if (fs.existsSync(d)) wtRoots.add(d);
|
|
501
|
+
}
|
|
502
|
+
let registered = null;
|
|
503
|
+
for (const wtRoot of wtRoots) {
|
|
504
|
+
if (_scannedWtRoots.has(wtRoot)) continue;
|
|
505
|
+
_scannedWtRoots.add(wtRoot);
|
|
506
|
+
// Resolve `git worktree list` once per project; reused across its roots.
|
|
507
|
+
if (registered === null) {
|
|
508
|
+
try {
|
|
509
|
+
const raw = String(shared.execSilent('git worktree list --porcelain', { cwd: root, timeout: 10000, windowsHide: true }) || '');
|
|
510
|
+
registered = new Set(shared.parseWorktreePorcelain(raw).map(wt => path.resolve(wt.path)));
|
|
511
|
+
} catch (e) {
|
|
512
|
+
log('warn', `orphan worktree dir scan for ${project.name || root}: ${e.message}`);
|
|
513
|
+
registered = new Set();
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
let entries;
|
|
517
|
+
try { entries = fs.readdirSync(wtRoot, { withFileTypes: true }); } catch { continue; }
|
|
518
|
+
for (const entry of entries) {
|
|
519
|
+
if (!entry.isDirectory()) continue;
|
|
520
|
+
const full = path.resolve(wtRoot, entry.name);
|
|
521
|
+
if (registered.has(full)) continue;
|
|
522
|
+
let stat; try { stat = fs.statSync(full); } catch { continue; }
|
|
523
|
+
if (stat.mtimeMs >= _twoHoursAgo) continue;
|
|
524
|
+
try {
|
|
525
|
+
fs.rmSync(full, { recursive: true, force: true });
|
|
526
|
+
cleaned.orphanWorktreeDirs++;
|
|
527
|
+
log('info', `Cleanup: removed orphan worktree dir ${full} (not registered in git)`);
|
|
528
|
+
} catch (err) {
|
|
529
|
+
log('warn', `orphan worktree dir ${full}: ${err.message}`);
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
455
535
|
// 3. Clean git worktrees for merged/abandoned PRs
|
|
456
536
|
const _attemptedWorktreePaths = new Set(); // dedup across projects sharing a worktreeRoot
|
|
457
537
|
for (const project of projects) {
|
|
@@ -909,8 +989,10 @@ async function runCleanup(config, verbose = false) {
|
|
|
909
989
|
}
|
|
910
990
|
} catch (e) { log('warn', 'prune orphaned dispatches: ' + e.message); }
|
|
911
991
|
|
|
912
|
-
|
|
913
|
-
|
|
992
|
+
const _orphanTemp = cleaned.orphanTempAgentDirs || 0;
|
|
993
|
+
const _orphanWt = cleaned.orphanWorktreeDirs || 0;
|
|
994
|
+
if (cleaned.tempFiles + cleaned.liveOutputs + cleaned.worktrees + cleaned.zombies + (cleaned.files || 0) + cleaned.orphanedDispatches + (cleaned.nestedWorktrees || 0) + _orphanTemp + _orphanWt > 0) {
|
|
995
|
+
log('info', `Cleanup: ${cleaned.tempFiles} temp, ${cleaned.liveOutputs} live outputs, ${cleaned.worktrees} worktrees, ${cleaned.zombies} zombies, ${cleaned.files || 0} archives, ${cleaned.orphanedDispatches} orphaned dispatches, ${cleaned.nestedWorktrees || 0} nested worktrees flagged, ${_orphanTemp} orphan temp dirs, ${_orphanWt} orphan worktree dirs`);
|
|
914
996
|
}
|
|
915
997
|
|
|
916
998
|
// 8. Clean swept KB files older than 7 days
|
package/engine/dispatch.js
CHANGED
|
@@ -696,6 +696,12 @@ function completeDispatch(id, result = DISPATCH_RESULT.SUCCESS, reason = '', res
|
|
|
696
696
|
// (overloaded_error / 503). Empty string clears any stale
|
|
697
697
|
// value from an earlier failure cycle.
|
|
698
698
|
wi._lastFailureClass = failureClass || '';
|
|
699
|
+
// W-mpmwxn1j — Bump per-agent retry count so the next dispatch
|
|
700
|
+
// can reassign to a different eligible agent once the same
|
|
701
|
+
// agent hits the threshold. Skip when no agent is resolvable
|
|
702
|
+
// (anonymous failures shouldn't corrupt the map shape).
|
|
703
|
+
const failedAgent = item.agent || wi.dispatched_to;
|
|
704
|
+
if (failedAgent) shared.bumpAgentRetryCount(wi, failedAgent);
|
|
699
705
|
delete wi.failReason;
|
|
700
706
|
delete wi.failedAt;
|
|
701
707
|
delete wi.dispatched_at;
|