@yemi33/minions 0.1.1965 → 0.1.1967
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/minions.js +6 -6
- package/dashboard/js/refresh.js +5 -0
- package/dashboard/js/render-managed.js +261 -0
- package/dashboard/js/render-other.js +5 -2
- package/dashboard/pages/engine.html +6 -0
- package/dashboard/styles.css +21 -4
- package/dashboard-build.js +1 -1
- package/dashboard.js +250 -1
- package/docs/README.md +10 -13
- package/docs/managed-spawn.md +259 -0
- package/docs/watches.md +47 -20
- package/engine/cli.js +39 -0
- package/engine/managed-spawn.js +1325 -0
- package/engine/playbook.js +34 -0
- package/engine/projects.js +13 -0
- package/engine/shared.js +118 -0
- package/engine.js +264 -14
- package/package.json +2 -1
package/docs/README.md
CHANGED
|
@@ -14,18 +14,21 @@ Hands-on stories and distribution guides for people running or evaluating Minion
|
|
|
14
14
|
Architecture, design proposals, and lifecycle references for people working on the engine, dashboard, or playbooks.
|
|
15
15
|
|
|
16
16
|
- [command-center.md](command-center.md) — Command Center (CC) chat panel: persistent Sonnet sessions, `--resume` semantics, system-prompt invalidation, and per-tab session storage.
|
|
17
|
-
- [completion-reports.md](completion-reports.md) — Canonical schema for the per-
|
|
17
|
+
- [completion-reports.md](completion-reports.md) — Canonical schema for the per-spawn completion JSON: trust nonce, `failure_class` enum, `noop` semantics, `retryable` / `needs_rerun` shape, and the artifacts array.
|
|
18
18
|
- [copilot-cli-schema.md](copilot-cli-schema.md) — Behavior and schema reference for the GitHub Copilot CLI adapter (capability flags, stdin vs `-p`, model discovery, effort levels).
|
|
19
19
|
- [design-state-storage.md](design-state-storage.md) — Design proposal evaluating five database options for replacing Minions' file-based JSON state; recommends `node:sqlite` as the medium-term target.
|
|
20
20
|
- [kb-sweep.md](kb-sweep.md) — Knowledge-base consolidation sweep (hash dedup → LLM batch dedup/reclassify → per-entry compress) and the detached runner that keeps it alive across `minions restart`.
|
|
21
|
+
- [managed-spawn.md](managed-spawn.md) — Engine-owned long-running services (managed-spawn primitive): sidecar schema, healthcheck examples, lifecycle, dashboard API, and the WI 1 (build) → WI 2 (test) chained-validation pattern.
|
|
21
22
|
- [plan-lifecycle.md](plan-lifecycle.md) — Full plan pipeline from `/plan` through PRD materialization, dispatch with dependency gating, verify task, and human archive.
|
|
22
23
|
- [pr-review-fix-loop.md](pr-review-fix-loop.md) — How the engine moves a PR from creation through review, fix dispatch, and re-review, including stale-status guards.
|
|
23
24
|
- [rfc-completion-json.md](rfc-completion-json.md) — RFC for replacing stdout regex-scraping with a structured `completion.json` control-plane protocol.
|
|
24
|
-
- [runtime-adapters.md](runtime-adapters.md) —
|
|
25
|
+
- [runtime-adapters.md](runtime-adapters.md) — Runtime adapter contract (`engine/runtimes/*`): how the engine talks to Claude Code, Copilot CLI, and future CLIs through a single capability-flagged interface.
|
|
25
26
|
- [self-improvement.md](self-improvement.md) — The six self-improvement mechanisms (learnings inbox, per-agent history, review feedback, quality metrics, etc.) that form Minions' continuous feedback loop.
|
|
26
|
-
- [skills.md](skills.md) —
|
|
27
|
-
- [
|
|
28
|
-
- [
|
|
27
|
+
- [skills.md](skills.md) — Skill block format: how agents emit reusable `\`\`\`skill` blocks and how the engine extracts them into native personal-skill directories.
|
|
28
|
+
- [slim-ux/concepts.md](slim-ux/concepts.md) — Slim-UX design notes: simplified surface concepts driving the project picker, inline project link, and decoupled folder picker.
|
|
29
|
+
- [slim-ux/architecture-suggestions.md](slim-ux/architecture-suggestions.md) — Slim-UX follow-up architecture suggestions paired with `concepts.md`.
|
|
30
|
+
- [team-memory.md](team-memory.md) — Per-agent memory layer (`knowledge/agents/<id>.md`) and the consolidation/routing rules that populate it from `notes/inbox/`.
|
|
31
|
+
- [watches.md](watches.md) — Persistent monitoring jobs (`engine/watches.json`): target-type registry, conditions, follow-up actions, and the `watches.d/` plugin folder.
|
|
29
32
|
|
|
30
33
|
## Operations
|
|
31
34
|
|
|
@@ -34,14 +37,8 @@ Operational runbooks for engine operators and fleet maintainers.
|
|
|
34
37
|
- [auto-discovery.md](auto-discovery.md) — Auto-discovery and execution pipeline: the per-tick orchestration loop and the four work-discovery sources.
|
|
35
38
|
- [engine-restart.md](engine-restart.md) — How agents survive an engine restart: state persistence, the 20-minute startup grace period, and orphan reattachment via PID files and `live-output.log`.
|
|
36
39
|
- [human-vs-automated.md](human-vs-automated.md) — Quick reference table of which features humans start, run, decide, and recover, and the two human approval gates.
|
|
37
|
-
- [
|
|
38
|
-
|
|
39
|
-
## Design notes & explorations
|
|
40
|
-
|
|
41
|
-
In-progress UX or architecture proposals; not yet shipped behavior.
|
|
42
|
-
|
|
43
|
-
- [slim-ux/concepts.md](slim-ux/concepts.md) — Slim-UX dashboard concept exploration (project picker, inline project link, decoupled folder picker).
|
|
44
|
-
- [slim-ux/architecture-suggestions.md](slim-ux/architecture-suggestions.md) — Companion architecture suggestions for the slim-UX track.
|
|
40
|
+
- [kb-sweep.md](kb-sweep.md) — Knowledge-base sweep runbook: how `engine/kb-sweep.js` consolidates `notes/inbox/` into `knowledge/` and survives `minions restart`.
|
|
41
|
+
- [onboarding.md](onboarding.md) — First-30-minutes walkthrough for a new operator: install, init, dispatch a first work item, watch it land.
|
|
45
42
|
|
|
46
43
|
---
|
|
47
44
|
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
# Managed-spawn primitive
|
|
2
|
+
|
|
3
|
+
> Engine-owned long-running services with sidecar declaration, healthcheck-gated dispatch SUCCESS, and dashboard discovery.
|
|
4
|
+
> Plan: [`plans/plan-w-mp7k1r760003b5dd-2026-05-15.md`](../plans/plan-w-mp7k1r760003b5dd-2026-05-15.md) · Module: [`engine/managed-spawn.js`](../engine/managed-spawn.js) · Dashboard panel: `/engine` → "Managed Processes".
|
|
5
|
+
|
|
6
|
+
## Why this exists
|
|
7
|
+
|
|
8
|
+
Before managed-spawn, Constellation (and similar multi-process projects) lost dev servers every time `minions restart` ran. The original Lambert/Ralph failure pattern:
|
|
9
|
+
|
|
10
|
+
1. Agent spawned `bun dev` as a detached child.
|
|
11
|
+
2. Wrote down PID in `keep-pids.json` so the engine wouldn't kill it.
|
|
12
|
+
3. `minions restart` killed the engine but left the child running — *on Node*. **On `bun` + Windows the child died with the parent** because the Windows detached-spawn semantics differ across runtimes.
|
|
13
|
+
4. Next dispatch landed against a dead URL with no signal it had died.
|
|
14
|
+
|
|
15
|
+
Managed-spawn moves the spawn ownership from the agent into the engine. Agents *describe* services in a sidecar; the engine spawns them using the proven [`bin/minions.js spawnDashboard`](../bin/minions.js) pattern (works for Node, bun, python, docker, …), runs healthchecks, gates dispatch SUCCESS on first-healthy, sweeps dead PIDs / expired TTL, auto-injects live processes into downstream agent prompts, and exposes everything through `/api/managed-processes` + the dashboard.
|
|
16
|
+
|
|
17
|
+
## When to use managed-spawn vs `keep_processes`
|
|
18
|
+
|
|
19
|
+
| Need | Use this |
|
|
20
|
+
|---|---|
|
|
21
|
+
| Long-running dev server / emulator / build daemon that downstream agents need to hit | **managed-spawn** |
|
|
22
|
+
| Short-lived helper the *same* agent needs alive past its exit (e.g. `gradle --daemon` for the next gradle invocation) | `meta.keep_processes` (existing per-agent `keep-pids.json`) |
|
|
23
|
+
| One-shot script that exits on its own | Neither — just run it inline |
|
|
24
|
+
|
|
25
|
+
Both can coexist. There's no plan to deprecate `keep_processes`; a future cleanup PR can revisit if managed-spawn fully subsumes its use cases.
|
|
26
|
+
|
|
27
|
+
## Sidecar schema
|
|
28
|
+
|
|
29
|
+
The sidecar lives at `<MINIONS_DIR>/agents/<agentId>/managed-spawn.json` and is read **once** by the engine in `onAgentClose`, after the agent's process exits. (Per-tick polling for sidecar edits is explicitly out of scope — pick a final shape before exit.)
|
|
30
|
+
|
|
31
|
+
```jsonc
|
|
32
|
+
{
|
|
33
|
+
"specs": [
|
|
34
|
+
{
|
|
35
|
+
"name": "constellation-host", // kebab-case, ≤64 chars, unique within file
|
|
36
|
+
"cmd": "bun", // must be on engine.managedSpawn.executableAllowlist
|
|
37
|
+
"args": ["run", "dev"], // ≤64 entries
|
|
38
|
+
"cwd": "D:/repos/constellation", // must be a real git worktree (requireGitWorkdir: true)
|
|
39
|
+
"env": { "VITE_HOST": "127.0.0.1" }, // ≤32 keys; allowlist + prefix-allowlist enforced
|
|
40
|
+
"ports": [3001], // 1024-65535; ≤20 per spec; advisory only (engine doesn't bind)
|
|
41
|
+
"ttl_minutes": 240, // ≤1440 (24h hard cap); defaults to 240 (4h)
|
|
42
|
+
"attrs": { // opaque per-spec metadata, ≤2048 bytes serialized
|
|
43
|
+
"base_url": "http://localhost:3001",
|
|
44
|
+
"framework": "vite"
|
|
45
|
+
},
|
|
46
|
+
"healthcheck": { // required for SUCCESS gating
|
|
47
|
+
"type": "http", // "http" | "command"
|
|
48
|
+
"url": "http://localhost:3001/health",
|
|
49
|
+
"expect_status": 200,
|
|
50
|
+
"interval_s": 1,
|
|
51
|
+
"timeout_s": 60 // total wait for first-healthy before failing dispatch
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
],
|
|
55
|
+
"written_by": "<your-agent-id>",
|
|
56
|
+
"wi_id": "<this-work-item-id>"
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
The renderer in [`buildManagedSpawnHint`](../engine/managed-spawn.js) at `engine/managed-spawn.js:419` emits this exact shape (with allowlist + cap reminders) into the agent's prompt whenever the work item has `meta.managed_spawn: true`. Treat the rendered hint as the source of truth — if this doc and the hint drift, the hint wins.
|
|
61
|
+
|
|
62
|
+
## Healthcheck examples
|
|
63
|
+
|
|
64
|
+
### HTTP — most common
|
|
65
|
+
|
|
66
|
+
```jsonc
|
|
67
|
+
"healthcheck": {
|
|
68
|
+
"type": "http",
|
|
69
|
+
"url": "http://localhost:3001/health",
|
|
70
|
+
"expect_status": 200,
|
|
71
|
+
"interval_s": 1,
|
|
72
|
+
"timeout_s": 60
|
|
73
|
+
}
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Probes `GET <url>` on `interval_s` cadence until `response.status === expect_status` or `timeout_s` elapses. No body assertion (deferred — see plan rejected items). Auth headers / cookies not supported (local dev only).
|
|
77
|
+
|
|
78
|
+
### Command — anything that exits 0 + matches a regex
|
|
79
|
+
|
|
80
|
+
```jsonc
|
|
81
|
+
"healthcheck": {
|
|
82
|
+
"type": "command",
|
|
83
|
+
"cmd": "curl",
|
|
84
|
+
"args": ["-fsS", "http://localhost:3001/ready"],
|
|
85
|
+
"shell": false,
|
|
86
|
+
"expect_regex": "^ready$",
|
|
87
|
+
"interval_s": 2,
|
|
88
|
+
"timeout_s": 30
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Runs the command with `child_process.exec` / `spawn`, asserts both exit code 0 AND stdout matching `expect_regex`. The `cmd` must be on the same executable allowlist as `spec.cmd`. Use `command` healthchecks for:
|
|
93
|
+
- Raw TCP — `Test-NetConnection -Port 3001 -ComputerName localhost` on Windows / `nc -z localhost 3001` on POSIX.
|
|
94
|
+
- File presence — `Get-Content -Tail 1 log.txt` matching `^ready$`.
|
|
95
|
+
- Database connectivity — `docker exec pg pg_isready`.
|
|
96
|
+
|
|
97
|
+
`tcp` and `log-match` healthcheck types are intentionally not implemented — see the plan's Rejected items for the reasoning.
|
|
98
|
+
|
|
99
|
+
## Lifecycle
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
┌───────────────────────────────────────────────────────────────────┐
|
|
103
|
+
│ 1. Agent runs │
|
|
104
|
+
│ - Playbook injects the managed-spawn hint when │
|
|
105
|
+
│ work_item.meta.managed_spawn === true │
|
|
106
|
+
│ - Agent writes agents/<id>/managed-spawn.json before exit │
|
|
107
|
+
└───────────────────────────────────┬───────────────────────────────┘
|
|
108
|
+
│ engine.onAgentClose
|
|
109
|
+
▼
|
|
110
|
+
┌───────────────────────────────────────────────────────────────────┐
|
|
111
|
+
│ 2. evaluateManagedSpawnAcceptance(agentId) │
|
|
112
|
+
│ - Validates schema, allowlists, caps │
|
|
113
|
+
│ - Rejects → ERROR + alert + sidecar deleted │
|
|
114
|
+
│ failure_class: invalid-managed-spawn (non-retryable) │
|
|
115
|
+
└───────────────────────────────────┬───────────────────────────────┘
|
|
116
|
+
│ accepted
|
|
117
|
+
▼
|
|
118
|
+
┌───────────────────────────────────────────────────────────────────┐
|
|
119
|
+
│ 3. spawnManagedSpec(spec) per spec │
|
|
120
|
+
│ - Uses the proven Windows detached-spawn pattern │
|
|
121
|
+
│ - Records each in engine/managed-processes.json (locked write) │
|
|
122
|
+
│ - Stdio → engine/managed-logs/<name>.log (append fd, NOT pipe) │
|
|
123
|
+
└───────────────────────────────────┬───────────────────────────────┘
|
|
124
|
+
│
|
|
125
|
+
▼
|
|
126
|
+
┌───────────────────────────────────────────────────────────────────┐
|
|
127
|
+
│ 4. waitForFirstHealth(spec) per spec, in parallel │
|
|
128
|
+
│ - Probes every interval_s, gives up at timeout_s │
|
|
129
|
+
│ - First success → state.healthy = true, last_health_at = now │
|
|
130
|
+
│ - Any failure → dispatch ERROR + sibling spawns left alive │
|
|
131
|
+
│ failure_class: managed-spawn-healthcheck (retryable) │
|
|
132
|
+
└───────────────────────────────────┬───────────────────────────────┘
|
|
133
|
+
│ all healthy
|
|
134
|
+
▼
|
|
135
|
+
┌───────────────────────────────────────────────────────────────────┐
|
|
136
|
+
│ 5. Dispatch SUCCESS │
|
|
137
|
+
│ - Spec survives engine restart (PID detached + boot reconcile) │
|
|
138
|
+
│ - Visible at /api/managed-processes + dashboard "Managed │
|
|
139
|
+
│ Processes" panel │
|
|
140
|
+
│ - Downstream agent prompts auto-inject a "## Live managed │
|
|
141
|
+
│ processes" block scoped to this project │
|
|
142
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Background work the engine does without further agent involvement:
|
|
146
|
+
- **Per-tick** (cadence `engine.managedSpawn.sweepEvery: 30`, ~30 min): `sweepManagedSpawn()` drops dead-PID rows, kills + unlinks TTL-expired specs, rotates `managed-logs/<name>.log` past `logRotateBytes` (10MB).
|
|
147
|
+
- **Per-engine-boot**: `bootReconcileManagedSpawn()` (timeout-bounded by `bootReconcileMaxMs: 2000` ms) drops dead PIDs, kills expired, runs one immediate healthcheck per survivor.
|
|
148
|
+
- **Per-project-removal**: `removeManagedSpecsForProject(name)` (called from `engine/projects.js removeProject`) kills + unlinks specs whose `owner_project` matches.
|
|
149
|
+
|
|
150
|
+
## WI 1 (build) → WI 2 (test) chained-validation pattern
|
|
151
|
+
|
|
152
|
+
The canonical use case: split a build + smoke-test workflow across two work items so each can be retried / restarted independently.
|
|
153
|
+
|
|
154
|
+
**WI 1 — build & host:**
|
|
155
|
+
- `meta.managed_spawn: true`
|
|
156
|
+
- Playbook: `implement` (or whichever fits)
|
|
157
|
+
- Sidecar spec: dev server, emulator, or whatever the next WI needs.
|
|
158
|
+
- `task_description`: build + verify locally, write the sidecar.
|
|
159
|
+
- Engine gates SUCCESS on healthcheck → spec is healthy when this WI completes.
|
|
160
|
+
|
|
161
|
+
**WI 2 — test against the live service:**
|
|
162
|
+
- `dependencies: [WI-1-id]` (engine won't dispatch until WI 1 is SUCCESS)
|
|
163
|
+
- `meta.managed_spawn: false` (or omit — no new specs to write)
|
|
164
|
+
- Playbook auto-inject (from `engine/playbook.js`) ships a `## Live managed processes for project <name>` block in WI 2's prompt with WI 1's specs' `name`, `pid`, `attrs`, `log_path`, `ttl_expires_at`, and base URL.
|
|
165
|
+
- WI 2 reads `attrs.base_url` from its prompt context, hits the live service, runs whatever test it needs to.
|
|
166
|
+
|
|
167
|
+
Outcome: if WI 2 fails, the spec stays alive (TTL-managed) and the next dispatch of WI 2 reuses the same instance — no rebuild churn. If WI 1 fails healthcheck, WI 2 never dispatches; you fix WI 1 and retry.
|
|
168
|
+
|
|
169
|
+
## Operating
|
|
170
|
+
|
|
171
|
+
### Inspect at runtime
|
|
172
|
+
|
|
173
|
+
| What you want | Where |
|
|
174
|
+
|---|---|
|
|
175
|
+
| All live specs (JSON) | `GET /api/managed-processes` |
|
|
176
|
+
| Filtered to one project | `GET /api/managed-processes?project=<name>` |
|
|
177
|
+
| One spec | `GET /api/managed-processes/by-name/<name>` |
|
|
178
|
+
| Live log tail (SSE) | `GET /api/managed-processes/log-stream/<name>` |
|
|
179
|
+
| Dashboard | http://localhost:7331/engine → "Managed Processes" |
|
|
180
|
+
|
|
181
|
+
The list endpoint returns an ETag; pass it as `If-None-Match` on follow-up polls to get `304 Not Modified` when nothing changed. The dashboard panel does this automatically.
|
|
182
|
+
|
|
183
|
+
### Force a restart / kill
|
|
184
|
+
|
|
185
|
+
| Action | Endpoint | Dashboard |
|
|
186
|
+
|---|---|---|
|
|
187
|
+
| Kill (PID terminated, removed from state) | `POST /api/managed-processes/kill` `{"name":"…"}` | Per-row "Kill" button |
|
|
188
|
+
| Restart (kill old PID, respawn from saved state, kick first healthcheck) | `POST /api/managed-processes/restart` `{"name":"…"}` | Per-row "Restart" button |
|
|
189
|
+
|
|
190
|
+
Killing a spec from outside Minions (raw `Stop-Process`) leaves a stale row in `managed-processes.json` until the next 30-tick sweep notices the dead PID. Prefer the API.
|
|
191
|
+
|
|
192
|
+
### View logs
|
|
193
|
+
|
|
194
|
+
`engine/managed-logs/<name>.log` — append-only, rotated to `<name>.log.1` at 10MB. The dashboard's "Log" button opens an SSE stream of the live tail.
|
|
195
|
+
|
|
196
|
+
## Configuration
|
|
197
|
+
|
|
198
|
+
All knobs live under `engine.managedSpawn` in `engine/shared.js:1500` (`ENGINE_DEFAULTS.managedSpawn`). Override per install via `config.json`:
|
|
199
|
+
|
|
200
|
+
| Key | Default | Notes |
|
|
201
|
+
|---|---|---|
|
|
202
|
+
| `enabled` | `true` | Global kill switch. `false` makes the engine ignore all sidecars + skip the sweep. |
|
|
203
|
+
| `maxSpecsPerFile` | `5` | Per-agent cap. |
|
|
204
|
+
| `maxTtlMinutes` | `1440` | Hard cap (24h). |
|
|
205
|
+
| `defaultTtlMinutes` | `240` | Fallback when `ttl_minutes` omitted (4h). |
|
|
206
|
+
| `sweepEvery` | `30` | Ticks between sweeps. Default tick = 60s ⇒ ~30 min. |
|
|
207
|
+
| `defaultHealthIntervalSec` | `1` | Healthcheck cadence pre-first-healthy. |
|
|
208
|
+
| `healthBackoffSec` | `30` | Healthcheck cadence post-first-healthy. |
|
|
209
|
+
| `logRotateBytes` | `10485760` | Rotation threshold for `<name>.log`. |
|
|
210
|
+
| `bootReconcileMaxMs` | `2000` | Boot-time reconcile hard timeout. |
|
|
211
|
+
| `promptContextMaxBytes` | `2048` | Auto-injected `## Live managed processes` block cap. |
|
|
212
|
+
| `requireGitWorkdir` | `true` | Reject specs whose `cwd` isn't a git worktree. |
|
|
213
|
+
| `executableAllowlist` | `[node, bun, npm, …]` | Single global. Applies to `spec.cmd` AND `command` healthcheck `cmd`. |
|
|
214
|
+
| `envKeyAllowlist` | `[NODE_ENV, PORT, …]` | Exact-match env keys. |
|
|
215
|
+
| `envKeyAllowlistPrefixes` | `[VITE_, NEXT_, …]` | Prefix-match env keys. |
|
|
216
|
+
|
|
217
|
+
## Failure modes
|
|
218
|
+
|
|
219
|
+
| Symptom | Likely cause | Fix |
|
|
220
|
+
|---|---|---|
|
|
221
|
+
| Dispatch ERROR `failure_class: invalid-managed-spawn` | Sidecar schema/allowlist violation | Read inbox alert; the validator includes a precise reason. Non-retryable — fix and re-dispatch. |
|
|
222
|
+
| Dispatch ERROR `failure_class: managed-spawn-healthcheck` | `timeout_s` elapsed before any spec became healthy | Check `engine/managed-logs/<name>.log` for the child's crash output. Sibling spawns are left alive. Retryable. |
|
|
223
|
+
| Spec gone after `minions restart` | Bun child died with parent (the original failure mode) | Should be fixed by item 2's detached-spawn pattern. If it recurs, verify `bin/minions.js spawnDashboard` semantics still work for the runtime — that's the canonical reference. |
|
|
224
|
+
| Spec listed `alive: true, healthy: false` for >30s | Healthcheck loop self-detected service degradation | The spec did not pass a subsequent healthcheck. Inspect the service; restart via API once recovered. |
|
|
225
|
+
| Stale row sticks around with dead PID | Spec killed outside Minions | Wait one sweep cycle (~30 min) or call `POST /api/managed-processes/kill` manually. |
|
|
226
|
+
|
|
227
|
+
## Performance budget
|
|
228
|
+
|
|
229
|
+
- **Per-tick contribution**: zero unless the 30-tick sweep fires. The sweep is `O(N)` over specs in `managed-processes.json` with one `process.kill(pid, 0)` per spec.
|
|
230
|
+
- **Healthcheck loops**: per-spec, self-scheduled (not tick-coupled). 10 specs × 1s pre-healthy cadence ⇒ 10 probes/s peak. Post-healthy drops to one probe per `healthBackoffSec` (30s).
|
|
231
|
+
- **Lock contention** on `managed-processes.json`: writes batch on transitions (healthy ↔ unhealthy), and `last_health_at` only persists every `healthBackoffSec` to avoid flap. See [`test/perf/managed-spawn-load.test.js`](../test/perf/managed-spawn-load.test.js) for the 10-spec / 3-project load assertion.
|
|
232
|
+
- **Hard cap recommendation**: 50 specs across one engine. Above that, `process.kill(pid, 0)` shells out enough on Windows to dent tick latency.
|
|
233
|
+
|
|
234
|
+
## Source map
|
|
235
|
+
|
|
236
|
+
| File | Purpose |
|
|
237
|
+
|---|---|
|
|
238
|
+
| `engine/managed-spawn.js` | Schema, validator, spawn, healthcheck, sweep, state-file I/O. All pure helpers; no side effects on import. |
|
|
239
|
+
| `engine.js` `onAgentClose` (line ~2247) | Acceptance gate (item 2) + healthcheck gate (item 3) wired into dispatch result. |
|
|
240
|
+
| `engine.js` `tickInner` (line ~5687) | Per-30-tick `sweepManagedSpawn()` invocation. |
|
|
241
|
+
| `engine/cli.js` | Boot-path `bootReconcileManagedSpawn()` invocation. |
|
|
242
|
+
| `engine/playbook.js` | Auto-inject `## Live managed processes` block + `managed_spawn` hint section. |
|
|
243
|
+
| `engine/projects.js` `removeProject` | `removeManagedSpecsForProject` cleanup hook. |
|
|
244
|
+
| `dashboard.js` | 5 routes (`/api/managed-processes`, `…/by-name/<n>`, `…/kill`, `…/restart`, `…/log-stream/<n>`). |
|
|
245
|
+
| `dashboard/js/render-managed.js` + `dashboard/pages/engine.html` | "Managed Processes" panel + log-viewer modal. |
|
|
246
|
+
| `engine/shared.js` `ENGINE_DEFAULTS.managedSpawn` | All configurable knobs in one place. |
|
|
247
|
+
|
|
248
|
+
## Plan items
|
|
249
|
+
|
|
250
|
+
| # | PRD id | Lands what |
|
|
251
|
+
|---|---|---|
|
|
252
|
+
| 1 | P-7a3b1c92 | Schema, validator, sidecar reader, defaults |
|
|
253
|
+
| 2 | P-2d5e8f04 | Engine spawn + locked state file |
|
|
254
|
+
| 3 | P-9c1f47a6 | Healthcheck implementations + dispatch SUCCESS/ERROR gate |
|
|
255
|
+
| 4 | P-4b8d2e57 | Discovery API (list / by-name / kill / restart + ETag) |
|
|
256
|
+
| 5 | P-6e2a8b13 | Dashboard panel + SSE log viewer |
|
|
257
|
+
| 6 | P-1f9c3a45 | Playbook auto-inject + dispatch hint |
|
|
258
|
+
| 7 | P-8a4d6f29 | TTL sweep + boot reconcile + project-removal + log rotation |
|
|
259
|
+
| 8 | P-3c7e9d18 | Load test + e2e + docs + Constellation smoke run *(this doc)* |
|
package/docs/watches.md
CHANGED
|
@@ -20,7 +20,7 @@ A watch is a small JSON record persisted to `engine/watches.json`. It binds:
|
|
|
20
20
|
| `requires` | Optional guard: array of predicate objects evaluated against `state` / `entity` / `prevState`; trigger is suppressed when any guard fails (false-or-error). Used to gate a watch on "PR is mergeable AND build passing" etc. |
|
|
21
21
|
| `status` | `WATCH_STATUS.ACTIVE` \| `PAUSED` \| `TRIGGERED` \| `EXPIRED` |
|
|
22
22
|
|
|
23
|
-
`createWatch()` allocates a `watch-<uid>` id, defaults the fields above, and persists atomically via `mutateJsonFileLocked` *(source: `engine/watches.js:184`)*.
|
|
23
|
+
`createWatch()` allocates a `watch-<uid>` id, defaults the fields above, and persists atomically via `mutateJsonFileLocked` *(source: `engine/watches.js:184-247`)*.
|
|
24
24
|
|
|
25
25
|
## Lifecycle (`WATCH_STATUS`)
|
|
26
26
|
|
|
@@ -33,7 +33,7 @@ Defined in `engine/shared.js:1875`:
|
|
|
33
33
|
| `triggered` | Reserved status (set on demand by callers; not auto-applied) |
|
|
34
34
|
| `expired` | Auto-set when `stopAfter` is reached, or on first trigger for absolute conditions when `stopAfter === 0`. The watch is left on disk for audit but no longer evaluated *(source: `engine/watches.js:507-508`)* |
|
|
35
35
|
|
|
36
|
-
Pause/resume flips the `status` field via `POST /api/watches/update` *(source: `engine/watches.js
|
|
36
|
+
Pause/resume flips the `status` field via `POST /api/watches/update` *(source: `engine/watches.js` `updateWatch`, `dashboard.js` `handleWatchesUpdate`)*.
|
|
37
37
|
|
|
38
38
|
## Conditions (`WATCH_CONDITION`)
|
|
39
39
|
|
|
@@ -46,14 +46,16 @@ Defined in `engine/shared.js:1891-1929`. Conditions split into two families:
|
|
|
46
46
|
|
|
47
47
|
When `stopAfter === 0`, these are **fire-once** — the engine flips the watch to `expired` after the first trigger so a permanently-merged PR (or a permanently-true compound state assertion like `ready-for-merge`) doesn't keep notifying *(source: `engine/watches.js:507-508`)*.
|
|
48
48
|
|
|
49
|
+
> **Per-target override (W-mp7hg58e000b5212):** the global `WATCH_ABSOLUTE_CONDITIONS` set is the legacy fallback. Each target type now declares its own `absoluteConditions: [...]` array in its spec; `registerTargetType` normalizes that into a `Set` that takes precedence at evaluation time. The plugin contract (see below) uses this to keep absolute-vs-change semantics local to each target type. Plugins that omit `absoluteConditions` get an empty set (all change-based).
|
|
50
|
+
|
|
49
51
|
### Change-based conditions
|
|
50
|
-
`status-change`, `any`, `new-comments`, `vote-change`, `stage-complete`, `ran`, `enabled`, `disabled`, `activity-change`, `head-commit-change`, `mergeable-flipped`, `behind-master`, `draft-flipped`, `dependency-met`, `stage-advanced
|
|
52
|
+
`status-change`, `any`, `new-comments`, `vote-change`, `stage-complete`, `ran`, `enabled`, `disabled`, `activity-change`, plus the predicate conditions added under P-w4e2f6a1 / P-w5b8d2c9 for the `pr`, `work-item`, `plan`, and `pipeline` target types (`head-commit-change`, `mergeable-flipped`, `behind-master`, `draft-flipped`, `stalled`, `dependency-met`, `stage-advanced`, `stuck-in-stage`). See `engine/shared.js:1891-1929` for the canonical enum.
|
|
53
|
+
|
|
54
|
+
These compare the live entity against the watch's `_lastState` snapshot and run forever when `stopAfter === 0`. Baseline `_lastState` is captured on the first check so the very next change triggers the watch *(source: `engine/watches.js:434, 520`)*.
|
|
51
55
|
|
|
52
56
|
### Tick-counted conditions
|
|
53
57
|
`stalled`, `stuck-in-stage` — require N consecutive unchanged captures (default `WATCH_STALLED_DEFAULT_TICKS = 12`, `WATCH_STUCK_STAGE_DEFAULT_TICKS = 12`, both in `engine/shared.js:1934-1935`). Counters (`_unchangedTicks`, `_stuckStageTicks`) are recomputed inside `_captureState` by comparing the fresh snapshot against `prevState`.
|
|
54
58
|
|
|
55
|
-
Change-based and tick-counted conditions compare the live entity against the watch's `_lastState` snapshot and run forever when `stopAfter === 0`. Baseline `_lastState` is captured on the first check so the very next change triggers the watch *(source: `engine/watches.js:434, 520`)*.
|
|
56
|
-
|
|
57
59
|
### Predicate conditions
|
|
58
60
|
|
|
59
61
|
Several condition keys evaluate a derived predicate on the captured entity/state rather than comparing to `_lastState`. Built-in predicates per target type:
|
|
@@ -63,20 +65,21 @@ Several condition keys evaluate a derived predicate on the captured entity/state
|
|
|
63
65
|
- **plan** — `all-items-done` (`items_done === items_total > 0`), `item-failed-n-times` (any `missing_features[*]._retryCount >= ENGINE_DEFAULTS.maxRetries`).
|
|
64
66
|
- **pipeline** — `stage-advanced` (`current_stage_id` changed within the same `runId`), `stuck-in-stage` (current stage unchanged for `WATCH_STUCK_STAGE_DEFAULT_TICKS` checks, default 12).
|
|
65
67
|
|
|
66
|
-
Compound state-assertion predicates (`ready-for-merge`, `retry-limit-reached`, `all-items-done`, `item-failed-n-times`) live in `WATCH_ABSOLUTE_CONDITIONS` so they fire-once when `stopAfter === 0` — without that they would re-fire every tick while the assertion holds *(source: `engine/shared.js` `WATCH_ABSOLUTE_CONDITIONS`)*.
|
|
68
|
+
Compound state-assertion predicates (`ready-for-merge`, `retry-limit-reached`, `all-items-done`, `item-failed-n-times`) live in `WATCH_ABSOLUTE_CONDITIONS` so they fire-once when `stopAfter === 0` — without that they would re-fire every tick while the assertion holds *(source: `engine/shared.js:1938` `WATCH_ABSOLUTE_CONDITIONS`)*.
|
|
67
69
|
|
|
68
70
|
## Target Types — `TARGET_TYPES` Registry
|
|
69
71
|
|
|
70
|
-
Target-type behavior in `engine/watches.js` is **data-driven via a registry** *(source: `engine/watches.js:
|
|
72
|
+
Target-type behavior in `engine/watches.js` is **data-driven via a registry** *(source: `engine/watches.js:124-160`)*. Each spec must provide:
|
|
71
73
|
|
|
72
74
|
- `label` — human name shown in dashboard pickers
|
|
73
75
|
- `description` — short help text
|
|
74
76
|
- `conditions` — non-empty array of accepted condition keys (also acts as the per-type allowlist)
|
|
77
|
+
- `absoluteConditions` — *(optional)* subset of `conditions` that fire-once when `stopAfter === 0`; defaults to `[]` (all conditions treated as change-based)
|
|
75
78
|
- `fetchEntity(target, state)` — entity-or-null lookup
|
|
76
79
|
- `captureState(entity)` — snapshot used for change-detection diffs
|
|
77
80
|
- `evaluate(condition, entity, prevState, target)` — returns `{ triggered, message }`
|
|
78
81
|
|
|
79
|
-
The registry IS the allowlist for `createWatch` and `/api/watches/target-types`; the old hardcoded "pr or work-item" check is gone. Add a new target type at runtime with `registerTargetType(type, spec)` and look one up with `getTargetType(type)`. `listTargetTypes()` returns the serializable form used by the dashboard *(source: `engine/watches.js:124-
|
|
82
|
+
The registry IS the allowlist for `createWatch` and `/api/watches/target-types`; the old hardcoded "pr or work-item" check is gone. Add a new target type at runtime with `registerTargetType(type, spec)` and look one up with `getTargetType(type)`. `listTargetTypes()` returns the serializable form used by the dashboard *(source: `engine/watches.js:124-178`)*.
|
|
80
83
|
|
|
81
84
|
### User-extensible via `watches.d/` (W-mp7hg58e000b5212)
|
|
82
85
|
|
|
@@ -86,7 +89,7 @@ Canonical example: `watches.d/http.js` (W-mp7i22mu00191b07) — a generic HTTP p
|
|
|
86
89
|
|
|
87
90
|
### Built-in target types
|
|
88
91
|
|
|
89
|
-
The eight built-ins are registered at module load *(source: `engine/watches.js:672-
|
|
92
|
+
The eight built-ins are registered at module load *(source: `engine/watches.js:672-1311`)*. Constants live at `engine/shared.js:1881-1890` (`WATCH_TARGET_TYPE`).
|
|
90
93
|
|
|
91
94
|
| `targetType` | Target value | Conditions | Notes |
|
|
92
95
|
|---------------|--------------------------------------|----------------------------------------------------------------------------|-------|
|
|
@@ -99,15 +102,38 @@ The eight built-ins are registered at module load *(source: `engine/watches.js:6
|
|
|
99
102
|
| `dispatch` | Dispatch entry id | `completed`, `failed`, `status-change`, `any` | Looks across `pending` / `active` / `completed` lists |
|
|
100
103
|
| `agent` | Agent id | `activity-change`, `status-change`, `any` | `activity-change` fires only on transitions in/out of `'working'` |
|
|
101
104
|
|
|
102
|
-
`evaluateWatch` dispatches to `tt.evaluate(...)`; unknown target types return `"Unknown target type: ..."` and unknown conditions return `"Unknown condition: ..."` — both are non-triggering *(source: `engine/watches.js:318
|
|
105
|
+
`evaluateWatch` dispatches to `tt.evaluate(...)`; unknown target types return `"Unknown target type: ..."` and unknown conditions return `"Unknown condition: ..."` — both are non-triggering *(source: `engine/watches.js:318-360`)*.
|
|
106
|
+
|
|
107
|
+
### Plugin folder (`watches.d/`) — user-extensible target types
|
|
108
|
+
|
|
109
|
+
W-mp7hg58e000b5212 added a **plugin folder** so operators can register new target types without editing engine source. At engine boot, `engine/watches.js` scans `<MINIONS_DIR>/watches.d/*.js` *after* the eight built-ins are registered (so plugins can override a built-in by re-using its key — last-write-wins) and calls `registerTargetType()` for each export *(source: `engine/watches.js:1313-1354`)*.
|
|
110
|
+
|
|
111
|
+
Each `watches.d/<name>.js` file must export `{ name, spec }` (or an array of those):
|
|
112
|
+
|
|
113
|
+
```js
|
|
114
|
+
module.exports = {
|
|
115
|
+
name: 'my-target',
|
|
116
|
+
spec: {
|
|
117
|
+
label: 'My Target',
|
|
118
|
+
description: 'What it watches',
|
|
119
|
+
conditions: ['status-change', 'value-equals'],
|
|
120
|
+
absoluteConditions: ['value-equals'], // optional, defaults to []
|
|
121
|
+
fetchEntity(target, state) { /* sync entity lookup */ },
|
|
122
|
+
captureState(entity) { /* snapshot for diffs */ },
|
|
123
|
+
evaluate(condition, entity, prevState, target) {
|
|
124
|
+
return { triggered: <bool>, message: '...' };
|
|
125
|
+
},
|
|
126
|
+
},
|
|
127
|
+
};
|
|
128
|
+
```
|
|
103
129
|
|
|
104
|
-
|
|
130
|
+
Resolution is `path.join(shared.MINIONS_DIR, 'watches.d')` so it works in both dev checkouts and installed `~/.minions/` layouts. Missing folder is silent; per-file failures are isolated (load errors log a `WARN` line, don't abort boot). Tests can re-scan via the exported `_loadPluginTargetTypes()`.
|
|
105
131
|
|
|
106
|
-
`
|
|
132
|
+
**Canonical plugin example:** [`watches.d/http.js`](../watches.d/http.js) (W-mp7i22mu00191b07) ships a generic `http` target type — point a watch at any URL or `{ url, method, headers, body, extract, expected, regex, timeoutMs }` config and the plugin exposes `status-change`, `value-change`, `value-equals`, `value-matches`, and `http-error` conditions. Auth headers support `{{env.GH_TOKEN}}`-style substitution at fetch time (secrets are never stored in `watches.json`). All conditions are change-based (`absoluteConditions: []`).
|
|
107
133
|
|
|
108
134
|
## Tick Integration
|
|
109
135
|
|
|
110
|
-
`engine.js` calls `checkWatches(config, state)` every 3 ticks (~3 min at the default 60s tick) inside its own `safe('checkWatches', ...)` block *(source: `engine.js:
|
|
136
|
+
`engine.js` calls `checkWatches(config, state)` every 3 ticks (~3 min at the default 60s tick) inside its own `safe('checkWatches', ...)` block *(source: `engine.js:5432-5485`)*. The engine builds the state object from cached project files + module reads:
|
|
111
137
|
|
|
112
138
|
```
|
|
113
139
|
{
|
|
@@ -118,7 +144,7 @@ The eight built-ins are registered at module load *(source: `engine/watches.js:6
|
|
|
118
144
|
}
|
|
119
145
|
```
|
|
120
146
|
|
|
121
|
-
`checkWatches` walks every active watch and, inside a single `mutateJsonFileLocked` callback *(source: `engine/watches.js:410
|
|
147
|
+
`checkWatches` walks every active watch and, inside a single `mutateJsonFileLocked` callback *(source: `engine/watches.js:410-560`)*:
|
|
122
148
|
|
|
123
149
|
1. Skips paused/expired watches and any watch checked within its `interval`.
|
|
124
150
|
2. Captures a baseline `_lastState` on first check (so change conditions have something to diff).
|
|
@@ -128,11 +154,11 @@ The eight built-ins are registered at module load *(source: `engine/watches.js:6
|
|
|
128
154
|
6. On non-trigger: writes a per-poll inbox note when `onNotMet === 'notify'`.
|
|
129
155
|
7. Refreshes `_lastState` for the next check.
|
|
130
156
|
|
|
131
|
-
I/O happens **outside the lock**: notifications via `writeToInbox`, follow-up actions via `_runActionTask` (`Promise` per action, failures isolated). Each action's result is persisted back onto the watch as `_lastActionResult` in a follow-up locked write *(source: `engine/watches.js:
|
|
157
|
+
I/O happens **outside the lock**: notifications via `writeToInbox`, follow-up actions via `_runActionTask` (`Promise` per action, failures isolated). Each action's result is persisted back onto the watch as `_lastActionResult` in a follow-up locked write *(source: `engine/watches.js:562+` `_runActionTask`)*.
|
|
132
158
|
|
|
133
159
|
## Follow-Up Actions on Trigger
|
|
134
160
|
|
|
135
|
-
`watch.action` is an optional structured action that runs after the inbox notification fires. Action types live in a sibling registry in `engine/watch-actions.js` and are validated at create/update time *(source: `engine/watches.js:
|
|
161
|
+
`watch.action` is an optional structured action that runs after the inbox notification fires. Action types live in a sibling registry in `engine/watch-actions.js` and are validated at create/update time *(source: `engine/watches.js:184-247` `createWatch`, `engine/watch-actions.js:223-330` `registerActionType`)*. `GET /api/watches/action-types` returns the live list for dashboard pickers.
|
|
136
162
|
|
|
137
163
|
### Built-in actions
|
|
138
164
|
|
|
@@ -148,7 +174,7 @@ I/O happens **outside the lock**: notifications via `writeToInbox`, follow-up ac
|
|
|
148
174
|
| `archive-plan` | Set PRD `status="archived"` + `archivedAt` |
|
|
149
175
|
| `resume-plan` | Set PRD `status=PLAN_STATUS.ACTIVE` and clear `planStale` |
|
|
150
176
|
|
|
151
|
-
Constants live in `WATCH_ACTION_TYPE` (`engine/shared.js:1960
|
|
177
|
+
Constants live in `WATCH_ACTION_TYPE` (`engine/shared.js:1960`); handlers in `engine/watch-actions.js`.
|
|
152
178
|
|
|
153
179
|
### Templating
|
|
154
180
|
|
|
@@ -208,20 +234,21 @@ Absolute conditions firing under `stopAfter === 0` flip `status` to `expired`; `
|
|
|
208
234
|
| Watch never fires | Check `status === 'active'`; check `last_checked` advancing each cycle; confirm engine tick is running and `interval` isn't longer than your test window |
|
|
209
235
|
| `evaluateWatch` returns `"<label> <target> not found"` | `fetchEntity` got nothing back — wrong `target` (e.g. PR display id vs canonical id), the target type isn't loaded, or the underlying file (PR cache, plan PRD) doesn't exist |
|
|
210
236
|
| `"Unknown target type"` / `"Unknown condition"` | The registry doesn't recognise the value. Check `GET /api/watches/target-types` to see what's registered server-side; condition must be in that target type's `conditions[]` |
|
|
211
|
-
| Change condition fires immediately on first tick | Won't happen — baseline `_lastState` is captured on the first check before `evaluate` runs *(source: `engine/watches.js:434
|
|
237
|
+
| Change condition fires immediately on first tick | Won't happen — baseline `_lastState` is captured on the first check before `evaluate` runs *(source: `engine/watches.js:434`)*. If you see this, suspect manual edits to `watches.json` |
|
|
212
238
|
| Absolute watch fires forever instead of once | `stopAfter` is set to a non-zero value; only `stopAfter === 0` triggers fire-once expiration |
|
|
213
239
|
| Action runs but inbox notification doesn't | `notify` field isn't `'inbox'`, or `owner` is empty. `notify` and `action` are independent — both can fire, or only one |
|
|
214
240
|
| `_lastActionResult.ok === false` with `"unknown action type"` | The `action.type` isn't registered. List with `listActionTypes()` / `GET /api/watches/action-types` |
|
|
215
241
|
| Webhook action returns `"only http/https allowed"` | URLs must use `http://` or `https://` schemes; other protocols are rejected by design *(source: `engine/watch-actions.js` `WEBHOOK` handler)* |
|
|
216
242
|
| Trigger fires but follow-up `dispatch-work-item` is missing | Check the engine log for `Watch <id> action <type>: <summary>`. Common reasons: missing `title`, the project's `work-items.json` couldn't be written, or the WI landed in central `work-items.json` because no project was specified |
|
|
217
|
-
| Watch `_lastActionResult` shows `"timeout"` for webhook | Webhooks have a 10s safety timeout to keep the watches tick fast *(source: `engine/watch-actions.js:482`)* |
|
|
218
|
-
| `checkWatches` block crashes silently | Wrapped in `safe('checkWatches', ...)` so one failure doesn't abort the tick *(source: `engine.js:
|
|
243
|
+
| Watch `_lastActionResult` shows `"timeout"` for webhook | Webhooks have a 10s safety timeout to keep the watches tick fast *(source: `engine/watch-actions.js:482-484`)* |
|
|
244
|
+
| `checkWatches` block crashes silently | Wrapped in `safe('checkWatches', ...)` so one failure doesn't abort the tick *(source: `engine.js:5432`)*. Inspect `engine/log.json` for `Watch check error (<id>)` lines. Regression #1088: the block must use `getProjects(config)`, never the long-removed `PROJECTS` constant |
|
|
219
245
|
|
|
220
246
|
## See Also
|
|
221
247
|
|
|
222
248
|
- `engine/shared.js:1875-1960` — `WATCH_STATUS`, `WATCH_TARGET_TYPE`, `WATCH_CONDITION`, `WATCH_ABSOLUTE_CONDITIONS`, `WATCH_ACTION_TYPE` constants
|
|
223
249
|
- `engine/watches.js` — registry, lifecycle, tick integration, `watches.d/` plugin loader
|
|
224
250
|
- `engine/watch-actions.js` — action registry and built-in handlers (including `minions-api`)
|
|
251
|
+
- `watches.d/http.js` — canonical user-extensible target type plugin
|
|
225
252
|
- `dashboard/pages/watches.html`, `dashboard/js/render-watches.js` — dashboard UI
|
|
226
253
|
- `test/unit/watches-module.test.js`, `test/unit/watch-actions.test.js`, `test/unit/watches-plugin-loader.test.js` — module-level tests
|
|
227
254
|
- [`auto-discovery.md`](auto-discovery.md) — overall tick cycle context
|
package/engine/cli.js
CHANGED
|
@@ -751,6 +751,45 @@ const commands = {
|
|
|
751
751
|
}
|
|
752
752
|
})();
|
|
753
753
|
|
|
754
|
+
// P-8a4d6f29 — Boot reconcile for managed-spawn: drop dead-PID rows from
|
|
755
|
+
// engine/managed-processes.json, kill TTL-expired specs, re-probe
|
|
756
|
+
// survivors once. Wrapped in Promise.race against
|
|
757
|
+
// ENGINE_DEFAULTS.managedSpawn.bootReconcileMaxMs (2s default) so a
|
|
758
|
+
// malformed or oversized state file cannot block engine boot. Async +
|
|
759
|
+
// fire-and-forget — the first tick is allowed to start regardless.
|
|
760
|
+
(function startupReconcileManagedSpawn() {
|
|
761
|
+
try {
|
|
762
|
+
const managedSpawn = require('./managed-spawn');
|
|
763
|
+
const limits = (require('./shared').ENGINE_DEFAULTS.managedSpawn || {});
|
|
764
|
+
if (limits.enabled === false) return;
|
|
765
|
+
const maxMs = Math.max(1, Number(limits.bootReconcileMaxMs) || 2000);
|
|
766
|
+
let settled = false;
|
|
767
|
+
const timeoutPromise = new Promise((resolve) => {
|
|
768
|
+
setTimeout(() => { if (!settled) { settled = true; resolve({ timedOut: true }); } }, maxMs).unref?.();
|
|
769
|
+
});
|
|
770
|
+
const reconcilePromise = Promise.resolve()
|
|
771
|
+
.then(() => managedSpawn.bootReconcileManagedSpawn())
|
|
772
|
+
.then((r) => { settled = true; return { timedOut: false, result: r }; })
|
|
773
|
+
.catch((err) => { settled = true; return { timedOut: false, error: err }; });
|
|
774
|
+
Promise.race([reconcilePromise, timeoutPromise]).then((outcome) => {
|
|
775
|
+
if (outcome.timedOut) {
|
|
776
|
+
e.log('warn', `Managed-spawn boot reconcile exceeded ${maxMs}ms; continuing without it`);
|
|
777
|
+
return;
|
|
778
|
+
}
|
|
779
|
+
if (outcome.error) {
|
|
780
|
+
e.log('warn', `Managed-spawn boot reconcile failed: ${outcome.error.message}`);
|
|
781
|
+
return;
|
|
782
|
+
}
|
|
783
|
+
const s = outcome.result && outcome.result.stats;
|
|
784
|
+
if (s && s.scanned > 0 && (s.ttlExpired || s.deadDropped || s.rotatedLogs)) {
|
|
785
|
+
console.log(` Managed-spawn boot reconcile: scanned=${s.scanned} ttl=${s.ttlExpired} dead=${s.deadDropped} killed=${s.killedPids} rotated=${s.rotatedLogs}`);
|
|
786
|
+
}
|
|
787
|
+
});
|
|
788
|
+
} catch (err) {
|
|
789
|
+
e.log('warn', `Managed-spawn boot reconcile failed to start: ${err.message}`);
|
|
790
|
+
}
|
|
791
|
+
})();
|
|
792
|
+
|
|
754
793
|
// W-mp73ya3e000me6c5 — Boot reconcile for the worktree pool: drop entries
|
|
755
794
|
// whose path is gone (manual rm), whose borrower crashed before
|
|
756
795
|
// returning, or whose idle TTL expired while the engine was down. The
|