instar 1.3.577 → 1.3.578
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/server.d.ts.map +1 -1
- package/dist/commands/server.js +13 -0
- package/dist/commands/server.js.map +1 -1
- package/dist/core/PostUpdateMigrator.d.ts.map +1 -1
- package/dist/core/PostUpdateMigrator.js +16 -0
- package/dist/core/PostUpdateMigrator.js.map +1 -1
- package/dist/core/types.d.ts +9 -0
- package/dist/core/types.d.ts.map +1 -1
- package/dist/core/types.js.map +1 -1
- package/dist/monitoring/ResumeQueue.d.ts +50 -0
- package/dist/monitoring/ResumeQueue.d.ts.map +1 -1
- package/dist/monitoring/ResumeQueue.js +179 -3
- package/dist/monitoring/ResumeQueue.js.map +1 -1
- package/dist/monitoring/guardManifest.d.ts.map +1 -1
- package/dist/monitoring/guardManifest.js +17 -0
- package/dist/monitoring/guardManifest.js.map +1 -1
- package/dist/scaffold/templates.d.ts.map +1 -1
- package/dist/scaffold/templates.js +1 -0
- package/dist/scaffold/templates.js.map +1 -1
- package/package.json +1 -1
- package/src/data/builtin-manifest.json +19 -19
- package/src/scaffold/templates.ts +1 -0
- package/upgrades/1.3.578.md +64 -0
- package/upgrades/side-effects/autonomous-run-outlives-session.md +114 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "./builtin-manifest.schema.json",
|
|
3
3
|
"schemaVersion": 1,
|
|
4
|
-
"generatedAt": "2026-06-15T19:
|
|
5
|
-
"instarVersion": "1.3.
|
|
4
|
+
"generatedAt": "2026-06-15T19:52:27.554Z",
|
|
5
|
+
"instarVersion": "1.3.578",
|
|
6
6
|
"entryCount": 201,
|
|
7
7
|
"entries": {
|
|
8
8
|
"hook:session-start": {
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"domain": "identity",
|
|
12
12
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
13
13
|
"installedPath": ".instar/hooks/instar/session-start.sh",
|
|
14
|
-
"contentHash": "
|
|
14
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
15
15
|
"since": "2025-01-01"
|
|
16
16
|
},
|
|
17
17
|
"hook:dangerous-command-guard": {
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
"domain": "safety",
|
|
21
21
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
22
22
|
"installedPath": ".instar/hooks/instar/dangerous-command-guard.sh",
|
|
23
|
-
"contentHash": "
|
|
23
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
24
24
|
"since": "2025-01-01"
|
|
25
25
|
},
|
|
26
26
|
"hook:grounding-before-messaging": {
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
"domain": "safety",
|
|
30
30
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
31
31
|
"installedPath": ".instar/hooks/instar/grounding-before-messaging.sh",
|
|
32
|
-
"contentHash": "
|
|
32
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
33
33
|
"since": "2025-01-01"
|
|
34
34
|
},
|
|
35
35
|
"hook:compaction-recovery": {
|
|
@@ -38,7 +38,7 @@
|
|
|
38
38
|
"domain": "identity",
|
|
39
39
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
40
40
|
"installedPath": ".instar/hooks/instar/compaction-recovery.sh",
|
|
41
|
-
"contentHash": "
|
|
41
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
42
42
|
"since": "2025-01-01"
|
|
43
43
|
},
|
|
44
44
|
"hook:external-operation-gate": {
|
|
@@ -47,7 +47,7 @@
|
|
|
47
47
|
"domain": "safety",
|
|
48
48
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
49
49
|
"installedPath": ".instar/hooks/instar/external-operation-gate.js",
|
|
50
|
-
"contentHash": "
|
|
50
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
51
51
|
"since": "2025-01-01"
|
|
52
52
|
},
|
|
53
53
|
"hook:deferral-detector": {
|
|
@@ -56,7 +56,7 @@
|
|
|
56
56
|
"domain": "safety",
|
|
57
57
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
58
58
|
"installedPath": ".instar/hooks/instar/deferral-detector.js",
|
|
59
|
-
"contentHash": "
|
|
59
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
60
60
|
"since": "2025-01-01"
|
|
61
61
|
},
|
|
62
62
|
"hook:self-stop-guard": {
|
|
@@ -65,7 +65,7 @@
|
|
|
65
65
|
"domain": "coherence",
|
|
66
66
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
67
67
|
"installedPath": ".instar/hooks/instar/self-stop-guard.js",
|
|
68
|
-
"contentHash": "
|
|
68
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
69
69
|
"since": "2025-01-01"
|
|
70
70
|
},
|
|
71
71
|
"hook:post-action-reflection": {
|
|
@@ -74,7 +74,7 @@
|
|
|
74
74
|
"domain": "evolution",
|
|
75
75
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
76
76
|
"installedPath": ".instar/hooks/instar/post-action-reflection.js",
|
|
77
|
-
"contentHash": "
|
|
77
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
78
78
|
"since": "2025-01-01"
|
|
79
79
|
},
|
|
80
80
|
"hook:external-communication-guard": {
|
|
@@ -83,7 +83,7 @@
|
|
|
83
83
|
"domain": "safety",
|
|
84
84
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
85
85
|
"installedPath": ".instar/hooks/instar/external-communication-guard.js",
|
|
86
|
-
"contentHash": "
|
|
86
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
87
87
|
"since": "2025-01-01"
|
|
88
88
|
},
|
|
89
89
|
"hook:scope-coherence-collector": {
|
|
@@ -92,7 +92,7 @@
|
|
|
92
92
|
"domain": "coherence",
|
|
93
93
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
94
94
|
"installedPath": ".instar/hooks/instar/scope-coherence-collector.js",
|
|
95
|
-
"contentHash": "
|
|
95
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
96
96
|
"since": "2025-01-01"
|
|
97
97
|
},
|
|
98
98
|
"hook:scope-coherence-checkpoint": {
|
|
@@ -101,7 +101,7 @@
|
|
|
101
101
|
"domain": "coherence",
|
|
102
102
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
103
103
|
"installedPath": ".instar/hooks/instar/scope-coherence-checkpoint.js",
|
|
104
|
-
"contentHash": "
|
|
104
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
105
105
|
"since": "2025-01-01"
|
|
106
106
|
},
|
|
107
107
|
"hook:free-text-guard": {
|
|
@@ -110,7 +110,7 @@
|
|
|
110
110
|
"domain": "safety",
|
|
111
111
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
112
112
|
"installedPath": ".instar/hooks/instar/free-text-guard.sh",
|
|
113
|
-
"contentHash": "
|
|
113
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
114
114
|
"since": "2025-01-01"
|
|
115
115
|
},
|
|
116
116
|
"hook:claim-intercept": {
|
|
@@ -119,7 +119,7 @@
|
|
|
119
119
|
"domain": "coherence",
|
|
120
120
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
121
121
|
"installedPath": ".instar/hooks/instar/claim-intercept.js",
|
|
122
|
-
"contentHash": "
|
|
122
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
123
123
|
"since": "2025-01-01"
|
|
124
124
|
},
|
|
125
125
|
"hook:claim-intercept-response": {
|
|
@@ -128,7 +128,7 @@
|
|
|
128
128
|
"domain": "coherence",
|
|
129
129
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
130
130
|
"installedPath": ".instar/hooks/instar/claim-intercept-response.js",
|
|
131
|
-
"contentHash": "
|
|
131
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
132
132
|
"since": "2025-01-01"
|
|
133
133
|
},
|
|
134
134
|
"hook:stop-gate-router": {
|
|
@@ -137,7 +137,7 @@
|
|
|
137
137
|
"domain": "safety",
|
|
138
138
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
139
139
|
"installedPath": ".instar/hooks/instar/stop-gate-router.js",
|
|
140
|
-
"contentHash": "
|
|
140
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
141
141
|
"since": "2025-01-01"
|
|
142
142
|
},
|
|
143
143
|
"hook:auto-approve-permissions": {
|
|
@@ -146,7 +146,7 @@
|
|
|
146
146
|
"domain": "safety",
|
|
147
147
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
148
148
|
"installedPath": ".instar/hooks/instar/auto-approve-permissions.js",
|
|
149
|
-
"contentHash": "
|
|
149
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
150
150
|
"since": "2025-01-01"
|
|
151
151
|
},
|
|
152
152
|
"job:health-check": {
|
|
@@ -1554,7 +1554,7 @@
|
|
|
1554
1554
|
"type": "subsystem",
|
|
1555
1555
|
"domain": "updates",
|
|
1556
1556
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
1557
|
-
"contentHash": "
|
|
1557
|
+
"contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
|
|
1558
1558
|
"since": "2025-01-01"
|
|
1559
1559
|
},
|
|
1560
1560
|
"subsystem:scheduler": {
|
|
@@ -422,6 +422,7 @@ This routes feedback to the Instar maintainers automatically. Valid types: \`bug
|
|
|
422
422
|
- **Mid-work resume queue** (ships observe-only/dry-run by default): a session reaped MID-WORK (strong work evidence at kill time) is queued for ordered automatic revival once the machine recovers — at most one resume per minute, only after sustained calm + quota headroom. \`GET /sessions/resume-queue\` shows entries, paused/breaker state, and lastTickAt; \`POST /sessions/resume-queue/:id/cancel\` · \`/:id/requeue\` (gave-up entries only) · \`/resume\` (unpause) · \`/drain\` (manual single step). Emergency stops pause the queue; an explicit per-topic stop cancels that topic's entries. Jobs only auto-resume when their definition sets \`resumeOnReap: true\`.
|
|
423
423
|
- **A stale emergency-stop pause self-heals**: an emergency-stop pauses the WHOLE revival queue, and that pause used to never lift — silently stranding later, unrelated active-run revivals (the 2026-06-14 4-hour-silent-strand). Now: while the queue is paused with sessions waiting, you get ONE plain-English heads-up that revival is paused (Layer 1, always on); and if the pause is a stale emergency/sentinel stop AND an active autonomous run has since been recycled and queued well after the stop, the queue auto-resumes itself (Layer 2, on by default — \`monitoring.resumeQueue.autoResumeStalePause: false\` to disable; \`staleEmergencyPauseAutoResumeMin\` tunes the window, default 60). Any topic you actually stopped stays blocked by its per-topic operator-stop record even after the queue resumes, and a deliberate \`autonomous stop-all\` halt is NEVER auto-cleared. Proactive: user asks "why did my session restart by itself after a stop?" / "why is revival paused?" → GET /sessions/resume-queue (paused state) and the resume-queue audit log, then explain in plain words.
|
|
424
424
|
- Proactive: user asks "where did my session go?" / "why did X disappear?" / "did something get killed?" → GET /sessions/reap-log and explain the most recent reaped/skipped entries for that session. User asks "did my interrupted work come back?" / "is a restart queued?" → GET /sessions/resume-queue and report the entry's status in plain words.
|
|
425
|
+
- **An autonomous run must outlive its session** (standard; dev-enabled, fleet-default-OFF self-heal): the revival queue takes a host-local lock so two machines can't share its state. A machine RENAME used to leave a stale lock the queue mistook for a shared-volume conflict → it silently disabled the whole revival guard (the 2026-06-15 incident). Now: on the dev agent, a stale FOREIGN-host lock that is provably a single-host rename (host-local disk + dead pid + ≥5min-stale heartbeat) is AUTO-HEALED instead of disabling (fail-closed on any uncertainty; \`monitoring.resumeQueue.autoHealStaleHostLock\`, fleet-default false). And a disabled revival queue now self-reports to the guard-posture inventory — it shows as \`off-runtime-divergent\` on \`GET /guards\` and raises one aggregated attention item, never silently inert. Proactive: user asks "why didn't my autonomous run come back after a restart/rename?" → GET /guards (is the resume queue off-runtime-divergent?) and GET /sessions/resume-queue (disabled reason), then explain.
|
|
425
426
|
- **Build-Session Yield Safety** (ACT-839; ships dev-enabled, dark on the fleet, per the Maturation Path standard): a session reaped while its WORKTREE holds uncommitted work (a build that died "standing by for tests") is resume-eligible on that alone — the killer collects a bounded, fail-open dirty-check pre-kill and tags \`uncommitted-worktree-work\`. On revival the continuation prompt leads with a commit-first directive, and a durable beacon-enabled commitment (\`GET /commitments\`) re-surfaces the obligation if the revived session stalls. An explicit operator/user kill is NEVER auto-revived on a dirty worktree alone. The die-again case is caught by the OrphanedWorkSentinel (\`GET /orphaned-work\`). Proactive: user asks "why did my build come back / why am I being told to commit?" → it was revived because its worktree had unsaved work; commit it or deliberately discard it.
|
|
426
427
|
|
|
427
428
|
### Guard Posture — which safety systems are genuinely on (\`GET /guards\`)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Upgrade Guide — vNEXT
|
|
2
|
+
|
|
3
|
+
<!-- assembled-by: assemble-next-md -->
|
|
4
|
+
<!-- bump: patch -->
|
|
5
|
+
|
|
6
|
+
## What Changed
|
|
7
|
+
|
|
8
|
+
A new constitutional standard ("An Autonomous Run Must Outlive Its Session") plus the
|
|
9
|
+
fix behind it. The mid-work resume queue (the system that revives a reaped autonomous
|
|
10
|
+
run, #1157) takes a host-local lock so two machines can't corrupt its shared state.
|
|
11
|
+
A machine RENAME used to leave a stale lock the queue mistook for a shared-volume
|
|
12
|
+
conflict — so it silently disabled the entire run-revival guard and never said so
|
|
13
|
+
(the 2026-06-15 incident). Two changes:
|
|
14
|
+
|
|
15
|
+
- **Rename-aware lock (GAP-D).** When the lock shows a different host, the queue now
|
|
16
|
+
distinguishes a single-host rename (provably host-local disk + dead pid + ≥5min
|
|
17
|
+
stale heartbeat → auto-heal the lock via an O_EXCL first-writer-wins takeover) from
|
|
18
|
+
a genuine shared-volume conflict (stay disabled). FAIL-CLOSED on any uncertainty
|
|
19
|
+
(unknown filesystem, `df` failure, live pid, fresh heartbeat). The original HARD
|
|
20
|
+
INVARIANT — never pid-probe a foreign-host lock — is fully preserved when auto-heal
|
|
21
|
+
is off. Ships **fleet-default OFF** (`monitoring.resumeQueue.autoHealStaleHostLock`),
|
|
22
|
+
dev-agent dryRun-first (logs "would auto-heal" without rewriting) before going live.
|
|
23
|
+
- **A disabled revival queue is now LOUD (D2).** The queue self-reports to the
|
|
24
|
+
guard-posture inventory (`GUARD_MANIFEST` entry + `guardStatus()` + an unconditional
|
|
25
|
+
registration), so a disabled revival queue reads `off-runtime-divergent` on
|
|
26
|
+
`GET /guards` and raises one aggregated attention item — never silently inert.
|
|
27
|
+
|
|
28
|
+
No new route. New code-defaulted config key (kept out of ConfigDefaults to preserve
|
|
29
|
+
the fleet flip, consistent with #1157). Signal-only surfacing; the only authority is
|
|
30
|
+
the queue refusing to start itself (bounded self-recovery, fail-closed).
|
|
31
|
+
|
|
32
|
+
## What to Tell Your User
|
|
33
|
+
|
|
34
|
+
- **Your autonomous work survives a machine rename now** (dev agent): "If I rename or
|
|
35
|
+
restore this machine, the system that brings a reaped autonomous run back no longer
|
|
36
|
+
quietly switches itself off. On a provable same-machine rename it heals its own lock
|
|
37
|
+
(carefully — only on a local disk, with the old process gone); on anything uncertain
|
|
38
|
+
it stays cautious. And if that revival system is ever genuinely disabled, you'll see
|
|
39
|
+
it flagged on the guards view with one alert instead of silence." ⚗️ Experimental —
|
|
40
|
+
the self-heal ships dark on the fleet (dev-agent first) and rolls out more widely
|
|
41
|
+
only after it's proven safe.
|
|
42
|
+
|
|
43
|
+
## Summary of New Capabilities
|
|
44
|
+
|
|
45
|
+
| Capability | How to Use |
|
|
46
|
+
|-----------|-----------|
|
|
47
|
+
| A machine rename auto-heals the resume-queue lock instead of silently disabling revival | Automatic on the dev agent (`monitoring.resumeQueue.autoHealStaleHostLock`; fleet default off) |
|
|
48
|
+
| A disabled revival queue surfaces as `off-runtime-divergent` | `GET /guards` (automatic) |
|
|
49
|
+
| Diagnose "why didn't my autonomous run come back?" | `GET /guards` + `GET /sessions/resume-queue` (disabled reason) |
|
|
50
|
+
|
|
51
|
+
## Evidence
|
|
52
|
+
|
|
53
|
+
- Unit (`tests/unit/resume-queue-autoheal-lock.test.ts`, 11): the FD1 device-source
|
|
54
|
+
truth-table (local `/dev/*` → local; `//host`/`host:/path` → not; unknown/tmpfs/map →
|
|
55
|
+
fail-closed); auto-heal fires only on local-FS + dead-pid + stale-heartbeat; stays
|
|
56
|
+
disabled on a non-local FS or a live pid; dryRun logs-but-does-not-rewrite; auto-heal
|
|
57
|
+
off preserves today's behavior; `guardStatus()` reporting.
|
|
58
|
+
- Integration (`tests/integration/resume-queue-guard-posture.test.ts`, 3): a
|
|
59
|
+
runtime-disabled queue classifies `off-runtime-divergent` through the real
|
|
60
|
+
GUARD_MANIFEST entry + `deriveGuardRow`; a healthy queue does not.
|
|
61
|
+
- Regression: the full resume-queue unit + route suite (100 tests) stays green —
|
|
62
|
+
including the existing HARD-INVARIANT test that a foreign-host lock is never
|
|
63
|
+
pid-probed (which guards against re-introducing the cross-host probe). tsc clean;
|
|
64
|
+
`lint-guard-manifest` clean. Independent Phase-5 second-pass review concurred.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Side-Effects Review — autonomous-run-outlives-session
|
|
2
|
+
|
|
3
|
+
Spec: `docs/specs/autonomous-run-outlives-session.md` (converged + approved).
|
|
4
|
+
Change: GAP-D — the resume-queue host-lock distinguishes a single-host RENAME
|
|
5
|
+
(auto-heal) from a genuine shared-volume conflict (stay disabled), fail-closed;
|
|
6
|
+
a disabled revival queue self-reports to the guard-posture inventory; + the
|
|
7
|
+
constitutional standard "An Autonomous Run Must Outlive Its Session".
|
|
8
|
+
|
|
9
|
+
Files:
|
|
10
|
+
- `src/monitoring/ResumeQueue.ts` — `classifyDfSourceLocal` + `isStateDirHostLocalDefault` (FD1), foreign-host rename-vs-conflict classifier (FD2), `takeOverLockAtomic` (FD4), `guardStatus()` (D2), `autoHealStaleHostLock` config field (FD5).
|
|
11
|
+
- `src/monitoring/guardManifest.ts` — `GUARD_MANIFEST` entry `monitoring.resumeQueue.enabled` (component `ResumeQueue`).
|
|
12
|
+
- `src/commands/server.ts` — dev-gate resolves `autoHealStaleHostLock`; UNCONDITIONAL `guardRegistry.register` for the queue.
|
|
13
|
+
- `src/core/types.ts` — `autoHealStaleHostLock?` config field.
|
|
14
|
+
- `docs/STANDARDS-REGISTRY.md` — the new standard.
|
|
15
|
+
- `src/scaffold/templates.ts` + `src/core/PostUpdateMigrator.ts` — Agent Awareness line (new + deployed agents).
|
|
16
|
+
- Tests: `tests/unit/resume-queue-autoheal-lock.test.ts` (11), `tests/integration/resume-queue-guard-posture.test.ts` (3).
|
|
17
|
+
|
|
18
|
+
## 1. Over-block (what legitimate inputs does this reject that it shouldn't?)
|
|
19
|
+
The auto-heal is STRICTLY ADDITIVE and gated: it can only turn a currently-DISABLED
|
|
20
|
+
foreign-host case into an enabled one. It never disables a case that previously
|
|
21
|
+
worked. The risk direction is "fails to heal a legitimate rename" → the queue
|
|
22
|
+
stays disabled exactly as today (no regression), just with a louder surface.
|
|
23
|
+
Fail-closed on any uncertainty (unknown FS, df failure, live pid, fresh heartbeat)
|
|
24
|
+
means some genuine renames won't auto-heal — acceptable: the operator clears the
|
|
25
|
+
lock manually as before, and the guard-posture alert now tells them to.
|
|
26
|
+
|
|
27
|
+
## 2. Under-block (what failure modes does this still miss?)
|
|
28
|
+
- pid recycling (FD3, accepted): a recycled dead pid that maps to a live unrelated
|
|
29
|
+
process reads as a live conflict → stays disabled + LOUD (safe direction; worst
|
|
30
|
+
case a false escalation, never corruption).
|
|
31
|
+
- The narrow double-boot unlink race in `takeOverLockAtomic` (two server boots on
|
|
32
|
+
one machine within ms of each other post-rename): O_EXCL gives EEXIST to the
|
|
33
|
+
loser in the common case; the residual double-unlink window is backstopped by
|
|
34
|
+
the next-acquire live-pid + heartbeat check. Not corruption — at worst a
|
|
35
|
+
transient re-evaluation.
|
|
36
|
+
- Genuine shared-volume setups where `df -P` reports a device string we don't
|
|
37
|
+
recognize as network: classified unknown → NOT local → stays disabled (correct).
|
|
38
|
+
|
|
39
|
+
## 3. Level-of-abstraction fit
|
|
40
|
+
Correct layer. The lock classifier lives IN `ResumeQueue.acquireLock` (the only
|
|
41
|
+
place that owns the lock), and the surfacing rides the EXISTING guard-posture
|
|
42
|
+
inventory (GUARD_MANIFEST + GuardRegistry + GuardPostureProbe) rather than a new
|
|
43
|
+
parallel alert path. No new notification surface invented — it feeds the one that
|
|
44
|
+
already aggregates and dedups (Bounded Notification Surface).
|
|
45
|
+
|
|
46
|
+
## 4. Signal vs authority compliance (docs/signal-vs-authority.md)
|
|
47
|
+
COMPLIANT. The auto-heal is bounded SELF-RECOVERY of the queue's own lock with a
|
|
48
|
+
fail-closed default — not a brittle gate holding blocking authority over agent
|
|
49
|
+
behavior or message flow. The guard-posture surfacing is a pure SIGNAL-producer
|
|
50
|
+
(it reports a disabled state; it never blocks, delays, or rewrites anything). The
|
|
51
|
+
default `autoHealStaleHostLock:false` keeps the behavior change off the fleet until
|
|
52
|
+
proven; the dev-agent runs it dryRun-first (logs intent without rewriting).
|
|
53
|
+
|
|
54
|
+
## 5. Interactions
|
|
55
|
+
- Preserves the original HARD INVARIANT (never pid-probe a foreign lock) when
|
|
56
|
+
auto-heal is OFF — verified by the existing `resume-queue.test.ts:417` invariant
|
|
57
|
+
test (which initially regressed and was fixed by gating all probing behind
|
|
58
|
+
`autoHealStaleHostLock`).
|
|
59
|
+
- The new GUARD_MANIFEST entry passes `lint-guard-manifest` (the drainer is not
|
|
60
|
+
auto-flagged, so no orphan NOT_A_GUARD entry — which would itself fail the lint).
|
|
61
|
+
- `guardRegistry.register` is UNCONDITIONAL (even when start() returns false) so a
|
|
62
|
+
lock-disabled queue reads `off-runtime-divergent`, not `missing`.
|
|
63
|
+
- Does NOT touch `evidenceEligible` / the #1157 revival path — strictly the lock
|
|
64
|
+
gate. No double-fire with the existing same-host stale reclaim (that path is
|
|
65
|
+
unchanged; this is the foreign-host branch only).
|
|
66
|
+
|
|
67
|
+
## 6. External surfaces
|
|
68
|
+
- New config key `monitoring.resumeQueue.autoHealStaleHostLock` (fleet default
|
|
69
|
+
false). No new route. `GET /guards` and `GET /sessions/resume-queue` gain a
|
|
70
|
+
truthful disabled-state read; no schema break (additive).
|
|
71
|
+
- CLAUDE.md template + migrator add one awareness bullet (new + deployed agents).
|
|
72
|
+
- No external network/timing dependence beyond a single bounded (3000ms) `df -P`
|
|
73
|
+
at lock-acquisition.
|
|
74
|
+
|
|
75
|
+
## 7. Multi-machine posture (Cross-Machine Coherence)
|
|
76
|
+
MACHINE-LOCAL BY DESIGN, and that is the whole point: the resume-queue lock + its
|
|
77
|
+
state dir are deliberately host-local (a shared volume across two hosts is
|
|
78
|
+
unsupported — the invariant this change PROTECTS). The fix makes the host-local
|
|
79
|
+
assumption ROBUST to a rename of the SAME machine without ever weakening the
|
|
80
|
+
cross-host protection (a genuine foreign live host still disables). guardStatus is
|
|
81
|
+
read per-machine; each machine's `/guards` reports its own queue. No replication
|
|
82
|
+
needed or wanted (a lock is intrinsically local).
|
|
83
|
+
|
|
84
|
+
## 8. Rollback cost
|
|
85
|
+
Cheap and immediate. `monitoring.resumeQueue.autoHealStaleHostLock:false` (the
|
|
86
|
+
fleet default) fully disables the new auto-heal — reverting to today's
|
|
87
|
+
disable-on-mismatch behavior — with no restart-data implications (config read at
|
|
88
|
+
queue construction; next server start picks it up). The guard-posture surfacing is
|
|
89
|
+
inert when the queue is healthy and harmless when disabled (it only reads state).
|
|
90
|
+
No migration, no data repair. The constitutional-standard doc + CLAUDE.md lines are
|
|
91
|
+
documentation (no runtime surface).
|
|
92
|
+
|
|
93
|
+
## Test coverage (Testing Integrity)
|
|
94
|
+
- Unit: `resume-queue-autoheal-lock.test.ts` — FD1 truth-table; auto-heal on
|
|
95
|
+
provable rename; stays-disabled on non-local FS / live pid / fresh heartbeat;
|
|
96
|
+
dryRun no-rewrite; auto-heal-off preserves original behavior; guardStatus.
|
|
97
|
+
- Integration: `resume-queue-guard-posture.test.ts` — a runtime-disabled queue
|
|
98
|
+
classifies `off-runtime-divergent` through the real GUARD_MANIFEST entry +
|
|
99
|
+
`deriveGuardRow` (the route's path); a healthy queue does not.
|
|
100
|
+
- E2E: the existing `tests/e2e/resume-idle-autonomous-lifecycle.test.ts` exercises
|
|
101
|
+
the queue alive end-to-end; this change is additive and those pass. (A dedicated
|
|
102
|
+
boot-with-stale-lock E2E is a candidate enhancement; the unit+integration tiers
|
|
103
|
+
cover the new logic and its wiring.)
|
|
104
|
+
- Regression: full resume-queue unit + route suite (100 tests) green; tsc clean;
|
|
105
|
+
lint-guard-manifest clean.
|
|
106
|
+
|
|
107
|
+
## Second-pass review
|
|
108
|
+
**Concur with the review.** Independent Phase-5 audit (guard/recovery path) verified, citing code:
|
|
109
|
+
1. Auto-heal can NEVER fire on a genuine shared volume with a live remote holder — `fsLocal` is dispositive and `&&`-short-circuits before any pid probe; `df -P` on a network mount never reports `/dev/*`.
|
|
110
|
+
2. The HARD INVARIANT (never pid-probe a foreign lock) is preserved when auto-heal is OFF (the fleet default) — all probing is gated behind `if (this.cfg.autoHealStaleHostLock)`; the existing invariant test (default config) still asserts `probed===false`.
|
|
111
|
+
3. `takeOverLockAtomic` O_EXCL first-writer-wins is correct; the only residual is the documented narrow double-boot unlink window, backstopped by the next-acquire live-pid+heartbeat check — transient/self-correcting, never durable corruption.
|
|
112
|
+
4. Signal-vs-Authority compliant — the only authority is the queue refusing to start itself (bounded self-recovery, fail-closed); `guardStatus()` is a pure signal producer.
|
|
113
|
+
5. No common-path regression — a healthy boot never enters the foreign-host branch and never calls `df`.
|
|
114
|
+
Verdict: sound, fail-closed in the right direction, well-tested on both sides of every decision boundary, safely gated.
|