instar 1.3.577 → 1.3.578

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "$schema": "./builtin-manifest.schema.json",
3
3
  "schemaVersion": 1,
4
- "generatedAt": "2026-06-15T19:37:21.883Z",
5
- "instarVersion": "1.3.577",
4
+ "generatedAt": "2026-06-15T19:52:27.554Z",
5
+ "instarVersion": "1.3.578",
6
6
  "entryCount": 201,
7
7
  "entries": {
8
8
  "hook:session-start": {
@@ -11,7 +11,7 @@
11
11
  "domain": "identity",
12
12
  "sourcePath": "src/core/PostUpdateMigrator.ts",
13
13
  "installedPath": ".instar/hooks/instar/session-start.sh",
14
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
14
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
15
15
  "since": "2025-01-01"
16
16
  },
17
17
  "hook:dangerous-command-guard": {
@@ -20,7 +20,7 @@
20
20
  "domain": "safety",
21
21
  "sourcePath": "src/core/PostUpdateMigrator.ts",
22
22
  "installedPath": ".instar/hooks/instar/dangerous-command-guard.sh",
23
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
23
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
24
24
  "since": "2025-01-01"
25
25
  },
26
26
  "hook:grounding-before-messaging": {
@@ -29,7 +29,7 @@
29
29
  "domain": "safety",
30
30
  "sourcePath": "src/core/PostUpdateMigrator.ts",
31
31
  "installedPath": ".instar/hooks/instar/grounding-before-messaging.sh",
32
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
32
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
33
33
  "since": "2025-01-01"
34
34
  },
35
35
  "hook:compaction-recovery": {
@@ -38,7 +38,7 @@
38
38
  "domain": "identity",
39
39
  "sourcePath": "src/core/PostUpdateMigrator.ts",
40
40
  "installedPath": ".instar/hooks/instar/compaction-recovery.sh",
41
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
41
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
42
42
  "since": "2025-01-01"
43
43
  },
44
44
  "hook:external-operation-gate": {
@@ -47,7 +47,7 @@
47
47
  "domain": "safety",
48
48
  "sourcePath": "src/core/PostUpdateMigrator.ts",
49
49
  "installedPath": ".instar/hooks/instar/external-operation-gate.js",
50
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
50
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
51
51
  "since": "2025-01-01"
52
52
  },
53
53
  "hook:deferral-detector": {
@@ -56,7 +56,7 @@
56
56
  "domain": "safety",
57
57
  "sourcePath": "src/core/PostUpdateMigrator.ts",
58
58
  "installedPath": ".instar/hooks/instar/deferral-detector.js",
59
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
59
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
60
60
  "since": "2025-01-01"
61
61
  },
62
62
  "hook:self-stop-guard": {
@@ -65,7 +65,7 @@
65
65
  "domain": "coherence",
66
66
  "sourcePath": "src/core/PostUpdateMigrator.ts",
67
67
  "installedPath": ".instar/hooks/instar/self-stop-guard.js",
68
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
68
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
69
69
  "since": "2025-01-01"
70
70
  },
71
71
  "hook:post-action-reflection": {
@@ -74,7 +74,7 @@
74
74
  "domain": "evolution",
75
75
  "sourcePath": "src/core/PostUpdateMigrator.ts",
76
76
  "installedPath": ".instar/hooks/instar/post-action-reflection.js",
77
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
77
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
78
78
  "since": "2025-01-01"
79
79
  },
80
80
  "hook:external-communication-guard": {
@@ -83,7 +83,7 @@
83
83
  "domain": "safety",
84
84
  "sourcePath": "src/core/PostUpdateMigrator.ts",
85
85
  "installedPath": ".instar/hooks/instar/external-communication-guard.js",
86
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
86
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
87
87
  "since": "2025-01-01"
88
88
  },
89
89
  "hook:scope-coherence-collector": {
@@ -92,7 +92,7 @@
92
92
  "domain": "coherence",
93
93
  "sourcePath": "src/core/PostUpdateMigrator.ts",
94
94
  "installedPath": ".instar/hooks/instar/scope-coherence-collector.js",
95
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
95
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
96
96
  "since": "2025-01-01"
97
97
  },
98
98
  "hook:scope-coherence-checkpoint": {
@@ -101,7 +101,7 @@
101
101
  "domain": "coherence",
102
102
  "sourcePath": "src/core/PostUpdateMigrator.ts",
103
103
  "installedPath": ".instar/hooks/instar/scope-coherence-checkpoint.js",
104
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
104
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
105
105
  "since": "2025-01-01"
106
106
  },
107
107
  "hook:free-text-guard": {
@@ -110,7 +110,7 @@
110
110
  "domain": "safety",
111
111
  "sourcePath": "src/core/PostUpdateMigrator.ts",
112
112
  "installedPath": ".instar/hooks/instar/free-text-guard.sh",
113
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
113
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
114
114
  "since": "2025-01-01"
115
115
  },
116
116
  "hook:claim-intercept": {
@@ -119,7 +119,7 @@
119
119
  "domain": "coherence",
120
120
  "sourcePath": "src/core/PostUpdateMigrator.ts",
121
121
  "installedPath": ".instar/hooks/instar/claim-intercept.js",
122
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
122
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
123
123
  "since": "2025-01-01"
124
124
  },
125
125
  "hook:claim-intercept-response": {
@@ -128,7 +128,7 @@
128
128
  "domain": "coherence",
129
129
  "sourcePath": "src/core/PostUpdateMigrator.ts",
130
130
  "installedPath": ".instar/hooks/instar/claim-intercept-response.js",
131
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
131
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
132
132
  "since": "2025-01-01"
133
133
  },
134
134
  "hook:stop-gate-router": {
@@ -137,7 +137,7 @@
137
137
  "domain": "safety",
138
138
  "sourcePath": "src/core/PostUpdateMigrator.ts",
139
139
  "installedPath": ".instar/hooks/instar/stop-gate-router.js",
140
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
140
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
141
141
  "since": "2025-01-01"
142
142
  },
143
143
  "hook:auto-approve-permissions": {
@@ -146,7 +146,7 @@
146
146
  "domain": "safety",
147
147
  "sourcePath": "src/core/PostUpdateMigrator.ts",
148
148
  "installedPath": ".instar/hooks/instar/auto-approve-permissions.js",
149
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
149
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
150
150
  "since": "2025-01-01"
151
151
  },
152
152
  "job:health-check": {
@@ -1554,7 +1554,7 @@
1554
1554
  "type": "subsystem",
1555
1555
  "domain": "updates",
1556
1556
  "sourcePath": "src/core/PostUpdateMigrator.ts",
1557
- "contentHash": "0ed164972ade364a81d33b54524f4893d080b86bdb37af6a1f9304fe7b5600e8",
1557
+ "contentHash": "79f741ce474e9e82ba385484e7d587f07d8279f9dfcf41b9b53a797bb890c717",
1558
1558
  "since": "2025-01-01"
1559
1559
  },
1560
1560
  "subsystem:scheduler": {
@@ -422,6 +422,7 @@ This routes feedback to the Instar maintainers automatically. Valid types: \`bug
422
422
  - **Mid-work resume queue** (ships observe-only/dry-run by default): a session reaped MID-WORK (strong work evidence at kill time) is queued for ordered automatic revival once the machine recovers — at most one resume per minute, only after sustained calm + quota headroom. \`GET /sessions/resume-queue\` shows entries, paused/breaker state, and lastTickAt; \`POST /sessions/resume-queue/:id/cancel\` · \`/:id/requeue\` (gave-up entries only) · \`/resume\` (unpause) · \`/drain\` (manual single step). Emergency stops pause the queue; an explicit per-topic stop cancels that topic's entries. Jobs only auto-resume when their definition sets \`resumeOnReap: true\`.
423
423
  - **A stale emergency-stop pause self-heals**: an emergency-stop pauses the WHOLE revival queue, and that pause used to never lift — silently stranding later, unrelated active-run revivals (the 2026-06-14 4-hour-silent-strand). Now: while the queue is paused with sessions waiting, you get ONE plain-English heads-up that revival is paused (Layer 1, always on); and if the pause is a stale emergency/sentinel stop AND an active autonomous run has since been recycled and queued well after the stop, the queue auto-resumes itself (Layer 2, on by default — \`monitoring.resumeQueue.autoResumeStalePause: false\` to disable; \`staleEmergencyPauseAutoResumeMin\` tunes the window, default 60). Any topic you actually stopped stays blocked by its per-topic operator-stop record even after the queue resumes, and a deliberate \`autonomous stop-all\` halt is NEVER auto-cleared. Proactive: user asks "why did my session restart by itself after a stop?" / "why is revival paused?" → GET /sessions/resume-queue (paused state) and the resume-queue audit log, then explain in plain words.
424
424
  - Proactive: user asks "where did my session go?" / "why did X disappear?" / "did something get killed?" → GET /sessions/reap-log and explain the most recent reaped/skipped entries for that session. User asks "did my interrupted work come back?" / "is a restart queued?" → GET /sessions/resume-queue and report the entry's status in plain words.
425
+ - **An autonomous run must outlive its session** (standard; dev-enabled, fleet-default-OFF self-heal): the revival queue takes a host-local lock so two machines can't share its state. A machine RENAME used to leave a stale lock the queue mistook for a shared-volume conflict → it silently disabled the whole revival guard (the 2026-06-15 incident). Now: on the dev agent, a stale FOREIGN-host lock that is provably a single-host rename (host-local disk + dead pid + ≥5min-stale heartbeat) is AUTO-HEALED instead of disabling (fail-closed on any uncertainty; \`monitoring.resumeQueue.autoHealStaleHostLock\`, fleet-default false). And a disabled revival queue now self-reports to the guard-posture inventory — it shows as \`off-runtime-divergent\` on \`GET /guards\` and raises one aggregated attention item, never silently inert. Proactive: user asks "why didn't my autonomous run come back after a restart/rename?" → GET /guards (is the resume queue off-runtime-divergent?) and GET /sessions/resume-queue (disabled reason), then explain.
425
426
  - **Build-Session Yield Safety** (ACT-839; ships dev-enabled, dark on the fleet, per the Maturation Path standard): a session reaped while its WORKTREE holds uncommitted work (a build that died "standing by for tests") is resume-eligible on that alone — the killer collects a bounded, fail-open dirty-check pre-kill and tags \`uncommitted-worktree-work\`. On revival the continuation prompt leads with a commit-first directive, and a durable beacon-enabled commitment (\`GET /commitments\`) re-surfaces the obligation if the revived session stalls. An explicit operator/user kill is NEVER auto-revived on a dirty worktree alone. The die-again case is caught by the OrphanedWorkSentinel (\`GET /orphaned-work\`). Proactive: user asks "why did my build come back / why am I being told to commit?" → it was revived because its worktree had unsaved work; commit it or deliberately discard it.
426
427
 
427
428
  ### Guard Posture — which safety systems are genuinely on (\`GET /guards\`)
@@ -0,0 +1,64 @@
1
+ # Upgrade Guide — vNEXT
2
+
3
+ <!-- assembled-by: assemble-next-md -->
4
+ <!-- bump: patch -->
5
+
6
+ ## What Changed
7
+
8
+ A new constitutional standard ("An Autonomous Run Must Outlive Its Session") plus the
9
+ fix behind it. The mid-work resume queue (the system that revives a reaped autonomous
10
+ run, #1157) takes a host-local lock so two machines can't corrupt its shared state.
11
+ A machine RENAME used to leave a stale lock the queue mistook for a shared-volume
12
+ conflict — so it silently disabled the entire run-revival guard and never said so
13
+ (the 2026-06-15 incident). Two changes:
14
+
15
+ - **Rename-aware lock (GAP-D).** When the lock shows a different host, the queue now
16
+ distinguishes a single-host rename (provably host-local disk + dead pid + ≥5min
17
+ stale heartbeat → auto-heal the lock via an O_EXCL first-writer-wins takeover) from
18
+ a genuine shared-volume conflict (stay disabled). FAIL-CLOSED on any uncertainty
19
+ (unknown filesystem, `df` failure, live pid, fresh heartbeat). The original HARD
20
+ INVARIANT — never pid-probe a foreign-host lock — is fully preserved when auto-heal
21
+ is off. Ships **fleet-default OFF** (`monitoring.resumeQueue.autoHealStaleHostLock`),
22
+ dev-agent dryRun-first (logs "would auto-heal" without rewriting) before going live.
23
+ - **A disabled revival queue is now LOUD (D2).** The queue self-reports to the
24
+ guard-posture inventory (`GUARD_MANIFEST` entry + `guardStatus()` + an unconditional
25
+ registration), so a disabled revival queue reads `off-runtime-divergent` on
26
+ `GET /guards` and raises one aggregated attention item — never silently inert.
27
+
28
+ No new route. New code-defaulted config key (kept out of ConfigDefaults to preserve
29
+ the fleet flip, consistent with #1157). Signal-only surfacing; the only authority is
30
+ the queue refusing to start itself (bounded self-recovery, fail-closed).
31
+
32
+ ## What to Tell Your User
33
+
34
+ - **Your autonomous work survives a machine rename now** (dev agent): "If I rename or
35
+ restore this machine, the system that brings a reaped autonomous run back no longer
36
+ quietly switches itself off. On a provable same-machine rename it heals its own lock
37
+ (carefully — only on a local disk, with the old process gone); on anything uncertain
38
+ it stays cautious. And if that revival system is ever genuinely disabled, you'll see
39
+ it flagged on the guards view with one alert instead of silence." ⚗️ Experimental —
40
+ the self-heal ships dark on the fleet (dev-agent first) and rolls out more widely
41
+ only after it's proven safe.
42
+
43
+ ## Summary of New Capabilities
44
+
45
+ | Capability | How to Use |
46
+ |-----------|-----------|
47
+ | A machine rename auto-heals the resume-queue lock instead of silently disabling revival | Automatic on the dev agent (`monitoring.resumeQueue.autoHealStaleHostLock`; fleet default off) |
48
+ | A disabled revival queue surfaces as `off-runtime-divergent` | `GET /guards` (automatic) |
49
+ | Diagnose "why didn't my autonomous run come back?" | `GET /guards` + `GET /sessions/resume-queue` (disabled reason) |
50
+
51
+ ## Evidence
52
+
53
+ - Unit (`tests/unit/resume-queue-autoheal-lock.test.ts`, 11): the FD1 device-source
54
+ truth-table (local `/dev/*` → local; `//host`/`host:/path` → not; unknown/tmpfs/map →
55
+ fail-closed); auto-heal fires only on local-FS + dead-pid + stale-heartbeat; stays
56
+ disabled on a non-local FS or a live pid; dryRun logs-but-does-not-rewrite; auto-heal
57
+ off preserves today's behavior; `guardStatus()` reporting.
58
+ - Integration (`tests/integration/resume-queue-guard-posture.test.ts`, 3): a
59
+ runtime-disabled queue classifies `off-runtime-divergent` through the real
60
+ GUARD_MANIFEST entry + `deriveGuardRow`; a healthy queue does not.
61
+ - Regression: the full resume-queue unit + route suite (100 tests) stays green —
62
+ including the existing HARD-INVARIANT test that a foreign-host lock is never
63
+ pid-probed (which guards against re-introducing the cross-host probe). tsc clean;
64
+ `lint-guard-manifest` clean. Independent Phase-5 second-pass review concurred.
@@ -0,0 +1,114 @@
1
+ # Side-Effects Review — autonomous-run-outlives-session
2
+
3
+ Spec: `docs/specs/autonomous-run-outlives-session.md` (converged + approved).
4
+ Change: GAP-D — the resume-queue host-lock distinguishes a single-host RENAME
5
+ (auto-heal) from a genuine shared-volume conflict (stay disabled), fail-closed;
6
+ a disabled revival queue self-reports to the guard-posture inventory; + the
7
+ constitutional standard "An Autonomous Run Must Outlive Its Session".
8
+
9
+ Files:
10
+ - `src/monitoring/ResumeQueue.ts` — `classifyDfSourceLocal` + `isStateDirHostLocalDefault` (FD1), foreign-host rename-vs-conflict classifier (FD2), `takeOverLockAtomic` (FD4), `guardStatus()` (D2), `autoHealStaleHostLock` config field (FD5).
11
+ - `src/monitoring/guardManifest.ts` — `GUARD_MANIFEST` entry `monitoring.resumeQueue.enabled` (component `ResumeQueue`).
12
+ - `src/commands/server.ts` — dev-gate resolves `autoHealStaleHostLock`; UNCONDITIONAL `guardRegistry.register` for the queue.
13
+ - `src/core/types.ts` — `autoHealStaleHostLock?` config field.
14
+ - `docs/STANDARDS-REGISTRY.md` — the new standard.
15
+ - `src/scaffold/templates.ts` + `src/core/PostUpdateMigrator.ts` — Agent Awareness line (new + deployed agents).
16
+ - Tests: `tests/unit/resume-queue-autoheal-lock.test.ts` (11), `tests/integration/resume-queue-guard-posture.test.ts` (3).
17
+
18
+ ## 1. Over-block (what legitimate inputs does this reject that it shouldn't?)
19
+ The auto-heal is STRICTLY ADDITIVE and gated: it can only turn a currently-DISABLED
20
+ foreign-host case into an enabled one. It never disables a case that previously
21
+ worked. The risk direction is "fails to heal a legitimate rename" → the queue
22
+ stays disabled exactly as today (no regression), just with a louder surface.
23
+ Fail-closed on any uncertainty (unknown FS, df failure, live pid, fresh heartbeat)
24
+ means some genuine renames won't auto-heal — acceptable: the operator clears the
25
+ lock manually as before, and the guard-posture alert now tells them to.
26
+
27
+ ## 2. Under-block (what failure modes does this still miss?)
28
+ - pid recycling (FD3, accepted): a recycled dead pid that maps to a live unrelated
29
+ process reads as a live conflict → stays disabled + LOUD (safe direction; worst
30
+ case a false escalation, never corruption).
31
+ - The narrow double-boot unlink race in `takeOverLockAtomic` (two server boots on
32
+ one machine within ms of each other post-rename): O_EXCL gives EEXIST to the
33
+ loser in the common case; the residual double-unlink window is backstopped by
34
+ the next-acquire live-pid + heartbeat check. Not corruption — at worst a
35
+ transient re-evaluation.
36
+ - Genuine shared-volume setups where `df -P` reports a device string we don't
37
+ recognize as network: classified unknown → NOT local → stays disabled (correct).
38
+
39
+ ## 3. Level-of-abstraction fit
40
+ Correct layer. The lock classifier lives IN `ResumeQueue.acquireLock` (the only
41
+ place that owns the lock), and the surfacing rides the EXISTING guard-posture
42
+ inventory (GUARD_MANIFEST + GuardRegistry + GuardPostureProbe) rather than a new
43
+ parallel alert path. No new notification surface invented — it feeds the one that
44
+ already aggregates and dedups (Bounded Notification Surface).
45
+
46
+ ## 4. Signal vs authority compliance (docs/signal-vs-authority.md)
47
+ COMPLIANT. The auto-heal is bounded SELF-RECOVERY of the queue's own lock with a
48
+ fail-closed default — not a brittle gate holding blocking authority over agent
49
+ behavior or message flow. The guard-posture surfacing is a pure SIGNAL-producer
50
+ (it reports a disabled state; it never blocks, delays, or rewrites anything). The
51
+ default `autoHealStaleHostLock:false` keeps the behavior change off the fleet until
52
+ proven; the dev-agent runs it dryRun-first (logs intent without rewriting).
53
+
54
+ ## 5. Interactions
55
+ - Preserves the original HARD INVARIANT (never pid-probe a foreign lock) when
56
+ auto-heal is OFF — verified by the existing `resume-queue.test.ts:417` invariant
57
+ test (which initially regressed and was fixed by gating all probing behind
58
+ `autoHealStaleHostLock`).
59
+ - The new GUARD_MANIFEST entry passes `lint-guard-manifest` (the drainer is not
60
+ auto-flagged, so no orphan NOT_A_GUARD entry — which would itself fail the lint).
61
+ - `guardRegistry.register` is UNCONDITIONAL (even when start() returns false) so a
62
+ lock-disabled queue reads `off-runtime-divergent`, not `missing`.
63
+ - Does NOT touch `evidenceEligible` / the #1157 revival path — strictly the lock
64
+ gate. No double-fire with the existing same-host stale reclaim (that path is
65
+ unchanged; this is the foreign-host branch only).
66
+
67
+ ## 6. External surfaces
68
+ - New config key `monitoring.resumeQueue.autoHealStaleHostLock` (fleet default
69
+ false). No new route. `GET /guards` and `GET /sessions/resume-queue` gain a
70
+ truthful disabled-state read; no schema break (additive).
71
+ - CLAUDE.md template + migrator add one awareness bullet (new + deployed agents).
72
+ - No external network/timing dependence beyond a single bounded (3000ms) `df -P`
73
+ at lock-acquisition.
74
+
75
+ ## 7. Multi-machine posture (Cross-Machine Coherence)
76
+ MACHINE-LOCAL BY DESIGN, and that is the whole point: the resume-queue lock + its
77
+ state dir are deliberately host-local (a shared volume across two hosts is
78
+ unsupported — the invariant this change PROTECTS). The fix makes the host-local
79
+ assumption ROBUST to a rename of the SAME machine without ever weakening the
80
+ cross-host protection (a genuine foreign live host still disables). guardStatus is
81
+ read per-machine; each machine's `/guards` reports its own queue. No replication
82
+ needed or wanted (a lock is intrinsically local).
83
+
84
+ ## 8. Rollback cost
85
+ Cheap and immediate. `monitoring.resumeQueue.autoHealStaleHostLock:false` (the
86
+ fleet default) fully disables the new auto-heal — reverting to today's
87
+ disable-on-mismatch behavior — with no restart-data implications (config read at
88
+ queue construction; next server start picks it up). The guard-posture surfacing is
89
+ inert when the queue is healthy and harmless when disabled (it only reads state).
90
+ No migration, no data repair. The constitutional-standard doc + CLAUDE.md lines are
91
+ documentation (no runtime surface).
92
+
93
+ ## Test coverage (Testing Integrity)
94
+ - Unit: `resume-queue-autoheal-lock.test.ts` — FD1 truth-table; auto-heal on
95
+ provable rename; stays-disabled on non-local FS / live pid / fresh heartbeat;
96
+ dryRun no-rewrite; auto-heal-off preserves original behavior; guardStatus.
97
+ - Integration: `resume-queue-guard-posture.test.ts` — a runtime-disabled queue
98
+ classifies `off-runtime-divergent` through the real GUARD_MANIFEST entry +
99
+ `deriveGuardRow` (the route's path); a healthy queue does not.
100
+ - E2E: the existing `tests/e2e/resume-idle-autonomous-lifecycle.test.ts` exercises
101
+ the queue alive end-to-end; this change is additive and those pass. (A dedicated
102
+ boot-with-stale-lock E2E is a candidate enhancement; the unit+integration tiers
103
+ cover the new logic and its wiring.)
104
+ - Regression: full resume-queue unit + route suite (100 tests) green; tsc clean;
105
+ lint-guard-manifest clean.
106
+
107
+ ## Second-pass review
108
+ **Concur with the review.** Independent Phase-5 audit (guard/recovery path) verified, citing code:
109
+ 1. Auto-heal can NEVER fire on a genuine shared volume with a live remote holder — `fsLocal` is dispositive and `&&`-short-circuits before any pid probe; `df -P` on a network mount never reports `/dev/*`.
110
+ 2. The HARD INVARIANT (never pid-probe a foreign lock) is preserved when auto-heal is OFF (the fleet default) — all probing is gated behind `if (this.cfg.autoHealStaleHostLock)`; the existing invariant test (default config) still asserts `probed===false`.
111
+ 3. `takeOverLockAtomic` O_EXCL first-writer-wins is correct; the only residual is the documented narrow double-boot unlink window, backstopped by the next-acquire live-pid+heartbeat check — transient/self-correcting, never durable corruption.
112
+ 4. Signal-vs-Authority compliant — the only authority is the queue refusing to start itself (bounded self-recovery, fail-closed); `guardStatus()` is a pure signal producer.
113
+ 5. No common-path regression — a healthy boot never enters the foreign-host branch and never calls `df`.
114
+ Verdict: sound, fail-closed in the right direction, well-tested on both sides of every decision boundary, safely gated.