swarph-cli 0.7.1__tar.gz → 0.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {swarph_cli-0.7.1/src/swarph_cli.egg-info → swarph_cli-0.7.3}/PKG-INFO +41 -1
  2. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/README.md +40 -0
  3. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/pyproject.toml +7 -1
  4. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/__init__.py +1 -1
  5. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/watchdog.py +308 -15
  6. swarph_cli-0.7.3/src/swarph_cli/systemd/swarph-watchdog.default +9 -0
  7. swarph_cli-0.7.3/src/swarph_cli/systemd/swarph-watchdog.service +15 -0
  8. swarph_cli-0.7.3/src/swarph_cli/systemd/swarph-watchdog.timer +13 -0
  9. {swarph_cli-0.7.1 → swarph_cli-0.7.3/src/swarph_cli.egg-info}/PKG-INFO +41 -1
  10. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/SOURCES.txt +3 -0
  11. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_watchdog.py +195 -1
  12. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/LICENSE +0 -0
  13. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/setup.cfg +0 -0
  14. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/caller.py +0 -0
  15. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/cell.py +0 -0
  16. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/__init__.py +0 -0
  17. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/chat.py +0 -0
  18. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/daemon.py +0 -0
  19. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/hook_output.py +0 -0
  20. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/import_session.py +0 -0
  21. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/install_hook.py +0 -0
  22. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/onboard.py +0 -0
  23. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/ratify.py +0 -0
  24. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/spawn.py +0 -0
  25. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/main.py +0 -0
  26. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/parsers/__init__.py +0 -0
  27. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/parsers/claude.py +0 -0
  28. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/dependency_links.txt +0 -0
  29. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/entry_points.txt +0 -0
  30. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/requires.txt +0 -0
  31. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/top_level.txt +0 -0
  32. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_cell_loader.py +0 -0
  33. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_chat_command.py +0 -0
  34. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_claude_parser.py +0 -0
  35. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_daemon_command.py +0 -0
  36. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_hook_output.py +0 -0
  37. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_import_command.py +0 -0
  38. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_install_hook.py +0 -0
  39. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_main.py +0 -0
  40. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_onboard_command.py +0 -0
  41. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_ratify_command.py +0 -0
  42. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_smoke_chat.py +0 -0
  43. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_smoke_one_shot.py +0 -0
  44. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_smoke_phase_5_5.py +0 -0
  45. {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_spawn_command.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: swarph-cli
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: The `swarph` binary — multi-LLM CLI with mesh-gateway integration. v0.7.0 ships Phase 7 substrate-doc R7 §11.1.7 operator-tooling layer in 5 increments: PR-A `--new-instance` flag (sibling-spawn case) + PR-B auto-suffix on collision (sibling-slot persistence) + PR-C SessionStart hook (closes bare-claude operator-paste gap) + watchdog (stranded-session recovery) + PR-D swarph-shared cell.yaml relocation (cell-yaml schema graduates to swarph-shared 0.3.0 kernel-tier; substrate-doc R7 §11.1.5 (O5) RESOLVED).
5
5
  Author: Pierre Samson, Claude Opus
6
6
  License: MIT
@@ -138,6 +138,46 @@ Loud-on-down (PLAN §16.5): never silently exits. Cursor writes are atomic (writ
138
138
 
139
139
  `--auto-act` flag is documented for v0.5.1+ when handler registration via `@swarph.on_dm(...)` lands; v0.5.0 ships surface-only mode (DMs printed + JSONL-logged to `inbox.log`, no automatic replies).
140
140
 
141
+ ### `swarph watchdog` (Phase 7 — v0.7 stranded-session detection, v0.7.3 systemd install)
142
+
143
+ Detects stranded Claude sessions (API throttle / harness death) via cursor-mtime + tmux pgrep AND-gate, and recovers via A1 tmux send-keys wake-prompt → A2 `swarph spawn` respawn. Cell.yaml-pinned cursor + tmux session (F4) since v0.7.2.
144
+
145
+ **One-shot mode (cron-callable, v0.7+):**
146
+ ```bash
147
+ */5 * * * * swarph watchdog --check --cell lab >> ~/.local/log/swarph-watchdog.log 2>&1
148
+ ```
149
+
150
+ **Systemd timer install (v0.7.3+ — closes ev_6954f748 substrate-component-installation-gap):**
151
+
152
+ ```bash
153
+ # Preview without writing (any user):
154
+ swarph watchdog --install-service --cell droplet --dry-run
155
+
156
+ # Install + enable (requires root for /etc/systemd/system writes):
157
+ sudo swarph watchdog --install-service --cell droplet
158
+ ```
159
+
160
+ This writes three files:
161
+
162
+ | Path | Purpose |
163
+ |------|---------|
164
+ | `/etc/systemd/system/swarph-watchdog.service` | `Type=oneshot`, runs `swarph watchdog --check` |
165
+ | `/etc/systemd/system/swarph-watchdog.timer` | Fires every 5 minutes (`OnUnitActiveSec=5min`) |
166
+ | `/etc/default/swarph-watchdog` | Sets `SWARPH_CELL=<role>` for the service env |
167
+
168
+ Then runs `systemctl daemon-reload && systemctl enable --now swarph-watchdog.timer`. Idempotent — re-running overwrites with current package version (newer-version semantics).
169
+
170
+ Monitoring:
171
+
172
+ ```bash
173
+ systemctl status swarph-watchdog.timer # is it scheduled?
174
+ systemctl list-timers swarph-watchdog.timer # next fire?
175
+ journalctl -u swarph-watchdog.service -f # live log
176
+ tail -f /var/log/swarph-watchdog.log # append-log alternative
177
+ ```
178
+
179
+ Why this matters: pre-v0.7.3, swarph-cli shipped the watchdog code but no install path. Lab ran it via cron (manual setup); droplet never installed it at all. A real production silence-window (drop's ~24min mute 2026-05-14 08:38→09:02 UTC after an Anthropic API error) made the install-gap visible. v0.7.3 closes it for any peer with one command.
180
+
141
181
  ### `swarph onboard` + `swarph ratify` (Phase 5.5)
142
182
 
143
183
  Per PLAN.md §15, onboarding splits into a **mechanics phase** (`swarph onboard`) that automates the boring parts (registry POST, scaffolding, token resolution) and a **manual contract phase** (the new peer composes the handshake DM in their own words). A witness peer judges the handshake and runs `swarph ratify <peer>` to flip `ratified=true`, gating `task_claim` server-side.
@@ -105,6 +105,46 @@ Loud-on-down (PLAN §16.5): never silently exits. Cursor writes are atomic (writ
105
105
 
106
106
  `--auto-act` flag is documented for v0.5.1+ when handler registration via `@swarph.on_dm(...)` lands; v0.5.0 ships surface-only mode (DMs printed + JSONL-logged to `inbox.log`, no automatic replies).
107
107
 
108
+ ### `swarph watchdog` (Phase 7 — v0.7 stranded-session detection, v0.7.3 systemd install)
109
+
110
+ Detects stranded Claude sessions (API throttle / harness death) via cursor-mtime + tmux pgrep AND-gate, and recovers via A1 tmux send-keys wake-prompt → A2 `swarph spawn` respawn. Cell.yaml-pinned cursor + tmux session (F4) since v0.7.2.
111
+
112
+ **One-shot mode (cron-callable, v0.7+):**
113
+ ```bash
114
+ */5 * * * * swarph watchdog --check --cell lab >> ~/.local/log/swarph-watchdog.log 2>&1
115
+ ```
116
+
117
+ **Systemd timer install (v0.7.3+ — closes ev_6954f748 substrate-component-installation-gap):**
118
+
119
+ ```bash
120
+ # Preview without writing (any user):
121
+ swarph watchdog --install-service --cell droplet --dry-run
122
+
123
+ # Install + enable (requires root for /etc/systemd/system writes):
124
+ sudo swarph watchdog --install-service --cell droplet
125
+ ```
126
+
127
+ This writes three files:
128
+
129
+ | Path | Purpose |
130
+ |------|---------|
131
+ | `/etc/systemd/system/swarph-watchdog.service` | `Type=oneshot`, runs `swarph watchdog --check` |
132
+ | `/etc/systemd/system/swarph-watchdog.timer` | Fires every 5 minutes (`OnUnitActiveSec=5min`) |
133
+ | `/etc/default/swarph-watchdog` | Sets `SWARPH_CELL=<role>` for the service env |
134
+
135
+ Then runs `systemctl daemon-reload && systemctl enable --now swarph-watchdog.timer`. Idempotent — re-running overwrites with current package version (newer-version semantics).
136
+
137
+ Monitoring:
138
+
139
+ ```bash
140
+ systemctl status swarph-watchdog.timer # is it scheduled?
141
+ systemctl list-timers swarph-watchdog.timer # next fire?
142
+ journalctl -u swarph-watchdog.service -f # live log
143
+ tail -f /var/log/swarph-watchdog.log # append-log alternative
144
+ ```
145
+
146
+ Why this matters: pre-v0.7.3, swarph-cli shipped the watchdog code but no install path. Lab ran it via cron (manual setup); droplet never installed it at all. A real production silence-window (drop's ~24min mute 2026-05-14 08:38→09:02 UTC after an Anthropic API error) made the install-gap visible. v0.7.3 closes it for any peer with one command.
147
+
108
148
  ### `swarph onboard` + `swarph ratify` (Phase 5.5)
109
149
 
110
150
  Per PLAN.md §15, onboarding splits into a **mechanics phase** (`swarph onboard`) that automates the boring parts (registry POST, scaffolding, token resolution) and a **manual contract phase** (the new peer composes the handshake DM in their own words). A witness peer judges the handshake and runs `swarph ratify <peer>` to flip `ratified=true`, gating `task_claim` server-side.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "swarph-cli"
7
- version = "0.7.1"
7
+ version = "0.7.3"
8
8
  description = "The `swarph` binary — multi-LLM CLI with mesh-gateway integration. v0.7.0 ships Phase 7 substrate-doc R7 §11.1.7 operator-tooling layer in 5 increments: PR-A `--new-instance` flag (sibling-spawn case) + PR-B auto-suffix on collision (sibling-slot persistence) + PR-C SessionStart hook (closes bare-claude operator-paste gap) + watchdog (stranded-session recovery) + PR-D swarph-shared cell.yaml relocation (cell-yaml schema graduates to swarph-shared 0.3.0 kernel-tier; substrate-doc R7 §11.1.5 (O5) RESOLVED)."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -59,6 +59,12 @@ swarph = "swarph_cli.main:main"
59
59
  [tool.setuptools.packages.find]
60
60
  where = ["src"]
61
61
 
62
+ [tool.setuptools.package-data]
63
+ # v0.7.3: ship bundled systemd unit + timer + default templates so
64
+ # `swarph watchdog --install-service` can read them via importlib.resources.
65
+ # Closes ev_6954f748 substrate-component-installation-gap.
66
+ swarph_cli = ["systemd/*.service", "systemd/*.timer", "systemd/*.default"]
67
+
62
68
  [tool.pytest.ini_options]
63
69
  testpaths = ["tests"]
64
70
  addopts = "-v --tb=short"
@@ -16,6 +16,6 @@ The architecture splits CLI from substrate so:
16
16
 
17
17
  from __future__ import annotations
18
18
 
19
- __version__ = "0.7.1"
19
+ __version__ = "0.7.3"
20
20
 
21
21
  __all__ = ["__version__"]
@@ -79,6 +79,11 @@ _DEFAULT_THRESHOLD_SEC = 1800 # 30 minutes
79
79
  _DEFAULT_A1_RETRIES = 3
80
80
  _DEFAULT_A1_BACKOFF_SEC = 60
81
81
  _DEFAULT_GATEWAY_URL = "http://localhost:8788"
82
+ # F3 — tmux pane_activity gate threshold. If pane has activity within this
83
+ # many seconds, suppress A1 (session is working, not stalled). 600s (10min)
84
+ # is comfortably above legitimate-pause noise + comfortably below the
85
+ # 30min cursor-staleness threshold, so the two gates compose cleanly.
86
+ _DEFAULT_PANE_ACTIVITY_THRESHOLD_SEC = 600
82
87
 
83
88
  _USAGE = """\
84
89
  Usage:
@@ -86,6 +91,7 @@ Usage:
86
91
  [--gateway URL] [--tmux-session NAME]
87
92
  [--peer NAME] [--no-respawn]
88
93
  [--log PATH] [--verbose]
94
+ swarph watchdog --install-service [--cell ROLE] [--dry-run]
89
95
 
90
96
  Detects stranded Claude sessions (API throttle / harness death) and attempts
91
97
  recovery via tmux send-keys A1 wake-prompt, escalating to swarph spawn
@@ -94,6 +100,12 @@ respawn (A2) on persistent darkness.
94
100
  Designed for cron invocation:
95
101
  */5 * * * * swarph watchdog --check --cell lab >> ~/.local/log/swarph-watchdog.log 2>&1
96
102
 
103
+ OR systemd timer (v0.7.3+, closes ev_6954f748 substrate-component-installation-gap):
104
+ sudo swarph watchdog --install-service [--cell <role>]
105
+ # → installs /etc/systemd/system/swarph-watchdog.{service,timer}
106
+ # → installs /etc/default/swarph-watchdog with SWARPH_CELL=<role>
107
+ # → daemon-reload + enable --now swarph-watchdog.timer
108
+
97
109
  Detection (mother #1021 AND-gate design):
98
110
  PRIMARY: cursor file mtime — most-recent Claude action (drain script touches it)
99
111
  FALLBACK: pgrep claude on tmux session — confirms process aliveness
@@ -118,11 +130,12 @@ Flags:
118
130
  --verbose also write diagnostics to stderr
119
131
 
120
132
  Exit codes:
121
- 0 no action taken (session healthy or no unread DMs queued)
133
+ 0 no action taken (session healthy or no unread DMs queued); install ok
122
134
  1 A1 fired (wake-prompt sent)
123
135
  2 A2 fired (full respawn triggered)
124
136
  3 detection error (cursor unreadable / gateway unreachable)
125
- 4 configuration error (invalid args, no cell.yaml resolved)
137
+ 4 configuration error (invalid args, no cell.yaml resolved); install needs sudo
138
+ 5 install error (file write failed / systemctl failed)
126
139
  """
127
140
 
128
141
 
@@ -137,10 +150,29 @@ def _stat_mtime(path: Path) -> Optional[int]:
137
150
  return None
138
151
 
139
152
 
140
- def _resolve_cursor_path(role: str, explicit: Optional[str]) -> Path:
141
- """Resolve cursor file path with documented fallback chain."""
153
+ def _resolve_cursor_path(
154
+ role: str,
155
+ explicit: Optional[str],
156
+ cell_yaml_value: Optional[str] = None,
157
+ ) -> Path:
158
+ """Resolve cursor file path with documented fallback chain.
159
+
160
+ Precedence (F4 — mother #1057/#1060 + beta #1061/#1065):
161
+ 1. Explicit ``--cursor`` CLI arg (highest)
162
+ 2. ``cell.yaml`` extra.cursor_path when --cell present
163
+ 3. ``$TMPDIR/<role>-cursor.json``
164
+ 4. ``/tmp/lab-claude-cursor.json`` (legacy lab-orchestrator default)
165
+
166
+ F4 closes the host-prefix-variant + sibling-instance-variant gap
167
+ class — cell.yaml carries the canonical cursor path per-cell, watchdog
168
+ auto-resolves when --cell is provided. Eliminates the silent-default-
169
+ to-lab-prefix failure mode that gave droplet 23hr of cursor-unreadable
170
+ errors before catch.
171
+ """
142
172
  if explicit:
143
173
  return Path(explicit).expanduser()
174
+ if cell_yaml_value:
175
+ return Path(cell_yaml_value).expanduser()
144
176
  tmpdir = os.environ.get("TMPDIR", "/tmp")
145
177
  primary = Path(tmpdir) / f"{role}-cursor.json"
146
178
  if primary.exists():
@@ -149,6 +181,74 @@ def _resolve_cursor_path(role: str, explicit: Optional[str]) -> Path:
149
181
  return Path("/tmp/lab-claude-cursor.json")
150
182
 
151
183
 
184
+ def _resolve_tmux_session(
185
+ role: str,
186
+ explicit: Optional[str],
187
+ cell_yaml_value: Optional[str] = None,
188
+ ) -> str:
189
+ """Resolve tmux session name with documented fallback chain.
190
+
191
+ Precedence (F4 sibling to cursor_path):
192
+ 1. Explicit ``--tmux-session`` CLI arg
193
+ 2. ``cell.yaml`` extra.tmux_session when --cell present
194
+ 3. Role itself (convention default)
195
+
196
+ Mother's sibling-instance variant (#1061): when slot-N siblings spawn,
197
+ each slot needs its own tmux session name; the cell.yaml that pins the
198
+ slot SHOULD also pin the tmux_session to keep the watchdog's reads
199
+ consistent with the spawn's writes.
200
+ """
201
+ if explicit:
202
+ return explicit
203
+ if cell_yaml_value:
204
+ return cell_yaml_value
205
+ return role
206
+
207
+
208
+ def _read_cell_yaml_pins(role: str) -> tuple[Optional[str], Optional[str]]:
209
+ """Best-effort read of cell.yaml extra.cursor_path + extra.tmux_session.
210
+
211
+ Tries the cwd-local ``./cell.yaml`` first (matches hook_output discovery),
212
+ falls back to ``<cells_dir>/<role>.yaml``. Returns (None, None) on any
213
+ failure — F4 is additive non-breaking, malformed cell.yaml falls through
214
+ to the legacy convention defaults.
215
+
216
+ NOTE: ``cursor_path`` / ``tmux_session`` live in ``Cell.extra`` (forward-
217
+ compat catch-all per swarph-shared v0.3) in v0.7.2. swarph-shared 0.4
218
+ will graduate them to first-class typed fields on ``Cell``; this reader
219
+ will continue to work because graduate-to-typed-field preserves the
220
+ extra-dict reading path (per swarph-shared's documented forward-compat
221
+ discipline).
222
+ """
223
+ from swarph_cli.cell import (
224
+ cells_dir,
225
+ discover_cell_in_cwd,
226
+ load_cell,
227
+ CellError,
228
+ )
229
+
230
+ cell_path = discover_cell_in_cwd()
231
+ if cell_path is None:
232
+ candidate = cells_dir() / f"{role}.yaml"
233
+ if candidate.is_file():
234
+ cell_path = candidate
235
+ if cell_path is None:
236
+ return None, None
237
+
238
+ try:
239
+ cell = load_cell(cell_path)
240
+ except (CellError, OSError):
241
+ return None, None
242
+
243
+ extra = cell.extra or {}
244
+ cursor_path = extra.get("cursor_path")
245
+ tmux_session = extra.get("tmux_session")
246
+ return (
247
+ str(cursor_path) if cursor_path else None,
248
+ str(tmux_session) if tmux_session else None,
249
+ )
250
+
251
+
152
252
  def _resolve_log_path(explicit: Optional[str]) -> Path:
153
253
  if explicit:
154
254
  return Path(explicit).expanduser()
@@ -228,6 +328,39 @@ def _tmux_send_keys(name: str, text: str) -> bool:
228
328
  return False
229
329
 
230
330
 
331
+ def _pane_activity_age_sec(name: str) -> Optional[int]:
332
+ """Age in seconds since the tmux pane's last activity event.
333
+
334
+ Reads tmux's `#{pane_activity}` format variable, which returns a unix
335
+ epoch timestamp of the most recent activity in the active pane of the
336
+ target session. Returns None if tmux is missing, the session doesn't
337
+ exist, or tmux's output isn't parseable as an integer epoch.
338
+
339
+ Used by F3 (mother #1087 / drop-on-meta-edge proposal) as a third
340
+ AND-gate input to distinguish (a) session genuinely stalled from (b)
341
+ session actively working in a long bash block. cursor-mtime alone
342
+ measures "time since last turn-end" not "time since last activity";
343
+ pane_activity covers the mid-turn-active case.
344
+
345
+ Returns None on detection error so the caller can fall through to
346
+ the legacy AND-gate behavior — F3 is a strengthening of the gate,
347
+ not a replacement of it.
348
+ """
349
+ try:
350
+ result = subprocess.run(
351
+ ["tmux", "display", "-p", "-t", name, "#{pane_activity}"],
352
+ capture_output=True, text=True, timeout=5,
353
+ )
354
+ if result.returncode != 0:
355
+ return None
356
+ out = result.stdout.strip()
357
+ if not out:
358
+ return None
359
+ return max(0, _now() - int(out))
360
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError, ValueError):
361
+ return None
362
+
363
+
231
364
  def _tmux_kill_session(name: str) -> bool:
232
365
  try:
233
366
  result = subprocess.run(
@@ -263,7 +396,7 @@ def _spawn_via_swarph(role: str, tmux_session: str) -> bool:
263
396
  return False
264
397
 
265
398
 
266
- def _a1_marker_path(log_path: Path, role: str) -> Path:
399
+ def _a1_marker_path(log_path: Path, role: str, tmux_session: Optional[str] = None) -> Path:
267
400
  """Marker file recording the cursor_mtime at which A1 was last fired.
268
401
 
269
402
  Co-located with the watchdog log so it inherits the same XDG_STATE_HOME
@@ -272,14 +405,27 @@ def _a1_marker_path(log_path: Path, role: str) -> Path:
272
405
  where cron fired A1 every 5min for 65min into an active session's tmux
273
406
  input buffer (commander #1092 + droplet #1087).
274
407
 
275
- Keyed on ``role`` alone today. When the F4 follow-up (cell.yaml-pinned
276
- cursor_path + tmux_session per mother+beta #1064/#1065) lands and the
277
- sibling-instance pattern (alpha+beta drop-on-meta-edge per
278
- project_drop_mitosis_to_meta_edge) ships at scale, two siblings sharing
279
- the same base ``role`` would clobber each other's markers. Re-key on
280
- ``(role, tmux_session)`` once F4 lands — flagged by mother in #1103.
408
+ Keyed on ``(role, tmux_session)`` post-F4 so sibling-instance patterns
409
+ (alpha+beta drop-on-meta-edge per project_drop_mitosis_to_meta_edge)
410
+ don't clobber each other's markers — mother's flag from #1103 closed in
411
+ v0.7.2. tmux_session is sanitized to alphanumeric + ``-_.`` for the
412
+ filename to avoid path-traversal or weird characters from cell.yaml-
413
+ pinned values.
414
+
415
+ NOTE (mother #1138 sanitization edge case): two siblings whose
416
+ ``tmux_session`` values differ ONLY in disallowed characters (e.g.,
417
+ ``cell:a`` vs ``cell:b`` — colons sanitized to ``_`` collapsing both
418
+ to ``cell_a`` / ``cell_b`` — fine in this example, but ``cell:a`` vs
419
+ ``cell$a`` would both collapse to ``cell_a``) would collide post-
420
+ sanitization. cell.yaml-pinned ``tmux_session`` values SHOULD differ
421
+ in alphanumeric content, not just punctuation. Cosmetic in practice
422
+ (operators don't choose session names that close), but worth knowing.
281
423
  """
282
- return log_path.parent / f"a1-fired-{role}.marker"
424
+ safe_tmux = "".join(
425
+ c if (c.isalnum() or c in "-_.") else "_"
426
+ for c in (tmux_session or role)
427
+ )
428
+ return log_path.parent / f"a1-fired-{role}-{safe_tmux}.marker"
283
429
 
284
430
 
285
431
  def _a1_already_fired_at(marker: Path, cursor_mtime: int) -> bool:
@@ -331,10 +477,17 @@ def _log_event(log_path: Path, event: str, details: dict, verbose: bool = False)
331
477
 
332
478
  def run_check(args: argparse.Namespace) -> int:
333
479
  role = args.cell
334
- cursor = _resolve_cursor_path(role, args.cursor)
480
+ # F4 — cell.yaml-pinned cursor_path + tmux_session (mother #1057/#1060
481
+ # + beta #1061/#1065). Reads cell.yaml `extra.cursor_path` /
482
+ # `extra.tmux_session` when --cell is provided; explicit CLI args still
483
+ # win. Best-effort: malformed cell.yaml falls through to legacy
484
+ # convention defaults (additive non-breaking).
485
+ cell_cursor, cell_tmux = _read_cell_yaml_pins(role)
486
+ cursor = _resolve_cursor_path(role, args.cursor, cell_cursor)
487
+ tmux_session = _resolve_tmux_session(role, args.tmux_session, cell_tmux)
335
488
  log_path = _resolve_log_path(args.log)
336
489
  threshold = args.threshold
337
- tmux_session = args.tmux_session or role
490
+ pane_activity_threshold = args.pane_activity_threshold
338
491
  peer = args.peer or role
339
492
  gateway = args.gateway
340
493
  token = os.environ.get("MESH_GATEWAY_TOKEN")
@@ -346,6 +499,9 @@ def run_check(args: argparse.Namespace) -> int:
346
499
  "threshold_sec": threshold,
347
500
  "tmux_session": tmux_session,
348
501
  "peer": peer,
502
+ "pane_activity_threshold_sec": pane_activity_threshold,
503
+ "cell_yaml_pinned_cursor": cell_cursor is not None,
504
+ "cell_yaml_pinned_tmux": cell_tmux is not None,
349
505
  }
350
506
 
351
507
  # PRIMARY signal: cursor file mtime
@@ -378,7 +534,7 @@ def run_check(args: argparse.Namespace) -> int:
378
534
  # cursor_stale + process_alive + unread None → noop (F2 fail-closed: can't verify work, don't poke)
379
535
  # cursor_stale + a1_marker matches cursor_mtime → noop (F1 same-window suppression)
380
536
 
381
- marker = _a1_marker_path(log_path, role)
537
+ marker = _a1_marker_path(log_path, role, tmux_session)
382
538
  diag["a1_marker"] = str(marker)
383
539
 
384
540
  if not process_alive:
@@ -434,6 +590,21 @@ def run_check(args: argparse.Namespace) -> int:
434
590
  _log_event(log_path, "noop", diag, verbose)
435
591
  return 0
436
592
 
593
+ # F3 — tmux pane_activity AND-gate (mother #1087). cursor-mtime measures
594
+ # "time since last turn-end" not "time since last activity"; mid-long-
595
+ # turn cursor is stale even though session is maximally alive. tmux's
596
+ # `#{pane_activity}` covers the mid-turn-active case. If the pane has
597
+ # had activity within `pane_activity_threshold_sec`, suppress A1 — the
598
+ # session is working, not stalled. Falls through to firing A1 when
599
+ # pane_activity is None (tmux missing / older tmux without the format)
600
+ # so F3 is a strengthening of the gate, not a hard dependency.
601
+ pane_age = _pane_activity_age_sec(tmux_session)
602
+ diag["pane_activity_age_sec"] = pane_age
603
+ if pane_age is not None and pane_age < pane_activity_threshold:
604
+ diag["decision"] = "noop_pane_activity_recent"
605
+ _log_event(log_path, "noop", diag, verbose)
606
+ return 0
607
+
437
608
  diag["decision"] = "a1_send_keys"
438
609
  wake_text = (
439
610
  f"watchdog wake — cursor stale {cursor_age}s, "
@@ -447,6 +618,107 @@ def run_check(args: argparse.Namespace) -> int:
447
618
  return 1 if sent else 4
448
619
 
449
620
 
621
+ _SYSTEMD_UNIT_DIR = Path("/etc/systemd/system")
622
+ _SYSTEMD_DEFAULT_DIR = Path("/etc/default")
623
+ _SYSTEMD_UNIT_NAMES = ("swarph-watchdog.service", "swarph-watchdog.timer")
624
+ _SYSTEMD_DEFAULT_NAME = "swarph-watchdog" # /etc/default/swarph-watchdog
625
+
626
+
627
+ def _bundled_systemd_files() -> dict[str, str]:
628
+ """Return {filename: content} for the 3 bundled systemd templates.
629
+
630
+ Reads from the package's bundled `systemd/` data directory via
631
+ importlib.resources. Works regardless of install method (pipx, pip,
632
+ editable, wheel-from-PyPI).
633
+ """
634
+ try:
635
+ from importlib.resources import files as _files
636
+ except ImportError: # pragma: no cover — Python <3.9 not supported anyway
637
+ from importlib_resources import files as _files # type: ignore[no-redef]
638
+
639
+ pkg_root = _files("swarph_cli") / "systemd"
640
+ out: dict[str, str] = {}
641
+ for name in (*_SYSTEMD_UNIT_NAMES, "swarph-watchdog.default"):
642
+ out[name] = (pkg_root / name).read_text(encoding="utf-8")
643
+ return out
644
+
645
+
646
+ def run_install_service(args: argparse.Namespace) -> int:
647
+ """Install systemd timer + service for periodic watchdog --check.
648
+
649
+ Idempotent: overwrites existing unit files (newer-version semantics).
650
+ Requires sudo for /etc/systemd/system writes unless --dry-run.
651
+
652
+ Exit codes:
653
+ 0 success (or dry-run completed)
654
+ 4 configuration error (non-root without --dry-run)
655
+ 5 install error (file write failed / systemctl failed)
656
+ """
657
+ files = _bundled_systemd_files()
658
+
659
+ # Template the default file with the requested role
660
+ default_content = files["swarph-watchdog.default"].replace(
661
+ "SWARPH_CELL=lab",
662
+ f"SWARPH_CELL={args.cell}",
663
+ 1,
664
+ )
665
+
666
+ targets = [
667
+ (_SYSTEMD_UNIT_DIR / _SYSTEMD_UNIT_NAMES[0], files[_SYSTEMD_UNIT_NAMES[0]]),
668
+ (_SYSTEMD_UNIT_DIR / _SYSTEMD_UNIT_NAMES[1], files[_SYSTEMD_UNIT_NAMES[1]]),
669
+ (_SYSTEMD_DEFAULT_DIR / _SYSTEMD_DEFAULT_NAME, default_content),
670
+ ]
671
+
672
+ if args.dry_run:
673
+ print(f"# DRY RUN — cell={args.cell}", file=sys.stderr)
674
+ for path, content in targets:
675
+ print(f"\n# would write {path}:", file=sys.stderr)
676
+ print(content, file=sys.stderr)
677
+ print(
678
+ "\n# would then run:\n"
679
+ "# sudo systemctl daemon-reload\n"
680
+ "# sudo systemctl enable --now swarph-watchdog.timer",
681
+ file=sys.stderr,
682
+ )
683
+ return 0
684
+
685
+ if os.geteuid() != 0:
686
+ print(
687
+ "ERROR: --install-service requires root. Re-run with sudo, or use "
688
+ "--dry-run to preview the install without writing.",
689
+ file=sys.stderr,
690
+ )
691
+ return 4
692
+
693
+ try:
694
+ for path, content in targets:
695
+ path.write_text(content, encoding="utf-8")
696
+ print(f"wrote {path}", file=sys.stderr)
697
+ except (OSError, PermissionError) as exc:
698
+ print(f"ERROR: failed to write unit files: {exc}", file=sys.stderr)
699
+ return 5
700
+
701
+ try:
702
+ subprocess.run(["systemctl", "daemon-reload"], check=True)
703
+ subprocess.run(
704
+ ["systemctl", "enable", "--now", "swarph-watchdog.timer"],
705
+ check=True,
706
+ )
707
+ except (subprocess.CalledProcessError, FileNotFoundError) as exc:
708
+ print(f"ERROR: systemctl failed: {exc}", file=sys.stderr)
709
+ return 5
710
+
711
+ print(
712
+ f"\nswarph-watchdog.timer installed + enabled for cell={args.cell}.\n"
713
+ f" status: systemctl status swarph-watchdog.timer\n"
714
+ f" logs: journalctl -u swarph-watchdog.service -f\n"
715
+ f" OR /var/log/swarph-watchdog.log\n"
716
+ f" next: systemctl list-timers swarph-watchdog.timer",
717
+ file=sys.stderr,
718
+ )
719
+ return 0
720
+
721
+
450
722
  def run_watchdog(argv: Optional[list[str]] = None) -> int:
451
723
  if argv is None:
452
724
  argv = sys.argv[2:] # skip "swarph watchdog"
@@ -460,9 +732,27 @@ def run_watchdog(argv: Optional[list[str]] = None) -> int:
460
732
  "--check", action="store_true",
461
733
  help="One-shot check (cron-callable; exits with status code).",
462
734
  )
735
+ p.add_argument(
736
+ "--install-service", action="store_true",
737
+ help="Install systemd timer + service for periodic --check invocation. "
738
+ "Requires sudo. Closes ev_6954f748 substrate-component-install gap.",
739
+ )
740
+ p.add_argument(
741
+ "--dry-run", action="store_true",
742
+ help="With --install-service: show what would be written without "
743
+ "writing. Useful for review or non-root preview.",
744
+ )
463
745
  p.add_argument("--cell", default=os.environ.get("SWARPH_CELL", "lab"))
464
746
  p.add_argument("--cursor", default=None)
465
747
  p.add_argument("--threshold", type=int, default=_DEFAULT_THRESHOLD_SEC)
748
+ p.add_argument(
749
+ "--pane-activity-threshold",
750
+ type=int,
751
+ default=_DEFAULT_PANE_ACTIVITY_THRESHOLD_SEC,
752
+ help="F3 gate: suppress A1 if tmux pane had activity within this "
753
+ "many seconds (covers mid-long-turn working sessions where "
754
+ "cursor-mtime is stale but session is alive).",
755
+ )
466
756
  p.add_argument("--gateway", default=_DEFAULT_GATEWAY_URL)
467
757
  p.add_argument("--tmux-session", default=None)
468
758
  p.add_argument("--peer", default=None)
@@ -475,6 +765,9 @@ def run_watchdog(argv: Optional[list[str]] = None) -> int:
475
765
  except SystemExit as exc:
476
766
  return int(exc.code or 0)
477
767
 
768
+ if args.install_service:
769
+ return run_install_service(args)
770
+
478
771
  if not args.check:
479
772
  print(_USAGE, file=sys.stderr)
480
773
  return 4
@@ -0,0 +1,9 @@
1
+ # /etc/default/swarph-watchdog — environment for swarph-watchdog.service
2
+ #
3
+ # SWARPH_CELL sets the cell role for the watchdog. The watchdog reads
4
+ # cell.yaml for this role to discover the cursor path + tmux session pin
5
+ # (per F4 cell.yaml-pinning landed in v0.7.2).
6
+ #
7
+ # If unset, watchdog defaults to "lab" (per the --cell argparse default).
8
+ # Override here per-vertex: droplet, gpu-wsl, razorpeter, etc.
9
+ SWARPH_CELL=lab
@@ -0,0 +1,15 @@
1
+ [Unit]
2
+ Description=Swarph watchdog one-shot check (stranded-session recovery)
3
+ Documentation=https://github.com/darw007d/swarph-cli
4
+ After=network-online.target
5
+ Wants=network-online.target
6
+
7
+ [Service]
8
+ Type=oneshot
9
+ EnvironmentFile=-/etc/default/swarph-watchdog
10
+ ExecStart=/usr/local/bin/swarph watchdog --check
11
+ StandardOutput=append:/var/log/swarph-watchdog.log
12
+ StandardError=append:/var/log/swarph-watchdog.log
13
+
14
+ [Install]
15
+ WantedBy=multi-user.target
@@ -0,0 +1,13 @@
1
+ [Unit]
2
+ Description=Run swarph watchdog every 5 minutes (stranded-session recovery)
3
+ Documentation=https://github.com/darw007d/swarph-cli
4
+ Requires=swarph-watchdog.service
5
+
6
+ [Timer]
7
+ OnBootSec=2min
8
+ OnUnitActiveSec=5min
9
+ AccuracySec=1min
10
+ Persistent=true
11
+
12
+ [Install]
13
+ WantedBy=timers.target
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: swarph-cli
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: The `swarph` binary — multi-LLM CLI with mesh-gateway integration. v0.7.0 ships Phase 7 substrate-doc R7 §11.1.7 operator-tooling layer in 5 increments: PR-A `--new-instance` flag (sibling-spawn case) + PR-B auto-suffix on collision (sibling-slot persistence) + PR-C SessionStart hook (closes bare-claude operator-paste gap) + watchdog (stranded-session recovery) + PR-D swarph-shared cell.yaml relocation (cell-yaml schema graduates to swarph-shared 0.3.0 kernel-tier; substrate-doc R7 §11.1.5 (O5) RESOLVED).
5
5
  Author: Pierre Samson, Claude Opus
6
6
  License: MIT
@@ -138,6 +138,46 @@ Loud-on-down (PLAN §16.5): never silently exits. Cursor writes are atomic (writ
138
138
 
139
139
  `--auto-act` flag is documented for v0.5.1+ when handler registration via `@swarph.on_dm(...)` lands; v0.5.0 ships surface-only mode (DMs printed + JSONL-logged to `inbox.log`, no automatic replies).
140
140
 
141
+ ### `swarph watchdog` (Phase 7 — v0.7 stranded-session detection, v0.7.3 systemd install)
142
+
143
+ Detects stranded Claude sessions (API throttle / harness death) via cursor-mtime + tmux pgrep AND-gate, and recovers via A1 tmux send-keys wake-prompt → A2 `swarph spawn` respawn. Cell.yaml-pinned cursor + tmux session (F4) since v0.7.2.
144
+
145
+ **One-shot mode (cron-callable, v0.7+):**
146
+ ```bash
147
+ */5 * * * * swarph watchdog --check --cell lab >> ~/.local/log/swarph-watchdog.log 2>&1
148
+ ```
149
+
150
+ **Systemd timer install (v0.7.3+ — closes ev_6954f748 substrate-component-installation-gap):**
151
+
152
+ ```bash
153
+ # Preview without writing (any user):
154
+ swarph watchdog --install-service --cell droplet --dry-run
155
+
156
+ # Install + enable (requires root for /etc/systemd/system writes):
157
+ sudo swarph watchdog --install-service --cell droplet
158
+ ```
159
+
160
+ This writes three files:
161
+
162
+ | Path | Purpose |
163
+ |------|---------|
164
+ | `/etc/systemd/system/swarph-watchdog.service` | `Type=oneshot`, runs `swarph watchdog --check` |
165
+ | `/etc/systemd/system/swarph-watchdog.timer` | Fires every 5 minutes (`OnUnitActiveSec=5min`) |
166
+ | `/etc/default/swarph-watchdog` | Sets `SWARPH_CELL=<role>` for the service env |
167
+
168
+ Then runs `systemctl daemon-reload && systemctl enable --now swarph-watchdog.timer`. Idempotent — re-running overwrites with current package version (newer-version semantics).
169
+
170
+ Monitoring:
171
+
172
+ ```bash
173
+ systemctl status swarph-watchdog.timer # is it scheduled?
174
+ systemctl list-timers swarph-watchdog.timer # next fire?
175
+ journalctl -u swarph-watchdog.service -f # live log
176
+ tail -f /var/log/swarph-watchdog.log # append-log alternative
177
+ ```
178
+
179
+ Why this matters: pre-v0.7.3, swarph-cli shipped the watchdog code but no install path. Lab ran it via cron (manual setup); droplet never installed it at all. A real production silence-window (drop's ~24min mute 2026-05-14 08:38→09:02 UTC after an Anthropic API error) made the install-gap visible. v0.7.3 closes it for any peer with one command.
180
+
141
181
  ### `swarph onboard` + `swarph ratify` (Phase 5.5)
142
182
 
143
183
  Per PLAN.md §15, onboarding splits into a **mechanics phase** (`swarph onboard`) that automates the boring parts (registry POST, scaffolding, token resolution) and a **manual contract phase** (the new peer composes the handshake DM in their own words). A witness peer judges the handshake and runs `swarph ratify <peer>` to flip `ratified=true`, gating `task_claim` server-side.
@@ -23,6 +23,9 @@ src/swarph_cli/commands/spawn.py
23
23
  src/swarph_cli/commands/watchdog.py
24
24
  src/swarph_cli/parsers/__init__.py
25
25
  src/swarph_cli/parsers/claude.py
26
+ src/swarph_cli/systemd/swarph-watchdog.default
27
+ src/swarph_cli/systemd/swarph-watchdog.service
28
+ src/swarph_cli/systemd/swarph-watchdog.timer
26
29
  tests/test_cell_loader.py
27
30
  tests/test_chat_command.py
28
31
  tests/test_claude_parser.py
@@ -341,6 +341,135 @@ def test_a1_rearms_after_cursor_advance(
341
341
  assert send_mock.call_count == 2
342
342
 
343
343
 
344
+ # ---------------------------------------------------------------------------
345
+ # F3 — tmux pane_activity AND-gate (mother #1087)
346
+ # ---------------------------------------------------------------------------
347
+
348
+
349
+ def test_pane_activity_recent_suppresses_a1(
350
+ isolated_state, stale_cursor, monkeypatch
351
+ ):
352
+ """F3 — cursor stale + alive + unread > 0 but pane_activity recent →
353
+ suppress A1. Session is working in a long bash block; cursor only
354
+ updates at turn-end. Same incident class as commander #1092 65-min
355
+ spam, but caught upstream of F1 marker by checking pane_activity
356
+ BEFORE firing."""
357
+ with patch("swarph_cli.commands.watchdog._process_alive", return_value=True), \
358
+ patch("swarph_cli.commands.watchdog._gateway_unread_count", return_value=3), \
359
+ patch("swarph_cli.commands.watchdog._tmux_session_exists", return_value=True), \
360
+ patch("swarph_cli.commands.watchdog._pane_activity_age_sec", return_value=30), \
361
+ patch("swarph_cli.commands.watchdog._tmux_send_keys") as send_mock:
362
+ rc = run_watchdog(argv=[
363
+ "--check", "--cell", "lab",
364
+ "--cursor", str(stale_cursor),
365
+ "--threshold", "60",
366
+ "--pane-activity-threshold", "600",
367
+ ])
368
+ assert rc == 0
369
+ send_mock.assert_not_called()
370
+
371
+
372
+ def test_pane_activity_old_falls_through_to_a1(
373
+ isolated_state, stale_cursor, monkeypatch
374
+ ):
375
+ """F3 — pane_activity OLDER than threshold means session has actually
376
+ been quiet; A1 still fires. Stop signal compatibility check."""
377
+ with patch("swarph_cli.commands.watchdog._process_alive", return_value=True), \
378
+ patch("swarph_cli.commands.watchdog._gateway_unread_count", return_value=3), \
379
+ patch("swarph_cli.commands.watchdog._tmux_session_exists", return_value=True), \
380
+ patch("swarph_cli.commands.watchdog._pane_activity_age_sec", return_value=1200), \
381
+ patch("swarph_cli.commands.watchdog._tmux_send_keys", return_value=True) as send_mock:
382
+ rc = run_watchdog(argv=[
383
+ "--check", "--cell", "lab",
384
+ "--cursor", str(stale_cursor),
385
+ "--threshold", "60",
386
+ "--pane-activity-threshold", "600",
387
+ ])
388
+ assert rc == 1
389
+ send_mock.assert_called_once()
390
+
391
+
392
+ def test_pane_activity_unavailable_falls_through_to_a1(
393
+ isolated_state, stale_cursor, monkeypatch
394
+ ):
395
+ """F3 — detection error (tmux missing / older tmux without
396
+ #{pane_activity}) returns None; A1 still fires. F3 is a strengthening
397
+ of the gate, not a hard dependency."""
398
+ with patch("swarph_cli.commands.watchdog._process_alive", return_value=True), \
399
+ patch("swarph_cli.commands.watchdog._gateway_unread_count", return_value=3), \
400
+ patch("swarph_cli.commands.watchdog._tmux_session_exists", return_value=True), \
401
+ patch("swarph_cli.commands.watchdog._pane_activity_age_sec", return_value=None), \
402
+ patch("swarph_cli.commands.watchdog._tmux_send_keys", return_value=True) as send_mock:
403
+ rc = run_watchdog(argv=[
404
+ "--check", "--cell", "lab",
405
+ "--cursor", str(stale_cursor),
406
+ "--threshold", "60",
407
+ ])
408
+ assert rc == 1
409
+ send_mock.assert_called_once()
410
+
411
+
412
+ # ---------------------------------------------------------------------------
413
+ # F4 — cell.yaml-pinned cursor_path + tmux_session (mother #1057/#1060 + beta #1061/#1065)
414
+ # ---------------------------------------------------------------------------
415
+
416
+
417
+ def test_resolve_cursor_path_cell_yaml_pin_beats_default(isolated_state):
418
+ """F4 — cell.yaml extra.cursor_path takes precedence over the
419
+ /tmp/lab-claude-cursor.json fallback when no explicit --cursor."""
420
+ from swarph_cli.commands.watchdog import _resolve_cursor_path
421
+ pinned = isolated_state / "custom-cursor.json"
422
+ assert _resolve_cursor_path("lab", None, str(pinned)) == pinned
423
+
424
+
425
+ def test_resolve_cursor_path_explicit_beats_cell_yaml_pin(isolated_state):
426
+ """F4 — explicit --cursor still wins over cell.yaml pin."""
427
+ from swarph_cli.commands.watchdog import _resolve_cursor_path
428
+ explicit = isolated_state / "explicit-cursor.json"
429
+ pinned = isolated_state / "pinned-cursor.json"
430
+ assert _resolve_cursor_path("lab", str(explicit), str(pinned)) == explicit
431
+
432
+
433
+ def test_resolve_tmux_session_cell_yaml_pin_beats_role(isolated_state):
434
+ """F4 — cell.yaml extra.tmux_session takes precedence over role
435
+ default when no explicit --tmux-session."""
436
+ from swarph_cli.commands.watchdog import _resolve_tmux_session
437
+ assert _resolve_tmux_session("drop-mother", None, "drop-mother-tmux") == "drop-mother-tmux"
438
+
439
+
440
+ def test_resolve_tmux_session_explicit_beats_cell_yaml_pin(isolated_state):
441
+ """F4 — explicit --tmux-session still wins over cell.yaml pin."""
442
+ from swarph_cli.commands.watchdog import _resolve_tmux_session
443
+ assert _resolve_tmux_session("lab", "explicit-name", "pinned-name") == "explicit-name"
444
+
445
+
446
+ def test_resolve_tmux_session_falls_back_to_role(isolated_state):
447
+ """F4 — no explicit + no cell.yaml pin → role itself."""
448
+ from swarph_cli.commands.watchdog import _resolve_tmux_session
449
+ assert _resolve_tmux_session("lab", None, None) == "lab"
450
+
451
+
452
+ def test_a1_marker_path_keyed_on_role_and_tmux_session(isolated_state):
453
+ """F4 — marker filename includes both role + tmux_session to prevent
454
+ sibling-instance marker collisions (mother #1103 follow-up)."""
455
+ from swarph_cli.commands.watchdog import _a1_marker_path
456
+ log_path = isolated_state / "wd.log"
457
+ m1 = _a1_marker_path(log_path, "drop-on-meta-edge", "drop-on-meta-edge")
458
+ m2 = _a1_marker_path(log_path, "drop-on-meta-edge", "drop-on-meta-edge-2")
459
+ assert m1 != m2
460
+ assert m1.name == "a1-fired-drop-on-meta-edge-drop-on-meta-edge.marker"
461
+ assert m2.name == "a1-fired-drop-on-meta-edge-drop-on-meta-edge-2.marker"
462
+
463
+
464
+ def test_a1_marker_path_sanitizes_tmux_session(isolated_state):
465
+ """F4 — tmux_session sanitized to alphanumeric+underscore so
466
+ cell.yaml-pinned values with weird chars don't break the filename."""
467
+ from swarph_cli.commands.watchdog import _a1_marker_path
468
+ log_path = isolated_state / "wd.log"
469
+ m = _a1_marker_path(log_path, "lab", "weird/name with spaces!")
470
+ assert ":" not in m.name and "/" not in m.name and " " not in m.name
471
+
472
+
344
473
  def test_a2_escalation_clears_a1_marker(
345
474
  isolated_state, stale_cursor, monkeypatch
346
475
  ):
@@ -361,7 +490,10 @@ def test_a2_escalation_clears_a1_marker(
361
490
  "--threshold", "60",
362
491
  "--log", str(log_path),
363
492
  ])
364
- marker = log_path.parent / "a1-fired-lab.marker"
493
+ # F4 v0.7.2 marker keyed on (role, tmux_session) — tmux_session defaults
494
+ # to role when no --tmux-session arg + no cell.yaml pin, so filename is
495
+ # a1-fired-{role}-{role}.marker.
496
+ marker = log_path.parent / "a1-fired-lab-lab.marker"
365
497
  assert marker.exists()
366
498
 
367
499
  # Now force A2 path (process dead) and confirm marker is gone
@@ -430,3 +562,65 @@ def test_watchdog_log_appends_across_invocations(isolated_state, monkeypatch):
430
562
  parsed_second = json.loads(lines[1])
431
563
  assert parsed_first["details"]["decision"] == "healthy_cursor_fresh"
432
564
  assert parsed_second["details"]["decision"] == "noop_no_unread"
565
+
566
+
567
+ # ---------------------------------------------------------------------------
568
+ # --install-service (v0.7.3 — closes ev_6954f748 substrate-component-install)
569
+ # ---------------------------------------------------------------------------
570
+
571
+
572
+ def test_install_service_dry_run_writes_no_files(isolated_state, capsys):
573
+ """--dry-run prints what would be written without touching the filesystem."""
574
+ rc = run_watchdog(argv=["--install-service", "--cell", "droplet", "--dry-run"])
575
+ assert rc == 0
576
+ captured = capsys.readouterr()
577
+ # Dry-run output goes to stderr
578
+ assert "DRY RUN" in captured.err
579
+ assert "cell=droplet" in captured.err
580
+ # All three target files surface in the preview
581
+ assert "/etc/systemd/system/swarph-watchdog.service" in captured.err
582
+ assert "/etc/systemd/system/swarph-watchdog.timer" in captured.err
583
+ assert "/etc/default/swarph-watchdog" in captured.err
584
+ # SWARPH_CELL was templated to the requested role
585
+ assert "SWARPH_CELL=droplet" in captured.err
586
+ # The bundled service file's identifying line shows up
587
+ assert "Swarph watchdog one-shot check" in captured.err
588
+
589
+
590
+ def test_install_service_dry_run_default_cell_is_lab(isolated_state, capsys):
591
+ """Without --cell, the dry-run preview keeps SWARPH_CELL=lab default."""
592
+ rc = run_watchdog(argv=["--install-service", "--dry-run"])
593
+ assert rc == 0
594
+ captured = capsys.readouterr()
595
+ assert "SWARPH_CELL=lab" in captured.err
596
+
597
+
598
+ def test_install_service_without_sudo_returns_4(isolated_state, capsys, monkeypatch):
599
+ """Non-root install (no --dry-run) refuses with helpful message + exit 4."""
600
+ monkeypatch.setattr("os.geteuid", lambda: 1000)
601
+ rc = run_watchdog(argv=["--install-service", "--cell", "droplet"])
602
+ assert rc == 4
603
+ captured = capsys.readouterr()
604
+ assert "requires root" in captured.err
605
+ assert "--dry-run" in captured.err # hint surfaces
606
+
607
+
608
+ def test_bundled_systemd_files_readable():
609
+ """Package-data manifest correctness — importlib.resources can read all
610
+ three bundled templates. Regression guard for pyproject package-data
611
+ declaration."""
612
+ from swarph_cli.commands.watchdog import _bundled_systemd_files
613
+
614
+ files = _bundled_systemd_files()
615
+ assert set(files.keys()) == {
616
+ "swarph-watchdog.service",
617
+ "swarph-watchdog.timer",
618
+ "swarph-watchdog.default",
619
+ }
620
+ # Service file has the expected Type=oneshot shape
621
+ assert "Type=oneshot" in files["swarph-watchdog.service"]
622
+ assert "ExecStart=/usr/local/bin/swarph watchdog --check" in files["swarph-watchdog.service"]
623
+ # Timer fires every 5 minutes
624
+ assert "OnUnitActiveSec=5min" in files["swarph-watchdog.timer"]
625
+ # Default file has the SWARPH_CELL=lab template line
626
+ assert "SWARPH_CELL=lab" in files["swarph-watchdog.default"]
File without changes
File without changes