swarph-cli 0.7.1__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {swarph_cli-0.7.1/src/swarph_cli.egg-info → swarph_cli-0.7.3}/PKG-INFO +41 -1
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/README.md +40 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/pyproject.toml +7 -1
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/__init__.py +1 -1
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/watchdog.py +308 -15
- swarph_cli-0.7.3/src/swarph_cli/systemd/swarph-watchdog.default +9 -0
- swarph_cli-0.7.3/src/swarph_cli/systemd/swarph-watchdog.service +15 -0
- swarph_cli-0.7.3/src/swarph_cli/systemd/swarph-watchdog.timer +13 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3/src/swarph_cli.egg-info}/PKG-INFO +41 -1
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/SOURCES.txt +3 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_watchdog.py +195 -1
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/LICENSE +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/setup.cfg +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/caller.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/cell.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/__init__.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/chat.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/daemon.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/hook_output.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/import_session.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/install_hook.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/onboard.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/ratify.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/commands/spawn.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/main.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/parsers/__init__.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli/parsers/claude.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/dependency_links.txt +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/entry_points.txt +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/requires.txt +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/src/swarph_cli.egg-info/top_level.txt +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_cell_loader.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_chat_command.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_claude_parser.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_daemon_command.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_hook_output.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_import_command.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_install_hook.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_main.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_onboard_command.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_ratify_command.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_smoke_chat.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_smoke_one_shot.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_smoke_phase_5_5.py +0 -0
- {swarph_cli-0.7.1 → swarph_cli-0.7.3}/tests/test_spawn_command.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: swarph-cli
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: The `swarph` binary — multi-LLM CLI with mesh-gateway integration. v0.7.0 ships Phase 7 substrate-doc R7 §11.1.7 operator-tooling layer in 5 increments: PR-A `--new-instance` flag (sibling-spawn case) + PR-B auto-suffix on collision (sibling-slot persistence) + PR-C SessionStart hook (closes bare-claude operator-paste gap) + watchdog (stranded-session recovery) + PR-D swarph-shared cell.yaml relocation (cell-yaml schema graduates to swarph-shared 0.3.0 kernel-tier; substrate-doc R7 §11.1.5 (O5) RESOLVED).
|
|
5
5
|
Author: Pierre Samson, Claude Opus
|
|
6
6
|
License: MIT
|
|
@@ -138,6 +138,46 @@ Loud-on-down (PLAN §16.5): never silently exits. Cursor writes are atomic (writ
|
|
|
138
138
|
|
|
139
139
|
`--auto-act` flag is documented for v0.5.1+ when handler registration via `@swarph.on_dm(...)` lands; v0.5.0 ships surface-only mode (DMs printed + JSONL-logged to `inbox.log`, no automatic replies).
|
|
140
140
|
|
|
141
|
+
### `swarph watchdog` (Phase 7 — v0.7 stranded-session detection, v0.7.3 systemd install)
|
|
142
|
+
|
|
143
|
+
Detects stranded Claude sessions (API throttle / harness death) via cursor-mtime + tmux pgrep AND-gate, and recovers via A1 tmux send-keys wake-prompt → A2 `swarph spawn` respawn. Cell.yaml-pinned cursor + tmux session (F4) since v0.7.2.
|
|
144
|
+
|
|
145
|
+
**One-shot mode (cron-callable, v0.7+):**
|
|
146
|
+
```bash
|
|
147
|
+
*/5 * * * * swarph watchdog --check --cell lab >> ~/.local/log/swarph-watchdog.log 2>&1
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Systemd timer install (v0.7.3+ — closes ev_6954f748 substrate-component-installation-gap):**
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
# Preview without writing (any user):
|
|
154
|
+
swarph watchdog --install-service --cell droplet --dry-run
|
|
155
|
+
|
|
156
|
+
# Install + enable (requires root for /etc/systemd/system writes):
|
|
157
|
+
sudo swarph watchdog --install-service --cell droplet
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
This writes three files:
|
|
161
|
+
|
|
162
|
+
| Path | Purpose |
|
|
163
|
+
|------|---------|
|
|
164
|
+
| `/etc/systemd/system/swarph-watchdog.service` | `Type=oneshot`, runs `swarph watchdog --check` |
|
|
165
|
+
| `/etc/systemd/system/swarph-watchdog.timer` | Fires every 5 minutes (`OnUnitActiveSec=5min`) |
|
|
166
|
+
| `/etc/default/swarph-watchdog` | Sets `SWARPH_CELL=<role>` for the service env |
|
|
167
|
+
|
|
168
|
+
Then runs `systemctl daemon-reload && systemctl enable --now swarph-watchdog.timer`. Idempotent — re-running overwrites with current package version (newer-version semantics).
|
|
169
|
+
|
|
170
|
+
Monitoring:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
systemctl status swarph-watchdog.timer # is it scheduled?
|
|
174
|
+
systemctl list-timers swarph-watchdog.timer # next fire?
|
|
175
|
+
journalctl -u swarph-watchdog.service -f # live log
|
|
176
|
+
tail -f /var/log/swarph-watchdog.log # append-log alternative
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Why this matters: pre-v0.7.3, swarph-cli shipped the watchdog code but no install path. Lab ran it via cron (manual setup); droplet never installed it at all. A real production silence-window (drop's ~24min mute 2026-05-14 08:38→09:02 UTC after an Anthropic API error) made the install-gap visible. v0.7.3 closes it for any peer with one command.
|
|
180
|
+
|
|
141
181
|
### `swarph onboard` + `swarph ratify` (Phase 5.5)
|
|
142
182
|
|
|
143
183
|
Per PLAN.md §15, onboarding splits into a **mechanics phase** (`swarph onboard`) that automates the boring parts (registry POST, scaffolding, token resolution) and a **manual contract phase** (the new peer composes the handshake DM in their own words). A witness peer judges the handshake and runs `swarph ratify <peer>` to flip `ratified=true`, gating `task_claim` server-side.
|
|
@@ -105,6 +105,46 @@ Loud-on-down (PLAN §16.5): never silently exits. Cursor writes are atomic (writ
|
|
|
105
105
|
|
|
106
106
|
`--auto-act` flag is documented for v0.5.1+ when handler registration via `@swarph.on_dm(...)` lands; v0.5.0 ships surface-only mode (DMs printed + JSONL-logged to `inbox.log`, no automatic replies).
|
|
107
107
|
|
|
108
|
+
### `swarph watchdog` (Phase 7 — v0.7 stranded-session detection, v0.7.3 systemd install)
|
|
109
|
+
|
|
110
|
+
Detects stranded Claude sessions (API throttle / harness death) via cursor-mtime + tmux pgrep AND-gate, and recovers via A1 tmux send-keys wake-prompt → A2 `swarph spawn` respawn. Cell.yaml-pinned cursor + tmux session (F4) since v0.7.2.
|
|
111
|
+
|
|
112
|
+
**One-shot mode (cron-callable, v0.7+):**
|
|
113
|
+
```bash
|
|
114
|
+
*/5 * * * * swarph watchdog --check --cell lab >> ~/.local/log/swarph-watchdog.log 2>&1
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
**Systemd timer install (v0.7.3+ — closes ev_6954f748 substrate-component-installation-gap):**
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
# Preview without writing (any user):
|
|
121
|
+
swarph watchdog --install-service --cell droplet --dry-run
|
|
122
|
+
|
|
123
|
+
# Install + enable (requires root for /etc/systemd/system writes):
|
|
124
|
+
sudo swarph watchdog --install-service --cell droplet
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
This writes three files:
|
|
128
|
+
|
|
129
|
+
| Path | Purpose |
|
|
130
|
+
|------|---------|
|
|
131
|
+
| `/etc/systemd/system/swarph-watchdog.service` | `Type=oneshot`, runs `swarph watchdog --check` |
|
|
132
|
+
| `/etc/systemd/system/swarph-watchdog.timer` | Fires every 5 minutes (`OnUnitActiveSec=5min`) |
|
|
133
|
+
| `/etc/default/swarph-watchdog` | Sets `SWARPH_CELL=<role>` for the service env |
|
|
134
|
+
|
|
135
|
+
Then runs `systemctl daemon-reload && systemctl enable --now swarph-watchdog.timer`. Idempotent — re-running overwrites with current package version (newer-version semantics).
|
|
136
|
+
|
|
137
|
+
Monitoring:
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
systemctl status swarph-watchdog.timer # is it scheduled?
|
|
141
|
+
systemctl list-timers swarph-watchdog.timer # next fire?
|
|
142
|
+
journalctl -u swarph-watchdog.service -f # live log
|
|
143
|
+
tail -f /var/log/swarph-watchdog.log # append-log alternative
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Why this matters: pre-v0.7.3, swarph-cli shipped the watchdog code but no install path. Lab ran it via cron (manual setup); droplet never installed it at all. A real production silence-window (drop's ~24min mute 2026-05-14 08:38→09:02 UTC after an Anthropic API error) made the install-gap visible. v0.7.3 closes it for any peer with one command.
|
|
147
|
+
|
|
108
148
|
### `swarph onboard` + `swarph ratify` (Phase 5.5)
|
|
109
149
|
|
|
110
150
|
Per PLAN.md §15, onboarding splits into a **mechanics phase** (`swarph onboard`) that automates the boring parts (registry POST, scaffolding, token resolution) and a **manual contract phase** (the new peer composes the handshake DM in their own words). A witness peer judges the handshake and runs `swarph ratify <peer>` to flip `ratified=true`, gating `task_claim` server-side.
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "swarph-cli"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.3"
|
|
8
8
|
description = "The `swarph` binary — multi-LLM CLI with mesh-gateway integration. v0.7.0 ships Phase 7 substrate-doc R7 §11.1.7 operator-tooling layer in 5 increments: PR-A `--new-instance` flag (sibling-spawn case) + PR-B auto-suffix on collision (sibling-slot persistence) + PR-C SessionStart hook (closes bare-claude operator-paste gap) + watchdog (stranded-session recovery) + PR-D swarph-shared cell.yaml relocation (cell-yaml schema graduates to swarph-shared 0.3.0 kernel-tier; substrate-doc R7 §11.1.5 (O5) RESOLVED)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -59,6 +59,12 @@ swarph = "swarph_cli.main:main"
|
|
|
59
59
|
[tool.setuptools.packages.find]
|
|
60
60
|
where = ["src"]
|
|
61
61
|
|
|
62
|
+
[tool.setuptools.package-data]
|
|
63
|
+
# v0.7.3: ship bundled systemd unit + timer + default templates so
|
|
64
|
+
# `swarph watchdog --install-service` can read them via importlib.resources.
|
|
65
|
+
# Closes ev_6954f748 substrate-component-installation-gap.
|
|
66
|
+
swarph_cli = ["systemd/*.service", "systemd/*.timer", "systemd/*.default"]
|
|
67
|
+
|
|
62
68
|
[tool.pytest.ini_options]
|
|
63
69
|
testpaths = ["tests"]
|
|
64
70
|
addopts = "-v --tb=short"
|
|
@@ -79,6 +79,11 @@ _DEFAULT_THRESHOLD_SEC = 1800 # 30 minutes
|
|
|
79
79
|
_DEFAULT_A1_RETRIES = 3
|
|
80
80
|
_DEFAULT_A1_BACKOFF_SEC = 60
|
|
81
81
|
_DEFAULT_GATEWAY_URL = "http://localhost:8788"
|
|
82
|
+
# F3 — tmux pane_activity gate threshold. If pane has activity within this
|
|
83
|
+
# many seconds, suppress A1 (session is working, not stalled). 600s (10min)
|
|
84
|
+
# is comfortably above legitimate-pause noise + comfortably below the
|
|
85
|
+
# 30min cursor-staleness threshold, so the two gates compose cleanly.
|
|
86
|
+
_DEFAULT_PANE_ACTIVITY_THRESHOLD_SEC = 600
|
|
82
87
|
|
|
83
88
|
_USAGE = """\
|
|
84
89
|
Usage:
|
|
@@ -86,6 +91,7 @@ Usage:
|
|
|
86
91
|
[--gateway URL] [--tmux-session NAME]
|
|
87
92
|
[--peer NAME] [--no-respawn]
|
|
88
93
|
[--log PATH] [--verbose]
|
|
94
|
+
swarph watchdog --install-service [--cell ROLE] [--dry-run]
|
|
89
95
|
|
|
90
96
|
Detects stranded Claude sessions (API throttle / harness death) and attempts
|
|
91
97
|
recovery via tmux send-keys A1 wake-prompt, escalating to swarph spawn
|
|
@@ -94,6 +100,12 @@ respawn (A2) on persistent darkness.
|
|
|
94
100
|
Designed for cron invocation:
|
|
95
101
|
*/5 * * * * swarph watchdog --check --cell lab >> ~/.local/log/swarph-watchdog.log 2>&1
|
|
96
102
|
|
|
103
|
+
OR systemd timer (v0.7.3+, closes ev_6954f748 substrate-component-installation-gap):
|
|
104
|
+
sudo swarph watchdog --install-service [--cell <role>]
|
|
105
|
+
# → installs /etc/systemd/system/swarph-watchdog.{service,timer}
|
|
106
|
+
# → installs /etc/default/swarph-watchdog with SWARPH_CELL=<role>
|
|
107
|
+
# → daemon-reload + enable --now swarph-watchdog.timer
|
|
108
|
+
|
|
97
109
|
Detection (mother #1021 AND-gate design):
|
|
98
110
|
PRIMARY: cursor file mtime — most-recent Claude action (drain script touches it)
|
|
99
111
|
FALLBACK: pgrep claude on tmux session — confirms process aliveness
|
|
@@ -118,11 +130,12 @@ Flags:
|
|
|
118
130
|
--verbose also write diagnostics to stderr
|
|
119
131
|
|
|
120
132
|
Exit codes:
|
|
121
|
-
0 no action taken (session healthy or no unread DMs queued)
|
|
133
|
+
0 no action taken (session healthy or no unread DMs queued); install ok
|
|
122
134
|
1 A1 fired (wake-prompt sent)
|
|
123
135
|
2 A2 fired (full respawn triggered)
|
|
124
136
|
3 detection error (cursor unreadable / gateway unreachable)
|
|
125
|
-
4 configuration error (invalid args, no cell.yaml resolved)
|
|
137
|
+
4 configuration error (invalid args, no cell.yaml resolved); install needs sudo
|
|
138
|
+
5 install error (file write failed / systemctl failed)
|
|
126
139
|
"""
|
|
127
140
|
|
|
128
141
|
|
|
@@ -137,10 +150,29 @@ def _stat_mtime(path: Path) -> Optional[int]:
|
|
|
137
150
|
return None
|
|
138
151
|
|
|
139
152
|
|
|
140
|
-
def _resolve_cursor_path(
|
|
141
|
-
|
|
153
|
+
def _resolve_cursor_path(
|
|
154
|
+
role: str,
|
|
155
|
+
explicit: Optional[str],
|
|
156
|
+
cell_yaml_value: Optional[str] = None,
|
|
157
|
+
) -> Path:
|
|
158
|
+
"""Resolve cursor file path with documented fallback chain.
|
|
159
|
+
|
|
160
|
+
Precedence (F4 — mother #1057/#1060 + beta #1061/#1065):
|
|
161
|
+
1. Explicit ``--cursor`` CLI arg (highest)
|
|
162
|
+
2. ``cell.yaml`` extra.cursor_path when --cell present
|
|
163
|
+
3. ``$TMPDIR/<role>-cursor.json``
|
|
164
|
+
4. ``/tmp/lab-claude-cursor.json`` (legacy lab-orchestrator default)
|
|
165
|
+
|
|
166
|
+
F4 closes the host-prefix-variant + sibling-instance-variant gap
|
|
167
|
+
class — cell.yaml carries the canonical cursor path per-cell, watchdog
|
|
168
|
+
auto-resolves when --cell is provided. Eliminates the silent-default-
|
|
169
|
+
to-lab-prefix failure mode that gave droplet 23hr of cursor-unreadable
|
|
170
|
+
errors before catch.
|
|
171
|
+
"""
|
|
142
172
|
if explicit:
|
|
143
173
|
return Path(explicit).expanduser()
|
|
174
|
+
if cell_yaml_value:
|
|
175
|
+
return Path(cell_yaml_value).expanduser()
|
|
144
176
|
tmpdir = os.environ.get("TMPDIR", "/tmp")
|
|
145
177
|
primary = Path(tmpdir) / f"{role}-cursor.json"
|
|
146
178
|
if primary.exists():
|
|
@@ -149,6 +181,74 @@ def _resolve_cursor_path(role: str, explicit: Optional[str]) -> Path:
|
|
|
149
181
|
return Path("/tmp/lab-claude-cursor.json")
|
|
150
182
|
|
|
151
183
|
|
|
184
|
+
def _resolve_tmux_session(
|
|
185
|
+
role: str,
|
|
186
|
+
explicit: Optional[str],
|
|
187
|
+
cell_yaml_value: Optional[str] = None,
|
|
188
|
+
) -> str:
|
|
189
|
+
"""Resolve tmux session name with documented fallback chain.
|
|
190
|
+
|
|
191
|
+
Precedence (F4 sibling to cursor_path):
|
|
192
|
+
1. Explicit ``--tmux-session`` CLI arg
|
|
193
|
+
2. ``cell.yaml`` extra.tmux_session when --cell present
|
|
194
|
+
3. Role itself (convention default)
|
|
195
|
+
|
|
196
|
+
Mother's sibling-instance variant (#1061): when slot-N siblings spawn,
|
|
197
|
+
each slot needs its own tmux session name; the cell.yaml that pins the
|
|
198
|
+
slot SHOULD also pin the tmux_session to keep the watchdog's reads
|
|
199
|
+
consistent with the spawn's writes.
|
|
200
|
+
"""
|
|
201
|
+
if explicit:
|
|
202
|
+
return explicit
|
|
203
|
+
if cell_yaml_value:
|
|
204
|
+
return cell_yaml_value
|
|
205
|
+
return role
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _read_cell_yaml_pins(role: str) -> tuple[Optional[str], Optional[str]]:
|
|
209
|
+
"""Best-effort read of cell.yaml extra.cursor_path + extra.tmux_session.
|
|
210
|
+
|
|
211
|
+
Tries the cwd-local ``./cell.yaml`` first (matches hook_output discovery),
|
|
212
|
+
falls back to ``<cells_dir>/<role>.yaml``. Returns (None, None) on any
|
|
213
|
+
failure — F4 is additive non-breaking, malformed cell.yaml falls through
|
|
214
|
+
to the legacy convention defaults.
|
|
215
|
+
|
|
216
|
+
NOTE: ``cursor_path`` / ``tmux_session`` live in ``Cell.extra`` (forward-
|
|
217
|
+
compat catch-all per swarph-shared v0.3) in v0.7.2. swarph-shared 0.4
|
|
218
|
+
will graduate them to first-class typed fields on ``Cell``; this reader
|
|
219
|
+
will continue to work because graduate-to-typed-field preserves the
|
|
220
|
+
extra-dict reading path (per swarph-shared's documented forward-compat
|
|
221
|
+
discipline).
|
|
222
|
+
"""
|
|
223
|
+
from swarph_cli.cell import (
|
|
224
|
+
cells_dir,
|
|
225
|
+
discover_cell_in_cwd,
|
|
226
|
+
load_cell,
|
|
227
|
+
CellError,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
cell_path = discover_cell_in_cwd()
|
|
231
|
+
if cell_path is None:
|
|
232
|
+
candidate = cells_dir() / f"{role}.yaml"
|
|
233
|
+
if candidate.is_file():
|
|
234
|
+
cell_path = candidate
|
|
235
|
+
if cell_path is None:
|
|
236
|
+
return None, None
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
cell = load_cell(cell_path)
|
|
240
|
+
except (CellError, OSError):
|
|
241
|
+
return None, None
|
|
242
|
+
|
|
243
|
+
extra = cell.extra or {}
|
|
244
|
+
cursor_path = extra.get("cursor_path")
|
|
245
|
+
tmux_session = extra.get("tmux_session")
|
|
246
|
+
return (
|
|
247
|
+
str(cursor_path) if cursor_path else None,
|
|
248
|
+
str(tmux_session) if tmux_session else None,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
152
252
|
def _resolve_log_path(explicit: Optional[str]) -> Path:
|
|
153
253
|
if explicit:
|
|
154
254
|
return Path(explicit).expanduser()
|
|
@@ -228,6 +328,39 @@ def _tmux_send_keys(name: str, text: str) -> bool:
|
|
|
228
328
|
return False
|
|
229
329
|
|
|
230
330
|
|
|
331
|
+
def _pane_activity_age_sec(name: str) -> Optional[int]:
|
|
332
|
+
"""Age in seconds since the tmux pane's last activity event.
|
|
333
|
+
|
|
334
|
+
Reads tmux's `#{pane_activity}` format variable, which returns a unix
|
|
335
|
+
epoch timestamp of the most recent activity in the active pane of the
|
|
336
|
+
target session. Returns None if tmux is missing, the session doesn't
|
|
337
|
+
exist, or tmux's output isn't parseable as an integer epoch.
|
|
338
|
+
|
|
339
|
+
Used by F3 (mother #1087 / drop-on-meta-edge proposal) as a third
|
|
340
|
+
AND-gate input to distinguish (a) session genuinely stalled from (b)
|
|
341
|
+
session actively working in a long bash block. cursor-mtime alone
|
|
342
|
+
measures "time since last turn-end" not "time since last activity";
|
|
343
|
+
pane_activity covers the mid-turn-active case.
|
|
344
|
+
|
|
345
|
+
Returns None on detection error so the caller can fall through to
|
|
346
|
+
the legacy AND-gate behavior — F3 is a strengthening of the gate,
|
|
347
|
+
not a replacement of it.
|
|
348
|
+
"""
|
|
349
|
+
try:
|
|
350
|
+
result = subprocess.run(
|
|
351
|
+
["tmux", "display", "-p", "-t", name, "#{pane_activity}"],
|
|
352
|
+
capture_output=True, text=True, timeout=5,
|
|
353
|
+
)
|
|
354
|
+
if result.returncode != 0:
|
|
355
|
+
return None
|
|
356
|
+
out = result.stdout.strip()
|
|
357
|
+
if not out:
|
|
358
|
+
return None
|
|
359
|
+
return max(0, _now() - int(out))
|
|
360
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError, ValueError):
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
|
|
231
364
|
def _tmux_kill_session(name: str) -> bool:
|
|
232
365
|
try:
|
|
233
366
|
result = subprocess.run(
|
|
@@ -263,7 +396,7 @@ def _spawn_via_swarph(role: str, tmux_session: str) -> bool:
|
|
|
263
396
|
return False
|
|
264
397
|
|
|
265
398
|
|
|
266
|
-
def _a1_marker_path(log_path: Path, role: str) -> Path:
|
|
399
|
+
def _a1_marker_path(log_path: Path, role: str, tmux_session: Optional[str] = None) -> Path:
|
|
267
400
|
"""Marker file recording the cursor_mtime at which A1 was last fired.
|
|
268
401
|
|
|
269
402
|
Co-located with the watchdog log so it inherits the same XDG_STATE_HOME
|
|
@@ -272,14 +405,27 @@ def _a1_marker_path(log_path: Path, role: str) -> Path:
|
|
|
272
405
|
where cron fired A1 every 5min for 65min into an active session's tmux
|
|
273
406
|
input buffer (commander #1092 + droplet #1087).
|
|
274
407
|
|
|
275
|
-
Keyed on ``role``
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
408
|
+
Keyed on ``(role, tmux_session)`` post-F4 so sibling-instance patterns
|
|
409
|
+
(alpha+beta drop-on-meta-edge per project_drop_mitosis_to_meta_edge)
|
|
410
|
+
don't clobber each other's markers — mother's flag from #1103 closed in
|
|
411
|
+
v0.7.2. tmux_session is sanitized to alphanumeric + ``-_.`` for the
|
|
412
|
+
filename to avoid path-traversal or weird characters from cell.yaml-
|
|
413
|
+
pinned values.
|
|
414
|
+
|
|
415
|
+
NOTE (mother #1138 sanitization edge case): two siblings whose
|
|
416
|
+
``tmux_session`` values differ ONLY in disallowed characters (e.g.,
|
|
417
|
+
``cell:a`` vs ``cell:b`` — colons sanitized to ``_`` collapsing both
|
|
418
|
+
to ``cell_a`` / ``cell_b`` — fine in this example, but ``cell:a`` vs
|
|
419
|
+
``cell$a`` would both collapse to ``cell_a``) would collide post-
|
|
420
|
+
sanitization. cell.yaml-pinned ``tmux_session`` values SHOULD differ
|
|
421
|
+
in alphanumeric content, not just punctuation. Cosmetic in practice
|
|
422
|
+
(operators don't choose session names that close), but worth knowing.
|
|
281
423
|
"""
|
|
282
|
-
|
|
424
|
+
safe_tmux = "".join(
|
|
425
|
+
c if (c.isalnum() or c in "-_.") else "_"
|
|
426
|
+
for c in (tmux_session or role)
|
|
427
|
+
)
|
|
428
|
+
return log_path.parent / f"a1-fired-{role}-{safe_tmux}.marker"
|
|
283
429
|
|
|
284
430
|
|
|
285
431
|
def _a1_already_fired_at(marker: Path, cursor_mtime: int) -> bool:
|
|
@@ -331,10 +477,17 @@ def _log_event(log_path: Path, event: str, details: dict, verbose: bool = False)
|
|
|
331
477
|
|
|
332
478
|
def run_check(args: argparse.Namespace) -> int:
|
|
333
479
|
role = args.cell
|
|
334
|
-
|
|
480
|
+
# F4 — cell.yaml-pinned cursor_path + tmux_session (mother #1057/#1060
|
|
481
|
+
# + beta #1061/#1065). Reads cell.yaml `extra.cursor_path` /
|
|
482
|
+
# `extra.tmux_session` when --cell is provided; explicit CLI args still
|
|
483
|
+
# win. Best-effort: malformed cell.yaml falls through to legacy
|
|
484
|
+
# convention defaults (additive non-breaking).
|
|
485
|
+
cell_cursor, cell_tmux = _read_cell_yaml_pins(role)
|
|
486
|
+
cursor = _resolve_cursor_path(role, args.cursor, cell_cursor)
|
|
487
|
+
tmux_session = _resolve_tmux_session(role, args.tmux_session, cell_tmux)
|
|
335
488
|
log_path = _resolve_log_path(args.log)
|
|
336
489
|
threshold = args.threshold
|
|
337
|
-
|
|
490
|
+
pane_activity_threshold = args.pane_activity_threshold
|
|
338
491
|
peer = args.peer or role
|
|
339
492
|
gateway = args.gateway
|
|
340
493
|
token = os.environ.get("MESH_GATEWAY_TOKEN")
|
|
@@ -346,6 +499,9 @@ def run_check(args: argparse.Namespace) -> int:
|
|
|
346
499
|
"threshold_sec": threshold,
|
|
347
500
|
"tmux_session": tmux_session,
|
|
348
501
|
"peer": peer,
|
|
502
|
+
"pane_activity_threshold_sec": pane_activity_threshold,
|
|
503
|
+
"cell_yaml_pinned_cursor": cell_cursor is not None,
|
|
504
|
+
"cell_yaml_pinned_tmux": cell_tmux is not None,
|
|
349
505
|
}
|
|
350
506
|
|
|
351
507
|
# PRIMARY signal: cursor file mtime
|
|
@@ -378,7 +534,7 @@ def run_check(args: argparse.Namespace) -> int:
|
|
|
378
534
|
# cursor_stale + process_alive + unread None → noop (F2 fail-closed: can't verify work, don't poke)
|
|
379
535
|
# cursor_stale + a1_marker matches cursor_mtime → noop (F1 same-window suppression)
|
|
380
536
|
|
|
381
|
-
marker = _a1_marker_path(log_path, role)
|
|
537
|
+
marker = _a1_marker_path(log_path, role, tmux_session)
|
|
382
538
|
diag["a1_marker"] = str(marker)
|
|
383
539
|
|
|
384
540
|
if not process_alive:
|
|
@@ -434,6 +590,21 @@ def run_check(args: argparse.Namespace) -> int:
|
|
|
434
590
|
_log_event(log_path, "noop", diag, verbose)
|
|
435
591
|
return 0
|
|
436
592
|
|
|
593
|
+
# F3 — tmux pane_activity AND-gate (mother #1087). cursor-mtime measures
|
|
594
|
+
# "time since last turn-end" not "time since last activity"; mid-long-
|
|
595
|
+
# turn cursor is stale even though session is maximally alive. tmux's
|
|
596
|
+
# `#{pane_activity}` covers the mid-turn-active case. If the pane has
|
|
597
|
+
# had activity within `pane_activity_threshold_sec`, suppress A1 — the
|
|
598
|
+
# session is working, not stalled. Falls through to firing A1 when
|
|
599
|
+
# pane_activity is None (tmux missing / older tmux without the format)
|
|
600
|
+
# so F3 is a strengthening of the gate, not a hard dependency.
|
|
601
|
+
pane_age = _pane_activity_age_sec(tmux_session)
|
|
602
|
+
diag["pane_activity_age_sec"] = pane_age
|
|
603
|
+
if pane_age is not None and pane_age < pane_activity_threshold:
|
|
604
|
+
diag["decision"] = "noop_pane_activity_recent"
|
|
605
|
+
_log_event(log_path, "noop", diag, verbose)
|
|
606
|
+
return 0
|
|
607
|
+
|
|
437
608
|
diag["decision"] = "a1_send_keys"
|
|
438
609
|
wake_text = (
|
|
439
610
|
f"watchdog wake — cursor stale {cursor_age}s, "
|
|
@@ -447,6 +618,107 @@ def run_check(args: argparse.Namespace) -> int:
|
|
|
447
618
|
return 1 if sent else 4
|
|
448
619
|
|
|
449
620
|
|
|
621
|
+
_SYSTEMD_UNIT_DIR = Path("/etc/systemd/system")
|
|
622
|
+
_SYSTEMD_DEFAULT_DIR = Path("/etc/default")
|
|
623
|
+
_SYSTEMD_UNIT_NAMES = ("swarph-watchdog.service", "swarph-watchdog.timer")
|
|
624
|
+
_SYSTEMD_DEFAULT_NAME = "swarph-watchdog" # /etc/default/swarph-watchdog
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def _bundled_systemd_files() -> dict[str, str]:
|
|
628
|
+
"""Return {filename: content} for the 3 bundled systemd templates.
|
|
629
|
+
|
|
630
|
+
Reads from the package's bundled `systemd/` data directory via
|
|
631
|
+
importlib.resources. Works regardless of install method (pipx, pip,
|
|
632
|
+
editable, wheel-from-PyPI).
|
|
633
|
+
"""
|
|
634
|
+
try:
|
|
635
|
+
from importlib.resources import files as _files
|
|
636
|
+
except ImportError: # pragma: no cover — Python <3.9 not supported anyway
|
|
637
|
+
from importlib_resources import files as _files # type: ignore[no-redef]
|
|
638
|
+
|
|
639
|
+
pkg_root = _files("swarph_cli") / "systemd"
|
|
640
|
+
out: dict[str, str] = {}
|
|
641
|
+
for name in (*_SYSTEMD_UNIT_NAMES, "swarph-watchdog.default"):
|
|
642
|
+
out[name] = (pkg_root / name).read_text(encoding="utf-8")
|
|
643
|
+
return out
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def run_install_service(args: argparse.Namespace) -> int:
|
|
647
|
+
"""Install systemd timer + service for periodic watchdog --check.
|
|
648
|
+
|
|
649
|
+
Idempotent: overwrites existing unit files (newer-version semantics).
|
|
650
|
+
Requires sudo for /etc/systemd/system writes unless --dry-run.
|
|
651
|
+
|
|
652
|
+
Exit codes:
|
|
653
|
+
0 success (or dry-run completed)
|
|
654
|
+
4 configuration error (non-root without --dry-run)
|
|
655
|
+
5 install error (file write failed / systemctl failed)
|
|
656
|
+
"""
|
|
657
|
+
files = _bundled_systemd_files()
|
|
658
|
+
|
|
659
|
+
# Template the default file with the requested role
|
|
660
|
+
default_content = files["swarph-watchdog.default"].replace(
|
|
661
|
+
"SWARPH_CELL=lab",
|
|
662
|
+
f"SWARPH_CELL={args.cell}",
|
|
663
|
+
1,
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
targets = [
|
|
667
|
+
(_SYSTEMD_UNIT_DIR / _SYSTEMD_UNIT_NAMES[0], files[_SYSTEMD_UNIT_NAMES[0]]),
|
|
668
|
+
(_SYSTEMD_UNIT_DIR / _SYSTEMD_UNIT_NAMES[1], files[_SYSTEMD_UNIT_NAMES[1]]),
|
|
669
|
+
(_SYSTEMD_DEFAULT_DIR / _SYSTEMD_DEFAULT_NAME, default_content),
|
|
670
|
+
]
|
|
671
|
+
|
|
672
|
+
if args.dry_run:
|
|
673
|
+
print(f"# DRY RUN — cell={args.cell}", file=sys.stderr)
|
|
674
|
+
for path, content in targets:
|
|
675
|
+
print(f"\n# would write {path}:", file=sys.stderr)
|
|
676
|
+
print(content, file=sys.stderr)
|
|
677
|
+
print(
|
|
678
|
+
"\n# would then run:\n"
|
|
679
|
+
"# sudo systemctl daemon-reload\n"
|
|
680
|
+
"# sudo systemctl enable --now swarph-watchdog.timer",
|
|
681
|
+
file=sys.stderr,
|
|
682
|
+
)
|
|
683
|
+
return 0
|
|
684
|
+
|
|
685
|
+
if os.geteuid() != 0:
|
|
686
|
+
print(
|
|
687
|
+
"ERROR: --install-service requires root. Re-run with sudo, or use "
|
|
688
|
+
"--dry-run to preview the install without writing.",
|
|
689
|
+
file=sys.stderr,
|
|
690
|
+
)
|
|
691
|
+
return 4
|
|
692
|
+
|
|
693
|
+
try:
|
|
694
|
+
for path, content in targets:
|
|
695
|
+
path.write_text(content, encoding="utf-8")
|
|
696
|
+
print(f"wrote {path}", file=sys.stderr)
|
|
697
|
+
except (OSError, PermissionError) as exc:
|
|
698
|
+
print(f"ERROR: failed to write unit files: {exc}", file=sys.stderr)
|
|
699
|
+
return 5
|
|
700
|
+
|
|
701
|
+
try:
|
|
702
|
+
subprocess.run(["systemctl", "daemon-reload"], check=True)
|
|
703
|
+
subprocess.run(
|
|
704
|
+
["systemctl", "enable", "--now", "swarph-watchdog.timer"],
|
|
705
|
+
check=True,
|
|
706
|
+
)
|
|
707
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as exc:
|
|
708
|
+
print(f"ERROR: systemctl failed: {exc}", file=sys.stderr)
|
|
709
|
+
return 5
|
|
710
|
+
|
|
711
|
+
print(
|
|
712
|
+
f"\nswarph-watchdog.timer installed + enabled for cell={args.cell}.\n"
|
|
713
|
+
f" status: systemctl status swarph-watchdog.timer\n"
|
|
714
|
+
f" logs: journalctl -u swarph-watchdog.service -f\n"
|
|
715
|
+
f" OR /var/log/swarph-watchdog.log\n"
|
|
716
|
+
f" next: systemctl list-timers swarph-watchdog.timer",
|
|
717
|
+
file=sys.stderr,
|
|
718
|
+
)
|
|
719
|
+
return 0
|
|
720
|
+
|
|
721
|
+
|
|
450
722
|
def run_watchdog(argv: Optional[list[str]] = None) -> int:
|
|
451
723
|
if argv is None:
|
|
452
724
|
argv = sys.argv[2:] # skip "swarph watchdog"
|
|
@@ -460,9 +732,27 @@ def run_watchdog(argv: Optional[list[str]] = None) -> int:
|
|
|
460
732
|
"--check", action="store_true",
|
|
461
733
|
help="One-shot check (cron-callable; exits with status code).",
|
|
462
734
|
)
|
|
735
|
+
p.add_argument(
|
|
736
|
+
"--install-service", action="store_true",
|
|
737
|
+
help="Install systemd timer + service for periodic --check invocation. "
|
|
738
|
+
"Requires sudo. Closes ev_6954f748 substrate-component-install gap.",
|
|
739
|
+
)
|
|
740
|
+
p.add_argument(
|
|
741
|
+
"--dry-run", action="store_true",
|
|
742
|
+
help="With --install-service: show what would be written without "
|
|
743
|
+
"writing. Useful for review or non-root preview.",
|
|
744
|
+
)
|
|
463
745
|
p.add_argument("--cell", default=os.environ.get("SWARPH_CELL", "lab"))
|
|
464
746
|
p.add_argument("--cursor", default=None)
|
|
465
747
|
p.add_argument("--threshold", type=int, default=_DEFAULT_THRESHOLD_SEC)
|
|
748
|
+
p.add_argument(
|
|
749
|
+
"--pane-activity-threshold",
|
|
750
|
+
type=int,
|
|
751
|
+
default=_DEFAULT_PANE_ACTIVITY_THRESHOLD_SEC,
|
|
752
|
+
help="F3 gate: suppress A1 if tmux pane had activity within this "
|
|
753
|
+
"many seconds (covers mid-long-turn working sessions where "
|
|
754
|
+
"cursor-mtime is stale but session is alive).",
|
|
755
|
+
)
|
|
466
756
|
p.add_argument("--gateway", default=_DEFAULT_GATEWAY_URL)
|
|
467
757
|
p.add_argument("--tmux-session", default=None)
|
|
468
758
|
p.add_argument("--peer", default=None)
|
|
@@ -475,6 +765,9 @@ def run_watchdog(argv: Optional[list[str]] = None) -> int:
|
|
|
475
765
|
except SystemExit as exc:
|
|
476
766
|
return int(exc.code or 0)
|
|
477
767
|
|
|
768
|
+
if args.install_service:
|
|
769
|
+
return run_install_service(args)
|
|
770
|
+
|
|
478
771
|
if not args.check:
|
|
479
772
|
print(_USAGE, file=sys.stderr)
|
|
480
773
|
return 4
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# /etc/default/swarph-watchdog — environment for swarph-watchdog.service
|
|
2
|
+
#
|
|
3
|
+
# SWARPH_CELL sets the cell role for the watchdog. The watchdog reads
|
|
4
|
+
# cell.yaml for this role to discover the cursor path + tmux session pin
|
|
5
|
+
# (per F4 cell.yaml-pinning landed in v0.7.2).
|
|
6
|
+
#
|
|
7
|
+
# If unset, watchdog defaults to "lab" (per the --cell argparse default).
|
|
8
|
+
# Override here per-vertex: droplet, gpu-wsl, razorpeter, etc.
|
|
9
|
+
SWARPH_CELL=lab
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
[Unit]
|
|
2
|
+
Description=Swarph watchdog one-shot check (stranded-session recovery)
|
|
3
|
+
Documentation=https://github.com/darw007d/swarph-cli
|
|
4
|
+
After=network-online.target
|
|
5
|
+
Wants=network-online.target
|
|
6
|
+
|
|
7
|
+
[Service]
|
|
8
|
+
Type=oneshot
|
|
9
|
+
EnvironmentFile=-/etc/default/swarph-watchdog
|
|
10
|
+
ExecStart=/usr/local/bin/swarph watchdog --check
|
|
11
|
+
StandardOutput=append:/var/log/swarph-watchdog.log
|
|
12
|
+
StandardError=append:/var/log/swarph-watchdog.log
|
|
13
|
+
|
|
14
|
+
[Install]
|
|
15
|
+
WantedBy=multi-user.target
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
[Unit]
|
|
2
|
+
Description=Run swarph watchdog every 5 minutes (stranded-session recovery)
|
|
3
|
+
Documentation=https://github.com/darw007d/swarph-cli
|
|
4
|
+
Requires=swarph-watchdog.service
|
|
5
|
+
|
|
6
|
+
[Timer]
|
|
7
|
+
OnBootSec=2min
|
|
8
|
+
OnUnitActiveSec=5min
|
|
9
|
+
AccuracySec=1min
|
|
10
|
+
Persistent=true
|
|
11
|
+
|
|
12
|
+
[Install]
|
|
13
|
+
WantedBy=timers.target
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: swarph-cli
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: The `swarph` binary — multi-LLM CLI with mesh-gateway integration. v0.7.0 ships Phase 7 substrate-doc R7 §11.1.7 operator-tooling layer in 5 increments: PR-A `--new-instance` flag (sibling-spawn case) + PR-B auto-suffix on collision (sibling-slot persistence) + PR-C SessionStart hook (closes bare-claude operator-paste gap) + watchdog (stranded-session recovery) + PR-D swarph-shared cell.yaml relocation (cell-yaml schema graduates to swarph-shared 0.3.0 kernel-tier; substrate-doc R7 §11.1.5 (O5) RESOLVED).
|
|
5
5
|
Author: Pierre Samson, Claude Opus
|
|
6
6
|
License: MIT
|
|
@@ -138,6 +138,46 @@ Loud-on-down (PLAN §16.5): never silently exits. Cursor writes are atomic (writ
|
|
|
138
138
|
|
|
139
139
|
`--auto-act` flag is documented for v0.5.1+ when handler registration via `@swarph.on_dm(...)` lands; v0.5.0 ships surface-only mode (DMs printed + JSONL-logged to `inbox.log`, no automatic replies).
|
|
140
140
|
|
|
141
|
+
### `swarph watchdog` (Phase 7 — v0.7 stranded-session detection, v0.7.3 systemd install)
|
|
142
|
+
|
|
143
|
+
Detects stranded Claude sessions (API throttle / harness death) via cursor-mtime + tmux pgrep AND-gate, and recovers via A1 tmux send-keys wake-prompt → A2 `swarph spawn` respawn. Cell.yaml-pinned cursor + tmux session (F4) since v0.7.2.
|
|
144
|
+
|
|
145
|
+
**One-shot mode (cron-callable, v0.7+):**
|
|
146
|
+
```bash
|
|
147
|
+
*/5 * * * * swarph watchdog --check --cell lab >> ~/.local/log/swarph-watchdog.log 2>&1
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Systemd timer install (v0.7.3+ — closes ev_6954f748 substrate-component-installation-gap):**
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
# Preview without writing (any user):
|
|
154
|
+
swarph watchdog --install-service --cell droplet --dry-run
|
|
155
|
+
|
|
156
|
+
# Install + enable (requires root for /etc/systemd/system writes):
|
|
157
|
+
sudo swarph watchdog --install-service --cell droplet
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
This writes three files:
|
|
161
|
+
|
|
162
|
+
| Path | Purpose |
|
|
163
|
+
|------|---------|
|
|
164
|
+
| `/etc/systemd/system/swarph-watchdog.service` | `Type=oneshot`, runs `swarph watchdog --check` |
|
|
165
|
+
| `/etc/systemd/system/swarph-watchdog.timer` | Fires every 5 minutes (`OnUnitActiveSec=5min`) |
|
|
166
|
+
| `/etc/default/swarph-watchdog` | Sets `SWARPH_CELL=<role>` for the service env |
|
|
167
|
+
|
|
168
|
+
Then runs `systemctl daemon-reload && systemctl enable --now swarph-watchdog.timer`. Idempotent — re-running overwrites with current package version (newer-version semantics).
|
|
169
|
+
|
|
170
|
+
Monitoring:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
systemctl status swarph-watchdog.timer # is it scheduled?
|
|
174
|
+
systemctl list-timers swarph-watchdog.timer # next fire?
|
|
175
|
+
journalctl -u swarph-watchdog.service -f # live log
|
|
176
|
+
tail -f /var/log/swarph-watchdog.log # append-log alternative
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Why this matters: pre-v0.7.3, swarph-cli shipped the watchdog code but no install path. Lab ran it via cron (manual setup); droplet never installed it at all. A real production silence-window (drop's ~24min mute 2026-05-14 08:38→09:02 UTC after an Anthropic API error) made the install-gap visible. v0.7.3 closes it for any peer with one command.
|
|
180
|
+
|
|
141
181
|
### `swarph onboard` + `swarph ratify` (Phase 5.5)
|
|
142
182
|
|
|
143
183
|
Per PLAN.md §15, onboarding splits into a **mechanics phase** (`swarph onboard`) that automates the boring parts (registry POST, scaffolding, token resolution) and a **manual contract phase** (the new peer composes the handshake DM in their own words). A witness peer judges the handshake and runs `swarph ratify <peer>` to flip `ratified=true`, gating `task_claim` server-side.
|
|
@@ -23,6 +23,9 @@ src/swarph_cli/commands/spawn.py
|
|
|
23
23
|
src/swarph_cli/commands/watchdog.py
|
|
24
24
|
src/swarph_cli/parsers/__init__.py
|
|
25
25
|
src/swarph_cli/parsers/claude.py
|
|
26
|
+
src/swarph_cli/systemd/swarph-watchdog.default
|
|
27
|
+
src/swarph_cli/systemd/swarph-watchdog.service
|
|
28
|
+
src/swarph_cli/systemd/swarph-watchdog.timer
|
|
26
29
|
tests/test_cell_loader.py
|
|
27
30
|
tests/test_chat_command.py
|
|
28
31
|
tests/test_claude_parser.py
|
|
@@ -341,6 +341,135 @@ def test_a1_rearms_after_cursor_advance(
|
|
|
341
341
|
assert send_mock.call_count == 2
|
|
342
342
|
|
|
343
343
|
|
|
344
|
+
# ---------------------------------------------------------------------------
|
|
345
|
+
# F3 — tmux pane_activity AND-gate (mother #1087)
|
|
346
|
+
# ---------------------------------------------------------------------------
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_pane_activity_recent_suppresses_a1(
|
|
350
|
+
isolated_state, stale_cursor, monkeypatch
|
|
351
|
+
):
|
|
352
|
+
"""F3 — cursor stale + alive + unread > 0 but pane_activity recent →
|
|
353
|
+
suppress A1. Session is working in a long bash block; cursor only
|
|
354
|
+
updates at turn-end. Same incident class as commander #1092 65-min
|
|
355
|
+
spam, but caught upstream of F1 marker by checking pane_activity
|
|
356
|
+
BEFORE firing."""
|
|
357
|
+
with patch("swarph_cli.commands.watchdog._process_alive", return_value=True), \
|
|
358
|
+
patch("swarph_cli.commands.watchdog._gateway_unread_count", return_value=3), \
|
|
359
|
+
patch("swarph_cli.commands.watchdog._tmux_session_exists", return_value=True), \
|
|
360
|
+
patch("swarph_cli.commands.watchdog._pane_activity_age_sec", return_value=30), \
|
|
361
|
+
patch("swarph_cli.commands.watchdog._tmux_send_keys") as send_mock:
|
|
362
|
+
rc = run_watchdog(argv=[
|
|
363
|
+
"--check", "--cell", "lab",
|
|
364
|
+
"--cursor", str(stale_cursor),
|
|
365
|
+
"--threshold", "60",
|
|
366
|
+
"--pane-activity-threshold", "600",
|
|
367
|
+
])
|
|
368
|
+
assert rc == 0
|
|
369
|
+
send_mock.assert_not_called()
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def test_pane_activity_old_falls_through_to_a1(
|
|
373
|
+
isolated_state, stale_cursor, monkeypatch
|
|
374
|
+
):
|
|
375
|
+
"""F3 — pane_activity OLDER than threshold means session has actually
|
|
376
|
+
been quiet; A1 still fires. Stop signal compatibility check."""
|
|
377
|
+
with patch("swarph_cli.commands.watchdog._process_alive", return_value=True), \
|
|
378
|
+
patch("swarph_cli.commands.watchdog._gateway_unread_count", return_value=3), \
|
|
379
|
+
patch("swarph_cli.commands.watchdog._tmux_session_exists", return_value=True), \
|
|
380
|
+
patch("swarph_cli.commands.watchdog._pane_activity_age_sec", return_value=1200), \
|
|
381
|
+
patch("swarph_cli.commands.watchdog._tmux_send_keys", return_value=True) as send_mock:
|
|
382
|
+
rc = run_watchdog(argv=[
|
|
383
|
+
"--check", "--cell", "lab",
|
|
384
|
+
"--cursor", str(stale_cursor),
|
|
385
|
+
"--threshold", "60",
|
|
386
|
+
"--pane-activity-threshold", "600",
|
|
387
|
+
])
|
|
388
|
+
assert rc == 1
|
|
389
|
+
send_mock.assert_called_once()
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def test_pane_activity_unavailable_falls_through_to_a1(
|
|
393
|
+
isolated_state, stale_cursor, monkeypatch
|
|
394
|
+
):
|
|
395
|
+
"""F3 — detection error (tmux missing / older tmux without
|
|
396
|
+
#{pane_activity}) returns None; A1 still fires. F3 is a strengthening
|
|
397
|
+
of the gate, not a hard dependency."""
|
|
398
|
+
with patch("swarph_cli.commands.watchdog._process_alive", return_value=True), \
|
|
399
|
+
patch("swarph_cli.commands.watchdog._gateway_unread_count", return_value=3), \
|
|
400
|
+
patch("swarph_cli.commands.watchdog._tmux_session_exists", return_value=True), \
|
|
401
|
+
patch("swarph_cli.commands.watchdog._pane_activity_age_sec", return_value=None), \
|
|
402
|
+
patch("swarph_cli.commands.watchdog._tmux_send_keys", return_value=True) as send_mock:
|
|
403
|
+
rc = run_watchdog(argv=[
|
|
404
|
+
"--check", "--cell", "lab",
|
|
405
|
+
"--cursor", str(stale_cursor),
|
|
406
|
+
"--threshold", "60",
|
|
407
|
+
])
|
|
408
|
+
assert rc == 1
|
|
409
|
+
send_mock.assert_called_once()
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
# ---------------------------------------------------------------------------
|
|
413
|
+
# F4 — cell.yaml-pinned cursor_path + tmux_session (mother #1057/#1060 + beta #1061/#1065)
|
|
414
|
+
# ---------------------------------------------------------------------------
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def test_resolve_cursor_path_cell_yaml_pin_beats_default(isolated_state):
|
|
418
|
+
"""F4 — cell.yaml extra.cursor_path takes precedence over the
|
|
419
|
+
/tmp/lab-claude-cursor.json fallback when no explicit --cursor."""
|
|
420
|
+
from swarph_cli.commands.watchdog import _resolve_cursor_path
|
|
421
|
+
pinned = isolated_state / "custom-cursor.json"
|
|
422
|
+
assert _resolve_cursor_path("lab", None, str(pinned)) == pinned
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def test_resolve_cursor_path_explicit_beats_cell_yaml_pin(isolated_state):
|
|
426
|
+
"""F4 — explicit --cursor still wins over cell.yaml pin."""
|
|
427
|
+
from swarph_cli.commands.watchdog import _resolve_cursor_path
|
|
428
|
+
explicit = isolated_state / "explicit-cursor.json"
|
|
429
|
+
pinned = isolated_state / "pinned-cursor.json"
|
|
430
|
+
assert _resolve_cursor_path("lab", str(explicit), str(pinned)) == explicit
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def test_resolve_tmux_session_cell_yaml_pin_beats_role(isolated_state):
|
|
434
|
+
"""F4 — cell.yaml extra.tmux_session takes precedence over role
|
|
435
|
+
default when no explicit --tmux-session."""
|
|
436
|
+
from swarph_cli.commands.watchdog import _resolve_tmux_session
|
|
437
|
+
assert _resolve_tmux_session("drop-mother", None, "drop-mother-tmux") == "drop-mother-tmux"
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def test_resolve_tmux_session_explicit_beats_cell_yaml_pin(isolated_state):
|
|
441
|
+
"""F4 — explicit --tmux-session still wins over cell.yaml pin."""
|
|
442
|
+
from swarph_cli.commands.watchdog import _resolve_tmux_session
|
|
443
|
+
assert _resolve_tmux_session("lab", "explicit-name", "pinned-name") == "explicit-name"
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def test_resolve_tmux_session_falls_back_to_role(isolated_state):
|
|
447
|
+
"""F4 — no explicit + no cell.yaml pin → role itself."""
|
|
448
|
+
from swarph_cli.commands.watchdog import _resolve_tmux_session
|
|
449
|
+
assert _resolve_tmux_session("lab", None, None) == "lab"
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def test_a1_marker_path_keyed_on_role_and_tmux_session(isolated_state):
|
|
453
|
+
"""F4 — marker filename includes both role + tmux_session to prevent
|
|
454
|
+
sibling-instance marker collisions (mother #1103 follow-up)."""
|
|
455
|
+
from swarph_cli.commands.watchdog import _a1_marker_path
|
|
456
|
+
log_path = isolated_state / "wd.log"
|
|
457
|
+
m1 = _a1_marker_path(log_path, "drop-on-meta-edge", "drop-on-meta-edge")
|
|
458
|
+
m2 = _a1_marker_path(log_path, "drop-on-meta-edge", "drop-on-meta-edge-2")
|
|
459
|
+
assert m1 != m2
|
|
460
|
+
assert m1.name == "a1-fired-drop-on-meta-edge-drop-on-meta-edge.marker"
|
|
461
|
+
assert m2.name == "a1-fired-drop-on-meta-edge-drop-on-meta-edge-2.marker"
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def test_a1_marker_path_sanitizes_tmux_session(isolated_state):
|
|
465
|
+
"""F4 — tmux_session sanitized to alphanumeric+underscore so
|
|
466
|
+
cell.yaml-pinned values with weird chars don't break the filename."""
|
|
467
|
+
from swarph_cli.commands.watchdog import _a1_marker_path
|
|
468
|
+
log_path = isolated_state / "wd.log"
|
|
469
|
+
m = _a1_marker_path(log_path, "lab", "weird/name with spaces!")
|
|
470
|
+
assert ":" not in m.name and "/" not in m.name and " " not in m.name
|
|
471
|
+
|
|
472
|
+
|
|
344
473
|
def test_a2_escalation_clears_a1_marker(
|
|
345
474
|
isolated_state, stale_cursor, monkeypatch
|
|
346
475
|
):
|
|
@@ -361,7 +490,10 @@ def test_a2_escalation_clears_a1_marker(
|
|
|
361
490
|
"--threshold", "60",
|
|
362
491
|
"--log", str(log_path),
|
|
363
492
|
])
|
|
364
|
-
|
|
493
|
+
# F4 v0.7.2 marker keyed on (role, tmux_session) — tmux_session defaults
|
|
494
|
+
# to role when no --tmux-session arg + no cell.yaml pin, so filename is
|
|
495
|
+
# a1-fired-{role}-{role}.marker.
|
|
496
|
+
marker = log_path.parent / "a1-fired-lab-lab.marker"
|
|
365
497
|
assert marker.exists()
|
|
366
498
|
|
|
367
499
|
# Now force A2 path (process dead) and confirm marker is gone
|
|
@@ -430,3 +562,65 @@ def test_watchdog_log_appends_across_invocations(isolated_state, monkeypatch):
|
|
|
430
562
|
parsed_second = json.loads(lines[1])
|
|
431
563
|
assert parsed_first["details"]["decision"] == "healthy_cursor_fresh"
|
|
432
564
|
assert parsed_second["details"]["decision"] == "noop_no_unread"
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
# ---------------------------------------------------------------------------
|
|
568
|
+
# --install-service (v0.7.3 — closes ev_6954f748 substrate-component-install)
|
|
569
|
+
# ---------------------------------------------------------------------------
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def test_install_service_dry_run_writes_no_files(isolated_state, capsys):
|
|
573
|
+
"""--dry-run prints what would be written without touching the filesystem."""
|
|
574
|
+
rc = run_watchdog(argv=["--install-service", "--cell", "droplet", "--dry-run"])
|
|
575
|
+
assert rc == 0
|
|
576
|
+
captured = capsys.readouterr()
|
|
577
|
+
# Dry-run output goes to stderr
|
|
578
|
+
assert "DRY RUN" in captured.err
|
|
579
|
+
assert "cell=droplet" in captured.err
|
|
580
|
+
# All three target files surface in the preview
|
|
581
|
+
assert "/etc/systemd/system/swarph-watchdog.service" in captured.err
|
|
582
|
+
assert "/etc/systemd/system/swarph-watchdog.timer" in captured.err
|
|
583
|
+
assert "/etc/default/swarph-watchdog" in captured.err
|
|
584
|
+
# SWARPH_CELL was templated to the requested role
|
|
585
|
+
assert "SWARPH_CELL=droplet" in captured.err
|
|
586
|
+
# The bundled service file's identifying line shows up
|
|
587
|
+
assert "Swarph watchdog one-shot check" in captured.err
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def test_install_service_dry_run_default_cell_is_lab(isolated_state, capsys):
|
|
591
|
+
"""Without --cell, the dry-run preview keeps SWARPH_CELL=lab default."""
|
|
592
|
+
rc = run_watchdog(argv=["--install-service", "--dry-run"])
|
|
593
|
+
assert rc == 0
|
|
594
|
+
captured = capsys.readouterr()
|
|
595
|
+
assert "SWARPH_CELL=lab" in captured.err
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def test_install_service_without_sudo_returns_4(isolated_state, capsys, monkeypatch):
|
|
599
|
+
"""Non-root install (no --dry-run) refuses with helpful message + exit 4."""
|
|
600
|
+
monkeypatch.setattr("os.geteuid", lambda: 1000)
|
|
601
|
+
rc = run_watchdog(argv=["--install-service", "--cell", "droplet"])
|
|
602
|
+
assert rc == 4
|
|
603
|
+
captured = capsys.readouterr()
|
|
604
|
+
assert "requires root" in captured.err
|
|
605
|
+
assert "--dry-run" in captured.err # hint surfaces
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def test_bundled_systemd_files_readable():
|
|
609
|
+
"""Package-data manifest correctness — importlib.resources can read all
|
|
610
|
+
three bundled templates. Regression guard for pyproject package-data
|
|
611
|
+
declaration."""
|
|
612
|
+
from swarph_cli.commands.watchdog import _bundled_systemd_files
|
|
613
|
+
|
|
614
|
+
files = _bundled_systemd_files()
|
|
615
|
+
assert set(files.keys()) == {
|
|
616
|
+
"swarph-watchdog.service",
|
|
617
|
+
"swarph-watchdog.timer",
|
|
618
|
+
"swarph-watchdog.default",
|
|
619
|
+
}
|
|
620
|
+
# Service file has the expected Type=oneshot shape
|
|
621
|
+
assert "Type=oneshot" in files["swarph-watchdog.service"]
|
|
622
|
+
assert "ExecStart=/usr/local/bin/swarph watchdog --check" in files["swarph-watchdog.service"]
|
|
623
|
+
# Timer fires every 5 minutes
|
|
624
|
+
assert "OnUnitActiveSec=5min" in files["swarph-watchdog.timer"]
|
|
625
|
+
# Default file has the SWARPH_CELL=lab template line
|
|
626
|
+
assert "SWARPH_CELL=lab" in files["swarph-watchdog.default"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|