swarph-cli 0.7.0__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {swarph_cli-0.7.0/src/swarph_cli.egg-info → swarph_cli-0.7.2}/PKG-INFO +1 -1
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/pyproject.toml +1 -1
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/__init__.py +1 -1
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/watchdog.py +258 -9
- {swarph_cli-0.7.0 → swarph_cli-0.7.2/src/swarph_cli.egg-info}/PKG-INFO +1 -1
- swarph_cli-0.7.2/tests/test_watchdog.py +564 -0
- swarph_cli-0.7.0/tests/test_watchdog.py +0 -305
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/LICENSE +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/README.md +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/setup.cfg +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/caller.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/cell.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/__init__.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/chat.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/daemon.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/hook_output.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/import_session.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/install_hook.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/onboard.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/ratify.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/commands/spawn.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/main.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/parsers/__init__.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli/parsers/claude.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli.egg-info/SOURCES.txt +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli.egg-info/dependency_links.txt +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli.egg-info/entry_points.txt +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli.egg-info/requires.txt +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/src/swarph_cli.egg-info/top_level.txt +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_cell_loader.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_chat_command.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_claude_parser.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_daemon_command.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_hook_output.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_import_command.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_install_hook.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_main.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_onboard_command.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_ratify_command.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_smoke_chat.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_smoke_one_shot.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_smoke_phase_5_5.py +0 -0
- {swarph_cli-0.7.0 → swarph_cli-0.7.2}/tests/test_spawn_command.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: swarph-cli
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: The `swarph` binary — multi-LLM CLI with mesh-gateway integration. v0.7.0 ships Phase 7 substrate-doc R7 §11.1.7 operator-tooling layer in 5 increments: PR-A `--new-instance` flag (sibling-spawn case) + PR-B auto-suffix on collision (sibling-slot persistence) + PR-C SessionStart hook (closes bare-claude operator-paste gap) + watchdog (stranded-session recovery) + PR-D swarph-shared cell.yaml relocation (cell-yaml schema graduates to swarph-shared 0.3.0 kernel-tier; substrate-doc R7 §11.1.5 (O5) RESOLVED).
|
|
5
5
|
Author: Pierre Samson, Claude Opus
|
|
6
6
|
License: MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "swarph-cli"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.2"
|
|
8
8
|
description = "The `swarph` binary — multi-LLM CLI with mesh-gateway integration. v0.7.0 ships Phase 7 substrate-doc R7 §11.1.7 operator-tooling layer in 5 increments: PR-A `--new-instance` flag (sibling-spawn case) + PR-B auto-suffix on collision (sibling-slot persistence) + PR-C SessionStart hook (closes bare-claude operator-paste gap) + watchdog (stranded-session recovery) + PR-D swarph-shared cell.yaml relocation (cell-yaml schema graduates to swarph-shared 0.3.0 kernel-tier; substrate-doc R7 §11.1.5 (O5) RESOLVED)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -79,6 +79,11 @@ _DEFAULT_THRESHOLD_SEC = 1800 # 30 minutes
|
|
|
79
79
|
_DEFAULT_A1_RETRIES = 3
|
|
80
80
|
_DEFAULT_A1_BACKOFF_SEC = 60
|
|
81
81
|
_DEFAULT_GATEWAY_URL = "http://localhost:8788"
|
|
82
|
+
# F3 — tmux pane_activity gate threshold. If pane has activity within this
|
|
83
|
+
# many seconds, suppress A1 (session is working, not stalled). 600s (10min)
|
|
84
|
+
# is comfortably above legitimate-pause noise + comfortably below the
|
|
85
|
+
# 30min cursor-staleness threshold, so the two gates compose cleanly.
|
|
86
|
+
_DEFAULT_PANE_ACTIVITY_THRESHOLD_SEC = 600
|
|
82
87
|
|
|
83
88
|
_USAGE = """\
|
|
84
89
|
Usage:
|
|
@@ -137,10 +142,29 @@ def _stat_mtime(path: Path) -> Optional[int]:
|
|
|
137
142
|
return None
|
|
138
143
|
|
|
139
144
|
|
|
140
|
-
def _resolve_cursor_path(
|
|
141
|
-
|
|
145
|
+
def _resolve_cursor_path(
|
|
146
|
+
role: str,
|
|
147
|
+
explicit: Optional[str],
|
|
148
|
+
cell_yaml_value: Optional[str] = None,
|
|
149
|
+
) -> Path:
|
|
150
|
+
"""Resolve cursor file path with documented fallback chain.
|
|
151
|
+
|
|
152
|
+
Precedence (F4 — mother #1057/#1060 + beta #1061/#1065):
|
|
153
|
+
1. Explicit ``--cursor`` CLI arg (highest)
|
|
154
|
+
2. ``cell.yaml`` extra.cursor_path when --cell present
|
|
155
|
+
3. ``$TMPDIR/<role>-cursor.json``
|
|
156
|
+
4. ``/tmp/lab-claude-cursor.json`` (legacy lab-orchestrator default)
|
|
157
|
+
|
|
158
|
+
F4 closes the host-prefix-variant + sibling-instance-variant gap
|
|
159
|
+
class — cell.yaml carries the canonical cursor path per-cell, watchdog
|
|
160
|
+
auto-resolves when --cell is provided. Eliminates the silent-default-
|
|
161
|
+
to-lab-prefix failure mode that gave droplet 23hr of cursor-unreadable
|
|
162
|
+
errors before catch.
|
|
163
|
+
"""
|
|
142
164
|
if explicit:
|
|
143
165
|
return Path(explicit).expanduser()
|
|
166
|
+
if cell_yaml_value:
|
|
167
|
+
return Path(cell_yaml_value).expanduser()
|
|
144
168
|
tmpdir = os.environ.get("TMPDIR", "/tmp")
|
|
145
169
|
primary = Path(tmpdir) / f"{role}-cursor.json"
|
|
146
170
|
if primary.exists():
|
|
@@ -149,6 +173,74 @@ def _resolve_cursor_path(role: str, explicit: Optional[str]) -> Path:
|
|
|
149
173
|
return Path("/tmp/lab-claude-cursor.json")
|
|
150
174
|
|
|
151
175
|
|
|
176
|
+
def _resolve_tmux_session(
|
|
177
|
+
role: str,
|
|
178
|
+
explicit: Optional[str],
|
|
179
|
+
cell_yaml_value: Optional[str] = None,
|
|
180
|
+
) -> str:
|
|
181
|
+
"""Resolve tmux session name with documented fallback chain.
|
|
182
|
+
|
|
183
|
+
Precedence (F4 sibling to cursor_path):
|
|
184
|
+
1. Explicit ``--tmux-session`` CLI arg
|
|
185
|
+
2. ``cell.yaml`` extra.tmux_session when --cell present
|
|
186
|
+
3. Role itself (convention default)
|
|
187
|
+
|
|
188
|
+
Mother's sibling-instance variant (#1061): when slot-N siblings spawn,
|
|
189
|
+
each slot needs its own tmux session name; the cell.yaml that pins the
|
|
190
|
+
slot SHOULD also pin the tmux_session to keep the watchdog's reads
|
|
191
|
+
consistent with the spawn's writes.
|
|
192
|
+
"""
|
|
193
|
+
if explicit:
|
|
194
|
+
return explicit
|
|
195
|
+
if cell_yaml_value:
|
|
196
|
+
return cell_yaml_value
|
|
197
|
+
return role
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _read_cell_yaml_pins(role: str) -> tuple[Optional[str], Optional[str]]:
|
|
201
|
+
"""Best-effort read of cell.yaml extra.cursor_path + extra.tmux_session.
|
|
202
|
+
|
|
203
|
+
Tries the cwd-local ``./cell.yaml`` first (matches hook_output discovery),
|
|
204
|
+
falls back to ``<cells_dir>/<role>.yaml``. Returns (None, None) on any
|
|
205
|
+
failure — F4 is additive non-breaking, malformed cell.yaml falls through
|
|
206
|
+
to the legacy convention defaults.
|
|
207
|
+
|
|
208
|
+
NOTE: ``cursor_path`` / ``tmux_session`` live in ``Cell.extra`` (forward-
|
|
209
|
+
compat catch-all per swarph-shared v0.3) in v0.7.2. swarph-shared 0.4
|
|
210
|
+
will graduate them to first-class typed fields on ``Cell``; this reader
|
|
211
|
+
will continue to work because graduate-to-typed-field preserves the
|
|
212
|
+
extra-dict reading path (per swarph-shared's documented forward-compat
|
|
213
|
+
discipline).
|
|
214
|
+
"""
|
|
215
|
+
from swarph_cli.cell import (
|
|
216
|
+
cells_dir,
|
|
217
|
+
discover_cell_in_cwd,
|
|
218
|
+
load_cell,
|
|
219
|
+
CellError,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
cell_path = discover_cell_in_cwd()
|
|
223
|
+
if cell_path is None:
|
|
224
|
+
candidate = cells_dir() / f"{role}.yaml"
|
|
225
|
+
if candidate.is_file():
|
|
226
|
+
cell_path = candidate
|
|
227
|
+
if cell_path is None:
|
|
228
|
+
return None, None
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
cell = load_cell(cell_path)
|
|
232
|
+
except (CellError, OSError):
|
|
233
|
+
return None, None
|
|
234
|
+
|
|
235
|
+
extra = cell.extra or {}
|
|
236
|
+
cursor_path = extra.get("cursor_path")
|
|
237
|
+
tmux_session = extra.get("tmux_session")
|
|
238
|
+
return (
|
|
239
|
+
str(cursor_path) if cursor_path else None,
|
|
240
|
+
str(tmux_session) if tmux_session else None,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
152
244
|
def _resolve_log_path(explicit: Optional[str]) -> Path:
|
|
153
245
|
if explicit:
|
|
154
246
|
return Path(explicit).expanduser()
|
|
@@ -228,6 +320,39 @@ def _tmux_send_keys(name: str, text: str) -> bool:
|
|
|
228
320
|
return False
|
|
229
321
|
|
|
230
322
|
|
|
323
|
+
def _pane_activity_age_sec(name: str) -> Optional[int]:
|
|
324
|
+
"""Age in seconds since the tmux pane's last activity event.
|
|
325
|
+
|
|
326
|
+
Reads tmux's `#{pane_activity}` format variable, which returns a unix
|
|
327
|
+
epoch timestamp of the most recent activity in the active pane of the
|
|
328
|
+
target session. Returns None if tmux is missing, the session doesn't
|
|
329
|
+
exist, or tmux's output isn't parseable as an integer epoch.
|
|
330
|
+
|
|
331
|
+
Used by F3 (mother #1087 / drop-on-meta-edge proposal) as a third
|
|
332
|
+
AND-gate input to distinguish (a) session genuinely stalled from (b)
|
|
333
|
+
session actively working in a long bash block. cursor-mtime alone
|
|
334
|
+
measures "time since last turn-end" not "time since last activity";
|
|
335
|
+
pane_activity covers the mid-turn-active case.
|
|
336
|
+
|
|
337
|
+
Returns None on detection error so the caller can fall through to
|
|
338
|
+
the legacy AND-gate behavior — F3 is a strengthening of the gate,
|
|
339
|
+
not a replacement of it.
|
|
340
|
+
"""
|
|
341
|
+
try:
|
|
342
|
+
result = subprocess.run(
|
|
343
|
+
["tmux", "display", "-p", "-t", name, "#{pane_activity}"],
|
|
344
|
+
capture_output=True, text=True, timeout=5,
|
|
345
|
+
)
|
|
346
|
+
if result.returncode != 0:
|
|
347
|
+
return None
|
|
348
|
+
out = result.stdout.strip()
|
|
349
|
+
if not out:
|
|
350
|
+
return None
|
|
351
|
+
return max(0, _now() - int(out))
|
|
352
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError, ValueError):
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
|
|
231
356
|
def _tmux_kill_session(name: str) -> bool:
|
|
232
357
|
try:
|
|
233
358
|
result = subprocess.run(
|
|
@@ -263,6 +388,67 @@ def _spawn_via_swarph(role: str, tmux_session: str) -> bool:
|
|
|
263
388
|
return False
|
|
264
389
|
|
|
265
390
|
|
|
391
|
+
def _a1_marker_path(log_path: Path, role: str, tmux_session: Optional[str] = None) -> Path:
|
|
392
|
+
"""Marker file recording the cursor_mtime at which A1 was last fired.
|
|
393
|
+
|
|
394
|
+
Co-located with the watchdog log so it inherits the same XDG_STATE_HOME
|
|
395
|
+
discipline. Cleared on cursor-advance OR A2 escalation. Used to suppress
|
|
396
|
+
repeat A1 fires within the same stale window — fix for the spam incident
|
|
397
|
+
where cron fired A1 every 5min for 65min into an active session's tmux
|
|
398
|
+
input buffer (commander #1092 + droplet #1087).
|
|
399
|
+
|
|
400
|
+
Keyed on ``(role, tmux_session)`` post-F4 so sibling-instance patterns
|
|
401
|
+
(alpha+beta drop-on-meta-edge per project_drop_mitosis_to_meta_edge)
|
|
402
|
+
don't clobber each other's markers — mother's flag from #1103 closed in
|
|
403
|
+
v0.7.2. tmux_session is sanitized to alphanumeric + ``-_.`` for the
|
|
404
|
+
filename to avoid path-traversal or weird characters from cell.yaml-
|
|
405
|
+
pinned values.
|
|
406
|
+
|
|
407
|
+
NOTE (mother #1138 sanitization edge case): two siblings whose
|
|
408
|
+
``tmux_session`` values differ ONLY in disallowed characters (e.g.,
|
|
409
|
+
``cell:a`` vs ``cell:b`` — colons sanitized to ``_`` collapsing both
|
|
410
|
+
to ``cell_a`` / ``cell_b`` — fine in this example, but ``cell:a`` vs
|
|
411
|
+
``cell$a`` would both collapse to ``cell_a``) would collide post-
|
|
412
|
+
sanitization. cell.yaml-pinned ``tmux_session`` values SHOULD differ
|
|
413
|
+
in alphanumeric content, not just punctuation. Cosmetic in practice
|
|
414
|
+
(operators don't choose session names that close), but worth knowing.
|
|
415
|
+
"""
|
|
416
|
+
safe_tmux = "".join(
|
|
417
|
+
c if (c.isalnum() or c in "-_.") else "_"
|
|
418
|
+
for c in (tmux_session or role)
|
|
419
|
+
)
|
|
420
|
+
return log_path.parent / f"a1-fired-{role}-{safe_tmux}.marker"
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def _a1_already_fired_at(marker: Path, cursor_mtime: int) -> bool:
|
|
424
|
+
"""Returns True if a previous A1 was fired with this exact cursor_mtime.
|
|
425
|
+
|
|
426
|
+
Same cursor_mtime ⇒ no cursor advance since last fire ⇒ we're still in
|
|
427
|
+
the same stale window ⇒ another A1 would spam. Suppresses the fire.
|
|
428
|
+
"""
|
|
429
|
+
try:
|
|
430
|
+
return int(marker.read_text().strip()) == cursor_mtime
|
|
431
|
+
except (FileNotFoundError, OSError, ValueError):
|
|
432
|
+
return False
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _record_a1_fired(marker: Path, cursor_mtime: int) -> None:
|
|
436
|
+
"""Best-effort marker write; failures are logged elsewhere but never block."""
|
|
437
|
+
try:
|
|
438
|
+
marker.parent.mkdir(parents=True, exist_ok=True)
|
|
439
|
+
marker.write_text(str(cursor_mtime))
|
|
440
|
+
except OSError:
|
|
441
|
+
pass
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _clear_a1_marker(marker: Path) -> None:
|
|
445
|
+
"""Idempotent marker removal. Called on A2 escalation paths."""
|
|
446
|
+
try:
|
|
447
|
+
marker.unlink()
|
|
448
|
+
except (FileNotFoundError, OSError):
|
|
449
|
+
pass
|
|
450
|
+
|
|
451
|
+
|
|
266
452
|
def _log_event(log_path: Path, event: str, details: dict, verbose: bool = False) -> None:
|
|
267
453
|
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
268
454
|
entry = {
|
|
@@ -283,10 +469,17 @@ def _log_event(log_path: Path, event: str, details: dict, verbose: bool = False)
|
|
|
283
469
|
|
|
284
470
|
def run_check(args: argparse.Namespace) -> int:
|
|
285
471
|
role = args.cell
|
|
286
|
-
|
|
472
|
+
# F4 — cell.yaml-pinned cursor_path + tmux_session (mother #1057/#1060
|
|
473
|
+
# + beta #1061/#1065). Reads cell.yaml `extra.cursor_path` /
|
|
474
|
+
# `extra.tmux_session` when --cell is provided; explicit CLI args still
|
|
475
|
+
# win. Best-effort: malformed cell.yaml falls through to legacy
|
|
476
|
+
# convention defaults (additive non-breaking).
|
|
477
|
+
cell_cursor, cell_tmux = _read_cell_yaml_pins(role)
|
|
478
|
+
cursor = _resolve_cursor_path(role, args.cursor, cell_cursor)
|
|
479
|
+
tmux_session = _resolve_tmux_session(role, args.tmux_session, cell_tmux)
|
|
287
480
|
log_path = _resolve_log_path(args.log)
|
|
288
481
|
threshold = args.threshold
|
|
289
|
-
|
|
482
|
+
pane_activity_threshold = args.pane_activity_threshold
|
|
290
483
|
peer = args.peer or role
|
|
291
484
|
gateway = args.gateway
|
|
292
485
|
token = os.environ.get("MESH_GATEWAY_TOKEN")
|
|
@@ -298,6 +491,9 @@ def run_check(args: argparse.Namespace) -> int:
|
|
|
298
491
|
"threshold_sec": threshold,
|
|
299
492
|
"tmux_session": tmux_session,
|
|
300
493
|
"peer": peer,
|
|
494
|
+
"pane_activity_threshold_sec": pane_activity_threshold,
|
|
495
|
+
"cell_yaml_pinned_cursor": cell_cursor is not None,
|
|
496
|
+
"cell_yaml_pinned_tmux": cell_tmux is not None,
|
|
301
497
|
}
|
|
302
498
|
|
|
303
499
|
# PRIMARY signal: cursor file mtime
|
|
@@ -323,13 +519,19 @@ def run_check(args: argparse.Namespace) -> int:
|
|
|
323
519
|
unread = _gateway_unread_count(gateway, peer, token)
|
|
324
520
|
diag["unread_count"] = unread
|
|
325
521
|
|
|
326
|
-
# Decision matrix:
|
|
327
|
-
# cursor_stale + process_alive + unread > 0 → A1 (alive but throttled, prompt may unblock)
|
|
522
|
+
# Decision matrix (post commander #1092 + droplet #1087 + #1089 hardening):
|
|
328
523
|
# cursor_stale + not process_alive → A2 (dead, respawn regardless of unread)
|
|
524
|
+
# cursor_stale + process_alive + unread > 0 → A1 (alive but throttled, prompt may unblock)
|
|
329
525
|
# cursor_stale + process_alive + unread = 0 → noop (no DMs to drain anyway)
|
|
330
|
-
# cursor_stale + unread
|
|
526
|
+
# cursor_stale + process_alive + unread None → noop (F2 fail-closed: can't verify work, don't poke)
|
|
527
|
+
# cursor_stale + a1_marker matches cursor_mtime → noop (F1 same-window suppression)
|
|
528
|
+
|
|
529
|
+
marker = _a1_marker_path(log_path, role, tmux_session)
|
|
530
|
+
diag["a1_marker"] = str(marker)
|
|
331
531
|
|
|
332
532
|
if not process_alive:
|
|
533
|
+
# A2 escalation — clear the A1 marker so the next A1 (after respawn) fires.
|
|
534
|
+
_clear_a1_marker(marker)
|
|
333
535
|
diag["decision"] = "a2_respawn_process_dead"
|
|
334
536
|
if args.no_respawn:
|
|
335
537
|
diag["dry_run_skip"] = True
|
|
@@ -340,17 +542,27 @@ def run_check(args: argparse.Namespace) -> int:
|
|
|
340
542
|
_log_event(log_path, "a2_respawn", diag, verbose)
|
|
341
543
|
return 2 if spawn_ok else 4
|
|
342
544
|
|
|
343
|
-
# Process is alive but cursor is stale
|
|
545
|
+
# Process is alive but cursor is stale.
|
|
546
|
+
# F2 — fail-closed when unread can't be verified. Trade false-negative for
|
|
547
|
+
# false-positive ("respect peer-time when uncertain" per droplet #1089).
|
|
548
|
+
# Production incident shape (commander #1092): gateway returned None for
|
|
549
|
+
# unread; old code fell through to A1, spamming the tmux buffer for 65min.
|
|
550
|
+
if unread is None:
|
|
551
|
+
diag["decision"] = "noop_unread_unknown"
|
|
552
|
+
_log_event(log_path, "noop", diag, verbose)
|
|
553
|
+
return 0
|
|
554
|
+
|
|
344
555
|
if unread == 0:
|
|
345
556
|
diag["decision"] = "noop_no_unread"
|
|
346
557
|
_log_event(log_path, "noop", diag, verbose)
|
|
347
558
|
return 0
|
|
348
559
|
|
|
349
|
-
diag["decision"] = "a1_send_keys"
|
|
350
560
|
if not _tmux_session_exists(tmux_session):
|
|
351
561
|
# Process alive somewhere but tmux session gone — partial state.
|
|
352
562
|
# Treat as A2 case: respawn fresh sibling.
|
|
563
|
+
_clear_a1_marker(marker)
|
|
353
564
|
diag["tmux_missing"] = True
|
|
565
|
+
diag["decision"] = "a2_respawn_tmux_missing"
|
|
354
566
|
if args.no_respawn:
|
|
355
567
|
_log_event(log_path, "a2_dry_run", diag, verbose)
|
|
356
568
|
return 2
|
|
@@ -359,12 +571,41 @@ def run_check(args: argparse.Namespace) -> int:
|
|
|
359
571
|
_log_event(log_path, "a2_respawn", diag, verbose)
|
|
360
572
|
return 2 if spawn_ok else 4
|
|
361
573
|
|
|
574
|
+
# F1 — same-stale-window suppression. If A1 already fired at this exact
|
|
575
|
+
# cursor_mtime, further A1s would only stack wake-prompts in the tmux
|
|
576
|
+
# input buffer (commander #1092: 13 fires across 65min on a session that
|
|
577
|
+
# was actively working but cursor only updates at turn-end). Fire AT MOST
|
|
578
|
+
# ONCE per stale window; re-arm only when cursor advances (recovery) or
|
|
579
|
+
# A2 escalates (respawn clears the marker above).
|
|
580
|
+
if _a1_already_fired_at(marker, cursor_mtime):
|
|
581
|
+
diag["decision"] = "noop_a1_already_fired_this_window"
|
|
582
|
+
_log_event(log_path, "noop", diag, verbose)
|
|
583
|
+
return 0
|
|
584
|
+
|
|
585
|
+
# F3 — tmux pane_activity AND-gate (mother #1087). cursor-mtime measures
|
|
586
|
+
# "time since last turn-end" not "time since last activity"; mid-long-
|
|
587
|
+
# turn cursor is stale even though session is maximally alive. tmux's
|
|
588
|
+
# `#{pane_activity}` covers the mid-turn-active case. If the pane has
|
|
589
|
+
# had activity within `pane_activity_threshold_sec`, suppress A1 — the
|
|
590
|
+
# session is working, not stalled. Falls through to firing A1 when
|
|
591
|
+
# pane_activity is None (tmux missing / older tmux without the format)
|
|
592
|
+
# so F3 is a strengthening of the gate, not a hard dependency.
|
|
593
|
+
pane_age = _pane_activity_age_sec(tmux_session)
|
|
594
|
+
diag["pane_activity_age_sec"] = pane_age
|
|
595
|
+
if pane_age is not None and pane_age < pane_activity_threshold:
|
|
596
|
+
diag["decision"] = "noop_pane_activity_recent"
|
|
597
|
+
_log_event(log_path, "noop", diag, verbose)
|
|
598
|
+
return 0
|
|
599
|
+
|
|
600
|
+
diag["decision"] = "a1_send_keys"
|
|
362
601
|
wake_text = (
|
|
363
602
|
f"watchdog wake — cursor stale {cursor_age}s, "
|
|
364
603
|
f"unread={unread}; please drain inbox"
|
|
365
604
|
)
|
|
366
605
|
sent = _tmux_send_keys(tmux_session, wake_text)
|
|
367
606
|
diag["send_keys_ok"] = sent
|
|
607
|
+
if sent:
|
|
608
|
+
_record_a1_fired(marker, cursor_mtime)
|
|
368
609
|
_log_event(log_path, "a1_send_keys", diag, verbose)
|
|
369
610
|
return 1 if sent else 4
|
|
370
611
|
|
|
@@ -385,6 +626,14 @@ def run_watchdog(argv: Optional[list[str]] = None) -> int:
|
|
|
385
626
|
p.add_argument("--cell", default=os.environ.get("SWARPH_CELL", "lab"))
|
|
386
627
|
p.add_argument("--cursor", default=None)
|
|
387
628
|
p.add_argument("--threshold", type=int, default=_DEFAULT_THRESHOLD_SEC)
|
|
629
|
+
p.add_argument(
|
|
630
|
+
"--pane-activity-threshold",
|
|
631
|
+
type=int,
|
|
632
|
+
default=_DEFAULT_PANE_ACTIVITY_THRESHOLD_SEC,
|
|
633
|
+
help="F3 gate: suppress A1 if tmux pane had activity within this "
|
|
634
|
+
"many seconds (covers mid-long-turn working sessions where "
|
|
635
|
+
"cursor-mtime is stale but session is alive).",
|
|
636
|
+
)
|
|
388
637
|
p.add_argument("--gateway", default=_DEFAULT_GATEWAY_URL)
|
|
389
638
|
p.add_argument("--tmux-session", default=None)
|
|
390
639
|
p.add_argument("--peer", default=None)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: swarph-cli
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: The `swarph` binary — multi-LLM CLI with mesh-gateway integration. v0.7.0 ships Phase 7 substrate-doc R7 §11.1.7 operator-tooling layer in 5 increments: PR-A `--new-instance` flag (sibling-spawn case) + PR-B auto-suffix on collision (sibling-slot persistence) + PR-C SessionStart hook (closes bare-claude operator-paste gap) + watchdog (stranded-session recovery) + PR-D swarph-shared cell.yaml relocation (cell-yaml schema graduates to swarph-shared 0.3.0 kernel-tier; substrate-doc R7 §11.1.5 (O5) RESOLVED).
|
|
5
5
|
Author: Pierre Samson, Claude Opus
|
|
6
6
|
License: MIT
|