@team-agent/installer 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/schemas/team.schema.json +6 -0
- package/src/team_agent/approvals/runtime_prompts.py +1 -1
- package/src/team_agent/cli/commands.py +122 -6
- package/src/team_agent/cli/parser.py +42 -1
- package/src/team_agent/coordinator/__main__.py +21 -2
- package/src/team_agent/coordinator/lifecycle.py +11 -0
- package/src/team_agent/diagnose/orphan_cleanup.py +364 -0
- package/src/team_agent/events.py +47 -0
- package/src/team_agent/launch/core.py +2 -1
- package/src/team_agent/leader/__init__.py +273 -60
- package/src/team_agent/lifecycle/agents.py +54 -2
- package/src/team_agent/lifecycle/operations.py +87 -9
- package/src/team_agent/lifecycle/start.py +1 -1
- package/src/team_agent/message_store/core.py +8 -7
- package/src/team_agent/message_store/leader_notification_log.py +132 -0
- package/src/team_agent/message_store/result_watchers.py +144 -1
- package/src/team_agent/message_store/schema.py +31 -2
- package/src/team_agent/messaging/delivery.py +293 -1
- package/src/team_agent/messaging/idle_alerts.py +109 -9
- package/src/team_agent/messaging/leader.py +179 -10
- package/src/team_agent/messaging/leader_api_errors.py +216 -0
- package/src/team_agent/messaging/leader_panes.py +393 -23
- package/src/team_agent/messaging/result_delivery.py +219 -4
- package/src/team_agent/messaging/results.py +12 -21
- package/src/team_agent/messaging/scheduler.py +24 -2
- package/src/team_agent/messaging/send.py +21 -26
- package/src/team_agent/messaging/tmux_io.py +153 -23
- package/src/team_agent/messaging/tmux_prompt.py +87 -0
- package/src/team_agent/messaging/trust_auto_answer.py +44 -0
- package/src/team_agent/restart/orchestration.py +207 -4
- package/src/team_agent/runtime.py +7 -7
- package/src/team_agent/rust_core.py +157 -3
- package/src/team_agent/sessions/capture.py +65 -15
- package/src/team_agent/spec.py +59 -0
- package/src/team_agent/state.py +153 -10
- package/src/team_agent/status/inbox.py +33 -3
- package/src/team_agent/status/queries.py +32 -1
- package/src/team_agent/watch/__init__.py +145 -0
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
"""Stage 14 (Gap 37a) — `team-agent doctor --cleanup-orphans` implementation.
|
|
2
|
+
|
|
3
|
+
Scans `ps` for processes matching `team_agent.coordinator --workspace <path>` and
|
|
4
|
+
classifies any whose workspace path no longer exists (or matches the test-tempdir
|
|
5
|
+
pattern) as an orphan. Dry-run by default; --confirm sends SIGTERM.
|
|
6
|
+
|
|
7
|
+
Mac mini 2026-05-26 evidence: 35 orphan coordinator processes alive simultaneously
|
|
8
|
+
pointing at /var/folders/.../T/team-agent-watcher-dedupe-* paths that had been removed
|
|
9
|
+
hours earlier. Each holds a long-lived Python interpreter + SQLite connection.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import signal
|
|
16
|
+
import subprocess
|
|
17
|
+
import time
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
# Pattern: argv contains "team_agent.coordinator --workspace <path>" anywhere.
|
|
22
|
+
_COORDINATOR_ARGV_RE = re.compile(
|
|
23
|
+
r"team_agent\.coordinator(?:\.__main__)?(?:\s+|.*?)\s--workspace\s+(\S+)"
|
|
24
|
+
)
|
|
25
|
+
# Test-tempdir patterns that indicate the workspace is ephemeral and almost certainly orphan.
|
|
26
|
+
_EPHEMERAL_PATH_HINTS = (
|
|
27
|
+
"team-agent-watcher-dedupe-",
|
|
28
|
+
"team-agent-gap",
|
|
29
|
+
"team-agent-stage",
|
|
30
|
+
"team-agent-orchestrator-",
|
|
31
|
+
"team-agent-rm-",
|
|
32
|
+
"team-agent-claim-",
|
|
33
|
+
"team-agent-hotfix",
|
|
34
|
+
"team-agent-multi",
|
|
35
|
+
"team-agent-progress-",
|
|
36
|
+
"team-agent-fanout-",
|
|
37
|
+
"team-agent-in-flight-",
|
|
38
|
+
"team-agent-test-",
|
|
39
|
+
)
|
|
40
|
+
_SIGTERM_WAIT_SECONDS = 3.0
|
|
41
|
+
_SIGKILL_WAIT_SECONDS = 2.0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def find_coordinator_processes(*, runner=subprocess.run) -> list[dict[str, Any]]:
|
|
45
|
+
"""Return list of {pid, etime, cmdline, workspace} dicts for every running
|
|
46
|
+
team_agent.coordinator process visible to ps. workspace is None when the cmdline
|
|
47
|
+
doesn't parse — those are noted but not auto-classified as orphan."""
|
|
48
|
+
try:
|
|
49
|
+
proc = runner(
|
|
50
|
+
["ps", "-Awwo", "pid=,etime=,command="],
|
|
51
|
+
text=True,
|
|
52
|
+
capture_output=True,
|
|
53
|
+
timeout=5,
|
|
54
|
+
check=False,
|
|
55
|
+
)
|
|
56
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
57
|
+
return []
|
|
58
|
+
if proc.returncode != 0 or not proc.stdout:
|
|
59
|
+
return []
|
|
60
|
+
rows: list[dict[str, Any]] = []
|
|
61
|
+
for line in proc.stdout.splitlines():
|
|
62
|
+
parts = line.strip().split(None, 2)
|
|
63
|
+
if len(parts) < 3:
|
|
64
|
+
continue
|
|
65
|
+
pid_s, etime, cmdline = parts[0], parts[1], parts[2]
|
|
66
|
+
if "team_agent.coordinator" not in cmdline:
|
|
67
|
+
continue
|
|
68
|
+
if "ps -Awwo" in cmdline:
|
|
69
|
+
continue
|
|
70
|
+
try:
|
|
71
|
+
pid = int(pid_s)
|
|
72
|
+
except ValueError:
|
|
73
|
+
continue
|
|
74
|
+
if pid == os.getpid():
|
|
75
|
+
continue
|
|
76
|
+
match = _COORDINATOR_ARGV_RE.search(cmdline)
|
|
77
|
+
workspace = match.group(1) if match else None
|
|
78
|
+
rows.append({
|
|
79
|
+
"pid": pid,
|
|
80
|
+
"etime": etime,
|
|
81
|
+
"cmdline": cmdline,
|
|
82
|
+
"workspace": workspace,
|
|
83
|
+
})
|
|
84
|
+
return rows
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def classify_orphan(entry: dict[str, Any]) -> tuple[bool, str]:
|
|
88
|
+
"""Return (is_orphan, reason). An entry is orphan when its workspace path no longer
|
|
89
|
+
exists on disk OR matches a known ephemeral-tempdir pattern (test workspaces should
|
|
90
|
+
NEVER spawn long-lived coordinators)."""
|
|
91
|
+
workspace = entry.get("workspace")
|
|
92
|
+
if not workspace:
|
|
93
|
+
return False, "cmdline_unparsed"
|
|
94
|
+
if not os.path.exists(workspace):
|
|
95
|
+
return True, "workspace_path_missing"
|
|
96
|
+
for hint in _EPHEMERAL_PATH_HINTS:
|
|
97
|
+
if hint in workspace:
|
|
98
|
+
return True, f"ephemeral_tempdir_pattern:{hint}"
|
|
99
|
+
return False, "workspace_alive"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def cleanup_orphan_coordinators(
|
|
103
|
+
*,
|
|
104
|
+
confirm: bool = False,
|
|
105
|
+
runner=subprocess.run,
|
|
106
|
+
killer=os.kill,
|
|
107
|
+
pg_killer=None,
|
|
108
|
+
pgid_getter=None,
|
|
109
|
+
sleeper=time.sleep,
|
|
110
|
+
sigterm_wait_seconds: float = _SIGTERM_WAIT_SECONDS,
|
|
111
|
+
sigkill_wait_seconds: float = _SIGKILL_WAIT_SECONDS,
|
|
112
|
+
) -> dict[str, Any]:
|
|
113
|
+
"""Scan for orphan coordinators. Without confirm: dry-run (just classify and report).
|
|
114
|
+
With confirm: SIGTERM each orphan, wait up to _SIGTERM_WAIT_SECONDS for graceful
|
|
115
|
+
exit; if still alive, escalate to SIGKILL and wait _SIGKILL_WAIT_SECONDS. Only
|
|
116
|
+
report status='failed' (with error='alive_after_sigkill') when the process
|
|
117
|
+
survives BOTH signals — that's extremely rare and almost always indicates a
|
|
118
|
+
zombie/uninterruptible-sleep kernel state.
|
|
119
|
+
|
|
120
|
+
Mac mini 2026-05-26 evidence: real orphan coordinators have been observed alive
|
|
121
|
+
40+ hours; many of them never exit on SIGTERM (signal handler suppressed during
|
|
122
|
+
long sqlite reads, or the python interpreter is hosting an async loop that
|
|
123
|
+
swallows the term signal). SIGKILL escalation is required for production.
|
|
124
|
+
|
|
125
|
+
pg_killer / pgid_getter default to os.killpg / os.getpgid; mock them in tests.
|
|
126
|
+
If pgid_getter succeeds AND returns a pgid > 1 AND the pgid != pid (i.e. the
|
|
127
|
+
process leads its own process group with children), we signal the WHOLE group;
|
|
128
|
+
otherwise we signal the pid directly. This catches orphan coordinators that
|
|
129
|
+
spawned subprocess.Popen children which would otherwise survive a pid-only
|
|
130
|
+
SIGTERM."""
|
|
131
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
132
|
+
if pg_killer is None:
|
|
133
|
+
pg_killer = getattr(os, "killpg", None)
|
|
134
|
+
if pgid_getter is None:
|
|
135
|
+
pgid_getter = getattr(os, "getpgid", None)
|
|
136
|
+
entries = find_coordinator_processes(runner=runner)
|
|
137
|
+
classified: list[dict[str, Any]] = []
|
|
138
|
+
orphans: list[dict[str, Any]] = []
|
|
139
|
+
for entry in entries:
|
|
140
|
+
is_orphan, reason = classify_orphan(entry)
|
|
141
|
+
annotated = {**entry, "is_orphan": is_orphan, "reason": reason}
|
|
142
|
+
classified.append(annotated)
|
|
143
|
+
if is_orphan:
|
|
144
|
+
orphans.append(annotated)
|
|
145
|
+
if not confirm:
|
|
146
|
+
return {
|
|
147
|
+
"ok": True,
|
|
148
|
+
"scanned": len(classified),
|
|
149
|
+
"orphans": orphans,
|
|
150
|
+
"dry_run": True,
|
|
151
|
+
"scanned_at": now,
|
|
152
|
+
"action_required": "re-run with --confirm to send SIGTERM",
|
|
153
|
+
}
|
|
154
|
+
killed: list[dict[str, Any]] = []
|
|
155
|
+
failed: list[dict[str, Any]] = []
|
|
156
|
+
for entry in orphans:
|
|
157
|
+
outcome = _terminate_orphan(
|
|
158
|
+
entry["pid"], killer=killer, pg_killer=pg_killer,
|
|
159
|
+
pgid_getter=pgid_getter, sleeper=sleeper,
|
|
160
|
+
sigterm_wait_seconds=sigterm_wait_seconds,
|
|
161
|
+
sigkill_wait_seconds=sigkill_wait_seconds,
|
|
162
|
+
)
|
|
163
|
+
annotated = {**entry, **outcome}
|
|
164
|
+
if outcome.get("status") == "killed":
|
|
165
|
+
killed.append(annotated)
|
|
166
|
+
elif outcome.get("status") == "missing":
|
|
167
|
+
killed.append(annotated)
|
|
168
|
+
else:
|
|
169
|
+
failed.append(annotated)
|
|
170
|
+
return {
|
|
171
|
+
"ok": True,
|
|
172
|
+
"scanned": len(classified),
|
|
173
|
+
"orphans": orphans,
|
|
174
|
+
"killed": killed,
|
|
175
|
+
"failed": failed,
|
|
176
|
+
"dry_run": False,
|
|
177
|
+
"scanned_at": now,
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _terminate_orphan(
|
|
182
|
+
pid: int,
|
|
183
|
+
*,
|
|
184
|
+
killer,
|
|
185
|
+
pg_killer,
|
|
186
|
+
pgid_getter,
|
|
187
|
+
sleeper,
|
|
188
|
+
sigterm_wait_seconds: float = _SIGTERM_WAIT_SECONDS,
|
|
189
|
+
sigkill_wait_seconds: float = _SIGKILL_WAIT_SECONDS,
|
|
190
|
+
) -> dict[str, Any]:
|
|
191
|
+
"""SIGTERM → wait 3s → SIGKILL → wait 2s escalation. Returns one of:
|
|
192
|
+
{status: 'killed', sigkill_required: False, signaled: 'pid'|'pgid'}
|
|
193
|
+
{status: 'killed', sigkill_required: True, signaled: 'pid'|'pgid'}
|
|
194
|
+
{status: 'missing', error: '<exc>'} — process gone before SIGTERM
|
|
195
|
+
{status: 'failed', error: 'alive_after_sigkill'} — process survived both
|
|
196
|
+
{status: 'failed', error: '<exc>'} — permission denied / OS error
|
|
197
|
+
"""
|
|
198
|
+
pgid, pgid_error = _safe_getpgid(pid, pgid_getter)
|
|
199
|
+
use_group = bool(pg_killer and pgid is not None and pgid > 1 and pgid != pid)
|
|
200
|
+
signaled = "pgid" if use_group else "pid"
|
|
201
|
+
|
|
202
|
+
def send(sig: int) -> tuple[bool, str | None]:
|
|
203
|
+
try:
|
|
204
|
+
if use_group:
|
|
205
|
+
pg_killer(pgid, sig)
|
|
206
|
+
else:
|
|
207
|
+
killer(pid, sig)
|
|
208
|
+
except ProcessLookupError:
|
|
209
|
+
return False, "process_lookup_error"
|
|
210
|
+
except (PermissionError, OSError) as exc:
|
|
211
|
+
return False, str(exc)
|
|
212
|
+
return True, None
|
|
213
|
+
|
|
214
|
+
ok, err = send(signal.SIGTERM)
|
|
215
|
+
if not ok:
|
|
216
|
+
if err == "process_lookup_error":
|
|
217
|
+
return {"status": "missing", "signaled": signaled, "pgid": pgid}
|
|
218
|
+
return {"status": "failed", "error": err, "signaled": signaled, "pgid": pgid}
|
|
219
|
+
if _wait_for_exit(pid, sigterm_wait_seconds, killer=killer, sleeper=sleeper):
|
|
220
|
+
return {
|
|
221
|
+
"status": "killed",
|
|
222
|
+
"sigkill_required": False,
|
|
223
|
+
"signaled": signaled,
|
|
224
|
+
"pgid": pgid,
|
|
225
|
+
"pgid_error": pgid_error,
|
|
226
|
+
}
|
|
227
|
+
# SIGTERM did not work — escalate.
|
|
228
|
+
ok, err = send(signal.SIGKILL)
|
|
229
|
+
if not ok:
|
|
230
|
+
if err == "process_lookup_error":
|
|
231
|
+
# Race: died between checks.
|
|
232
|
+
return {
|
|
233
|
+
"status": "killed",
|
|
234
|
+
"sigkill_required": False,
|
|
235
|
+
"signaled": signaled,
|
|
236
|
+
"pgid": pgid,
|
|
237
|
+
"pgid_error": pgid_error,
|
|
238
|
+
}
|
|
239
|
+
return {
|
|
240
|
+
"status": "failed",
|
|
241
|
+
"error": err,
|
|
242
|
+
"signaled": signaled,
|
|
243
|
+
"pgid": pgid,
|
|
244
|
+
"sigkill_attempted": True,
|
|
245
|
+
}
|
|
246
|
+
if _wait_for_exit(pid, sigkill_wait_seconds, killer=killer, sleeper=sleeper):
|
|
247
|
+
return {
|
|
248
|
+
"status": "killed",
|
|
249
|
+
"sigkill_required": True,
|
|
250
|
+
"signaled": signaled,
|
|
251
|
+
"pgid": pgid,
|
|
252
|
+
"pgid_error": pgid_error,
|
|
253
|
+
}
|
|
254
|
+
return {
|
|
255
|
+
"status": "failed",
|
|
256
|
+
"error": "alive_after_sigkill",
|
|
257
|
+
"signaled": signaled,
|
|
258
|
+
"pgid": pgid,
|
|
259
|
+
"sigkill_required": True,
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _safe_getpgid(pid: int, pgid_getter) -> tuple[int | None, str | None]:
|
|
264
|
+
if pgid_getter is None:
|
|
265
|
+
return None, "getpgid_unavailable"
|
|
266
|
+
try:
|
|
267
|
+
return pgid_getter(pid), None
|
|
268
|
+
except (ProcessLookupError, PermissionError, OSError) as exc:
|
|
269
|
+
return None, str(exc)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _wait_for_exit(pid: int, timeout: float, *, killer, sleeper) -> bool:
|
|
273
|
+
deadline = time.monotonic() + max(timeout, 0.0)
|
|
274
|
+
while time.monotonic() < deadline:
|
|
275
|
+
try:
|
|
276
|
+
killer(pid, 0)
|
|
277
|
+
except ProcessLookupError:
|
|
278
|
+
return True
|
|
279
|
+
except (PermissionError, OSError):
|
|
280
|
+
return True
|
|
281
|
+
sleeper(0.1)
|
|
282
|
+
# Final check after the deadline elapses.
|
|
283
|
+
try:
|
|
284
|
+
killer(pid, 0)
|
|
285
|
+
except ProcessLookupError:
|
|
286
|
+
return True
|
|
287
|
+
except (PermissionError, OSError):
|
|
288
|
+
return True
|
|
289
|
+
return False
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def orphan_gate(
|
|
293
|
+
*,
|
|
294
|
+
fix: bool = False,
|
|
295
|
+
confirm: bool = False,
|
|
296
|
+
runner=subprocess.run,
|
|
297
|
+
killer=os.kill,
|
|
298
|
+
pg_killer=None,
|
|
299
|
+
pgid_getter=None,
|
|
300
|
+
sleeper=time.sleep,
|
|
301
|
+
sigterm_wait_seconds: float = _SIGTERM_WAIT_SECONDS,
|
|
302
|
+
sigkill_wait_seconds: float = _SIGKILL_WAIT_SECONDS,
|
|
303
|
+
) -> dict[str, Any]:
|
|
304
|
+
if fix and not confirm:
|
|
305
|
+
return {
|
|
306
|
+
"ok": False,
|
|
307
|
+
"gate": "orphans",
|
|
308
|
+
"status": "refused",
|
|
309
|
+
"reason": "fix_requires_confirm",
|
|
310
|
+
"action": "re-run with --gate orphans --fix --confirm",
|
|
311
|
+
}
|
|
312
|
+
result = cleanup_orphan_coordinators(
|
|
313
|
+
confirm=fix and confirm,
|
|
314
|
+
runner=runner,
|
|
315
|
+
killer=killer,
|
|
316
|
+
pg_killer=pg_killer,
|
|
317
|
+
pgid_getter=pgid_getter,
|
|
318
|
+
sleeper=sleeper,
|
|
319
|
+
sigterm_wait_seconds=sigterm_wait_seconds,
|
|
320
|
+
sigkill_wait_seconds=sigkill_wait_seconds,
|
|
321
|
+
)
|
|
322
|
+
orphans = result.get("orphans") or []
|
|
323
|
+
failed = result.get("failed") or []
|
|
324
|
+
passed = not orphans if not fix else not failed
|
|
325
|
+
envelope = {
|
|
326
|
+
**result,
|
|
327
|
+
"ok": passed,
|
|
328
|
+
"gate": "orphans",
|
|
329
|
+
"status": "passed" if passed else "failed",
|
|
330
|
+
"fix": bool(fix),
|
|
331
|
+
}
|
|
332
|
+
if not fix and orphans:
|
|
333
|
+
envelope["action_required"] = "re-run with --gate orphans --fix --confirm"
|
|
334
|
+
return envelope
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def format_cleanup_orphans(result: dict[str, Any]) -> str:
|
|
338
|
+
lines = [
|
|
339
|
+
f"Coordinator orphan scan @ {result.get('scanned_at')}",
|
|
340
|
+
f" scanned: {result.get('scanned', 0)} coordinator processes",
|
|
341
|
+
f" orphans: {len(result.get('orphans') or [])}",
|
|
342
|
+
]
|
|
343
|
+
if result.get("dry_run"):
|
|
344
|
+
lines.append(" mode: DRY-RUN (no SIGTERM sent; re-run with --confirm)")
|
|
345
|
+
else:
|
|
346
|
+
killed_entries = result.get("killed") or []
|
|
347
|
+
escalated = sum(1 for k in killed_entries if k.get("sigkill_required"))
|
|
348
|
+
lines.append(f" killed: {len(killed_entries)} (sigkill_required: {escalated})")
|
|
349
|
+
lines.append(f" failed: {len(result.get('failed') or [])}")
|
|
350
|
+
for orphan in result.get("orphans") or []:
|
|
351
|
+
lines.append(
|
|
352
|
+
f" PID {orphan['pid']} etime={orphan['etime']} "
|
|
353
|
+
f"workspace={orphan.get('workspace') or '?'} reason={orphan.get('reason')}"
|
|
354
|
+
)
|
|
355
|
+
return "\n".join(lines)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
__all__ = [
|
|
359
|
+
"cleanup_orphan_coordinators",
|
|
360
|
+
"classify_orphan",
|
|
361
|
+
"find_coordinator_processes",
|
|
362
|
+
"format_cleanup_orphans",
|
|
363
|
+
"orphan_gate",
|
|
364
|
+
]
|
package/src/team_agent/events.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
4
5
|
from datetime import datetime, timezone
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any
|
|
@@ -8,6 +9,15 @@ from typing import Any
|
|
|
8
9
|
from team_agent.paths import logs_dir
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
# Stage 14 (Gap 36a) — bounded retention. 5 MB cap × 5 archives = 25 MB worst-case.
|
|
13
|
+
# Mac mini 2026-05-26 evidence: events.jsonl grew to 28 MB / 128k lines in one day with
|
|
14
|
+
# unbounded retention; coordinator's tick-time scan over the file was a ~22% CPU hot path.
|
|
15
|
+
# Rotation keeps the current segment small so reads are cheap; archives preserve forensic
|
|
16
|
+
# history but are NOT consulted by hot-path scans.
|
|
17
|
+
EVENT_LOG_ROTATE_BYTES = 5 * 1024 * 1024
|
|
18
|
+
EVENT_LOG_ARCHIVE_KEEP = 5
|
|
19
|
+
|
|
20
|
+
|
|
11
21
|
class EventLog:
|
|
12
22
|
def __init__(self, workspace: Path):
|
|
13
23
|
self.workspace = workspace
|
|
@@ -20,11 +30,14 @@ class EventLog:
|
|
|
20
30
|
"event": event_type,
|
|
21
31
|
**fields,
|
|
22
32
|
}
|
|
33
|
+
self._maybe_rotate()
|
|
23
34
|
with self.path.open("a", encoding="utf-8") as f:
|
|
24
35
|
f.write(json.dumps(event, ensure_ascii=False, sort_keys=True) + "\n")
|
|
25
36
|
return event
|
|
26
37
|
|
|
27
38
|
def tail(self, limit: int = 20) -> list[dict[str, Any]]:
|
|
39
|
+
# Hot-path scan reads only the current segment. Archives are forensic; if a
|
|
40
|
+
# caller genuinely needs longer history it can iterate _archive_paths explicitly.
|
|
28
41
|
if not self.path.exists():
|
|
29
42
|
return []
|
|
30
43
|
lines = self.path.read_text(encoding="utf-8").splitlines()[-limit:]
|
|
@@ -35,3 +48,37 @@ class EventLog:
|
|
|
35
48
|
except json.JSONDecodeError:
|
|
36
49
|
out.append({"raw": line})
|
|
37
50
|
return out
|
|
51
|
+
|
|
52
|
+
def _maybe_rotate(self) -> None:
|
|
53
|
+
try:
|
|
54
|
+
size = self.path.stat().st_size
|
|
55
|
+
except FileNotFoundError:
|
|
56
|
+
return
|
|
57
|
+
if size < EVENT_LOG_ROTATE_BYTES:
|
|
58
|
+
return
|
|
59
|
+
# Shift archives: events.jsonl.4 → .5, .3 → .4, …, .1 → .2, current → .1
|
|
60
|
+
# Drop the oldest if it would overflow the keep budget.
|
|
61
|
+
oldest = self._archive_path(EVENT_LOG_ARCHIVE_KEEP)
|
|
62
|
+
if oldest.exists():
|
|
63
|
+
try:
|
|
64
|
+
oldest.unlink()
|
|
65
|
+
except OSError:
|
|
66
|
+
pass
|
|
67
|
+
for idx in range(EVENT_LOG_ARCHIVE_KEEP - 1, 0, -1):
|
|
68
|
+
src = self._archive_path(idx)
|
|
69
|
+
dst = self._archive_path(idx + 1)
|
|
70
|
+
if src.exists():
|
|
71
|
+
try:
|
|
72
|
+
os.replace(src, dst)
|
|
73
|
+
except OSError:
|
|
74
|
+
pass
|
|
75
|
+
try:
|
|
76
|
+
os.replace(self.path, self._archive_path(1))
|
|
77
|
+
except OSError:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
def _archive_path(self, index: int) -> Path:
|
|
81
|
+
return self.path.with_name(f"{self.path.name}.{index}")
|
|
82
|
+
|
|
83
|
+
def _archive_paths(self) -> list[Path]:
|
|
84
|
+
return [self._archive_path(i) for i in range(1, EVENT_LOG_ARCHIVE_KEEP + 1)]
|
|
@@ -215,7 +215,7 @@ def launch(
|
|
|
215
215
|
stdout=proc.stdout,
|
|
216
216
|
)
|
|
217
217
|
raise RuntimeError(f"Failed to start agent {agent['id']}: {proc.stderr.strip()}")
|
|
218
|
-
handled_prompts = adapter.handle_startup_prompts(session_name, agent["id"], checks=
|
|
218
|
+
handled_prompts = adapter.handle_startup_prompts(session_name, agent["id"], checks=20, sleep_s=0.5)
|
|
219
219
|
for prompt_event in handled_prompts:
|
|
220
220
|
event_log.write(
|
|
221
221
|
"launch.startup_prompt_handled",
|
|
@@ -261,6 +261,7 @@ def launch(
|
|
|
261
261
|
event_log,
|
|
262
262
|
timeout_s=1.5,
|
|
263
263
|
exclude_session_ids=known_session_ids,
|
|
264
|
+
raise_on_missed=False,
|
|
264
265
|
)
|
|
265
266
|
if state.get("display_backend") in GHOSTTY_DISPLAY_BACKENDS:
|
|
266
267
|
display_jobs.append((agent["id"], agent))
|