cli-agent-runner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_runner/__init__.py +3 -0
- agent_runner/_docgen.py +200 -0
- agent_runner/_version.py +24 -0
- agent_runner/agent_runtime.py +127 -0
- agent_runner/api.py +331 -0
- agent_runner/api_types.py +111 -0
- agent_runner/cli/__init__.py +76 -0
- agent_runner/cli/__main__.py +3 -0
- agent_runner/cli/common.py +78 -0
- agent_runner/cli/init_cmd.py +31 -0
- agent_runner/cli/install_cmd.py +44 -0
- agent_runner/cli/monitor_cmd.py +48 -0
- agent_runner/cli/peek_cmd.py +81 -0
- agent_runner/cli/round_cmd.py +17 -0
- agent_runner/cli/serve_cmd.py +60 -0
- agent_runner/cli/service_cmd.py +54 -0
- agent_runner/config.py +92 -0
- agent_runner/context_store.py +117 -0
- agent_runner/critic.py +33 -0
- agent_runner/defenses.py +111 -0
- agent_runner/events.py +53 -0
- agent_runner/lifecycle.py +67 -0
- agent_runner/metrics.py +69 -0
- agent_runner/monitor.py +515 -0
- agent_runner/prompt_loader.py +44 -0
- agent_runner/round_view.py +86 -0
- agent_runner/runner.py +236 -0
- agent_runner/scaffold.py +124 -0
- agent_runner/service_unit.py +74 -0
- agent_runner/startup_check.py +132 -0
- agent_runner/vcs_state.py +222 -0
- cli_agent_runner-0.1.0.dist-info/METADATA +150 -0
- cli_agent_runner-0.1.0.dist-info/RECORD +36 -0
- cli_agent_runner-0.1.0.dist-info/WHEEL +4 -0
- cli_agent_runner-0.1.0.dist-info/entry_points.txt +2 -0
- cli_agent_runner-0.1.0.dist-info/licenses/LICENSE +202 -0
agent_runner/monitor.py
ADDED
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
"""Monitor — anomaly detectors over events + metrics + log tails.
|
|
2
|
+
|
|
3
|
+
Phase 2 ships 9 detectors. Two trigger ``auto_action="stop_service"``:
|
|
4
|
+
* oauth_fail — auth pattern in short-exit logs (retrying burns API quota)
|
|
5
|
+
* disk_critical — disk_used_pct > 95% (writing more risks corruption)
|
|
6
|
+
|
|
7
|
+
The detectors are pure functions; the loop, ssh fetch, and auto-stop wiring
|
|
8
|
+
live further down (Tasks 3.2 / 3.3).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import re
|
|
15
|
+
from collections.abc import Iterable
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from datetime import UTC, datetime
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Protocol
|
|
20
|
+
|
|
21
|
+
from agent_runner.api_types import (
|
|
22
|
+
Alert,
|
|
23
|
+
ProjectState,
|
|
24
|
+
ServiceMode,
|
|
25
|
+
ServiceStatus,
|
|
26
|
+
SystemMetrics,
|
|
27
|
+
)
|
|
28
|
+
from agent_runner.context_store import read_json
|
|
29
|
+
from agent_runner.events import emit as emit_event
|
|
30
|
+
from agent_runner.events import now_iso_ms
|
|
31
|
+
|
|
32
|
+
KNOWN_ALERT_KINDS: frozenset[str] = frozenset(
|
|
33
|
+
{
|
|
34
|
+
"timeout_rate",
|
|
35
|
+
"hung",
|
|
36
|
+
"orphan_chain",
|
|
37
|
+
"disk_warning",
|
|
38
|
+
"disk_critical",
|
|
39
|
+
"mem_pressure",
|
|
40
|
+
"smoke_fail_rate",
|
|
41
|
+
"oauth_fail",
|
|
42
|
+
"network_fail",
|
|
43
|
+
}
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Subset of KNOWN_ALERT_KINDS whose detectors set auto_action="stop_service".
|
|
47
|
+
# Continuing in either state actively harms the host (burning API quota / writing
|
|
48
|
+
# to a near-full disk), so monitor.on_alert calls api.stop on these.
|
|
49
|
+
AUTO_STOP_ALERTS: frozenset[str] = frozenset({"oauth_fail", "disk_critical"})
|
|
50
|
+
|
|
51
|
+
SHORT_EXIT_THRESHOLD_S = 60
|
|
52
|
+
|
|
53
|
+
_AUTH_PATTERNS = re.compile(
|
|
54
|
+
r"\b(oauth|unauthorized|401|api[_ ]key|"
|
|
55
|
+
r"auth(entication)?[_ -]?(failed|error|expired)|session.*expired)\b",
|
|
56
|
+
re.IGNORECASE,
|
|
57
|
+
)
|
|
58
|
+
_NETWORK_PATTERNS = re.compile(
|
|
59
|
+
r"\b(connection refused|econnrefused|dns|"
|
|
60
|
+
r"name or service not known|connect(ion)? timed out|"
|
|
61
|
+
r"nodename nor servname|network unreachable|"
|
|
62
|
+
r"50[023] (service unavailable|bad gateway|gateway timeout)|"
|
|
63
|
+
r"connection reset)\b",
|
|
64
|
+
re.IGNORECASE,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _alert(
|
|
69
|
+
detector: str, severity: str, message: str, context: dict[str, Any], auto_action: str = "none"
|
|
70
|
+
) -> Alert:
|
|
71
|
+
assert detector in KNOWN_ALERT_KINDS, f"unknown alert kind: {detector!r}"
|
|
72
|
+
return Alert(
|
|
73
|
+
severity=severity,
|
|
74
|
+
detector=detector,
|
|
75
|
+
message=message,
|
|
76
|
+
context=context,
|
|
77
|
+
ts=now_iso_ms(),
|
|
78
|
+
auto_action=auto_action,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _last_n_round_exits(events: list[dict[str, Any]], n: int) -> list[dict[str, Any]]:
|
|
83
|
+
exits = [e for e in events if e.get("event") == "agent_exit"]
|
|
84
|
+
return exits[-n:]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def detect_timeout_rate(
|
|
88
|
+
events: list[dict[str, Any]], *, window: int = 10, threshold: float = 0.2
|
|
89
|
+
) -> Alert | None:
|
|
90
|
+
recent = _last_n_round_exits(events, window)
|
|
91
|
+
if len(recent) < window:
|
|
92
|
+
return None
|
|
93
|
+
timed = sum(1 for e in recent if e.get("timed_out"))
|
|
94
|
+
rate = timed / len(recent)
|
|
95
|
+
if rate < threshold:
|
|
96
|
+
return None
|
|
97
|
+
return _alert(
|
|
98
|
+
"timeout_rate",
|
|
99
|
+
"warning",
|
|
100
|
+
f"{timed}/{len(recent)} recent rounds timed out (>{threshold:.0%})",
|
|
101
|
+
{"rate": rate, "threshold": threshold, "window": window},
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def detect_hung(
|
|
106
|
+
events: list[dict[str, Any]], *, now: datetime, factor: float = 1.5, round_timeout_s: int = 1800
|
|
107
|
+
) -> Alert | None:
|
|
108
|
+
"""A round_start without a matching round_end after round_timeout_s * factor."""
|
|
109
|
+
open_rounds: dict[int, str] = {}
|
|
110
|
+
for e in events:
|
|
111
|
+
kind = e.get("event")
|
|
112
|
+
rn = e.get("round_num")
|
|
113
|
+
if kind == "round_start" and rn is not None:
|
|
114
|
+
open_rounds[rn] = e["ts"]
|
|
115
|
+
elif kind == "round_end" and rn in open_rounds:
|
|
116
|
+
del open_rounds[rn]
|
|
117
|
+
for rn, started_ts in open_rounds.items():
|
|
118
|
+
started = datetime.fromisoformat(started_ts.replace("Z", "+00:00"))
|
|
119
|
+
elapsed = (now - started).total_seconds()
|
|
120
|
+
if elapsed > round_timeout_s * factor:
|
|
121
|
+
return _alert(
|
|
122
|
+
"hung",
|
|
123
|
+
"warning",
|
|
124
|
+
f"Round {rn} started {elapsed:.0f}s ago with no round_end",
|
|
125
|
+
{"round_num": rn, "elapsed_s": elapsed, "threshold_s": round_timeout_s * factor},
|
|
126
|
+
)
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def detect_orphan_chain(events: list[dict[str, Any]], *, threshold: int = 3) -> Alert | None:
|
|
131
|
+
rounds_in_order = [e for e in events if e.get("event") in ("round_end", "orphan_stashed")]
|
|
132
|
+
streak = 0
|
|
133
|
+
last_round_with_orphan: int | None = None
|
|
134
|
+
for e in rounds_in_order:
|
|
135
|
+
if e.get("event") == "orphan_stashed":
|
|
136
|
+
streak += 1
|
|
137
|
+
last_round_with_orphan = e.get("round_num")
|
|
138
|
+
elif e.get("event") == "round_end":
|
|
139
|
+
rn = e.get("round_num")
|
|
140
|
+
has_orphan_for_round = any(
|
|
141
|
+
o.get("event") == "orphan_stashed" and o.get("round_num") == rn
|
|
142
|
+
for o in rounds_in_order
|
|
143
|
+
)
|
|
144
|
+
if not has_orphan_for_round:
|
|
145
|
+
streak = 0
|
|
146
|
+
if streak >= threshold:
|
|
147
|
+
return _alert(
|
|
148
|
+
"orphan_chain",
|
|
149
|
+
"warning",
|
|
150
|
+
f"{streak} consecutive rounds with orphan_stashed (>= {threshold})",
|
|
151
|
+
{"streak": streak, "threshold": threshold, "last_round": last_round_with_orphan},
|
|
152
|
+
)
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _latest(metrics: list[dict[str, Any]], key: str) -> Any:
|
|
157
|
+
for m in reversed(metrics):
|
|
158
|
+
if key in m:
|
|
159
|
+
return m[key]
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def detect_disk_warning(
|
|
164
|
+
metrics: list[dict[str, Any]], *, threshold_pct: float = 90.0
|
|
165
|
+
) -> Alert | None:
|
|
166
|
+
val = _latest(metrics, "disk_used_pct")
|
|
167
|
+
if val is None or val < threshold_pct:
|
|
168
|
+
return None
|
|
169
|
+
if val >= 95.0: # leave the >=95 case to detect_disk_critical
|
|
170
|
+
return None
|
|
171
|
+
return _alert(
|
|
172
|
+
"disk_warning",
|
|
173
|
+
"warning",
|
|
174
|
+
f"disk_used_pct {val} >= {threshold_pct}",
|
|
175
|
+
{
|
|
176
|
+
"value": val,
|
|
177
|
+
"threshold": threshold_pct,
|
|
178
|
+
"hint": "Free space soon — clean ~/.agent-runner/<project>/logs/",
|
|
179
|
+
},
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def detect_disk_critical(
|
|
184
|
+
metrics: list[dict[str, Any]], *, threshold_pct: float = 95.0
|
|
185
|
+
) -> Alert | None:
|
|
186
|
+
val = _latest(metrics, "disk_used_pct")
|
|
187
|
+
if val is None or val < threshold_pct:
|
|
188
|
+
return None
|
|
189
|
+
return _alert(
|
|
190
|
+
"disk_critical",
|
|
191
|
+
"critical",
|
|
192
|
+
f"disk_used_pct {val} >= {threshold_pct} — auto-stopping service",
|
|
193
|
+
{"value": val, "threshold": threshold_pct, "hint": "Stop and clean disk before resuming"},
|
|
194
|
+
auto_action="stop_service",
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def detect_mem_pressure(metrics: list[dict[str, Any]], *, threshold_mb: int = 200) -> Alert | None:
|
|
199
|
+
val = _latest(metrics, "mem_available_mb")
|
|
200
|
+
if val is None or val >= threshold_mb:
|
|
201
|
+
return None
|
|
202
|
+
return _alert(
|
|
203
|
+
"mem_pressure",
|
|
204
|
+
"warning",
|
|
205
|
+
f"mem_available_mb {val} < {threshold_mb}",
|
|
206
|
+
{
|
|
207
|
+
"value": val,
|
|
208
|
+
"threshold": threshold_mb,
|
|
209
|
+
"hint": "Investigate memory leak or move to a larger host",
|
|
210
|
+
},
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def detect_smoke_fail_rate(
|
|
215
|
+
events: list[dict[str, Any]], *, window: int = 10, threshold: float = 0.1
|
|
216
|
+
) -> Alert | None:
|
|
217
|
+
ends = [e for e in events if e.get("event") == "round_end"]
|
|
218
|
+
if len(ends) < window:
|
|
219
|
+
return None
|
|
220
|
+
recent_round_nums = [e.get("round_num") for e in ends[-window:]]
|
|
221
|
+
fails = sum(
|
|
222
|
+
1
|
|
223
|
+
for e in events
|
|
224
|
+
if e.get("event") == "smoke_check_failed" and e.get("round_num") in recent_round_nums
|
|
225
|
+
)
|
|
226
|
+
rate = fails / window
|
|
227
|
+
if rate < threshold:
|
|
228
|
+
return None
|
|
229
|
+
return _alert(
|
|
230
|
+
"smoke_fail_rate",
|
|
231
|
+
"warning",
|
|
232
|
+
f"{fails}/{window} recent rounds had smoke_check_failed",
|
|
233
|
+
{"rate": rate, "threshold": threshold, "hint": "Inspect events.jsonl for failure reasons"},
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _short_exit_with_pattern(
|
|
238
|
+
events: list[dict[str, Any]], log_tails: dict[int, str], pattern: re.Pattern[str], window: int
|
|
239
|
+
) -> tuple[int, int]:
|
|
240
|
+
recent = _last_n_round_exits(events, window)
|
|
241
|
+
matches = 0
|
|
242
|
+
for e in recent:
|
|
243
|
+
rn = e.get("round_num")
|
|
244
|
+
dur = e.get("duration_s") or 0.0
|
|
245
|
+
exit_code = e.get("exit_code", 0)
|
|
246
|
+
timed_out = e.get("timed_out", False)
|
|
247
|
+
if dur < SHORT_EXIT_THRESHOLD_S and exit_code != 0 and not timed_out:
|
|
248
|
+
tail = log_tails.get(rn, "")
|
|
249
|
+
if pattern.search(tail):
|
|
250
|
+
matches += 1
|
|
251
|
+
return matches, len(recent)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def detect_oauth_fail(
|
|
255
|
+
events: list[dict[str, Any]],
|
|
256
|
+
log_tails: dict[int, str],
|
|
257
|
+
*,
|
|
258
|
+
window: int = 10,
|
|
259
|
+
threshold: float = 0.2,
|
|
260
|
+
) -> Alert | None:
|
|
261
|
+
matches, total = _short_exit_with_pattern(events, log_tails, _AUTH_PATTERNS, window)
|
|
262
|
+
if total < window or matches / total < threshold:
|
|
263
|
+
return None
|
|
264
|
+
return _alert(
|
|
265
|
+
"oauth_fail",
|
|
266
|
+
"critical",
|
|
267
|
+
f"{matches}/{total} recent rounds short-exited with auth failure pattern",
|
|
268
|
+
{
|
|
269
|
+
"matches": matches,
|
|
270
|
+
"window": total,
|
|
271
|
+
"threshold": threshold,
|
|
272
|
+
"hint": "Run `claude /login` on the supervisor host or refresh ANTHROPIC_API_KEY",
|
|
273
|
+
},
|
|
274
|
+
auto_action="stop_service",
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def detect_network_fail(
|
|
279
|
+
events: list[dict[str, Any]],
|
|
280
|
+
log_tails: dict[int, str],
|
|
281
|
+
*,
|
|
282
|
+
window: int = 10,
|
|
283
|
+
threshold: float = 0.2,
|
|
284
|
+
) -> Alert | None:
|
|
285
|
+
matches, total = _short_exit_with_pattern(events, log_tails, _NETWORK_PATTERNS, window)
|
|
286
|
+
if total < window or matches / total < threshold:
|
|
287
|
+
return None
|
|
288
|
+
return _alert(
|
|
289
|
+
"network_fail",
|
|
290
|
+
"warning",
|
|
291
|
+
f"{matches}/{total} recent rounds short-exited with network error pattern",
|
|
292
|
+
{
|
|
293
|
+
"matches": matches,
|
|
294
|
+
"window": total,
|
|
295
|
+
"threshold": threshold,
|
|
296
|
+
"hint": "Check upstream Anthropic status or local DNS / VPN",
|
|
297
|
+
},
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# ---------------------------------------------------------------------------
|
|
302
|
+
# State-tree assembly (Task 3.2)
|
|
303
|
+
# ---------------------------------------------------------------------------
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class StateSource(Protocol):
|
|
307
|
+
"""Local or remote source — returns paths to read."""
|
|
308
|
+
|
|
309
|
+
def events_files(self) -> list[Path]: ...
|
|
310
|
+
def metrics_files(self) -> list[Path]: ...
|
|
311
|
+
def rounds_dir(self) -> Path: ...
|
|
312
|
+
def status_path(self) -> Path: ...
|
|
313
|
+
def orphan_path(self) -> Path: ...
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@dataclass(frozen=True)
|
|
317
|
+
class LocalSource:
|
|
318
|
+
log_dir: Path
|
|
319
|
+
|
|
320
|
+
def events_files(self) -> list[Path]:
|
|
321
|
+
return sorted(self.log_dir.glob("events-*.jsonl"))
|
|
322
|
+
|
|
323
|
+
def metrics_files(self) -> list[Path]:
|
|
324
|
+
return sorted(self.log_dir.glob("metrics-*.jsonl"))
|
|
325
|
+
|
|
326
|
+
def rounds_dir(self) -> Path:
|
|
327
|
+
return self.log_dir / "rounds"
|
|
328
|
+
|
|
329
|
+
def status_path(self) -> Path:
|
|
330
|
+
return self.log_dir / "status.json"
|
|
331
|
+
|
|
332
|
+
def orphan_path(self) -> Path:
|
|
333
|
+
return self.log_dir / "orphan-state.json"
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def parse_events_from_jsonl_files(files: Iterable[Path]) -> list[dict[str, Any]]:
|
|
337
|
+
out: list[dict[str, Any]] = []
|
|
338
|
+
for f in files:
|
|
339
|
+
try:
|
|
340
|
+
text = f.read_text(encoding="utf-8")
|
|
341
|
+
except FileNotFoundError:
|
|
342
|
+
continue
|
|
343
|
+
for line in text.splitlines():
|
|
344
|
+
line = line.strip()
|
|
345
|
+
if not line:
|
|
346
|
+
continue
|
|
347
|
+
try:
|
|
348
|
+
out.append(json.loads(line))
|
|
349
|
+
except json.JSONDecodeError:
|
|
350
|
+
continue
|
|
351
|
+
return out
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def load_round_log_tails(rounds_dir: Path, *, tail_lines: int = 50) -> dict[int, str]:
|
|
355
|
+
tails: dict[int, str] = {}
|
|
356
|
+
if not rounds_dir.is_dir():
|
|
357
|
+
return tails
|
|
358
|
+
for f in rounds_dir.glob("R*-*.log"):
|
|
359
|
+
try:
|
|
360
|
+
num = int(f.name.split("-", 1)[0][1:])
|
|
361
|
+
except (ValueError, IndexError):
|
|
362
|
+
continue
|
|
363
|
+
try:
|
|
364
|
+
lines = f.read_text(encoding="utf-8").splitlines()
|
|
365
|
+
except FileNotFoundError:
|
|
366
|
+
continue
|
|
367
|
+
tails[num] = "\n".join(lines[-tail_lines:])
|
|
368
|
+
return tails
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def _latest_metric_dict(metrics: list[dict[str, Any]]) -> dict[str, Any]:
|
|
372
|
+
return metrics[-1] if metrics else {}
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def assemble_project_state(source: StateSource, *, project: str) -> ProjectState:
|
|
376
|
+
metrics = parse_events_from_jsonl_files(source.metrics_files())
|
|
377
|
+
status = read_json(source.status_path()) or {}
|
|
378
|
+
orphan = read_json(source.orphan_path())
|
|
379
|
+
latest = _latest_metric_dict(metrics)
|
|
380
|
+
system = SystemMetrics(
|
|
381
|
+
mem_total_mb=int(latest.get("mem_total_mb", 0)),
|
|
382
|
+
mem_available_mb=int(latest.get("mem_available_mb", 0)),
|
|
383
|
+
disk_used_pct=float(latest.get("disk_used_pct", 0.0)),
|
|
384
|
+
disk_free_gb=float(latest.get("disk_free_gb", 0.0)),
|
|
385
|
+
load_1m=latest.get("load_1m"),
|
|
386
|
+
cpu_pct=latest.get("cpu_pct"),
|
|
387
|
+
)
|
|
388
|
+
return ProjectState(
|
|
389
|
+
project=project,
|
|
390
|
+
status=status,
|
|
391
|
+
defenses=[],
|
|
392
|
+
current_round=None,
|
|
393
|
+
recent_rounds=[],
|
|
394
|
+
orphan=orphan,
|
|
395
|
+
system=system,
|
|
396
|
+
service=ServiceStatus(mode=ServiceMode.NONE, active=False),
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def run_all_detectors(
|
|
401
|
+
*,
|
|
402
|
+
events: list[dict[str, Any]],
|
|
403
|
+
metrics: list[dict[str, Any]],
|
|
404
|
+
log_tails: dict[int, str],
|
|
405
|
+
round_timeout_s: int = 1800,
|
|
406
|
+
now: datetime | None = None,
|
|
407
|
+
) -> list[Alert]:
|
|
408
|
+
"""Run all 9 detectors; returns alerts (empty = healthy)."""
|
|
409
|
+
if now is None:
|
|
410
|
+
now = datetime.now(UTC)
|
|
411
|
+
candidates = [
|
|
412
|
+
detect_timeout_rate(events),
|
|
413
|
+
detect_hung(events, now=now, round_timeout_s=round_timeout_s),
|
|
414
|
+
detect_orphan_chain(events),
|
|
415
|
+
detect_disk_warning(metrics),
|
|
416
|
+
detect_disk_critical(metrics),
|
|
417
|
+
detect_mem_pressure(metrics),
|
|
418
|
+
detect_smoke_fail_rate(events),
|
|
419
|
+
detect_oauth_fail(events, log_tails),
|
|
420
|
+
detect_network_fail(events, log_tails),
|
|
421
|
+
]
|
|
422
|
+
return [a for a in candidates if a is not None]
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
# ---------------------------------------------------------------------------
|
|
426
|
+
# Remote source + auto-stop dispatch (Task 3.3)
|
|
427
|
+
# ---------------------------------------------------------------------------
|
|
428
|
+
|
|
429
|
+
import subprocess # noqa: TID251, E402 — monitor needs ssh + local stop subprocess
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def run_remote_command(host: str, cmd: str, *, timeout: int = 30) -> tuple[int, str]:
|
|
433
|
+
"""Run a single shell command over ssh; returns (returncode, stdout).
|
|
434
|
+
|
|
435
|
+
Callers decide whether to treat non-zero as fatal. ``RemoteSource._list``
|
|
436
|
+
tolerates non-zero (missing files glob to empty), but ``on_alert`` remote
|
|
437
|
+
stop should not silently swallow ssh failures.
|
|
438
|
+
"""
|
|
439
|
+
r = subprocess.run(
|
|
440
|
+
["ssh", host, cmd],
|
|
441
|
+
capture_output=True,
|
|
442
|
+
text=True,
|
|
443
|
+
timeout=timeout,
|
|
444
|
+
check=False,
|
|
445
|
+
)
|
|
446
|
+
return r.returncode, r.stdout
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
@dataclass(frozen=True)
|
|
450
|
+
class RemoteSource:
|
|
451
|
+
"""Mirrors LocalSource but fetches paths via ssh ls; reads via cat."""
|
|
452
|
+
|
|
453
|
+
host: str
|
|
454
|
+
project: str
|
|
455
|
+
|
|
456
|
+
def _remote_log_dir(self) -> str:
|
|
457
|
+
return f"~/.agent-runner/{self.project}/logs"
|
|
458
|
+
|
|
459
|
+
def _list(self, glob: str) -> list[Path]:
|
|
460
|
+
_rc, out = run_remote_command(
|
|
461
|
+
self.host, f"ls -1 {self._remote_log_dir()}/{glob} 2>/dev/null"
|
|
462
|
+
)
|
|
463
|
+
return [Path(line.strip()) for line in out.splitlines() if line.strip()]
|
|
464
|
+
|
|
465
|
+
def events_files(self) -> list[Path]:
|
|
466
|
+
return self._list("events-*.jsonl")
|
|
467
|
+
|
|
468
|
+
def metrics_files(self) -> list[Path]:
|
|
469
|
+
return self._list("metrics-*.jsonl")
|
|
470
|
+
|
|
471
|
+
def rounds_dir(self) -> Path:
|
|
472
|
+
return Path(f"{self._remote_log_dir()}/rounds")
|
|
473
|
+
|
|
474
|
+
def status_path(self) -> Path:
|
|
475
|
+
return Path(f"{self._remote_log_dir()}/status.json")
|
|
476
|
+
|
|
477
|
+
def orphan_path(self) -> Path:
|
|
478
|
+
return Path(f"{self._remote_log_dir()}/orphan-state.json")
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def _call_local_stop(project: str) -> None:
|
|
482
|
+
# Late import: api imports monitor for peek, so we defer the reverse direction.
|
|
483
|
+
from agent_runner import api
|
|
484
|
+
|
|
485
|
+
api.stop(project)
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def on_alert(alert: Alert, *, project: str, host: str | None, log_dir: Path) -> None:
|
|
489
|
+
"""Record the alert to events.jsonl and, if auto_action==stop_service, stop the service."""
|
|
490
|
+
if log_dir.is_dir():
|
|
491
|
+
emit_event(
|
|
492
|
+
log_dir,
|
|
493
|
+
"monitor_alert_emitted",
|
|
494
|
+
detector=alert.detector,
|
|
495
|
+
severity=alert.severity,
|
|
496
|
+
message=alert.message,
|
|
497
|
+
auto_action=alert.auto_action,
|
|
498
|
+
)
|
|
499
|
+
if alert.auto_action != "stop_service":
|
|
500
|
+
return
|
|
501
|
+
if log_dir.is_dir():
|
|
502
|
+
emit_event(
|
|
503
|
+
log_dir,
|
|
504
|
+
"monitor_auto_stop_triggered",
|
|
505
|
+
detector=alert.detector,
|
|
506
|
+
host=host,
|
|
507
|
+
)
|
|
508
|
+
if host is None:
|
|
509
|
+
_call_local_stop(project)
|
|
510
|
+
else:
|
|
511
|
+
run_remote_command(
|
|
512
|
+
host,
|
|
513
|
+
f"agent-runner stop --config ~/.agent-runner/{project}/agent-runner.toml",
|
|
514
|
+
timeout=30,
|
|
515
|
+
)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Prompt loader — knows prompt is a .md file, optionally injects round-context block.
|
|
2
|
+
|
|
3
|
+
R721 defense: strip YAML frontmatter before passing to claude CLI argv. A prompt
|
|
4
|
+
starting with `---\\n` is rejected by claude's arg parser as an unknown flag.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_prompt(prompt_file: Path) -> str:
|
|
15
|
+
try:
|
|
16
|
+
return prompt_file.read_text(encoding="utf-8")
|
|
17
|
+
except FileNotFoundError as e:
|
|
18
|
+
raise FileNotFoundError(f"prompt file not found: {prompt_file}") from e
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def strip_yaml_frontmatter(text: str) -> str:
|
|
22
|
+
if not text.startswith("---\n"):
|
|
23
|
+
return text
|
|
24
|
+
end = text.find("\n---\n", 4)
|
|
25
|
+
if end < 0:
|
|
26
|
+
return text
|
|
27
|
+
return text[end + len("\n---\n") :].lstrip()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _format_context_block(context: dict[str, Any]) -> str:
|
|
31
|
+
body = json.dumps(context, indent=2, ensure_ascii=False)
|
|
32
|
+
return f"```json round-context\n{body}\n```\n\n"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def assemble_prompt(
|
|
36
|
+
prompt_file: Path,
|
|
37
|
+
*,
|
|
38
|
+
context: dict[str, Any] | None,
|
|
39
|
+
inject_context: bool,
|
|
40
|
+
) -> str:
|
|
41
|
+
body = strip_yaml_frontmatter(load_prompt(prompt_file))
|
|
42
|
+
if inject_context and context is not None:
|
|
43
|
+
return _format_context_block(context) + body
|
|
44
|
+
return body
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Build RoundView snapshots for peek's --round / --log / --events drill-down.
|
|
2
|
+
|
|
3
|
+
Kept separate from api.py and monitor.py so neither approaches its LOC cap.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from agent_runner.api_types import RoundView
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def resolve_round_arg(arg: int | str | None, log_dir: Path) -> int | None:
|
|
15
|
+
"""Resolve --round value (int, int-string, or 'latest') to a concrete number."""
|
|
16
|
+
if arg is None:
|
|
17
|
+
return None
|
|
18
|
+
if isinstance(arg, int):
|
|
19
|
+
return arg
|
|
20
|
+
if arg == "latest":
|
|
21
|
+
rounds_dir = log_dir / "rounds"
|
|
22
|
+
if not rounds_dir.is_dir():
|
|
23
|
+
return None
|
|
24
|
+
nums: list[int] = []
|
|
25
|
+
for f in rounds_dir.glob("R*-*.log"):
|
|
26
|
+
try:
|
|
27
|
+
nums.append(int(f.name.split("-", 1)[0][1:]))
|
|
28
|
+
except (ValueError, IndexError):
|
|
29
|
+
continue
|
|
30
|
+
return max(nums) if nums else None
|
|
31
|
+
try:
|
|
32
|
+
return int(arg)
|
|
33
|
+
except ValueError as e:
|
|
34
|
+
raise KeyError(f"--round expects int or 'latest', got {arg!r}") from e
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def build_round_view(
|
|
38
|
+
log_dir: Path,
|
|
39
|
+
round_num: int,
|
|
40
|
+
events: list[dict[str, Any]],
|
|
41
|
+
*,
|
|
42
|
+
want_log: bool = False,
|
|
43
|
+
tail_lines: int = 50,
|
|
44
|
+
) -> RoundView | None:
|
|
45
|
+
"""Construct a RoundView for ``round_num`` by reading rounds/R{N}-*.log + events.
|
|
46
|
+
|
|
47
|
+
Returns None when the round log file is absent (round never started or already pruned).
|
|
48
|
+
"""
|
|
49
|
+
rounds_dir = log_dir / "rounds"
|
|
50
|
+
log_path = next(rounds_dir.glob(f"R{round_num}-*.log"), None)
|
|
51
|
+
if log_path is None:
|
|
52
|
+
return None
|
|
53
|
+
started_at = ""
|
|
54
|
+
phase: str | None = None
|
|
55
|
+
duration: float | None = None
|
|
56
|
+
exit_code: int | None = None
|
|
57
|
+
timed_out: bool | None = None
|
|
58
|
+
for e in events:
|
|
59
|
+
if e.get("round_num") != round_num:
|
|
60
|
+
continue
|
|
61
|
+
kind = e.get("event")
|
|
62
|
+
if kind == "round_start":
|
|
63
|
+
started_at = e.get("ts", "")
|
|
64
|
+
phase = e.get("phase")
|
|
65
|
+
elif kind == "agent_exit":
|
|
66
|
+
duration = e.get("duration_s")
|
|
67
|
+
exit_code = e.get("exit_code")
|
|
68
|
+
timed_out = e.get("timed_out")
|
|
69
|
+
log_tail: str | None = None
|
|
70
|
+
if want_log:
|
|
71
|
+
try:
|
|
72
|
+
lines = log_path.read_text(encoding="utf-8").splitlines()
|
|
73
|
+
log_tail = "\n".join(lines[-tail_lines:])
|
|
74
|
+
except FileNotFoundError:
|
|
75
|
+
log_tail = None
|
|
76
|
+
return RoundView(
|
|
77
|
+
round_num=round_num,
|
|
78
|
+
phase=phase,
|
|
79
|
+
started_at=started_at,
|
|
80
|
+
duration_so_far_s=duration,
|
|
81
|
+
pid=None,
|
|
82
|
+
exit_code=exit_code,
|
|
83
|
+
timed_out=timed_out,
|
|
84
|
+
log_path=log_path,
|
|
85
|
+
log_tail=log_tail,
|
|
86
|
+
)
|