@team-agent/installer 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +34 -1
- package/Cargo.toml +1 -1
- package/crates/team-agent/Cargo.toml +1 -1
- package/crates/team-agent/src/cli/adapters.rs +234 -26
- package/crates/team-agent/src/cli/diagnose.rs +144 -10
- package/crates/team-agent/src/cli/emit.rs +289 -54
- package/crates/team-agent/src/cli/leader.rs +37 -8
- package/crates/team-agent/src/cli/mod.rs +1281 -196
- package/crates/team-agent/src/cli/status_port.rs +195 -46
- package/crates/team-agent/src/cli/tests/divergence.rs +1 -2
- package/crates/team-agent/src/cli/tests/lane_c.rs +23 -13
- package/crates/team-agent/src/cli/tests/main_preserved.rs +2 -0
- package/crates/team-agent/src/cli/tests/run_delegation.rs +59 -3
- package/crates/team-agent/src/cli/types.rs +18 -0
- package/crates/team-agent/src/compiler.rs +15 -5
- package/crates/team-agent/src/coordinator/health.rs +95 -17
- package/crates/team-agent/src/coordinator/mod.rs +4 -0
- package/crates/team-agent/src/coordinator/runtime_detectors.rs +500 -0
- package/crates/team-agent/src/coordinator/runtime_observation.rs +58 -0
- package/crates/team-agent/src/coordinator/tick.rs +222 -69
- package/crates/team-agent/src/coordinator/types.rs +15 -3
- package/crates/team-agent/src/db/schema.rs +37 -2
- package/crates/team-agent/src/diagnose/comms.rs +226 -0
- package/crates/team-agent/src/diagnose/mod.rs +45 -0
- package/crates/team-agent/src/diagnose/orphans.rs +658 -0
- package/crates/team-agent/src/fake_worker.rs +146 -3
- package/crates/team-agent/src/leader/start.rs +121 -23
- package/crates/team-agent/src/leader/types.rs +44 -1
- package/crates/team-agent/src/lib.rs +3 -0
- package/crates/team-agent/src/lifecycle/display.rs +645 -47
- package/crates/team-agent/src/lifecycle/launch.rs +1061 -146
- package/crates/team-agent/src/lifecycle/mod.rs +2 -0
- package/crates/team-agent/src/lifecycle/profile_launch.rs +810 -0
- package/crates/team-agent/src/lifecycle/profile_smoke.rs +522 -0
- package/crates/team-agent/src/lifecycle/restart/agent.rs +99 -23
- package/crates/team-agent/src/lifecycle/restart/common.rs +183 -24
- package/crates/team-agent/src/lifecycle/restart/rebuild.rs +498 -22
- package/crates/team-agent/src/lifecycle/restart/remove.rs +27 -7
- package/crates/team-agent/src/lifecycle/restart/team_state.rs +19 -0
- package/crates/team-agent/src/lifecycle/restart.rs +24 -1
- package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +5 -5
- package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +37 -7
- package/crates/team-agent/src/lifecycle/types.rs +19 -0
- package/crates/team-agent/src/mcp_server/helpers.rs +1 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +341 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/mod.rs +10 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/state_status.rs +158 -0
- package/crates/team-agent/src/mcp_server/mod.rs +3 -74
- package/crates/team-agent/src/mcp_server/tests/scoped.rs +1 -1
- package/crates/team-agent/src/mcp_server/tests/send.rs +6 -5
- package/crates/team-agent/src/mcp_server/tools.rs +312 -111
- package/crates/team-agent/src/mcp_server/types.rs +6 -4
- package/crates/team-agent/src/mcp_server/wire.rs +19 -7
- package/crates/team-agent/src/message_store.rs +21 -4
- package/crates/team-agent/src/messaging/delivery.rs +470 -59
- package/crates/team-agent/src/messaging/mod.rs +9 -6
- package/crates/team-agent/src/messaging/results.rs +353 -63
- package/crates/team-agent/src/messaging/selftest.rs +199 -12
- package/crates/team-agent/src/messaging/send.rs +35 -3
- package/crates/team-agent/src/messaging/tests/runtime.rs +19 -4
- package/crates/team-agent/src/messaging/types.rs +11 -3
- package/crates/team-agent/src/os_probe.rs +119 -0
- package/crates/team-agent/src/packaging/migrate.rs +10 -2
- package/crates/team-agent/src/packaging/tests.rs +23 -0
- package/crates/team-agent/src/provider/adapter.rs +564 -63
- package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +1 -7
- package/crates/team-agent/src/provider/classify.rs +51 -4
- package/crates/team-agent/src/provider/helpers.rs +10 -1
- package/crates/team-agent/src/provider/startup_prompt.rs +94 -0
- package/crates/team-agent/src/provider/types.rs +47 -0
- package/crates/team-agent/src/session_capture.rs +616 -0
- package/crates/team-agent/src/state/persist.rs +170 -1
- package/crates/team-agent/src/state/projection.rs +141 -8
- package/crates/team-agent/src/state/selector.rs +5 -2
- package/crates/team-agent/src/tmux_backend.rs +161 -64
- package/crates/team-agent/src/transport/test_support.rs +9 -0
- package/crates/team-agent/src/transport/tests/wire.rs +4 -0
- package/crates/team-agent/src/transport.rs +13 -2
- package/package.json +4 -4
|
@@ -138,15 +138,16 @@ pub fn stop_coordinator(workspace: &WorkspacePath) -> Result<StopReport, StopErr
|
|
|
138
138
|
pid: None,
|
|
139
139
|
});
|
|
140
140
|
};
|
|
141
|
-
|
|
141
|
+
if pid_is_running(pid).ok() == Some(false) {
|
|
142
|
+
remove_file_if_exists(&pid_path)?;
|
|
143
|
+
remove_file_if_exists(&coordinator_meta_path(workspace))?;
|
|
142
144
|
return Ok(StopReport {
|
|
143
|
-
ok:
|
|
144
|
-
status: StopOutcome::
|
|
145
|
+
ok: true,
|
|
146
|
+
status: StopOutcome::Missing,
|
|
145
147
|
pid: Some(pid),
|
|
146
148
|
});
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
if rc != 0 {
|
|
149
|
+
}
|
|
150
|
+
if !terminate_pid(pid) {
|
|
150
151
|
return Ok(StopReport {
|
|
151
152
|
ok: false,
|
|
152
153
|
status: StopOutcome::KillFailed,
|
|
@@ -197,9 +198,11 @@ fn stop_discovered_coordinators(
|
|
|
197
198
|
}
|
|
198
199
|
|
|
199
200
|
fn discover_coordinator_pids(workspace: &WorkspacePath) -> Vec<Pid> {
|
|
200
|
-
let output = match
|
|
201
|
-
.args(["-axo", "pid=,command="])
|
|
202
|
-
|
|
201
|
+
let output = match crate::os_probe::bounded_command_output_with_probe(
|
|
202
|
+
Command::new("ps").args(["-axo", "pid=,command="]),
|
|
203
|
+
"ps_table",
|
|
204
|
+
None,
|
|
205
|
+
)
|
|
203
206
|
{
|
|
204
207
|
Ok(output) if output.status.success() => output,
|
|
205
208
|
_ => return Vec::new(),
|
|
@@ -250,13 +253,86 @@ fn terminate_pid(pid: Pid) -> bool {
|
|
|
250
253
|
if pid_is_running(pid).ok() == Some(false) {
|
|
251
254
|
return true;
|
|
252
255
|
}
|
|
253
|
-
|
|
254
|
-
|
|
256
|
+
let pids = process_tree_pids(pid);
|
|
257
|
+
for child in pids.iter().rev() {
|
|
258
|
+
let _ = send_signal(*child, libc::SIGTERM);
|
|
255
259
|
}
|
|
256
|
-
if
|
|
257
|
-
|
|
260
|
+
if !wait_until_all_not_running(&pids, Duration::from_secs(5)) {
|
|
261
|
+
for child in pids.iter().rev() {
|
|
262
|
+
let _ = send_signal(*child, libc::SIGKILL);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
wait_until_all_not_running(&pids, Duration::from_secs(5))
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/// Public wrapper for diagnostic cleanup paths that must reuse coordinator
|
|
269
|
+
/// shutdown's SIGTERM-then-SIGKILL semantics.
|
|
270
|
+
pub fn terminate_pid_tree(pid: Pid) -> bool {
|
|
271
|
+
terminate_pid(pid)
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
fn process_tree_pids(root: Pid) -> Vec<Pid> {
|
|
275
|
+
let root_pid = root.get();
|
|
276
|
+
let pairs = crate::os_probe::bounded_command_output_with_probe(
|
|
277
|
+
Command::new("ps").args(["-axo", "pid=,ppid="]),
|
|
278
|
+
"ps_parent",
|
|
279
|
+
None,
|
|
280
|
+
)
|
|
281
|
+
.ok()
|
|
282
|
+
.map(|out| String::from_utf8_lossy(&out.stdout).to_string())
|
|
283
|
+
.unwrap_or_default()
|
|
284
|
+
.lines()
|
|
285
|
+
.filter_map(|line| {
|
|
286
|
+
let mut parts = line.split_whitespace();
|
|
287
|
+
let pid = parts.next()?.parse::<u32>().ok()?;
|
|
288
|
+
let ppid = parts.next()?.parse::<u32>().ok()?;
|
|
289
|
+
Some((pid, ppid))
|
|
290
|
+
})
|
|
291
|
+
.collect::<Vec<_>>();
|
|
292
|
+
let mut out = Vec::new();
|
|
293
|
+
collect_child_pids(root_pid, &pairs, &mut out);
|
|
294
|
+
out.push(root_pid);
|
|
295
|
+
out.sort_unstable();
|
|
296
|
+
out.dedup();
|
|
297
|
+
out.into_iter().map(Pid::new).collect()
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
fn collect_child_pids(parent: u32, pairs: &[(u32, u32)], out: &mut Vec<u32>) {
|
|
301
|
+
for (pid, ppid) in pairs {
|
|
302
|
+
if *ppid == parent && !out.contains(pid) {
|
|
303
|
+
out.push(*pid);
|
|
304
|
+
collect_child_pids(*pid, pairs, out);
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
fn wait_until_all_not_running(pids: &[Pid], timeout: Duration) -> bool {
|
|
310
|
+
let start = std::time::Instant::now();
|
|
311
|
+
loop {
|
|
312
|
+
for pid in pids {
|
|
313
|
+
reap_child_if_possible(*pid);
|
|
314
|
+
}
|
|
315
|
+
if pids
|
|
316
|
+
.iter()
|
|
317
|
+
.all(|pid| pid_is_running(*pid).ok() != Some(true))
|
|
318
|
+
{
|
|
319
|
+
return true;
|
|
320
|
+
}
|
|
321
|
+
if start.elapsed() >= timeout {
|
|
322
|
+
return false;
|
|
323
|
+
}
|
|
324
|
+
std::thread::sleep(Duration::from_millis(25));
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
fn reap_child_if_possible(pid: Pid) {
|
|
329
|
+
let Ok(pid_t) = libc::pid_t::try_from(pid.get()) else {
|
|
330
|
+
return;
|
|
331
|
+
};
|
|
332
|
+
let mut status = 0;
|
|
333
|
+
unsafe {
|
|
334
|
+
libc::waitpid(pid_t, &mut status, libc::WNOHANG);
|
|
258
335
|
}
|
|
259
|
-
send_signal(pid, libc::SIGKILL) && wait_until_not_running(pid, Duration::from_millis(750))
|
|
260
336
|
}
|
|
261
337
|
|
|
262
338
|
fn send_signal(pid: Pid, signal: libc::c_int) -> bool {
|
|
@@ -297,9 +373,11 @@ pub fn pid_is_running(pid: Pid) -> Result<bool, std::io::Error> {
|
|
|
297
373
|
_ => Err(err),
|
|
298
374
|
};
|
|
299
375
|
}
|
|
300
|
-
let out =
|
|
301
|
-
.args(["-p", &pid.to_string(), "-o", "stat="])
|
|
302
|
-
|
|
376
|
+
let out = crate::os_probe::bounded_command_output_with_probe(
|
|
377
|
+
Command::new("ps").args(["-p", &pid.to_string(), "-o", "stat="]),
|
|
378
|
+
"ps_table",
|
|
379
|
+
Some(pid.get()),
|
|
380
|
+
)?;
|
|
303
381
|
if !out.status.success() {
|
|
304
382
|
return Ok(false);
|
|
305
383
|
}
|
|
@@ -64,6 +64,8 @@ use serde_json::Value;
|
|
|
64
64
|
pub mod backoff;
|
|
65
65
|
pub mod health;
|
|
66
66
|
pub mod orphan;
|
|
67
|
+
pub mod runtime_detectors;
|
|
68
|
+
pub mod runtime_observation;
|
|
67
69
|
pub mod tick;
|
|
68
70
|
pub mod types;
|
|
69
71
|
|
|
@@ -75,6 +77,8 @@ pub use tick::*;
|
|
|
75
77
|
pub use backoff::*;
|
|
76
78
|
pub use orphan::*;
|
|
77
79
|
pub use health::*;
|
|
80
|
+
pub use runtime_detectors::*;
|
|
81
|
+
pub use runtime_observation::*;
|
|
78
82
|
|
|
79
83
|
#[cfg(test)]
|
|
80
84
|
mod tests;
|
|
@@ -0,0 +1,500 @@
|
|
|
1
|
+
use std::collections::BTreeMap;
|
|
2
|
+
use std::path::Path;
|
|
3
|
+
|
|
4
|
+
use serde_json::{json, Map, Value};
|
|
5
|
+
|
|
6
|
+
use crate::event_log::EventLog;
|
|
7
|
+
use crate::model::enums::Provider;
|
|
8
|
+
use crate::model::ids::AgentId;
|
|
9
|
+
|
|
10
|
+
use super::runtime_observation::{
|
|
11
|
+
CapturedRuntimeFact, LeaderCaptureFact, RuntimeObservationResults,
|
|
12
|
+
};
|
|
13
|
+
use super::types::{CompactionResult, LeaderApiError, SessionDriftResult};
|
|
14
|
+
|
|
15
|
+
const COMPACTION_RESET_THRESHOLD_DEFAULT: i64 = 3;
|
|
16
|
+
|
|
17
|
+
pub fn observe_runtime(
|
|
18
|
+
workspace: &Path,
|
|
19
|
+
state: &mut Value,
|
|
20
|
+
captures_by_agent: BTreeMap<AgentId, CapturedRuntimeFact>,
|
|
21
|
+
leader_capture: Option<LeaderCaptureFact>,
|
|
22
|
+
) -> RuntimeObservationResults {
|
|
23
|
+
let event_log = EventLog::new(workspace);
|
|
24
|
+
let mut compaction = Vec::new();
|
|
25
|
+
let mut session_drift = Vec::new();
|
|
26
|
+
for fact in captures_by_agent.values() {
|
|
27
|
+
if let Some(result) = detect_compaction(state, &event_log, fact) {
|
|
28
|
+
compaction.push(result);
|
|
29
|
+
}
|
|
30
|
+
if let Some(result) = detect_session_drift(state, &event_log, fact) {
|
|
31
|
+
session_drift.push(result);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
let api_errors = detect_leader_api_error(state, &event_log, leader_capture.as_ref());
|
|
35
|
+
RuntimeObservationResults {
|
|
36
|
+
captures_by_agent,
|
|
37
|
+
compaction,
|
|
38
|
+
session_drift,
|
|
39
|
+
api_errors,
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
fn detect_compaction(
|
|
44
|
+
state: &mut Value,
|
|
45
|
+
event_log: &EventLog,
|
|
46
|
+
fact: &CapturedRuntimeFact,
|
|
47
|
+
) -> Option<CompactionResult> {
|
|
48
|
+
let count = count_compaction_markers(&fact.scrollback_tail);
|
|
49
|
+
if count <= 0 {
|
|
50
|
+
return None;
|
|
51
|
+
}
|
|
52
|
+
let team = fact
|
|
53
|
+
.team_key
|
|
54
|
+
.as_ref()
|
|
55
|
+
.map(|team| team.as_str().to_string())
|
|
56
|
+
.unwrap_or_else(|| crate::state::projection::team_state_key(state));
|
|
57
|
+
let current = update_compaction_count(state, &team, &fact.agent_id, count);
|
|
58
|
+
let provider = fact.provider;
|
|
59
|
+
let _ = event_log.write(
|
|
60
|
+
"coordinator.compaction_observed",
|
|
61
|
+
json!({
|
|
62
|
+
"agent_id": fact.agent_id.as_str(),
|
|
63
|
+
"provider": provider.map(provider_name),
|
|
64
|
+
"team": team,
|
|
65
|
+
"compaction_count": current,
|
|
66
|
+
"stuck_loop": false,
|
|
67
|
+
}),
|
|
68
|
+
);
|
|
69
|
+
let threshold = compaction_reset_threshold(state);
|
|
70
|
+
let recommendation = if provider == Some(Provider::Codex) && current >= threshold {
|
|
71
|
+
let message = format!(
|
|
72
|
+
"agent {} crossed Codex compaction threshold; run team-agent reset-agent {} --discard-session",
|
|
73
|
+
fact.agent_id.as_str(),
|
|
74
|
+
fact.agent_id.as_str()
|
|
75
|
+
);
|
|
76
|
+
let _ = event_log.write(
|
|
77
|
+
"compaction_threshold_crossed.recommend_reset",
|
|
78
|
+
json!({
|
|
79
|
+
"agent_id": fact.agent_id.as_str(),
|
|
80
|
+
"provider": provider.map(provider_name),
|
|
81
|
+
"team": team,
|
|
82
|
+
"compaction_count": current,
|
|
83
|
+
"threshold": threshold,
|
|
84
|
+
"leader_visible_message": message,
|
|
85
|
+
}),
|
|
86
|
+
);
|
|
87
|
+
Some(message)
|
|
88
|
+
} else {
|
|
89
|
+
None
|
|
90
|
+
};
|
|
91
|
+
Some(CompactionResult {
|
|
92
|
+
agent_id: fact.agent_id.clone(),
|
|
93
|
+
provider,
|
|
94
|
+
observed: true,
|
|
95
|
+
reason: Some("compaction_observed".to_string()),
|
|
96
|
+
recommendation,
|
|
97
|
+
})
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
fn detect_session_drift(
|
|
101
|
+
state: &mut Value,
|
|
102
|
+
event_log: &EventLog,
|
|
103
|
+
fact: &CapturedRuntimeFact,
|
|
104
|
+
) -> Option<SessionDriftResult> {
|
|
105
|
+
if fact.provider != Some(Provider::Codex) {
|
|
106
|
+
return None;
|
|
107
|
+
}
|
|
108
|
+
let stored = fact
|
|
109
|
+
.stored_session_id
|
|
110
|
+
.as_deref()
|
|
111
|
+
.filter(|s| !s.trim().is_empty())?;
|
|
112
|
+
let actual = extract_thread_id_from_scrollback(&fact.scrollback_tail)?;
|
|
113
|
+
if actual.eq_ignore_ascii_case(stored) {
|
|
114
|
+
return None;
|
|
115
|
+
}
|
|
116
|
+
if fact
|
|
117
|
+
.agent_state_snapshot
|
|
118
|
+
.get("status")
|
|
119
|
+
.and_then(Value::as_str)
|
|
120
|
+
== Some("session_drift")
|
|
121
|
+
{
|
|
122
|
+
return None;
|
|
123
|
+
}
|
|
124
|
+
let detected_at = chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Micros, false);
|
|
125
|
+
let remediation = "team-agent reset-agent --discard-session <agent>";
|
|
126
|
+
let _ = event_log.write(
|
|
127
|
+
"coordinator.session_drift_detected",
|
|
128
|
+
json!({
|
|
129
|
+
"agent_id": fact.agent_id.as_str(),
|
|
130
|
+
"stored_session_id": stored,
|
|
131
|
+
"actual_thread_id": actual,
|
|
132
|
+
"status": "session_drift",
|
|
133
|
+
"provider": "codex",
|
|
134
|
+
"ts": detected_at,
|
|
135
|
+
"remediation": remediation,
|
|
136
|
+
}),
|
|
137
|
+
);
|
|
138
|
+
mark_agent_session_drift(
|
|
139
|
+
state,
|
|
140
|
+
&fact.agent_id,
|
|
141
|
+
stored,
|
|
142
|
+
&actual,
|
|
143
|
+
&detected_at,
|
|
144
|
+
remediation,
|
|
145
|
+
);
|
|
146
|
+
Some(SessionDriftResult {
|
|
147
|
+
agent_id: fact.agent_id.clone(),
|
|
148
|
+
stored_session_id: Some(stored.to_string()),
|
|
149
|
+
observed_session_id: Some(actual),
|
|
150
|
+
status: "session_drift".to_string(),
|
|
151
|
+
})
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
fn detect_leader_api_error(
|
|
155
|
+
state: &mut Value,
|
|
156
|
+
event_log: &EventLog,
|
|
157
|
+
leader_capture: Option<&LeaderCaptureFact>,
|
|
158
|
+
) -> Vec<LeaderApiError> {
|
|
159
|
+
let Some(capture) = leader_capture else {
|
|
160
|
+
return Vec::new();
|
|
161
|
+
};
|
|
162
|
+
let Some((error_class, snippet)) = match_api_error(&capture.scrollback_tail) else {
|
|
163
|
+
clear_last_api_error_fingerprint(state);
|
|
164
|
+
return Vec::new();
|
|
165
|
+
};
|
|
166
|
+
let fingerprint = format!("{error_class}::{}", tail_chars(&snippet, 120));
|
|
167
|
+
if get_coordinator(state)
|
|
168
|
+
.and_then(|c| c.get("last_api_error_fingerprint"))
|
|
169
|
+
.and_then(Value::as_str)
|
|
170
|
+
== Some(fingerprint.as_str())
|
|
171
|
+
{
|
|
172
|
+
return Vec::new();
|
|
173
|
+
}
|
|
174
|
+
let Some(coordinator) = coordinator_object_mut(state) else {
|
|
175
|
+
return Vec::new();
|
|
176
|
+
};
|
|
177
|
+
coordinator.insert(
|
|
178
|
+
"last_api_error_fingerprint".to_string(),
|
|
179
|
+
Value::String(fingerprint.clone()),
|
|
180
|
+
);
|
|
181
|
+
let provider = leader_receiver_provider(capture.leader_receiver.as_ref())
|
|
182
|
+
.or_else(|| leader_receiver_provider(state.get("leader_receiver")));
|
|
183
|
+
let pane_id = capture
|
|
184
|
+
.pane_id
|
|
185
|
+
.as_ref()
|
|
186
|
+
.map(|pane| pane.as_str().to_string())
|
|
187
|
+
.or_else(|| {
|
|
188
|
+
capture
|
|
189
|
+
.leader_receiver
|
|
190
|
+
.as_ref()
|
|
191
|
+
.and_then(|r| r.get("pane_id"))
|
|
192
|
+
.and_then(Value::as_str)
|
|
193
|
+
.map(ToString::to_string)
|
|
194
|
+
});
|
|
195
|
+
let leader_session_uuid = state
|
|
196
|
+
.get("team_owner")
|
|
197
|
+
.and_then(|owner| owner.get("leader_session_uuid"))
|
|
198
|
+
.or_else(|| {
|
|
199
|
+
capture
|
|
200
|
+
.leader_receiver
|
|
201
|
+
.as_ref()
|
|
202
|
+
.and_then(|receiver| receiver.get("leader_session_uuid"))
|
|
203
|
+
})
|
|
204
|
+
.and_then(Value::as_str)
|
|
205
|
+
.map(ToString::to_string);
|
|
206
|
+
let partial_response_streamed =
|
|
207
|
+
scrollback_has_partial_response(&capture.scrollback_tail, &snippet);
|
|
208
|
+
let _ = event_log.write(
|
|
209
|
+
"leader.api_error",
|
|
210
|
+
json!({
|
|
211
|
+
"leader_session_uuid": leader_session_uuid,
|
|
212
|
+
"error_class": error_class,
|
|
213
|
+
"provider": provider.map(provider_name),
|
|
214
|
+
"partial_response_streamed": partial_response_streamed,
|
|
215
|
+
"worker_dispatch_just_before": [],
|
|
216
|
+
"retry_count": 0,
|
|
217
|
+
"matched_pattern_snippet": snippet.chars().take(160).collect::<String>(),
|
|
218
|
+
}),
|
|
219
|
+
);
|
|
220
|
+
vec![LeaderApiError {
|
|
221
|
+
provider,
|
|
222
|
+
pane_id,
|
|
223
|
+
fingerprint,
|
|
224
|
+
message: snippet,
|
|
225
|
+
}]
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
fn count_compaction_markers(scrollback: &str) -> i64 {
|
|
229
|
+
let lower = scrollback.to_ascii_lowercase();
|
|
230
|
+
lower.matches("context compacted").count() as i64
|
|
231
|
+
+ lower.matches("compaction occurred").count() as i64
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
fn update_compaction_count(state: &mut Value, team: &str, agent_id: &AgentId, count: i64) -> i64 {
|
|
235
|
+
let Some(coordinator) = coordinator_object_mut(state) else {
|
|
236
|
+
return count;
|
|
237
|
+
};
|
|
238
|
+
let Some(counts) = object_field_mut(coordinator, "compaction_counts") else {
|
|
239
|
+
return count;
|
|
240
|
+
};
|
|
241
|
+
let Some(team_counts) = object_field_mut(counts, team) else {
|
|
242
|
+
return count;
|
|
243
|
+
};
|
|
244
|
+
let previous = team_counts
|
|
245
|
+
.get(agent_id.as_str())
|
|
246
|
+
.and_then(Value::as_i64)
|
|
247
|
+
.unwrap_or(0);
|
|
248
|
+
let current = previous.max(count);
|
|
249
|
+
team_counts.insert(agent_id.as_str().to_string(), json!(current));
|
|
250
|
+
current
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
fn compaction_reset_threshold(state: &Value) -> i64 {
|
|
254
|
+
state
|
|
255
|
+
.get("runtime")
|
|
256
|
+
.and_then(|runtime| runtime.get("compaction_reset_threshold"))
|
|
257
|
+
.and_then(Value::as_i64)
|
|
258
|
+
.filter(|value| *value > 0)
|
|
259
|
+
.unwrap_or(COMPACTION_RESET_THRESHOLD_DEFAULT)
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
fn extract_thread_id_from_scrollback(scrollback: &str) -> Option<String> {
|
|
263
|
+
let mut found = None;
|
|
264
|
+
let lower = scrollback.to_ascii_lowercase();
|
|
265
|
+
for needle in ["switched to thread", "resume", "thread"] {
|
|
266
|
+
let mut offset = 0;
|
|
267
|
+
while let Some(pos) = lower.get(offset..).and_then(|tail| tail.find(needle)) {
|
|
268
|
+
let start = offset + pos + needle.len();
|
|
269
|
+
if let Some(token) = first_token(scrollback.get(start..).unwrap_or_default()) {
|
|
270
|
+
found = Some(token.to_ascii_lowercase());
|
|
271
|
+
}
|
|
272
|
+
offset = start;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
found
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
fn first_token(text: &str) -> Option<String> {
|
|
279
|
+
let trimmed =
|
|
280
|
+
text.trim_start_matches(|c: char| c.is_whitespace() || matches!(c, ':' | '=' | '#'));
|
|
281
|
+
let trimmed = trimmed
|
|
282
|
+
.strip_prefix("id")
|
|
283
|
+
.map(|rest| {
|
|
284
|
+
rest.trim_start_matches(|c: char| c.is_whitespace() || matches!(c, ':' | '=' | '#'))
|
|
285
|
+
})
|
|
286
|
+
.unwrap_or(trimmed);
|
|
287
|
+
let token: String = trimmed
|
|
288
|
+
.chars()
|
|
289
|
+
.take_while(|c| c.is_ascii_alphanumeric() || matches!(c, '-' | '_' | ':' | '.'))
|
|
290
|
+
.collect();
|
|
291
|
+
(!token.is_empty()).then_some(token)
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
fn mark_agent_session_drift(
|
|
295
|
+
state: &mut Value,
|
|
296
|
+
agent_id: &AgentId,
|
|
297
|
+
stored: &str,
|
|
298
|
+
actual: &str,
|
|
299
|
+
detected_at: &str,
|
|
300
|
+
remediation: &str,
|
|
301
|
+
) {
|
|
302
|
+
let drift = json!({
|
|
303
|
+
"stored_session_id": stored,
|
|
304
|
+
"actual_thread_id": actual,
|
|
305
|
+
"detected_at": detected_at,
|
|
306
|
+
"remediation": remediation,
|
|
307
|
+
});
|
|
308
|
+
if let Some(agent) = agent_object_mut(state, agent_id) {
|
|
309
|
+
agent.insert(
|
|
310
|
+
"status".to_string(),
|
|
311
|
+
Value::String("session_drift".to_string()),
|
|
312
|
+
);
|
|
313
|
+
agent.insert("session_drift".to_string(), drift.clone());
|
|
314
|
+
}
|
|
315
|
+
if let Some(teams) = state.get_mut("teams").and_then(Value::as_object_mut) {
|
|
316
|
+
for team in teams.values_mut() {
|
|
317
|
+
if let Some(agent) = team
|
|
318
|
+
.get_mut("agents")
|
|
319
|
+
.and_then(Value::as_object_mut)
|
|
320
|
+
.and_then(|agents| agents.get_mut(agent_id.as_str()))
|
|
321
|
+
.and_then(Value::as_object_mut)
|
|
322
|
+
{
|
|
323
|
+
agent.insert(
|
|
324
|
+
"status".to_string(),
|
|
325
|
+
Value::String("session_drift".to_string()),
|
|
326
|
+
);
|
|
327
|
+
agent.insert("session_drift".to_string(), drift.clone());
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
fn match_api_error(scrollback: &str) -> Option<(String, String)> {
|
|
334
|
+
let lines: Vec<String> = scrollback
|
|
335
|
+
.lines()
|
|
336
|
+
.rev()
|
|
337
|
+
.take(100)
|
|
338
|
+
.map(str::trim)
|
|
339
|
+
.map(ToString::to_string)
|
|
340
|
+
.collect::<Vec<_>>()
|
|
341
|
+
.into_iter()
|
|
342
|
+
.rev()
|
|
343
|
+
.collect();
|
|
344
|
+
let mut best = None;
|
|
345
|
+
for start in 0..lines.len() {
|
|
346
|
+
for size in 1..=3 {
|
|
347
|
+
if start + size > lines.len() {
|
|
348
|
+
break;
|
|
349
|
+
}
|
|
350
|
+
let mut window = lines[start..start + size]
|
|
351
|
+
.iter()
|
|
352
|
+
.filter(|line| !line.is_empty())
|
|
353
|
+
.cloned()
|
|
354
|
+
.collect::<Vec<_>>()
|
|
355
|
+
.join(" ");
|
|
356
|
+
if window.len() > 400 {
|
|
357
|
+
window = tail_chars(&window, 400);
|
|
358
|
+
}
|
|
359
|
+
let lower = window.to_ascii_lowercase();
|
|
360
|
+
let class = if lower.contains("api error: overloaded") {
|
|
361
|
+
Some("Overloaded")
|
|
362
|
+
} else if lower.contains("429 too many requests")
|
|
363
|
+
|| (has_api_context(&lower) && lower.contains("429"))
|
|
364
|
+
{
|
|
365
|
+
Some("RateLimit")
|
|
366
|
+
} else if lower.contains("etimedout")
|
|
367
|
+
|| (has_api_context(&lower)
|
|
368
|
+
&& (lower.contains("request timed out")
|
|
369
|
+
|| lower.contains("request timeout")
|
|
370
|
+
|| lower.contains("connection timed out")
|
|
371
|
+
|| lower.contains("connection timeout")))
|
|
372
|
+
{
|
|
373
|
+
Some("Timeout")
|
|
374
|
+
} else if has_api_context(&lower)
|
|
375
|
+
&& (lower.contains("500")
|
|
376
|
+
|| lower.contains("502")
|
|
377
|
+
|| lower.contains("503")
|
|
378
|
+
|| lower.contains("504")
|
|
379
|
+
|| lower.contains("fetch failed"))
|
|
380
|
+
{
|
|
381
|
+
Some("NetworkError")
|
|
382
|
+
} else {
|
|
383
|
+
None
|
|
384
|
+
};
|
|
385
|
+
if let Some(class) = class {
|
|
386
|
+
best = Some((
|
|
387
|
+
start,
|
|
388
|
+
class.to_string(),
|
|
389
|
+
window.chars().take(240).collect::<String>(),
|
|
390
|
+
));
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
best.map(|(_, class, snippet)| (class, snippet))
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
fn has_api_context(lower: &str) -> bool {
|
|
398
|
+
lower.contains("api error")
|
|
399
|
+
|| lower.contains("http error")
|
|
400
|
+
|| lower.contains("httperror")
|
|
401
|
+
|| lower.contains("request failed")
|
|
402
|
+
|| lower.contains("codex")
|
|
403
|
+
|| lower.contains("claude")
|
|
404
|
+
|| lower.contains("anthropic")
|
|
405
|
+
|| lower.contains("openai")
|
|
406
|
+
|| lower.contains("typeerror")
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
fn scrollback_has_partial_response(scrollback: &str, snippet: &str) -> bool {
|
|
410
|
+
let Some(idx) = scrollback.rfind(snippet) else {
|
|
411
|
+
return false;
|
|
412
|
+
};
|
|
413
|
+
let start = idx.saturating_sub(4000);
|
|
414
|
+
let head = scrollback
|
|
415
|
+
.get(start..idx)
|
|
416
|
+
.unwrap_or_default()
|
|
417
|
+
.to_ascii_lowercase();
|
|
418
|
+
[
|
|
419
|
+
"assistant",
|
|
420
|
+
"i'll ",
|
|
421
|
+
"i will ",
|
|
422
|
+
"i'm ",
|
|
423
|
+
"i am ",
|
|
424
|
+
"let me ",
|
|
425
|
+
"> ",
|
|
426
|
+
]
|
|
427
|
+
.iter()
|
|
428
|
+
.any(|hint| head.contains(hint))
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
fn clear_last_api_error_fingerprint(state: &mut Value) {
|
|
432
|
+
if let Some(coordinator) = get_coordinator_mut(state) {
|
|
433
|
+
if coordinator.get("last_api_error_fingerprint").is_some() {
|
|
434
|
+
coordinator.insert("last_api_error_fingerprint".to_string(), Value::Null);
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
fn leader_receiver_provider(receiver: Option<&Value>) -> Option<Provider> {
|
|
440
|
+
let raw = receiver
|
|
441
|
+
.and_then(|receiver| receiver.get("provider"))
|
|
442
|
+
.and_then(Value::as_str)?;
|
|
443
|
+
serde_json::from_value(Value::String(raw.to_string())).ok()
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
fn provider_name(provider: Provider) -> &'static str {
|
|
447
|
+
match provider {
|
|
448
|
+
Provider::Claude => "claude",
|
|
449
|
+
Provider::ClaudeCode => "claude_code",
|
|
450
|
+
Provider::Codex => "codex",
|
|
451
|
+
Provider::GeminiCli => "gemini_cli",
|
|
452
|
+
Provider::Fake => "fake",
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
fn coordinator_object_mut(state: &mut Value) -> Option<&mut Map<String, Value>> {
|
|
457
|
+
if !state.is_object() {
|
|
458
|
+
*state = json!({});
|
|
459
|
+
}
|
|
460
|
+
let obj = state.as_object_mut()?;
|
|
461
|
+
if !obj.get("coordinator").is_some_and(Value::is_object) {
|
|
462
|
+
obj.insert("coordinator".to_string(), json!({}));
|
|
463
|
+
}
|
|
464
|
+
obj.get_mut("coordinator").and_then(Value::as_object_mut)
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
fn get_coordinator(state: &Value) -> Option<&Map<String, Value>> {
|
|
468
|
+
state.get("coordinator").and_then(Value::as_object)
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
fn get_coordinator_mut(state: &mut Value) -> Option<&mut Map<String, Value>> {
|
|
472
|
+
state.get_mut("coordinator").and_then(Value::as_object_mut)
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
fn object_field_mut<'a>(
|
|
476
|
+
obj: &'a mut Map<String, Value>,
|
|
477
|
+
key: &str,
|
|
478
|
+
) -> Option<&'a mut Map<String, Value>> {
|
|
479
|
+
if !obj.get(key).is_some_and(Value::is_object) {
|
|
480
|
+
obj.insert(key.to_string(), json!({}));
|
|
481
|
+
}
|
|
482
|
+
obj.get_mut(key).and_then(Value::as_object_mut)
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
fn agent_object_mut<'a>(
|
|
486
|
+
state: &'a mut Value,
|
|
487
|
+
agent_id: &AgentId,
|
|
488
|
+
) -> Option<&'a mut Map<String, Value>> {
|
|
489
|
+
state
|
|
490
|
+
.get_mut("agents")
|
|
491
|
+
.and_then(Value::as_object_mut)
|
|
492
|
+
.and_then(|agents| agents.get_mut(agent_id.as_str()))
|
|
493
|
+
.and_then(Value::as_object_mut)
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
fn tail_chars(text: &str, max_chars: usize) -> String {
|
|
497
|
+
let chars: Vec<char> = text.chars().collect();
|
|
498
|
+
let start = chars.len().saturating_sub(max_chars);
|
|
499
|
+
chars[start..].iter().collect()
|
|
500
|
+
}
|