@team-agent/installer 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/Cargo.lock +34 -1
  2. package/Cargo.toml +1 -1
  3. package/crates/team-agent/Cargo.toml +1 -1
  4. package/crates/team-agent/src/cli/adapters.rs +234 -26
  5. package/crates/team-agent/src/cli/diagnose.rs +144 -10
  6. package/crates/team-agent/src/cli/emit.rs +289 -54
  7. package/crates/team-agent/src/cli/leader.rs +37 -8
  8. package/crates/team-agent/src/cli/mod.rs +1281 -196
  9. package/crates/team-agent/src/cli/status_port.rs +195 -46
  10. package/crates/team-agent/src/cli/tests/divergence.rs +1 -2
  11. package/crates/team-agent/src/cli/tests/lane_c.rs +23 -13
  12. package/crates/team-agent/src/cli/tests/main_preserved.rs +2 -0
  13. package/crates/team-agent/src/cli/tests/run_delegation.rs +59 -3
  14. package/crates/team-agent/src/cli/types.rs +18 -0
  15. package/crates/team-agent/src/compiler.rs +15 -5
  16. package/crates/team-agent/src/coordinator/health.rs +95 -17
  17. package/crates/team-agent/src/coordinator/mod.rs +4 -0
  18. package/crates/team-agent/src/coordinator/runtime_detectors.rs +500 -0
  19. package/crates/team-agent/src/coordinator/runtime_observation.rs +58 -0
  20. package/crates/team-agent/src/coordinator/tick.rs +222 -69
  21. package/crates/team-agent/src/coordinator/types.rs +15 -3
  22. package/crates/team-agent/src/db/schema.rs +37 -2
  23. package/crates/team-agent/src/diagnose/comms.rs +226 -0
  24. package/crates/team-agent/src/diagnose/mod.rs +45 -0
  25. package/crates/team-agent/src/diagnose/orphans.rs +658 -0
  26. package/crates/team-agent/src/fake_worker.rs +146 -3
  27. package/crates/team-agent/src/leader/start.rs +121 -23
  28. package/crates/team-agent/src/leader/types.rs +44 -1
  29. package/crates/team-agent/src/lib.rs +3 -0
  30. package/crates/team-agent/src/lifecycle/display.rs +645 -47
  31. package/crates/team-agent/src/lifecycle/launch.rs +1061 -146
  32. package/crates/team-agent/src/lifecycle/mod.rs +2 -0
  33. package/crates/team-agent/src/lifecycle/profile_launch.rs +810 -0
  34. package/crates/team-agent/src/lifecycle/profile_smoke.rs +522 -0
  35. package/crates/team-agent/src/lifecycle/restart/agent.rs +99 -23
  36. package/crates/team-agent/src/lifecycle/restart/common.rs +183 -24
  37. package/crates/team-agent/src/lifecycle/restart/rebuild.rs +498 -22
  38. package/crates/team-agent/src/lifecycle/restart/remove.rs +27 -7
  39. package/crates/team-agent/src/lifecycle/restart/team_state.rs +19 -0
  40. package/crates/team-agent/src/lifecycle/restart.rs +24 -1
  41. package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +5 -5
  42. package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +37 -7
  43. package/crates/team-agent/src/lifecycle/types.rs +19 -0
  44. package/crates/team-agent/src/mcp_server/helpers.rs +1 -0
  45. package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +341 -0
  46. package/crates/team-agent/src/mcp_server/lifecycle_tools/mod.rs +10 -0
  47. package/crates/team-agent/src/mcp_server/lifecycle_tools/state_status.rs +158 -0
  48. package/crates/team-agent/src/mcp_server/mod.rs +3 -74
  49. package/crates/team-agent/src/mcp_server/tests/scoped.rs +1 -1
  50. package/crates/team-agent/src/mcp_server/tests/send.rs +6 -5
  51. package/crates/team-agent/src/mcp_server/tools.rs +312 -111
  52. package/crates/team-agent/src/mcp_server/types.rs +6 -4
  53. package/crates/team-agent/src/mcp_server/wire.rs +19 -7
  54. package/crates/team-agent/src/message_store.rs +21 -4
  55. package/crates/team-agent/src/messaging/delivery.rs +470 -59
  56. package/crates/team-agent/src/messaging/mod.rs +9 -6
  57. package/crates/team-agent/src/messaging/results.rs +353 -63
  58. package/crates/team-agent/src/messaging/selftest.rs +199 -12
  59. package/crates/team-agent/src/messaging/send.rs +35 -3
  60. package/crates/team-agent/src/messaging/tests/runtime.rs +19 -4
  61. package/crates/team-agent/src/messaging/types.rs +11 -3
  62. package/crates/team-agent/src/os_probe.rs +119 -0
  63. package/crates/team-agent/src/packaging/migrate.rs +10 -2
  64. package/crates/team-agent/src/packaging/tests.rs +23 -0
  65. package/crates/team-agent/src/provider/adapter.rs +564 -63
  66. package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +1 -7
  67. package/crates/team-agent/src/provider/classify.rs +51 -4
  68. package/crates/team-agent/src/provider/helpers.rs +10 -1
  69. package/crates/team-agent/src/provider/startup_prompt.rs +94 -0
  70. package/crates/team-agent/src/provider/types.rs +47 -0
  71. package/crates/team-agent/src/session_capture.rs +616 -0
  72. package/crates/team-agent/src/state/persist.rs +170 -1
  73. package/crates/team-agent/src/state/projection.rs +141 -8
  74. package/crates/team-agent/src/state/selector.rs +5 -2
  75. package/crates/team-agent/src/tmux_backend.rs +161 -64
  76. package/crates/team-agent/src/transport/test_support.rs +9 -0
  77. package/crates/team-agent/src/transport/tests/wire.rs +4 -0
  78. package/crates/team-agent/src/transport.rs +13 -2
  79. package/package.json +4 -4
@@ -138,15 +138,16 @@ pub fn stop_coordinator(workspace: &WorkspacePath) -> Result<StopReport, StopErr
138
138
  pid: None,
139
139
  });
140
140
  };
141
- let Ok(pid_t) = libc::pid_t::try_from(pid.get()) else {
141
+ if pid_is_running(pid).ok() == Some(false) {
142
+ remove_file_if_exists(&pid_path)?;
143
+ remove_file_if_exists(&coordinator_meta_path(workspace))?;
142
144
  return Ok(StopReport {
143
- ok: false,
144
- status: StopOutcome::KillFailed,
145
+ ok: true,
146
+ status: StopOutcome::Missing,
145
147
  pid: Some(pid),
146
148
  });
147
- };
148
- let rc = unsafe { libc::kill(pid_t, libc::SIGTERM) };
149
- if rc != 0 {
149
+ }
150
+ if !terminate_pid(pid) {
150
151
  return Ok(StopReport {
151
152
  ok: false,
152
153
  status: StopOutcome::KillFailed,
@@ -197,9 +198,11 @@ fn stop_discovered_coordinators(
197
198
  }
198
199
 
199
200
  fn discover_coordinator_pids(workspace: &WorkspacePath) -> Vec<Pid> {
200
- let output = match Command::new("ps")
201
- .args(["-axo", "pid=,command="])
202
- .output()
201
+ let output = match crate::os_probe::bounded_command_output_with_probe(
202
+ Command::new("ps").args(["-axo", "pid=,command="]),
203
+ "ps_table",
204
+ None,
205
+ )
203
206
  {
204
207
  Ok(output) if output.status.success() => output,
205
208
  _ => return Vec::new(),
@@ -250,13 +253,86 @@ fn terminate_pid(pid: Pid) -> bool {
250
253
  if pid_is_running(pid).ok() == Some(false) {
251
254
  return true;
252
255
  }
253
- if !send_signal(pid, libc::SIGTERM) {
254
- return false;
256
+ let pids = process_tree_pids(pid);
257
+ for child in pids.iter().rev() {
258
+ let _ = send_signal(*child, libc::SIGTERM);
255
259
  }
256
- if wait_until_not_running(pid, Duration::from_millis(750)) {
257
- return true;
260
+ if !wait_until_all_not_running(&pids, Duration::from_secs(5)) {
261
+ for child in pids.iter().rev() {
262
+ let _ = send_signal(*child, libc::SIGKILL);
263
+ }
264
+ }
265
+ wait_until_all_not_running(&pids, Duration::from_secs(5))
266
+ }
267
+
268
+ /// Public wrapper for diagnostic cleanup paths that must reuse coordinator
269
+ /// shutdown's SIGTERM-then-SIGKILL semantics.
270
+ pub fn terminate_pid_tree(pid: Pid) -> bool {
271
+ terminate_pid(pid)
272
+ }
273
+
274
+ fn process_tree_pids(root: Pid) -> Vec<Pid> {
275
+ let root_pid = root.get();
276
+ let pairs = crate::os_probe::bounded_command_output_with_probe(
277
+ Command::new("ps").args(["-axo", "pid=,ppid="]),
278
+ "ps_parent",
279
+ None,
280
+ )
281
+ .ok()
282
+ .map(|out| String::from_utf8_lossy(&out.stdout).to_string())
283
+ .unwrap_or_default()
284
+ .lines()
285
+ .filter_map(|line| {
286
+ let mut parts = line.split_whitespace();
287
+ let pid = parts.next()?.parse::<u32>().ok()?;
288
+ let ppid = parts.next()?.parse::<u32>().ok()?;
289
+ Some((pid, ppid))
290
+ })
291
+ .collect::<Vec<_>>();
292
+ let mut out = Vec::new();
293
+ collect_child_pids(root_pid, &pairs, &mut out);
294
+ out.push(root_pid);
295
+ out.sort_unstable();
296
+ out.dedup();
297
+ out.into_iter().map(Pid::new).collect()
298
+ }
299
+
300
+ fn collect_child_pids(parent: u32, pairs: &[(u32, u32)], out: &mut Vec<u32>) {
301
+ for (pid, ppid) in pairs {
302
+ if *ppid == parent && !out.contains(pid) {
303
+ out.push(*pid);
304
+ collect_child_pids(*pid, pairs, out);
305
+ }
306
+ }
307
+ }
308
+
309
+ fn wait_until_all_not_running(pids: &[Pid], timeout: Duration) -> bool {
310
+ let start = std::time::Instant::now();
311
+ loop {
312
+ for pid in pids {
313
+ reap_child_if_possible(*pid);
314
+ }
315
+ if pids
316
+ .iter()
317
+ .all(|pid| pid_is_running(*pid).ok() != Some(true))
318
+ {
319
+ return true;
320
+ }
321
+ if start.elapsed() >= timeout {
322
+ return false;
323
+ }
324
+ std::thread::sleep(Duration::from_millis(25));
325
+ }
326
+ }
327
+
328
+ fn reap_child_if_possible(pid: Pid) {
329
+ let Ok(pid_t) = libc::pid_t::try_from(pid.get()) else {
330
+ return;
331
+ };
332
+ let mut status = 0;
333
+ unsafe {
334
+ libc::waitpid(pid_t, &mut status, libc::WNOHANG);
258
335
  }
259
- send_signal(pid, libc::SIGKILL) && wait_until_not_running(pid, Duration::from_millis(750))
260
336
  }
261
337
 
262
338
  fn send_signal(pid: Pid, signal: libc::c_int) -> bool {
@@ -297,9 +373,11 @@ pub fn pid_is_running(pid: Pid) -> Result<bool, std::io::Error> {
297
373
  _ => Err(err),
298
374
  };
299
375
  }
300
- let out = Command::new("ps")
301
- .args(["-p", &pid.to_string(), "-o", "stat="])
302
- .output()?;
376
+ let out = crate::os_probe::bounded_command_output_with_probe(
377
+ Command::new("ps").args(["-p", &pid.to_string(), "-o", "stat="]),
378
+ "ps_table",
379
+ Some(pid.get()),
380
+ )?;
303
381
  if !out.status.success() {
304
382
  return Ok(false);
305
383
  }
@@ -64,6 +64,8 @@ use serde_json::Value;
64
64
  pub mod backoff;
65
65
  pub mod health;
66
66
  pub mod orphan;
67
+ pub mod runtime_detectors;
68
+ pub mod runtime_observation;
67
69
  pub mod tick;
68
70
  pub mod types;
69
71
 
@@ -75,6 +77,8 @@ pub use tick::*;
75
77
  pub use backoff::*;
76
78
  pub use orphan::*;
77
79
  pub use health::*;
80
+ pub use runtime_detectors::*;
81
+ pub use runtime_observation::*;
78
82
 
79
83
  #[cfg(test)]
80
84
  mod tests;
@@ -0,0 +1,500 @@
1
+ use std::collections::BTreeMap;
2
+ use std::path::Path;
3
+
4
+ use serde_json::{json, Map, Value};
5
+
6
+ use crate::event_log::EventLog;
7
+ use crate::model::enums::Provider;
8
+ use crate::model::ids::AgentId;
9
+
10
+ use super::runtime_observation::{
11
+ CapturedRuntimeFact, LeaderCaptureFact, RuntimeObservationResults,
12
+ };
13
+ use super::types::{CompactionResult, LeaderApiError, SessionDriftResult};
14
+
15
+ const COMPACTION_RESET_THRESHOLD_DEFAULT: i64 = 3;
16
+
17
+ pub fn observe_runtime(
18
+ workspace: &Path,
19
+ state: &mut Value,
20
+ captures_by_agent: BTreeMap<AgentId, CapturedRuntimeFact>,
21
+ leader_capture: Option<LeaderCaptureFact>,
22
+ ) -> RuntimeObservationResults {
23
+ let event_log = EventLog::new(workspace);
24
+ let mut compaction = Vec::new();
25
+ let mut session_drift = Vec::new();
26
+ for fact in captures_by_agent.values() {
27
+ if let Some(result) = detect_compaction(state, &event_log, fact) {
28
+ compaction.push(result);
29
+ }
30
+ if let Some(result) = detect_session_drift(state, &event_log, fact) {
31
+ session_drift.push(result);
32
+ }
33
+ }
34
+ let api_errors = detect_leader_api_error(state, &event_log, leader_capture.as_ref());
35
+ RuntimeObservationResults {
36
+ captures_by_agent,
37
+ compaction,
38
+ session_drift,
39
+ api_errors,
40
+ }
41
+ }
42
+
43
+ fn detect_compaction(
44
+ state: &mut Value,
45
+ event_log: &EventLog,
46
+ fact: &CapturedRuntimeFact,
47
+ ) -> Option<CompactionResult> {
48
+ let count = count_compaction_markers(&fact.scrollback_tail);
49
+ if count <= 0 {
50
+ return None;
51
+ }
52
+ let team = fact
53
+ .team_key
54
+ .as_ref()
55
+ .map(|team| team.as_str().to_string())
56
+ .unwrap_or_else(|| crate::state::projection::team_state_key(state));
57
+ let current = update_compaction_count(state, &team, &fact.agent_id, count);
58
+ let provider = fact.provider;
59
+ let _ = event_log.write(
60
+ "coordinator.compaction_observed",
61
+ json!({
62
+ "agent_id": fact.agent_id.as_str(),
63
+ "provider": provider.map(provider_name),
64
+ "team": team,
65
+ "compaction_count": current,
66
+ "stuck_loop": false,
67
+ }),
68
+ );
69
+ let threshold = compaction_reset_threshold(state);
70
+ let recommendation = if provider == Some(Provider::Codex) && current >= threshold {
71
+ let message = format!(
72
+ "agent {} crossed Codex compaction threshold; run team-agent reset-agent {} --discard-session",
73
+ fact.agent_id.as_str(),
74
+ fact.agent_id.as_str()
75
+ );
76
+ let _ = event_log.write(
77
+ "compaction_threshold_crossed.recommend_reset",
78
+ json!({
79
+ "agent_id": fact.agent_id.as_str(),
80
+ "provider": provider.map(provider_name),
81
+ "team": team,
82
+ "compaction_count": current,
83
+ "threshold": threshold,
84
+ "leader_visible_message": message,
85
+ }),
86
+ );
87
+ Some(message)
88
+ } else {
89
+ None
90
+ };
91
+ Some(CompactionResult {
92
+ agent_id: fact.agent_id.clone(),
93
+ provider,
94
+ observed: true,
95
+ reason: Some("compaction_observed".to_string()),
96
+ recommendation,
97
+ })
98
+ }
99
+
100
+ fn detect_session_drift(
101
+ state: &mut Value,
102
+ event_log: &EventLog,
103
+ fact: &CapturedRuntimeFact,
104
+ ) -> Option<SessionDriftResult> {
105
+ if fact.provider != Some(Provider::Codex) {
106
+ return None;
107
+ }
108
+ let stored = fact
109
+ .stored_session_id
110
+ .as_deref()
111
+ .filter(|s| !s.trim().is_empty())?;
112
+ let actual = extract_thread_id_from_scrollback(&fact.scrollback_tail)?;
113
+ if actual.eq_ignore_ascii_case(stored) {
114
+ return None;
115
+ }
116
+ if fact
117
+ .agent_state_snapshot
118
+ .get("status")
119
+ .and_then(Value::as_str)
120
+ == Some("session_drift")
121
+ {
122
+ return None;
123
+ }
124
+ let detected_at = chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Micros, false);
125
+ let remediation = "team-agent reset-agent --discard-session <agent>";
126
+ let _ = event_log.write(
127
+ "coordinator.session_drift_detected",
128
+ json!({
129
+ "agent_id": fact.agent_id.as_str(),
130
+ "stored_session_id": stored,
131
+ "actual_thread_id": actual,
132
+ "status": "session_drift",
133
+ "provider": "codex",
134
+ "ts": detected_at,
135
+ "remediation": remediation,
136
+ }),
137
+ );
138
+ mark_agent_session_drift(
139
+ state,
140
+ &fact.agent_id,
141
+ stored,
142
+ &actual,
143
+ &detected_at,
144
+ remediation,
145
+ );
146
+ Some(SessionDriftResult {
147
+ agent_id: fact.agent_id.clone(),
148
+ stored_session_id: Some(stored.to_string()),
149
+ observed_session_id: Some(actual),
150
+ status: "session_drift".to_string(),
151
+ })
152
+ }
153
+
154
+ fn detect_leader_api_error(
155
+ state: &mut Value,
156
+ event_log: &EventLog,
157
+ leader_capture: Option<&LeaderCaptureFact>,
158
+ ) -> Vec<LeaderApiError> {
159
+ let Some(capture) = leader_capture else {
160
+ return Vec::new();
161
+ };
162
+ let Some((error_class, snippet)) = match_api_error(&capture.scrollback_tail) else {
163
+ clear_last_api_error_fingerprint(state);
164
+ return Vec::new();
165
+ };
166
+ let fingerprint = format!("{error_class}::{}", tail_chars(&snippet, 120));
167
+ if get_coordinator(state)
168
+ .and_then(|c| c.get("last_api_error_fingerprint"))
169
+ .and_then(Value::as_str)
170
+ == Some(fingerprint.as_str())
171
+ {
172
+ return Vec::new();
173
+ }
174
+ let Some(coordinator) = coordinator_object_mut(state) else {
175
+ return Vec::new();
176
+ };
177
+ coordinator.insert(
178
+ "last_api_error_fingerprint".to_string(),
179
+ Value::String(fingerprint.clone()),
180
+ );
181
+ let provider = leader_receiver_provider(capture.leader_receiver.as_ref())
182
+ .or_else(|| leader_receiver_provider(state.get("leader_receiver")));
183
+ let pane_id = capture
184
+ .pane_id
185
+ .as_ref()
186
+ .map(|pane| pane.as_str().to_string())
187
+ .or_else(|| {
188
+ capture
189
+ .leader_receiver
190
+ .as_ref()
191
+ .and_then(|r| r.get("pane_id"))
192
+ .and_then(Value::as_str)
193
+ .map(ToString::to_string)
194
+ });
195
+ let leader_session_uuid = state
196
+ .get("team_owner")
197
+ .and_then(|owner| owner.get("leader_session_uuid"))
198
+ .or_else(|| {
199
+ capture
200
+ .leader_receiver
201
+ .as_ref()
202
+ .and_then(|receiver| receiver.get("leader_session_uuid"))
203
+ })
204
+ .and_then(Value::as_str)
205
+ .map(ToString::to_string);
206
+ let partial_response_streamed =
207
+ scrollback_has_partial_response(&capture.scrollback_tail, &snippet);
208
+ let _ = event_log.write(
209
+ "leader.api_error",
210
+ json!({
211
+ "leader_session_uuid": leader_session_uuid,
212
+ "error_class": error_class,
213
+ "provider": provider.map(provider_name),
214
+ "partial_response_streamed": partial_response_streamed,
215
+ "worker_dispatch_just_before": [],
216
+ "retry_count": 0,
217
+ "matched_pattern_snippet": snippet.chars().take(160).collect::<String>(),
218
+ }),
219
+ );
220
+ vec![LeaderApiError {
221
+ provider,
222
+ pane_id,
223
+ fingerprint,
224
+ message: snippet,
225
+ }]
226
+ }
227
+
228
+ fn count_compaction_markers(scrollback: &str) -> i64 {
229
+ let lower = scrollback.to_ascii_lowercase();
230
+ lower.matches("context compacted").count() as i64
231
+ + lower.matches("compaction occurred").count() as i64
232
+ }
233
+
234
+ fn update_compaction_count(state: &mut Value, team: &str, agent_id: &AgentId, count: i64) -> i64 {
235
+ let Some(coordinator) = coordinator_object_mut(state) else {
236
+ return count;
237
+ };
238
+ let Some(counts) = object_field_mut(coordinator, "compaction_counts") else {
239
+ return count;
240
+ };
241
+ let Some(team_counts) = object_field_mut(counts, team) else {
242
+ return count;
243
+ };
244
+ let previous = team_counts
245
+ .get(agent_id.as_str())
246
+ .and_then(Value::as_i64)
247
+ .unwrap_or(0);
248
+ let current = previous.max(count);
249
+ team_counts.insert(agent_id.as_str().to_string(), json!(current));
250
+ current
251
+ }
252
+
253
+ fn compaction_reset_threshold(state: &Value) -> i64 {
254
+ state
255
+ .get("runtime")
256
+ .and_then(|runtime| runtime.get("compaction_reset_threshold"))
257
+ .and_then(Value::as_i64)
258
+ .filter(|value| *value > 0)
259
+ .unwrap_or(COMPACTION_RESET_THRESHOLD_DEFAULT)
260
+ }
261
+
262
+ fn extract_thread_id_from_scrollback(scrollback: &str) -> Option<String> {
263
+ let mut found = None;
264
+ let lower = scrollback.to_ascii_lowercase();
265
+ for needle in ["switched to thread", "resume", "thread"] {
266
+ let mut offset = 0;
267
+ while let Some(pos) = lower.get(offset..).and_then(|tail| tail.find(needle)) {
268
+ let start = offset + pos + needle.len();
269
+ if let Some(token) = first_token(scrollback.get(start..).unwrap_or_default()) {
270
+ found = Some(token.to_ascii_lowercase());
271
+ }
272
+ offset = start;
273
+ }
274
+ }
275
+ found
276
+ }
277
+
278
+ fn first_token(text: &str) -> Option<String> {
279
+ let trimmed =
280
+ text.trim_start_matches(|c: char| c.is_whitespace() || matches!(c, ':' | '=' | '#'));
281
+ let trimmed = trimmed
282
+ .strip_prefix("id")
283
+ .map(|rest| {
284
+ rest.trim_start_matches(|c: char| c.is_whitespace() || matches!(c, ':' | '=' | '#'))
285
+ })
286
+ .unwrap_or(trimmed);
287
+ let token: String = trimmed
288
+ .chars()
289
+ .take_while(|c| c.is_ascii_alphanumeric() || matches!(c, '-' | '_' | ':' | '.'))
290
+ .collect();
291
+ (!token.is_empty()).then_some(token)
292
+ }
293
+
294
+ fn mark_agent_session_drift(
295
+ state: &mut Value,
296
+ agent_id: &AgentId,
297
+ stored: &str,
298
+ actual: &str,
299
+ detected_at: &str,
300
+ remediation: &str,
301
+ ) {
302
+ let drift = json!({
303
+ "stored_session_id": stored,
304
+ "actual_thread_id": actual,
305
+ "detected_at": detected_at,
306
+ "remediation": remediation,
307
+ });
308
+ if let Some(agent) = agent_object_mut(state, agent_id) {
309
+ agent.insert(
310
+ "status".to_string(),
311
+ Value::String("session_drift".to_string()),
312
+ );
313
+ agent.insert("session_drift".to_string(), drift.clone());
314
+ }
315
+ if let Some(teams) = state.get_mut("teams").and_then(Value::as_object_mut) {
316
+ for team in teams.values_mut() {
317
+ if let Some(agent) = team
318
+ .get_mut("agents")
319
+ .and_then(Value::as_object_mut)
320
+ .and_then(|agents| agents.get_mut(agent_id.as_str()))
321
+ .and_then(Value::as_object_mut)
322
+ {
323
+ agent.insert(
324
+ "status".to_string(),
325
+ Value::String("session_drift".to_string()),
326
+ );
327
+ agent.insert("session_drift".to_string(), drift.clone());
328
+ }
329
+ }
330
+ }
331
+ }
332
+
333
+ fn match_api_error(scrollback: &str) -> Option<(String, String)> {
334
+ let lines: Vec<String> = scrollback
335
+ .lines()
336
+ .rev()
337
+ .take(100)
338
+ .map(str::trim)
339
+ .map(ToString::to_string)
340
+ .collect::<Vec<_>>()
341
+ .into_iter()
342
+ .rev()
343
+ .collect();
344
+ let mut best = None;
345
+ for start in 0..lines.len() {
346
+ for size in 1..=3 {
347
+ if start + size > lines.len() {
348
+ break;
349
+ }
350
+ let mut window = lines[start..start + size]
351
+ .iter()
352
+ .filter(|line| !line.is_empty())
353
+ .cloned()
354
+ .collect::<Vec<_>>()
355
+ .join(" ");
356
+ if window.len() > 400 {
357
+ window = tail_chars(&window, 400);
358
+ }
359
+ let lower = window.to_ascii_lowercase();
360
+ let class = if lower.contains("api error: overloaded") {
361
+ Some("Overloaded")
362
+ } else if lower.contains("429 too many requests")
363
+ || (has_api_context(&lower) && lower.contains("429"))
364
+ {
365
+ Some("RateLimit")
366
+ } else if lower.contains("etimedout")
367
+ || (has_api_context(&lower)
368
+ && (lower.contains("request timed out")
369
+ || lower.contains("request timeout")
370
+ || lower.contains("connection timed out")
371
+ || lower.contains("connection timeout")))
372
+ {
373
+ Some("Timeout")
374
+ } else if has_api_context(&lower)
375
+ && (lower.contains("500")
376
+ || lower.contains("502")
377
+ || lower.contains("503")
378
+ || lower.contains("504")
379
+ || lower.contains("fetch failed"))
380
+ {
381
+ Some("NetworkError")
382
+ } else {
383
+ None
384
+ };
385
+ if let Some(class) = class {
386
+ best = Some((
387
+ start,
388
+ class.to_string(),
389
+ window.chars().take(240).collect::<String>(),
390
+ ));
391
+ }
392
+ }
393
+ }
394
+ best.map(|(_, class, snippet)| (class, snippet))
395
+ }
396
+
397
+ fn has_api_context(lower: &str) -> bool {
398
+ lower.contains("api error")
399
+ || lower.contains("http error")
400
+ || lower.contains("httperror")
401
+ || lower.contains("request failed")
402
+ || lower.contains("codex")
403
+ || lower.contains("claude")
404
+ || lower.contains("anthropic")
405
+ || lower.contains("openai")
406
+ || lower.contains("typeerror")
407
+ }
408
+
409
+ fn scrollback_has_partial_response(scrollback: &str, snippet: &str) -> bool {
410
+ let Some(idx) = scrollback.rfind(snippet) else {
411
+ return false;
412
+ };
413
+ let start = idx.saturating_sub(4000);
414
+ let head = scrollback
415
+ .get(start..idx)
416
+ .unwrap_or_default()
417
+ .to_ascii_lowercase();
418
+ [
419
+ "assistant",
420
+ "i'll ",
421
+ "i will ",
422
+ "i'm ",
423
+ "i am ",
424
+ "let me ",
425
+ "> ",
426
+ ]
427
+ .iter()
428
+ .any(|hint| head.contains(hint))
429
+ }
430
+
431
+ fn clear_last_api_error_fingerprint(state: &mut Value) {
432
+ if let Some(coordinator) = get_coordinator_mut(state) {
433
+ if coordinator.get("last_api_error_fingerprint").is_some() {
434
+ coordinator.insert("last_api_error_fingerprint".to_string(), Value::Null);
435
+ }
436
+ }
437
+ }
438
+
439
+ fn leader_receiver_provider(receiver: Option<&Value>) -> Option<Provider> {
440
+ let raw = receiver
441
+ .and_then(|receiver| receiver.get("provider"))
442
+ .and_then(Value::as_str)?;
443
+ serde_json::from_value(Value::String(raw.to_string())).ok()
444
+ }
445
+
446
+ fn provider_name(provider: Provider) -> &'static str {
447
+ match provider {
448
+ Provider::Claude => "claude",
449
+ Provider::ClaudeCode => "claude_code",
450
+ Provider::Codex => "codex",
451
+ Provider::GeminiCli => "gemini_cli",
452
+ Provider::Fake => "fake",
453
+ }
454
+ }
455
+
456
+ fn coordinator_object_mut(state: &mut Value) -> Option<&mut Map<String, Value>> {
457
+ if !state.is_object() {
458
+ *state = json!({});
459
+ }
460
+ let obj = state.as_object_mut()?;
461
+ if !obj.get("coordinator").is_some_and(Value::is_object) {
462
+ obj.insert("coordinator".to_string(), json!({}));
463
+ }
464
+ obj.get_mut("coordinator").and_then(Value::as_object_mut)
465
+ }
466
+
467
+ fn get_coordinator(state: &Value) -> Option<&Map<String, Value>> {
468
+ state.get("coordinator").and_then(Value::as_object)
469
+ }
470
+
471
+ fn get_coordinator_mut(state: &mut Value) -> Option<&mut Map<String, Value>> {
472
+ state.get_mut("coordinator").and_then(Value::as_object_mut)
473
+ }
474
+
475
+ fn object_field_mut<'a>(
476
+ obj: &'a mut Map<String, Value>,
477
+ key: &str,
478
+ ) -> Option<&'a mut Map<String, Value>> {
479
+ if !obj.get(key).is_some_and(Value::is_object) {
480
+ obj.insert(key.to_string(), json!({}));
481
+ }
482
+ obj.get_mut(key).and_then(Value::as_object_mut)
483
+ }
484
+
485
+ fn agent_object_mut<'a>(
486
+ state: &'a mut Value,
487
+ agent_id: &AgentId,
488
+ ) -> Option<&'a mut Map<String, Value>> {
489
+ state
490
+ .get_mut("agents")
491
+ .and_then(Value::as_object_mut)
492
+ .and_then(|agents| agents.get_mut(agent_id.as_str()))
493
+ .and_then(Value::as_object_mut)
494
+ }
495
+
496
+ fn tail_chars(text: &str, max_chars: usize) -> String {
497
+ let chars: Vec<char> = text.chars().collect();
498
+ let start = chars.len().saturating_sub(max_chars);
499
+ chars[start..].iter().collect()
500
+ }