@team-agent/installer 0.3.9 → 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.lock CHANGED
@@ -566,7 +566,7 @@ dependencies = [
566
566
 
567
567
  [[package]]
568
568
  name = "team-agent"
569
- version = "0.3.9"
569
+ version = "0.3.10"
570
570
  dependencies = [
571
571
  "anyhow",
572
572
  "chrono",
package/Cargo.toml CHANGED
@@ -9,7 +9,7 @@ members = ["crates/team-agent"]
9
9
 
10
10
  [workspace.package]
11
11
  edition = "2021"
12
- version = "0.3.9"
12
+ version = "0.3.10"
13
13
  license = "AGPL-3.0"
14
14
  rust-version = "1.95"
15
15
 
@@ -258,4 +258,10 @@ fn spine_tick_session_missing_emits_event() {
258
258
  .any(|e| e.get("event").and_then(|v| v.as_str()) == Some("coordinator.session_missing")),
259
259
  "the tmux-missing gate must emit a coordinator.session_missing event before the stop report; got {events:?}"
260
260
  );
261
+ assert!(
262
+ events
263
+ .iter()
264
+ .any(|e| e.get("event").and_then(|v| v.as_str()) == Some("coordinator.session_missing_alert")),
265
+ "the tmux-missing gate must emit an explicit leader-visible alert before stopping; got {events:?}"
266
+ );
261
267
  }
@@ -198,6 +198,7 @@ impl Coordinator {
198
198
  "coordinator.session_missing",
199
199
  serde_json::json!({"session": session_name}),
200
200
  )?;
201
+ notify_session_missing(self.workspace.as_path(), &state, &event_log, session_name)?;
201
202
  return Ok(empty_tick_report(
202
203
  false,
203
204
  true,
@@ -971,7 +972,8 @@ impl Coordinator {
971
972
  "runtime_approval.auto_approved",
972
973
  serde_json::json!({
973
974
  "agent_id": agent_id,
974
- "tool": prompt.tool,
975
+ "server": prompt.server.as_deref(),
976
+ "tool": prompt.tool.as_deref(),
975
977
  "choice": choice,
976
978
  "cleared": cleared,
977
979
  "policy_source": approval_policy.source,
@@ -980,6 +982,23 @@ impl Coordinator {
980
982
  "worker_capability_above_leader": approval_policy.worker_capability_above_leader,
981
983
  }),
982
984
  )?;
985
+ event_log.write(
986
+ "mcp.tool.auto_approved",
987
+ serde_json::json!({
988
+ "agent_id": agent_id,
989
+ "server": prompt.server.as_deref(),
990
+ "tool": prompt.tool.as_deref(),
991
+ "choice": choice,
992
+ "cleared": cleared,
993
+ "inherit_reason": approval_policy.inherit_reason(),
994
+ "bypass_source": approval_policy.source,
995
+ "provider": approval_policy.provider,
996
+ "flag": approval_policy.flag,
997
+ "inherited": approval_policy.inherited,
998
+ "explicit_yes_confirmed": approval_policy.explicit_yes_confirmed,
999
+ "worker_capability_above_leader": approval_policy.worker_capability_above_leader,
1000
+ }),
1001
+ )?;
983
1002
  }
984
1003
  RuntimeApprovalDecision::AwaitingHumanConfirm => {
985
1004
  let Some(reason) = awaiting_human_confirm_reason(&prompt, auto_answer_allowed) else {
@@ -2110,6 +2129,8 @@ struct RuntimeApprovalPolicy {
2110
2129
  source: String,
2111
2130
  inherited: bool,
2112
2131
  explicit_yes_confirmed: bool,
2132
+ provider: Option<String>,
2133
+ flag: Option<String>,
2113
2134
  worker_capability_above_leader: bool,
2114
2135
  }
2115
2136
 
@@ -2127,6 +2148,14 @@ impl RuntimeApprovalPolicy {
2127
2148
  && (!self.worker_capability_above_leader
2128
2149
  || (self.source == "runtime_config" && self.explicit_yes_confirmed))
2129
2150
  }
2151
+
2152
+ fn inherit_reason(&self) -> &'static str {
2153
+ match self.source.as_str() {
2154
+ "leader_process" if self.inherited => "leader_bypass",
2155
+ "runtime_config" if self.explicit_yes_confirmed => "runtime_config_explicit_yes",
2156
+ _ => "none",
2157
+ }
2158
+ }
2130
2159
  }
2131
2160
 
2132
2161
  fn runtime_approval_policy_from_agent(agent: &Value) -> RuntimeApprovalPolicy {
@@ -2151,6 +2180,14 @@ fn runtime_approval_policy_from_agent(agent: &Value) -> RuntimeApprovalPolicy {
2151
2180
  .and_then(|p| p.get("explicit_yes_confirmed"))
2152
2181
  .and_then(Value::as_bool)
2153
2182
  .unwrap_or(false),
2183
+ provider: policy
2184
+ .and_then(|p| p.get("provider"))
2185
+ .and_then(Value::as_str)
2186
+ .map(str::to_string),
2187
+ flag: policy
2188
+ .and_then(|p| p.get("flag"))
2189
+ .and_then(Value::as_str)
2190
+ .map(str::to_string),
2154
2191
  worker_capability_above_leader: policy
2155
2192
  .and_then(|p| p.get("worker_capability_above_leader"))
2156
2193
  .and_then(Value::as_bool)
@@ -2423,3 +2460,48 @@ fn remove_file_if_exists(path: &Path) -> Result<(), std::io::Error> {
2423
2460
  Err(e) => Err(e),
2424
2461
  }
2425
2462
  }
2463
+
2464
+ fn notify_session_missing(
2465
+ workspace: &Path,
2466
+ state: &Value,
2467
+ event_log: &EventLog,
2468
+ session_name: &str,
2469
+ ) -> Result<(), TickError> {
2470
+ let content = format!(
2471
+ "coordinator.session_missing\nerror: tmux session {session_name} is missing; coordinator is stopping\naction: restart the team or recover the missing tmux session\nlog: .team/logs/events.jsonl"
2472
+ );
2473
+ let dedupe_key = format!("coordinator.session_missing:{session_name}");
2474
+ match crate::messaging::send_to_leader_receiver(
2475
+ workspace,
2476
+ state,
2477
+ "leader",
2478
+ &content,
2479
+ None,
2480
+ "coordinator",
2481
+ false,
2482
+ Some(&dedupe_key),
2483
+ event_log,
2484
+ ) {
2485
+ Ok(outcome) => {
2486
+ event_log.write(
2487
+ "coordinator.session_missing_alert",
2488
+ serde_json::json!({
2489
+ "session": session_name,
2490
+ "leader_notification_status": crate::messaging::helpers::status_wire(outcome.status),
2491
+ "message_id": outcome.message_id,
2492
+ }),
2493
+ )?;
2494
+ }
2495
+ Err(error) => {
2496
+ event_log.write(
2497
+ "coordinator.session_missing_alert_failed",
2498
+ serde_json::json!({
2499
+ "session": session_name,
2500
+ "error": error.to_string(),
2501
+ "action": "inspect .team/logs/events.jsonl and restart the team",
2502
+ }),
2503
+ )?;
2504
+ }
2505
+ }
2506
+ Ok(())
2507
+ }
@@ -654,6 +654,8 @@ fn leader_command_provider(command: &str) -> Option<Provider> {
654
654
  Some(Provider::ClaudeCode)
655
655
  } else if lower.contains("codex") {
656
656
  Some(Provider::Codex)
657
+ } else if lower.contains("copilot") {
658
+ Some(Provider::Copilot)
657
659
  } else if lower.contains("fake") {
658
660
  Some(Provider::Fake)
659
661
  } else {
@@ -1082,3 +1084,20 @@ pub fn detect_dual_state_divergence(
1082
1084
  "team_owner_epoch": team_epoch,
1083
1085
  })))
1084
1086
  }
1087
+
1088
+ #[cfg(test)]
1089
+ mod tests {
1090
+ use super::*;
1091
+
1092
+ #[test]
1093
+ fn leader_command_provider_recognizes_copilot() {
1094
+ assert_eq!(
1095
+ leader_command_provider("copilot --allow-all-tools"),
1096
+ Some(Provider::Copilot)
1097
+ );
1098
+ assert_eq!(
1099
+ leader_command_provider("/usr/local/bin/copilot"),
1100
+ Some(Provider::Copilot)
1101
+ );
1102
+ }
1103
+ }
@@ -80,6 +80,18 @@ fn event_named(events: &[Value], name: &str) -> Value {
80
80
  .unwrap_or_else(|| panic!("missing event {name}; got {events:?}"))
81
81
  }
82
82
 
83
+ #[test]
84
+ fn leader_command_provider_recognizes_copilot() {
85
+ assert_eq!(
86
+ leader_command_provider("copilot --allow-all-tools"),
87
+ Some(Provider::Copilot)
88
+ );
89
+ assert_eq!(
90
+ leader_command_provider("/usr/local/bin/copilot"),
91
+ Some(Provider::Copilot)
92
+ );
93
+ }
94
+
83
95
  fn last_event_named(events: &[Value], name: &str) -> Value {
84
96
  events
85
97
  .iter()
@@ -590,6 +590,8 @@ fn leader_command_provider(command: &str) -> Option<Provider> {
590
590
  Some(Provider::ClaudeCode)
591
591
  } else if lower.contains("codex") {
592
592
  Some(Provider::Codex)
593
+ } else if lower.contains("copilot") {
594
+ Some(Provider::Copilot)
593
595
  } else if lower.contains("fake") {
594
596
  Some(Provider::Fake)
595
597
  } else {
@@ -288,6 +288,7 @@ fn spawn_agents(
288
288
  let mut env =
289
289
  inherited_env_with_team_overrides(workspace, agent_id_raw, Some(&mcp_team_id));
290
290
  apply_profile_launch_env(&mut env, &profile_launch);
291
+ apply_mcp_auto_approval_env(&mut env, &safety);
291
292
  // Python providers.py:145 + launch/core.py:253 — fresh launch runs the worker
292
293
  // with cwd=workspace, same as the RS fork/add and restart paths.
293
294
  let env_unset: Vec<String> = profile_launch.env_unset.iter().cloned().collect();
@@ -1390,6 +1391,39 @@ pub(crate) fn inherited_env_with_team_overrides(
1390
1391
  env
1391
1392
  }
1392
1393
 
1394
+ pub(crate) fn apply_mcp_auto_approval_env(
1395
+ env: &mut BTreeMap<String, String>,
1396
+ safety: &DangerousApproval,
1397
+ ) {
1398
+ for key in [
1399
+ "TEAM_AGENT_LEADER_BYPASS",
1400
+ "TEAM_AGENT_LEADER_BYPASS_SOURCE",
1401
+ "TEAM_AGENT_LEADER_BYPASS_PROVIDER",
1402
+ "TEAM_AGENT_LEADER_BYPASS_FLAG",
1403
+ "TEAM_AGENT_MCP_AUTO_APPROVE",
1404
+ "TEAM_AGENT_MCP_AUTO_APPROVE_SOURCE",
1405
+ ] {
1406
+ env.remove(key);
1407
+ }
1408
+ if safety.enabled
1409
+ && matches!(safety.source, DangerousApprovalSource::LeaderProcess)
1410
+ && safety.inherited
1411
+ {
1412
+ env.insert("TEAM_AGENT_LEADER_BYPASS".to_string(), "1".to_string());
1413
+ env.insert("TEAM_AGENT_LEADER_BYPASS_SOURCE".to_string(), "leader_process".to_string());
1414
+ if let Some(provider) = safety.provider.as_deref() {
1415
+ env.insert("TEAM_AGENT_LEADER_BYPASS_PROVIDER".to_string(), provider.to_string());
1416
+ }
1417
+ if let Some(flag) = safety.flag.as_deref() {
1418
+ env.insert("TEAM_AGENT_LEADER_BYPASS_FLAG".to_string(), flag.to_string());
1419
+ }
1420
+ env.insert("TEAM_AGENT_MCP_AUTO_APPROVE".to_string(), "team_orchestrator".to_string());
1421
+ env.insert("TEAM_AGENT_MCP_AUTO_APPROVE_SOURCE".to_string(), "leader_bypass".to_string());
1422
+ } else {
1423
+ env.insert("TEAM_AGENT_LEADER_BYPASS".to_string(), "0".to_string());
1424
+ }
1425
+ }
1426
+
1393
1427
  /// BUG / B2 灵魂件 + C-1-2 + C-6-1 cr verdict — Copilot per-worker AGENTS.md
1394
1428
  /// 写入 + `COPILOT_CUSTOM_INSTRUCTIONS_DIRS` 注入。
1395
1429
  ///
@@ -2971,6 +3005,7 @@ pub fn fork_agent_with_transport(
2971
3005
  let mut env =
2972
3006
  inherited_env_with_team_overrides(&workspace, as_agent_id.as_str(), Some(&fork_team));
2973
3007
  apply_profile_launch_env(&mut env, &profile_launch);
3008
+ apply_mcp_auto_approval_env(&mut env, &safety);
2974
3009
  // golden operations.py:336 -> _tmux_start_command_for_agent_window (runtime.py:1017-1020): branch on
2975
3010
  // _tmux_session_exists — an ABSENT session => new-session (spawn_first), present => new-window
2976
3011
  // (spawn_into). The Rust restart seam (restart.rs spawn_agent_window) uses the same branch.
@@ -108,17 +108,31 @@ pub(crate) fn start_agent_at_paths(
108
108
  let provider = agent_provider(&agent);
109
109
  let session_id = agent_session_id(&agent);
110
110
  let rollout_path = agent_rollout_path(&agent);
111
- let rollout_exists = rollout_path
111
+ let resume_backing_exists = session_id
112
112
  .as_ref()
113
- .map(|p| p.as_path().exists())
113
+ .map(|session| {
114
+ resume_backing_exists_for_agent(
115
+ workspace,
116
+ agent_id,
117
+ &agent,
118
+ provider,
119
+ session,
120
+ rollout_path.as_ref(),
121
+ )
122
+ })
114
123
  .unwrap_or(false);
115
124
  let start_mode = decide_start_mode(
116
125
  provider_wire(provider),
117
126
  session_id.as_ref(),
118
127
  rollout_path.as_ref(),
119
- rollout_exists,
128
+ resume_backing_exists,
120
129
  allow_fresh,
121
130
  );
131
+ if matches!(start_mode, StartMode::Noop) {
132
+ return Err(LifecycleError::RequirementUnmet(format!(
133
+ "resume_not_ready: session backing store missing for agent {agent_id}; rerun with --allow-fresh to start fresh"
134
+ )));
135
+ }
122
136
  let spawn_session_id = if matches!(start_mode, StartMode::Resumed) {
123
137
  session_id.as_ref()
124
138
  } else {
@@ -122,6 +122,7 @@ pub(super) fn spawn_agent_window(
122
122
  team_id.as_deref(),
123
123
  );
124
124
  crate::lifecycle::launch::apply_profile_launch_env(&mut env, &profile_launch);
125
+ crate::lifecycle::launch::apply_mcp_auto_approval_env(&mut env, safety);
125
126
  let spawn_cwd = spawn_cwd_override
126
127
  .or_else(|| {
127
128
  agent
@@ -242,6 +243,80 @@ pub(super) fn agent_rollout_path(agent: &serde_json::Value) -> Option<RolloutPat
242
243
  .map(RolloutPath::new)
243
244
  }
244
245
 
246
+ pub(super) fn resume_backing_exists_for_agent(
247
+ workspace: &Path,
248
+ agent_id: &AgentId,
249
+ agent: &serde_json::Value,
250
+ provider: Provider,
251
+ session_id: &SessionId,
252
+ rollout_path: Option<&RolloutPath>,
253
+ ) -> bool {
254
+ match provider {
255
+ Provider::Codex => rollout_path_exists(rollout_path),
256
+ Provider::Claude | Provider::ClaudeCode => {
257
+ rollout_path_exists(rollout_path)
258
+ || event_log_transcript_exists(workspace, agent_id.as_str(), session_id.as_str())
259
+ }
260
+ Provider::Copilot => copilot_session_store_has_session(session_id.as_str()),
261
+ Provider::GeminiCli | Provider::Fake => {
262
+ let _ = agent;
263
+ true
264
+ }
265
+ }
266
+ }
267
+
268
+ fn rollout_path_exists(rollout_path: Option<&RolloutPath>) -> bool {
269
+ rollout_path
270
+ .as_ref()
271
+ .is_some_and(|path| path.as_path().exists())
272
+ }
273
+
274
+ fn event_log_transcript_exists(workspace: &Path, agent_id: &str, session_id: &str) -> bool {
275
+ let Ok(events) = crate::event_log::EventLog::new(workspace).tail(0) else {
276
+ return false;
277
+ };
278
+ events.iter().rev().any(|event| {
279
+ event.get("event").and_then(serde_json::Value::as_str) == Some("session.captured")
280
+ && ["agent_id", "worker_id"]
281
+ .iter()
282
+ .any(|key| event.get(*key).and_then(serde_json::Value::as_str) == Some(agent_id))
283
+ && event.get("session_id").and_then(serde_json::Value::as_str) == Some(session_id)
284
+ && event_transcript_path(event).is_some_and(|path| path.exists())
285
+ })
286
+ }
287
+
288
+ fn event_transcript_path(event: &serde_json::Value) -> Option<PathBuf> {
289
+ event
290
+ .get("rollout_path")
291
+ .or_else(|| event.get("transcript_path"))
292
+ .and_then(serde_json::Value::as_str)
293
+ .filter(|path| !path.is_empty())
294
+ .map(PathBuf::from)
295
+ }
296
+
297
+ fn copilot_session_store_has_session(session_id: &str) -> bool {
298
+ let Some(home) = std::env::var_os("HOME").map(PathBuf::from) else {
299
+ return false;
300
+ };
301
+ let db_path = home.join(".copilot").join("session-store.db");
302
+ if !db_path.exists() {
303
+ return false;
304
+ }
305
+ let Ok(conn) = rusqlite::Connection::open_with_flags(
306
+ db_path,
307
+ rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY | rusqlite::OpenFlags::SQLITE_OPEN_NO_MUTEX,
308
+ ) else {
309
+ return false;
310
+ };
311
+ conn
312
+ .query_row(
313
+ "select 1 from sessions where id = ?1 limit 1",
314
+ [session_id],
315
+ |_| Ok(()),
316
+ )
317
+ .is_ok()
318
+ }
319
+
245
320
  pub(crate) fn refresh_missing_provider_sessions(
246
321
  state: &mut serde_json::Value,
247
322
  ) -> Result<bool, LifecycleError> {
@@ -1,5 +1,5 @@
1
1
  use super::common::*;
2
- use super::selection::classify_restart_plan;
2
+ use super::selection::classify_restart_plan_with_resume_validation;
3
3
  use super::*;
4
4
 
5
5
  // ── lifecycle::restart —— 整队 Route B resume-or-fresh 重建 ──────────────────
@@ -29,6 +29,7 @@ pub fn restart_with_session_convergence_deadline(
29
29
  team,
30
30
  &crate::tmux_backend::TmuxBackend::for_workspace(&run_ws),
31
31
  session_converge_deadline_ms,
32
+ None,
32
33
  )
33
34
  }
34
35
 
@@ -40,6 +41,16 @@ pub fn restart_with_transport(
40
41
  allow_fresh: bool,
41
42
  team: Option<&str>,
42
43
  transport: &dyn crate::transport::Transport,
44
+ ) -> Result<RestartReport, LifecycleError> {
45
+ restart_with_transport_with_readiness_deadline(workspace, allow_fresh, team, transport, None)
46
+ }
47
+
48
+ pub fn restart_with_transport_with_readiness_deadline(
49
+ workspace: &Path,
50
+ allow_fresh: bool,
51
+ team: Option<&str>,
52
+ transport: &dyn crate::transport::Transport,
53
+ readiness_deadline_ms: Option<u64>,
43
54
  ) -> Result<RestartReport, LifecycleError> {
44
55
  match restart_with_transport_with_session_convergence_deadline(
45
56
  workspace,
@@ -47,6 +58,7 @@ pub fn restart_with_transport(
47
58
  team,
48
59
  transport,
49
60
  None,
61
+ readiness_deadline_ms,
50
62
  )? {
51
63
  RestartReport::RefusedResumeNotReady {
52
64
  missing,
@@ -76,6 +88,7 @@ pub fn restart_with_transport_with_session_convergence_deadline(
76
88
  team: Option<&str>,
77
89
  transport: &dyn crate::transport::Transport,
78
90
  session_converge_deadline_ms: Option<u64>,
91
+ readiness_deadline_ms: Option<u64>,
79
92
  ) -> Result<RestartReport, LifecycleError> {
80
93
  // RED-2-STILL(P0):入口门必须在 canonical_run_workspace 解析后的路径上判,不用 raw workspace。
81
94
  // 根因:quick-start <dir> 把 .team/runtime/spec 落在 team_workspace(dir)=**parent**/.team;
@@ -166,7 +179,7 @@ pub fn restart_with_transport_with_session_convergence_deadline(
166
179
  convergence.missing.iter().cloned().collect()
167
180
  };
168
181
  let forced_fresh_convergence = (!convergence.converged).then_some(convergence.clone());
169
- let plan = classify_restart_plan(&state, allow_fresh)?;
182
+ let plan = classify_restart_plan_with_resume_validation(Some(&selected.run_workspace), &state, allow_fresh)?;
170
183
  write_restart_resume_decision_events(
171
184
  &selected.run_workspace,
172
185
  &state,
@@ -186,7 +199,7 @@ pub fn restart_with_transport_with_session_convergence_deadline(
186
199
  return Ok(RestartReport::RefusedResumeAtomicity {
187
200
  unresumable: plan.unresumable,
188
201
  allow_fresh,
189
- error: "restart requires resumable workers before live spawn".to_string(),
202
+ error: "restart requires resumable workers before live spawn; rerun with --allow-fresh to start fresh".to_string(),
190
203
  });
191
204
  }
192
205
  let session_name = state_session_name(&state);
@@ -253,6 +266,15 @@ pub fn restart_with_transport_with_session_convergence_deadline(
253
266
  crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
254
267
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
255
268
  let coordinator_started = start_coordinator_for_workspace(&selected.run_workspace)?;
269
+ wait_restart_readiness_or_timeout(
270
+ &selected.run_workspace,
271
+ &state,
272
+ &session_name,
273
+ &plan.decisions,
274
+ transport,
275
+ restart_readiness_deadline(readiness_deadline_ms),
276
+ restart_readiness_poll_interval(),
277
+ )?;
256
278
  let attach_commands = crate::tmux_backend::attach_commands_for_windows(
257
279
  &selected.run_workspace,
258
280
  &session_name,
@@ -455,6 +477,182 @@ fn parse_duration_value_seconds_ms(value: &str) -> Option<u64> {
455
477
  }
456
478
  }
457
479
 
480
+ fn restart_readiness_deadline(requested_ms: Option<u64>) -> std::time::Duration {
481
+ requested_ms.map(std::time::Duration::from_millis).unwrap_or_else(|| {
482
+ env_duration_ms(&["TEAM_AGENT_RESTART_READINESS_DEADLINE_MS"], 30_000)
483
+ })
484
+ }
485
+
486
+ fn restart_readiness_poll_interval() -> std::time::Duration {
487
+ env_duration_ms(&["TEAM_AGENT_RESTART_READINESS_POLL_MS"], 200)
488
+ }
489
+
490
+ #[derive(Debug, Clone, Copy)]
491
+ struct RestartReadiness {
492
+ session_created: bool,
493
+ worker_pane_addressable: bool,
494
+ coordinator_alive: bool,
495
+ }
496
+
497
+ impl RestartReadiness {
498
+ fn ready(self) -> bool {
499
+ self.session_created && self.worker_pane_addressable && self.coordinator_alive
500
+ }
501
+ }
502
+
503
+ fn wait_restart_readiness_or_timeout(
504
+ workspace: &Path,
505
+ state: &serde_json::Value,
506
+ session_name: &SessionName,
507
+ decisions: &[RestartedAgent],
508
+ transport: &dyn crate::transport::Transport,
509
+ deadline: std::time::Duration,
510
+ poll_interval: std::time::Duration,
511
+ ) -> Result<(), LifecycleError> {
512
+ let started = std::time::Instant::now();
513
+ loop {
514
+ let readiness = restart_readiness(workspace, state, session_name, decisions, transport);
515
+ if readiness.ready() {
516
+ return Ok(());
517
+ }
518
+ let elapsed = started.elapsed();
519
+ if elapsed >= deadline {
520
+ write_restart_readiness_timeout_event(workspace, readiness, deadline, elapsed)?;
521
+ return Err(LifecycleError::RequirementUnmet(restart_readiness_timeout_message(
522
+ workspace, readiness, deadline,
523
+ )));
524
+ }
525
+ std::thread::sleep(std::cmp::min(poll_interval, deadline.saturating_sub(elapsed)));
526
+ }
527
+ }
528
+
529
+ fn restart_readiness(
530
+ workspace: &Path,
531
+ state: &serde_json::Value,
532
+ session_name: &SessionName,
533
+ decisions: &[RestartedAgent],
534
+ transport: &dyn crate::transport::Transport,
535
+ ) -> RestartReadiness {
536
+ let session_created = session_live_or_default(transport, session_name, false);
537
+ let worker_pane_addressable = restart_worker_panes_addressable(state, decisions, transport);
538
+ let coordinator_workspace = crate::coordinator::WorkspacePath::new(workspace.to_path_buf());
539
+ let coordinator_alive =
540
+ crate::coordinator::coordinator_health(&coordinator_workspace).ok && session_created;
541
+ RestartReadiness { session_created, worker_pane_addressable, coordinator_alive }
542
+ }
543
+
544
+ fn restart_worker_panes_addressable(
545
+ state: &serde_json::Value,
546
+ decisions: &[RestartedAgent],
547
+ transport: &dyn crate::transport::Transport,
548
+ ) -> bool {
549
+ if decisions.is_empty() {
550
+ return true;
551
+ }
552
+ decisions.iter().all(|decision| {
553
+ let Some(pane_id) = state
554
+ .get("agents")
555
+ .and_then(|agents| agents.get(decision.agent_id.as_str()))
556
+ .and_then(|agent| agent.get("pane_id"))
557
+ .and_then(serde_json::Value::as_str)
558
+ .filter(|pane| !pane.is_empty())
559
+ .map(crate::transport::PaneId::new)
560
+ else {
561
+ return false;
562
+ };
563
+ pane_addressable(transport, &pane_id)
564
+ })
565
+ }
566
+
567
+ fn pane_addressable(
568
+ transport: &dyn crate::transport::Transport,
569
+ pane_id: &crate::transport::PaneId,
570
+ ) -> bool {
571
+ match transport.has_pane(pane_id) {
572
+ Ok(Some(present)) => present,
573
+ Ok(None) | Err(_) => {
574
+ transport
575
+ .list_targets()
576
+ .map(|targets| targets.iter().any(|pane| pane.pane_id == *pane_id))
577
+ .unwrap_or(false)
578
+ || transport
579
+ .liveness(pane_id)
580
+ .map(|state| state == crate::transport::PaneLiveness::Live)
581
+ .unwrap_or(false)
582
+ }
583
+ }
584
+ }
585
+
586
+ fn write_restart_readiness_timeout_event(
587
+ workspace: &Path,
588
+ readiness: RestartReadiness,
589
+ deadline: std::time::Duration,
590
+ elapsed: std::time::Duration,
591
+ ) -> Result<(), LifecycleError> {
592
+ crate::event_log::EventLog::new(workspace)
593
+ .write(
594
+ "restart.readiness_timeout",
595
+ serde_json::json!({
596
+ "tmux_session_created": readiness.session_created,
597
+ "worker_pane_addressable": readiness.worker_pane_addressable,
598
+ "coordinator_alive": readiness.coordinator_alive,
599
+ "deadline_ms": deadline.as_millis(),
600
+ "elapsed_ms": elapsed.as_millis(),
601
+ "coordinator_log": crate::coordinator::coordinator_log_path(
602
+ &crate::coordinator::WorkspacePath::new(workspace.to_path_buf())
603
+ ).display().to_string(),
604
+ "state_path": crate::state::persist::runtime_state_path(workspace).display().to_string(),
605
+ "pid_path": crate::coordinator::coordinator_pid_path(
606
+ &crate::coordinator::WorkspacePath::new(workspace.to_path_buf())
607
+ ).display().to_string(),
608
+ }),
609
+ )
610
+ .map(|_| ())
611
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))
612
+ }
613
+
614
+ fn restart_readiness_timeout_message(
615
+ workspace: &Path,
616
+ readiness: RestartReadiness,
617
+ deadline: std::time::Duration,
618
+ ) -> String {
619
+ let coordinator_workspace = crate::coordinator::WorkspacePath::new(workspace.to_path_buf());
620
+ let deadline_s = deadline.as_secs_f64();
621
+ format!(
622
+ "restart not ready within {deadline_s:.1}s: {missing}\n\
623
+ - tmux session created: {session}\n\
624
+ - worker pane addressable: {pane}\n\
625
+ - coordinator alive: {coordinator}\n\
626
+ Action: check coordinator log {log}, then `team-agent restart <agent> --allow-fresh` or `team-agent diagnose`\n\
627
+ Log: coordinator_log={log} state={state} pid_file={pid}",
628
+ missing = restart_readiness_missing_summary(readiness),
629
+ session = yes_no(readiness.session_created),
630
+ pane = yes_no(readiness.worker_pane_addressable),
631
+ coordinator = yes_no(readiness.coordinator_alive),
632
+ log = crate::coordinator::coordinator_log_path(&coordinator_workspace).display(),
633
+ state = crate::state::persist::runtime_state_path(workspace).display(),
634
+ pid = crate::coordinator::coordinator_pid_path(&coordinator_workspace).display(),
635
+ )
636
+ }
637
+
638
+ fn restart_readiness_missing_summary(readiness: RestartReadiness) -> String {
639
+ let mut missing = Vec::new();
640
+ if !readiness.session_created {
641
+ missing.push("tmux session created");
642
+ }
643
+ if !readiness.worker_pane_addressable {
644
+ missing.push("worker pane addressable");
645
+ }
646
+ if !readiness.coordinator_alive {
647
+ missing.push("coordinator alive");
648
+ }
649
+ missing.join(", ")
650
+ }
651
+
652
+ fn yes_no(value: bool) -> &'static str {
653
+ if value { "yes" } else { "no" }
654
+ }
655
+
458
656
  fn verify_spawned_agent_live(
459
657
  _agent_id: &AgentId,
460
658
  _spawn: &SpawnedAgentWindow,