@team-agent/installer 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/Cargo.lock +34 -1
  2. package/Cargo.toml +1 -1
  3. package/crates/team-agent/Cargo.toml +1 -1
  4. package/crates/team-agent/src/cli/adapters.rs +196 -19
  5. package/crates/team-agent/src/cli/diagnose.rs +145 -11
  6. package/crates/team-agent/src/cli/emit.rs +287 -53
  7. package/crates/team-agent/src/cli/leader.rs +37 -8
  8. package/crates/team-agent/src/cli/mod.rs +807 -316
  9. package/crates/team-agent/src/cli/status_port.rs +25 -2
  10. package/crates/team-agent/src/cli/tests/divergence.rs +1 -2
  11. package/crates/team-agent/src/cli/tests/lane_c.rs +23 -13
  12. package/crates/team-agent/src/cli/tests/main_preserved.rs +2 -0
  13. package/crates/team-agent/src/cli/tests/run_delegation.rs +57 -3
  14. package/crates/team-agent/src/cli/types.rs +17 -0
  15. package/crates/team-agent/src/compiler/tests.rs +2 -2
  16. package/crates/team-agent/src/compiler.rs +16 -6
  17. package/crates/team-agent/src/coordinator/health.rs +89 -20
  18. package/crates/team-agent/src/coordinator/mod.rs +4 -0
  19. package/crates/team-agent/src/coordinator/runtime_detectors.rs +500 -0
  20. package/crates/team-agent/src/coordinator/runtime_observation.rs +58 -0
  21. package/crates/team-agent/src/coordinator/tests/watch.rs +4 -2
  22. package/crates/team-agent/src/coordinator/tick.rs +222 -69
  23. package/crates/team-agent/src/coordinator/types.rs +15 -3
  24. package/crates/team-agent/src/db/schema.rs +37 -2
  25. package/crates/team-agent/src/diagnose/comms.rs +226 -0
  26. package/crates/team-agent/src/diagnose/mod.rs +45 -0
  27. package/crates/team-agent/src/diagnose/orphans.rs +658 -0
  28. package/crates/team-agent/src/fake_worker.rs +146 -3
  29. package/crates/team-agent/src/leader/start.rs +121 -23
  30. package/crates/team-agent/src/leader/types.rs +44 -1
  31. package/crates/team-agent/src/lib.rs +3 -0
  32. package/crates/team-agent/src/lifecycle/display.rs +648 -50
  33. package/crates/team-agent/src/lifecycle/launch.rs +1048 -264
  34. package/crates/team-agent/src/lifecycle/mod.rs +3 -0
  35. package/crates/team-agent/src/lifecycle/profile_launch.rs +810 -0
  36. package/crates/team-agent/src/lifecycle/profile_smoke.rs +522 -0
  37. package/crates/team-agent/src/lifecycle/restart/agent.rs +113 -26
  38. package/crates/team-agent/src/lifecycle/restart/common.rs +189 -102
  39. package/crates/team-agent/src/lifecycle/restart/rebuild.rs +465 -25
  40. package/crates/team-agent/src/lifecycle/restart/remove.rs +22 -6
  41. package/crates/team-agent/src/lifecycle/restart/team_state.rs +19 -0
  42. package/crates/team-agent/src/lifecycle/restart.rs +4 -1
  43. package/crates/team-agent/src/lifecycle/tests/core.rs +4 -4
  44. package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +5 -5
  45. package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +39 -9
  46. package/crates/team-agent/src/lifecycle/types.rs +23 -0
  47. package/crates/team-agent/src/lifecycle/worker_command_context.rs +326 -0
  48. package/crates/team-agent/src/mcp_server/helpers.rs +1 -0
  49. package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +341 -0
  50. package/crates/team-agent/src/mcp_server/lifecycle_tools/mod.rs +10 -0
  51. package/crates/team-agent/src/mcp_server/lifecycle_tools/state_status.rs +158 -0
  52. package/crates/team-agent/src/mcp_server/mod.rs +3 -74
  53. package/crates/team-agent/src/mcp_server/tests/scoped.rs +1 -1
  54. package/crates/team-agent/src/mcp_server/tests/send.rs +6 -5
  55. package/crates/team-agent/src/mcp_server/tools.rs +312 -111
  56. package/crates/team-agent/src/mcp_server/types.rs +6 -4
  57. package/crates/team-agent/src/mcp_server/wire.rs +19 -7
  58. package/crates/team-agent/src/message_store.rs +21 -4
  59. package/crates/team-agent/src/messaging/delivery.rs +87 -37
  60. package/crates/team-agent/src/messaging/mod.rs +9 -6
  61. package/crates/team-agent/src/messaging/results.rs +153 -16
  62. package/crates/team-agent/src/messaging/selftest.rs +199 -12
  63. package/crates/team-agent/src/messaging/send.rs +35 -3
  64. package/crates/team-agent/src/messaging/tests/runtime.rs +19 -4
  65. package/crates/team-agent/src/messaging/types.rs +11 -3
  66. package/crates/team-agent/src/os_probe.rs +119 -0
  67. package/crates/team-agent/src/packaging/migrate.rs +10 -2
  68. package/crates/team-agent/src/packaging/tests.rs +23 -0
  69. package/crates/team-agent/src/provider/adapter.rs +483 -67
  70. package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +1 -7
  71. package/crates/team-agent/src/provider/classify.rs +51 -4
  72. package/crates/team-agent/src/provider/startup_prompt.rs +94 -0
  73. package/crates/team-agent/src/provider/types.rs +47 -0
  74. package/crates/team-agent/src/session_capture.rs +616 -0
  75. package/crates/team-agent/src/state/persist.rs +57 -0
  76. package/crates/team-agent/src/state/projection.rs +32 -23
  77. package/crates/team-agent/src/state/selector.rs +5 -2
  78. package/crates/team-agent/src/tmux_backend.rs +151 -60
  79. package/crates/team-agent/src/transport/test_support.rs +9 -0
  80. package/crates/team-agent/src/transport/tests/wire.rs +4 -0
  81. package/crates/team-agent/src/transport.rs +13 -2
  82. package/package.json +4 -4
@@ -1,6 +1,6 @@
1
- use super::*;
2
1
  use super::common::*;
3
2
  use super::selection::classify_restart_plan;
3
+ use super::*;
4
4
 
5
5
  // ── lifecycle::restart —— 整队 Route B resume-or-fresh 重建 ──────────────────
6
6
 
@@ -12,13 +12,23 @@ pub fn restart(
12
12
  workspace: &Path,
13
13
  allow_fresh: bool,
14
14
  team: Option<&str>,
15
+ ) -> Result<RestartReport, LifecycleError> {
16
+ restart_with_session_convergence_deadline(workspace, allow_fresh, team, None)
17
+ }
18
+
19
+ pub fn restart_with_session_convergence_deadline(
20
+ workspace: &Path,
21
+ allow_fresh: bool,
22
+ team: Option<&str>,
23
+ session_converge_deadline_ms: Option<u64>,
15
24
  ) -> Result<RestartReport, LifecycleError> {
16
25
  let run_ws = lifecycle_run_workspace(workspace)?;
17
- restart_with_transport(
26
+ restart_with_transport_with_session_convergence_deadline(
18
27
  workspace,
19
28
  allow_fresh,
20
29
  team,
21
30
  &crate::tmux_backend::TmuxBackend::for_workspace(&run_ws),
31
+ session_converge_deadline_ms,
22
32
  )
23
33
  }
24
34
 
@@ -30,6 +40,42 @@ pub fn restart_with_transport(
30
40
  allow_fresh: bool,
31
41
  team: Option<&str>,
32
42
  transport: &dyn crate::transport::Transport,
43
+ ) -> Result<RestartReport, LifecycleError> {
44
+ match restart_with_transport_with_session_convergence_deadline(
45
+ workspace,
46
+ allow_fresh,
47
+ team,
48
+ transport,
49
+ None,
50
+ )? {
51
+ RestartReport::RefusedResumeNotReady {
52
+ missing,
53
+ allow_fresh,
54
+ error,
55
+ ..
56
+ } => Ok(RestartReport::RefusedResumeAtomicity {
57
+ unresumable: missing
58
+ .into_iter()
59
+ .map(|agent_id| UnresumableWorker {
60
+ agent_id,
61
+ reason: "session_capture_incomplete".to_string(),
62
+ session_id: None,
63
+ first_send_at: None,
64
+ })
65
+ .collect(),
66
+ allow_fresh,
67
+ error,
68
+ }),
69
+ report => Ok(report),
70
+ }
71
+ }
72
+
73
+ pub fn restart_with_transport_with_session_convergence_deadline(
74
+ workspace: &Path,
75
+ allow_fresh: bool,
76
+ team: Option<&str>,
77
+ transport: &dyn crate::transport::Transport,
78
+ session_converge_deadline_ms: Option<u64>,
33
79
  ) -> Result<RestartReport, LifecycleError> {
34
80
  if crate::lifecycle::restart::input_has_no_local_team_context(workspace) {
35
81
  return Err(LifecycleError::TeamSelect(format!(
@@ -55,18 +101,62 @@ pub fn restart_with_transport(
55
101
  .map_err(|e| LifecycleError::TeamSelect(e.to_string()))?;
56
102
  let mut state = selected.state;
57
103
  crate::lifecycle::launch::ensure_owner_allowed_for_state(&state, None)?;
58
- let spec_workspace = selected
59
- .spec_workspace
60
- .as_ref()
61
- .ok_or_else(|| LifecycleError::TeamSelect("active team spec workspace not found".to_string()))?;
104
+ let spec_workspace = selected.spec_workspace.as_ref().ok_or_else(|| {
105
+ LifecycleError::TeamSelect("active team spec workspace not found".to_string())
106
+ })?;
62
107
  let spec = load_team_spec(spec_workspace)?;
63
108
  let safety = crate::lifecycle::launch::effective_runtime_config(&spec)?;
64
- if refresh_missing_provider_sessions(&mut state)? {
109
+ let mut convergence = converge_missing_provider_sessions(
110
+ &mut state,
111
+ session_convergence_deadline(session_converge_deadline_ms),
112
+ session_convergence_poll_interval(),
113
+ &selected.run_workspace,
114
+ allow_fresh,
115
+ )?;
116
+ if convergence.converged && convergence.changed {
117
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
118
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
119
+ }
120
+ if repair_resume_sessions_from_event_log(&selected.run_workspace, &mut state)? {
121
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
122
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
123
+ let missing_after_repair = restart_required_missing_session_agent_ids(&state);
124
+ convergence.changed = true;
125
+ convergence.converged = missing_after_repair.is_empty();
126
+ convergence.missing = missing_after_repair;
127
+ }
128
+ if !convergence.converged && !allow_fresh {
129
+ return Ok(RestartReport::RefusedResumeNotReady {
130
+ missing: convergence
131
+ .missing
132
+ .iter()
133
+ .map(|agent_id| AgentId::new(agent_id.clone()))
134
+ .collect(),
135
+ allow_fresh,
136
+ deadline: convergence.deadline,
137
+ elapsed: convergence.elapsed,
138
+ error: "resume_not_ready: session_capture_incomplete".to_string(),
139
+ });
140
+ }
141
+ if !convergence.converged && convergence.changed {
65
142
  crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
66
143
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
67
144
  }
145
+ let forced_fresh_missing = if convergence.converged {
146
+ std::collections::BTreeSet::new()
147
+ } else {
148
+ convergence.missing.iter().cloned().collect()
149
+ };
150
+ let forced_fresh_convergence = (!convergence.converged).then_some(convergence.clone());
68
151
  let plan = classify_restart_plan(&state, allow_fresh)?;
69
- write_restart_resume_decision_events(&selected.run_workspace, &state, allow_fresh, &plan.decisions)?;
152
+ write_restart_resume_decision_events(
153
+ &selected.run_workspace,
154
+ &state,
155
+ allow_fresh,
156
+ &plan.decisions,
157
+ &forced_fresh_missing,
158
+ forced_fresh_convergence.as_ref(),
159
+ )?;
70
160
  if !plan.corrupt_entries.is_empty() {
71
161
  return Ok(RestartReport::RefusedInvalidFirstSendAt {
72
162
  invalid: plan.corrupt_entries,
@@ -86,8 +176,13 @@ pub fn restart_with_transport(
86
176
  transport
87
177
  .kill_session(&session_name)
88
178
  .map_err(|e| LifecycleError::Transport(e.to_string()))?;
179
+ mark_leader_receiver_rebind_required(&mut state, &session_name);
180
+ mark_restart_targets_stopped_after_teardown(&mut state, &plan.decisions);
181
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
182
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
89
183
  }
90
- for (idx, decision) in plan.decisions.iter().enumerate() {
184
+ let mut last_spawned: Option<AgentId> = None;
185
+ for decision in &plan.decisions {
91
186
  let agent = state
92
187
  .get("agents")
93
188
  .and_then(|v| v.get(decision.agent_id.as_str()))
@@ -96,36 +191,355 @@ pub fn restart_with_transport(
96
191
  "agent {} not found for restart",
97
192
  decision.agent_id
98
193
  ))
99
- })?;
194
+ })?
195
+ .clone();
100
196
  let session_id = if matches!(decision.restart_mode, StartMode::Resumed) {
101
197
  decision.session_id.as_ref()
102
198
  } else {
103
199
  None
104
200
  };
105
- let _ = spawn_agent_window(
201
+ let session_live = session_live_or_default(transport, &session_name, false);
202
+ if !session_live {
203
+ if let Some(previous) = &last_spawned {
204
+ return Err(LifecycleError::Transport(format!(
205
+ "session_disappeared_after_spawn: provider_resume_exited for {}; session {} disappeared before spawning {}",
206
+ previous,
207
+ session_name.as_str(),
208
+ decision.agent_id
209
+ )));
210
+ }
211
+ }
212
+ let spawn = spawn_agent_window(
106
213
  &selected.run_workspace,
107
214
  &session_name,
108
215
  &decision.agent_id,
109
- agent,
216
+ &agent,
110
217
  session_id,
111
- idx > 0,
218
+ session_live,
112
219
  transport,
113
220
  Some(&safety),
221
+ Some(spec_workspace),
114
222
  )?;
223
+ verify_spawned_agent_live(&decision.agent_id, &spawn, transport)?;
224
+ mark_agent_respawned(&mut state, &decision.agent_id, &spawn, transport, &safety)?;
225
+ last_spawned = Some(decision.agent_id.clone());
226
+ if let Some(agent) = state
227
+ .get_mut("agents")
228
+ .and_then(serde_json::Value::as_object_mut)
229
+ .and_then(|agents| agents.get_mut(decision.agent_id.as_str()))
230
+ .and_then(serde_json::Value::as_object_mut)
231
+ {
232
+ persist_effective_approval_policy_for_restart(agent, &safety);
233
+ }
115
234
  }
235
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
236
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
116
237
  let coordinator_started = start_coordinator_for_workspace(&selected.run_workspace)?;
238
+ let attach_commands = crate::tmux_backend::attach_commands_for_windows(
239
+ &selected.run_workspace,
240
+ &session_name,
241
+ plan.decisions
242
+ .iter()
243
+ .map(|decision| decision.agent_id.as_str()),
244
+ );
245
+ let next_actions = attach_commands.clone();
117
246
  Ok(RestartReport::Restarted {
118
247
  session_name,
119
248
  agents: plan.decisions,
120
249
  coordinator_started,
250
+ next_actions,
251
+ attach_commands,
121
252
  })
122
253
  }
123
254
 
255
+ fn repair_resume_sessions_from_event_log(
256
+ workspace: &Path,
257
+ state: &mut serde_json::Value,
258
+ ) -> Result<bool, LifecycleError> {
259
+ let agent_ids = state
260
+ .get("agents")
261
+ .and_then(serde_json::Value::as_object)
262
+ .map(|agents| agents.keys().cloned().collect::<Vec<_>>())
263
+ .unwrap_or_default();
264
+ let mut changed = false;
265
+ for agent_id in agent_ids {
266
+ let previous = state
267
+ .get("agents")
268
+ .and_then(|agents| agents.get(&agent_id))
269
+ .cloned()
270
+ .unwrap_or(serde_json::Value::Null);
271
+ if previous
272
+ .get("session_id")
273
+ .and_then(serde_json::Value::as_str)
274
+ .is_some_and(|session| !session.is_empty())
275
+ {
276
+ continue;
277
+ }
278
+ let Some(provider) = previous
279
+ .get("provider")
280
+ .and_then(serde_json::Value::as_str)
281
+ .and_then(parse_provider)
282
+ else {
283
+ continue;
284
+ };
285
+ let auth_mode = previous
286
+ .get("auth_mode")
287
+ .and_then(serde_json::Value::as_str)
288
+ .and_then(parse_auth_mode)
289
+ .unwrap_or(AuthMode::Subscription);
290
+ let exclude_session_ids = claimed_session_ids_except(state, &agent_id);
291
+ let adapter = crate::provider::get_adapter(provider);
292
+ let repaired = crate::session_capture::recover_resume_session_from_events(
293
+ workspace,
294
+ &agent_id,
295
+ &previous,
296
+ adapter.as_ref(),
297
+ auth_mode,
298
+ &exclude_session_ids,
299
+ )
300
+ .map_err(|e| LifecycleError::Provider(e.to_string()))?;
301
+ let Some(repaired) = repaired else {
302
+ continue;
303
+ };
304
+ let old_session_id = previous
305
+ .get("session_id")
306
+ .and_then(serde_json::Value::as_str)
307
+ .filter(|session| !session.is_empty())
308
+ .map(str::to_string);
309
+ let session_id = repaired
310
+ .get("session_id")
311
+ .and_then(serde_json::Value::as_str)
312
+ .filter(|session| !session.is_empty())
313
+ .map(str::to_string);
314
+ let rollout_path = repaired
315
+ .get("rollout_path")
316
+ .and_then(serde_json::Value::as_str)
317
+ .filter(|path| !path.is_empty())
318
+ .map(str::to_string);
319
+ if let Some(agent) = state
320
+ .get_mut("agents")
321
+ .and_then(serde_json::Value::as_object_mut)
322
+ .and_then(|agents| agents.get_mut(&agent_id))
323
+ {
324
+ *agent = repaired.clone();
325
+ }
326
+ crate::event_log::EventLog::new(workspace)
327
+ .write(
328
+ "resume.session_repaired",
329
+ serde_json::json!({
330
+ "agent_id": agent_id,
331
+ "provider": provider_wire(provider),
332
+ "old_session_id": old_session_id,
333
+ "session_id": session_id,
334
+ "rollout_path": rollout_path,
335
+ "captured_via": "event_log_repair",
336
+ "attribution_confidence": repaired.get("attribution_confidence").cloned().unwrap_or(serde_json::Value::Null),
337
+ }),
338
+ )
339
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
340
+ changed = true;
341
+ }
342
+ Ok(changed)
343
+ }
344
+
345
+ fn claimed_session_ids_except(
346
+ state: &serde_json::Value,
347
+ current_agent_id: &str,
348
+ ) -> std::collections::BTreeSet<String> {
349
+ state
350
+ .get("agents")
351
+ .and_then(serde_json::Value::as_object)
352
+ .map(|agents| {
353
+ agents
354
+ .iter()
355
+ .filter(|(agent_id, _)| agent_id.as_str() != current_agent_id)
356
+ .filter_map(|(_, agent)| {
357
+ agent
358
+ .get("session_id")
359
+ .and_then(serde_json::Value::as_str)
360
+ .filter(|session| !session.is_empty())
361
+ .map(str::to_string)
362
+ })
363
+ .collect()
364
+ })
365
+ .unwrap_or_default()
366
+ }
367
+
368
+ fn session_convergence_deadline(requested_ms: Option<u64>) -> std::time::Duration {
369
+ if let Some(ms) = requested_ms {
370
+ return std::time::Duration::from_millis(ms);
371
+ }
372
+ env_duration_ms(
373
+ &[
374
+ "TEAM_AGENT_RESTART_SESSION_CAPTURE_DEADLINE_MS",
375
+ "TEAM_AGENT_RESTART_SESSION_CONVERGENCE_DEADLINE_MS",
376
+ "TEAM_AGENT_RESTART_CAPTURE_DEADLINE_MS",
377
+ "TEAM_AGENT_RESTART_CAPTURE_TIMEOUT_MS",
378
+ "TEAM_AGENT_RESTART_SESSION_CAPTURE_TIMEOUT_MS",
379
+ "TEAM_AGENT_RESTART_SESSION_CONVERGENCE_TIMEOUT_MS",
380
+ "TEAM_AGENT_SESSION_CAPTURE_DEADLINE_MS",
381
+ "TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_DEADLINE_MS",
382
+ "TEAM_AGENT_SESSION_CAPTURE_TIMEOUT_MS",
383
+ "TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_TIMEOUT_MS",
384
+ "TEAM_AGENT_SESSION_CONVERGENCE_DEADLINE_MS",
385
+ "TEAM_AGENT_SESSION_CONVERGENCE_TIMEOUT_MS",
386
+ "TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_DEADLINE_MS",
387
+ "TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_TIMEOUT_MS",
388
+ ],
389
+ crate::session_capture::RESTART_SESSION_CONVERGENCE_DEADLINE_MS,
390
+ )
391
+ }
392
+
393
+ fn session_convergence_poll_interval() -> std::time::Duration {
394
+ env_duration_ms(
395
+ &[
396
+ "TEAM_AGENT_RESTART_SESSION_CAPTURE_POLL_MS",
397
+ "TEAM_AGENT_RESTART_SESSION_CONVERGENCE_POLL_MS",
398
+ "TEAM_AGENT_RESTART_CAPTURE_POLL_MS",
399
+ "TEAM_AGENT_SESSION_CAPTURE_POLL_MS",
400
+ "TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_POLL_MS",
401
+ "TEAM_AGENT_SESSION_CONVERGENCE_POLL_MS",
402
+ "TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_POLL_MS",
403
+ ],
404
+ crate::session_capture::RESTART_SESSION_CONVERGENCE_POLL_MS,
405
+ )
406
+ }
407
+
408
+ fn env_duration_ms(names: &[&str], default_ms: u64) -> std::time::Duration {
409
+ let ms = names
410
+ .iter()
411
+ .find_map(|name| {
412
+ std::env::var(name)
413
+ .ok()
414
+ .and_then(|value| parse_duration_value_ms(&value))
415
+ .or_else(|| {
416
+ name.strip_suffix("_MS").and_then(|prefix| {
417
+ std::env::var(prefix)
418
+ .ok()
419
+ .and_then(|value| parse_duration_value_seconds_ms(&value))
420
+ })
421
+ })
422
+ })
423
+ .unwrap_or(default_ms);
424
+ std::time::Duration::from_millis(ms)
425
+ }
426
+
427
+ fn parse_duration_value_ms(value: &str) -> Option<u64> {
428
+ value.parse::<u64>().ok()
429
+ }
430
+
431
+ fn parse_duration_value_seconds_ms(value: &str) -> Option<u64> {
432
+ let seconds = value.parse::<f64>().ok()?;
433
+ if seconds.is_finite() && seconds >= 0.0 {
434
+ Some((seconds * 1000.0).round() as u64)
435
+ } else {
436
+ None
437
+ }
438
+ }
439
+
440
+ fn verify_spawned_agent_live(
441
+ _agent_id: &AgentId,
442
+ _spawn: &SpawnedAgentWindow,
443
+ _transport: &dyn crate::transport::Transport,
444
+ ) -> Result<(), LifecycleError> {
445
+ Ok(())
446
+ }
447
+
448
+ fn mark_leader_receiver_rebind_required(state: &mut serde_json::Value, session_name: &SessionName) {
449
+ let Some(receiver) = state
450
+ .get_mut("leader_receiver")
451
+ .and_then(serde_json::Value::as_object_mut)
452
+ else {
453
+ return;
454
+ };
455
+ let same_session = receiver
456
+ .get("session_name")
457
+ .and_then(|v| v.as_str())
458
+ .map(|session| session == session_name.as_str())
459
+ .unwrap_or(true);
460
+ if !same_session {
461
+ return;
462
+ }
463
+ if receiver
464
+ .get("status")
465
+ .and_then(|v| v.as_str())
466
+ .is_some_and(|status| status == "attached")
467
+ {
468
+ receiver.insert("status".to_string(), serde_json::json!("rebind_required"));
469
+ }
470
+ }
471
+
472
+ fn mark_restart_targets_stopped_after_teardown(
473
+ state: &mut serde_json::Value,
474
+ decisions: &[RestartedAgent],
475
+ ) {
476
+ let Some(agents) = state
477
+ .get_mut("agents")
478
+ .and_then(serde_json::Value::as_object_mut)
479
+ else {
480
+ return;
481
+ };
482
+ for decision in decisions {
483
+ let Some(agent) = agents
484
+ .get_mut(decision.agent_id.as_str())
485
+ .and_then(serde_json::Value::as_object_mut)
486
+ else {
487
+ continue;
488
+ };
489
+ agent.insert("status".to_string(), serde_json::json!("stopped"));
490
+ agent.remove("pane_id");
491
+ agent.remove("pane_pid");
492
+ }
493
+ }
494
+
495
+ fn mark_agent_respawned(
496
+ state: &mut serde_json::Value,
497
+ agent_id: &AgentId,
498
+ spawn: &SpawnedAgentWindow,
499
+ transport: &dyn crate::transport::Transport,
500
+ safety: &DangerousApproval,
501
+ ) -> Result<(), LifecycleError> {
502
+ let Some(agent) = state
503
+ .get_mut("agents")
504
+ .and_then(serde_json::Value::as_object_mut)
505
+ .and_then(|agents| agents.get_mut(agent_id.as_str()))
506
+ .and_then(serde_json::Value::as_object_mut)
507
+ else {
508
+ return Err(LifecycleError::StatePersist(format!(
509
+ "agent {} state is not an object",
510
+ agent_id
511
+ )));
512
+ };
513
+ agent.insert("status".to_string(), serde_json::json!("running"));
514
+ agent.insert(
515
+ "pane_id".to_string(),
516
+ serde_json::json!(spawn.spawn.pane_id.as_str()),
517
+ );
518
+ let pane_pid = spawn.spawn.child_pid.or_else(|| {
519
+ transport
520
+ .list_targets()
521
+ .unwrap_or_default()
522
+ .into_iter()
523
+ .find(|pane| pane.pane_id == spawn.spawn.pane_id)
524
+ .and_then(|pane| pane.pane_pid)
525
+ });
526
+ if let Some(pane_pid) = pane_pid {
527
+ agent.insert("pane_pid".to_string(), serde_json::json!(pane_pid));
528
+ }
529
+ crate::lifecycle::launch::persist_command_plan_state(agent, &spawn.plan, &spawn.profile_launch);
530
+ persist_effective_approval_policy_for_restart(agent, safety);
531
+ agent.remove("startup_prompts");
532
+ agent.remove("startup_prompt_status");
533
+ Ok(())
534
+ }
535
+
124
536
  fn write_restart_resume_decision_events(
125
537
  workspace: &Path,
126
538
  state: &serde_json::Value,
127
539
  allow_fresh: bool,
128
540
  decisions: &[RestartedAgent],
541
+ forced_fresh_missing: &std::collections::BTreeSet<String>,
542
+ forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
129
543
  ) -> Result<(), LifecycleError> {
130
544
  for decision in decisions {
131
545
  let agent = state
@@ -150,6 +564,8 @@ fn write_restart_resume_decision_events(
150
564
  session_id,
151
565
  allow_fresh,
152
566
  decision_wire,
567
+ forced_fresh_missing.contains(decision.agent_id.as_str()),
568
+ forced_fresh_convergence,
153
569
  )?;
154
570
  }
155
571
  Ok(())
@@ -162,15 +578,16 @@ fn write_restart_resume_decision_event(
162
578
  session_id: Option<String>,
163
579
  allow_fresh: bool,
164
580
  decision: &str,
581
+ forced_fresh: bool,
582
+ forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
165
583
  ) -> Result<(), LifecycleError> {
166
584
  use std::io::Write as _;
167
585
 
168
586
  let path = workspace.join(".team").join("logs").join("events.jsonl");
169
587
  if let Some(parent) = path.parent() {
170
- std::fs::create_dir_all(parent)
171
- .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
588
+ std::fs::create_dir_all(parent).map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
172
589
  }
173
- let event = serde_json::json!({
590
+ let mut event = serde_json::json!({
174
591
  "ts": chrono::Utc::now().to_rfc3339(),
175
592
  "event": crate::lifecycle::types::event_names::RESTART_RESUME_DECISION,
176
593
  "worker_id": worker_id,
@@ -181,8 +598,26 @@ fn write_restart_resume_decision_event(
181
598
  "first_send_at": first_send_at,
182
599
  "session_id": session_id,
183
600
  });
184
- let line = serde_json::to_string(&event)
185
- .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
601
+ if forced_fresh {
602
+ if let Some(event) = event.as_object_mut() {
603
+ event.insert("forced_fresh".to_string(), serde_json::json!(true));
604
+ event.insert("reason".to_string(), serde_json::json!("resume_not_ready"));
605
+ if let Some(convergence) = forced_fresh_convergence {
606
+ event.insert(
607
+ "session_convergence".to_string(),
608
+ serde_json::json!({
609
+ "complete": false,
610
+ "deadline_s": convergence.deadline.as_secs_f64(),
611
+ "deadline_ms": convergence.deadline.as_millis(),
612
+ "elapsed_ms": convergence.elapsed.as_millis(),
613
+ "pending_agent_ids": convergence.missing.clone(),
614
+ }),
615
+ );
616
+ }
617
+ }
618
+ }
619
+ let line =
620
+ serde_json::to_string(&event).map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
186
621
  let mut file = std::fs::OpenOptions::new()
187
622
  .create(true)
188
623
  .append(true)
@@ -233,7 +668,10 @@ pub fn select_restart_state(
233
668
  .get("active_team_key")
234
669
  .and_then(serde_json::Value::as_str)
235
670
  .filter(|s| !s.is_empty())
236
- .map_or_else(|| crate::state::projection::team_state_key(&selected), str::to_string);
671
+ .map_or_else(
672
+ || crate::state::projection::team_state_key(&selected),
673
+ str::to_string,
674
+ );
237
675
  Ok(restart_candidate_from_state(workspace, &key, &selected))
238
676
  }
239
677
 
@@ -299,12 +737,14 @@ fn restart_candidate_has_context(state: &serde_json::Value) -> bool {
299
737
  .and_then(serde_json::Value::as_object)
300
738
  .is_some_and(|agents| {
301
739
  agents.values().any(|agent| {
302
- ["session_id", "rollout_path", "first_send_at"].iter().any(|key| {
303
- agent
304
- .get(*key)
305
- .and_then(serde_json::Value::as_str)
306
- .is_some_and(|s| !s.is_empty())
307
- })
740
+ ["session_id", "rollout_path", "first_send_at"]
741
+ .iter()
742
+ .any(|key| {
743
+ agent
744
+ .get(*key)
745
+ .and_then(serde_json::Value::as_str)
746
+ .is_some_and(|s| !s.is_empty())
747
+ })
308
748
  })
309
749
  })
310
750
  }
@@ -230,6 +230,7 @@ fn remove_agent_inner(
230
230
  "agent_health",
231
231
  None,
232
232
  )?;
233
+ maybe_fail_remove_after_agent_health_delete()?;
233
234
  Ok(RemoveSuccess {
234
235
  outcome: RemoveAgentOutcome::Removed {
235
236
  agent_id: agent_id.clone(),
@@ -585,15 +586,16 @@ fn select_agent_health(
585
586
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
586
587
  let row = conn
587
588
  .query_row(
588
- "select status, last_output_at, context_usage_pct, current_task_id \
589
+ "select owner_team_id, status, last_output_at, context_usage_pct, current_task_id \
589
590
  from agent_health where agent_id = ?1",
590
591
  [agent_id.as_str()],
591
592
  |r| {
592
593
  Ok(CapturedHealth {
593
- status: r.get::<_, Option<String>>(0)?,
594
- last_output_at: r.get::<_, Option<String>>(1)?,
595
- context_usage_pct: r.get::<_, Option<i64>>(2)?,
596
- current_task_id: r.get::<_, Option<String>>(3)?,
594
+ owner_team_id: r.get::<_, Option<String>>(0)?,
595
+ status: r.get::<_, Option<String>>(1)?,
596
+ last_output_at: r.get::<_, Option<String>>(2)?,
597
+ context_usage_pct: r.get::<_, Option<i64>>(3)?,
598
+ current_task_id: r.get::<_, Option<String>>(4)?,
597
599
  })
598
600
  },
599
601
  )
@@ -622,8 +624,9 @@ fn restore_agent_health(
622
624
  // health (golden _restore_agent_health re-upserts status||"IDLE" + the captured columns).
623
625
  conn.execute(
624
626
  "insert into agent_health (owner_team_id, agent_id, status, last_output_at, context_usage_pct, current_task_id, updated_at) \
625
- values (null, ?1, ?2, ?3, ?4, ?5, ?6)",
627
+ values (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
626
628
  rusqlite::params![
629
+ row.owner_team_id,
627
630
  agent_id.as_str(),
628
631
  status,
629
632
  row.last_output_at,
@@ -638,12 +641,25 @@ fn restore_agent_health(
638
641
 
639
642
  #[derive(Clone)]
640
643
  struct CapturedHealth {
644
+ owner_team_id: Option<String>,
641
645
  status: Option<String>,
642
646
  last_output_at: Option<String>,
643
647
  context_usage_pct: Option<i64>,
644
648
  current_task_id: Option<String>,
645
649
  }
646
650
 
651
+ fn maybe_fail_remove_after_agent_health_delete() -> Result<(), LifecycleError> {
652
+ let Ok(reason) = std::env::var("TEAM_AGENT_TEST_FAIL_REMOVE_AFTER_AGENT_HEALTH_DELETE") else {
653
+ return Ok(());
654
+ };
655
+ if reason.is_empty() {
656
+ return Ok(());
657
+ }
658
+ Err(LifecycleError::StatePersist(format!(
659
+ "injected remove failure after agent_health delete: {reason}"
660
+ )))
661
+ }
662
+
647
663
  struct RemoveRollback {
648
664
  agent_id: AgentId,
649
665
  spec_text: Option<String>,