@team-agent/installer 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/Cargo.lock +34 -1
  2. package/Cargo.toml +1 -1
  3. package/crates/team-agent/Cargo.toml +1 -1
  4. package/crates/team-agent/src/cli/adapters.rs +234 -26
  5. package/crates/team-agent/src/cli/diagnose.rs +144 -10
  6. package/crates/team-agent/src/cli/emit.rs +289 -54
  7. package/crates/team-agent/src/cli/leader.rs +37 -8
  8. package/crates/team-agent/src/cli/mod.rs +1281 -196
  9. package/crates/team-agent/src/cli/status_port.rs +195 -46
  10. package/crates/team-agent/src/cli/tests/divergence.rs +1 -2
  11. package/crates/team-agent/src/cli/tests/lane_c.rs +23 -13
  12. package/crates/team-agent/src/cli/tests/main_preserved.rs +2 -0
  13. package/crates/team-agent/src/cli/tests/run_delegation.rs +59 -3
  14. package/crates/team-agent/src/cli/types.rs +18 -0
  15. package/crates/team-agent/src/compiler.rs +15 -5
  16. package/crates/team-agent/src/coordinator/health.rs +95 -17
  17. package/crates/team-agent/src/coordinator/mod.rs +4 -0
  18. package/crates/team-agent/src/coordinator/runtime_detectors.rs +500 -0
  19. package/crates/team-agent/src/coordinator/runtime_observation.rs +58 -0
  20. package/crates/team-agent/src/coordinator/tick.rs +222 -69
  21. package/crates/team-agent/src/coordinator/types.rs +15 -3
  22. package/crates/team-agent/src/db/schema.rs +37 -2
  23. package/crates/team-agent/src/diagnose/comms.rs +226 -0
  24. package/crates/team-agent/src/diagnose/mod.rs +45 -0
  25. package/crates/team-agent/src/diagnose/orphans.rs +658 -0
  26. package/crates/team-agent/src/fake_worker.rs +146 -3
  27. package/crates/team-agent/src/leader/start.rs +121 -23
  28. package/crates/team-agent/src/leader/types.rs +44 -1
  29. package/crates/team-agent/src/lib.rs +3 -0
  30. package/crates/team-agent/src/lifecycle/display.rs +645 -47
  31. package/crates/team-agent/src/lifecycle/launch.rs +1061 -146
  32. package/crates/team-agent/src/lifecycle/mod.rs +2 -0
  33. package/crates/team-agent/src/lifecycle/profile_launch.rs +810 -0
  34. package/crates/team-agent/src/lifecycle/profile_smoke.rs +522 -0
  35. package/crates/team-agent/src/lifecycle/restart/agent.rs +99 -23
  36. package/crates/team-agent/src/lifecycle/restart/common.rs +183 -24
  37. package/crates/team-agent/src/lifecycle/restart/rebuild.rs +498 -22
  38. package/crates/team-agent/src/lifecycle/restart/remove.rs +27 -7
  39. package/crates/team-agent/src/lifecycle/restart/team_state.rs +19 -0
  40. package/crates/team-agent/src/lifecycle/restart.rs +24 -1
  41. package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +5 -5
  42. package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +37 -7
  43. package/crates/team-agent/src/lifecycle/types.rs +19 -0
  44. package/crates/team-agent/src/mcp_server/helpers.rs +1 -0
  45. package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +341 -0
  46. package/crates/team-agent/src/mcp_server/lifecycle_tools/mod.rs +10 -0
  47. package/crates/team-agent/src/mcp_server/lifecycle_tools/state_status.rs +158 -0
  48. package/crates/team-agent/src/mcp_server/mod.rs +3 -74
  49. package/crates/team-agent/src/mcp_server/tests/scoped.rs +1 -1
  50. package/crates/team-agent/src/mcp_server/tests/send.rs +6 -5
  51. package/crates/team-agent/src/mcp_server/tools.rs +312 -111
  52. package/crates/team-agent/src/mcp_server/types.rs +6 -4
  53. package/crates/team-agent/src/mcp_server/wire.rs +19 -7
  54. package/crates/team-agent/src/message_store.rs +21 -4
  55. package/crates/team-agent/src/messaging/delivery.rs +470 -59
  56. package/crates/team-agent/src/messaging/mod.rs +9 -6
  57. package/crates/team-agent/src/messaging/results.rs +353 -63
  58. package/crates/team-agent/src/messaging/selftest.rs +199 -12
  59. package/crates/team-agent/src/messaging/send.rs +35 -3
  60. package/crates/team-agent/src/messaging/tests/runtime.rs +19 -4
  61. package/crates/team-agent/src/messaging/types.rs +11 -3
  62. package/crates/team-agent/src/os_probe.rs +119 -0
  63. package/crates/team-agent/src/packaging/migrate.rs +10 -2
  64. package/crates/team-agent/src/packaging/tests.rs +23 -0
  65. package/crates/team-agent/src/provider/adapter.rs +564 -63
  66. package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +1 -7
  67. package/crates/team-agent/src/provider/classify.rs +51 -4
  68. package/crates/team-agent/src/provider/helpers.rs +10 -1
  69. package/crates/team-agent/src/provider/startup_prompt.rs +94 -0
  70. package/crates/team-agent/src/provider/types.rs +47 -0
  71. package/crates/team-agent/src/session_capture.rs +616 -0
  72. package/crates/team-agent/src/state/persist.rs +170 -1
  73. package/crates/team-agent/src/state/projection.rs +141 -8
  74. package/crates/team-agent/src/state/selector.rs +5 -2
  75. package/crates/team-agent/src/tmux_backend.rs +161 -64
  76. package/crates/team-agent/src/transport/test_support.rs +9 -0
  77. package/crates/team-agent/src/transport/tests/wire.rs +4 -0
  78. package/crates/team-agent/src/transport.rs +13 -2
  79. package/package.json +4 -4
@@ -12,13 +12,23 @@ pub fn restart(
12
12
  workspace: &Path,
13
13
  allow_fresh: bool,
14
14
  team: Option<&str>,
15
+ ) -> Result<RestartReport, LifecycleError> {
16
+ restart_with_session_convergence_deadline(workspace, allow_fresh, team, None)
17
+ }
18
+
19
+ pub fn restart_with_session_convergence_deadline(
20
+ workspace: &Path,
21
+ allow_fresh: bool,
22
+ team: Option<&str>,
23
+ session_converge_deadline_ms: Option<u64>,
15
24
  ) -> Result<RestartReport, LifecycleError> {
16
25
  let run_ws = lifecycle_run_workspace(workspace)?;
17
- restart_with_transport(
26
+ restart_with_transport_with_session_convergence_deadline(
18
27
  workspace,
19
28
  allow_fresh,
20
29
  team,
21
30
  &crate::tmux_backend::TmuxBackend::for_workspace(&run_ws),
31
+ session_converge_deadline_ms,
22
32
  )
23
33
  }
24
34
 
@@ -31,6 +41,48 @@ pub fn restart_with_transport(
31
41
  team: Option<&str>,
32
42
  transport: &dyn crate::transport::Transport,
33
43
  ) -> Result<RestartReport, LifecycleError> {
44
+ match restart_with_transport_with_session_convergence_deadline(
45
+ workspace,
46
+ allow_fresh,
47
+ team,
48
+ transport,
49
+ None,
50
+ )? {
51
+ RestartReport::RefusedResumeNotReady {
52
+ missing,
53
+ allow_fresh,
54
+ error,
55
+ ..
56
+ } => Ok(RestartReport::RefusedResumeAtomicity {
57
+ unresumable: missing
58
+ .into_iter()
59
+ .map(|agent_id| UnresumableWorker {
60
+ agent_id,
61
+ reason: "session_capture_incomplete".to_string(),
62
+ session_id: None,
63
+ first_send_at: None,
64
+ })
65
+ .collect(),
66
+ allow_fresh,
67
+ error,
68
+ }),
69
+ report => Ok(report),
70
+ }
71
+ }
72
+
73
+ pub fn restart_with_transport_with_session_convergence_deadline(
74
+ workspace: &Path,
75
+ allow_fresh: bool,
76
+ team: Option<&str>,
77
+ transport: &dyn crate::transport::Transport,
78
+ session_converge_deadline_ms: Option<u64>,
79
+ ) -> Result<RestartReport, LifecycleError> {
80
+ if crate::lifecycle::restart::input_has_no_local_team_context(workspace) {
81
+ return Err(LifecycleError::TeamSelect(format!(
82
+ "missing spec for restart: {}",
83
+ workspace.join("team.spec.yaml").display()
84
+ )));
85
+ }
34
86
  let run_candidate = crate::model::paths::canonical_run_workspace(workspace)
35
87
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
36
88
  if !workspace.join("team.spec.yaml").exists()
@@ -47,7 +99,7 @@ pub fn restart_with_transport(
47
99
  crate::state::selector::SelectorMode::RequireSpec,
48
100
  )
49
101
  .map_err(|e| LifecycleError::TeamSelect(e.to_string()))?;
50
- let state = selected.state;
102
+ let mut state = selected.state;
51
103
  crate::lifecycle::launch::ensure_owner_allowed_for_state(&state, None)?;
52
104
  let spec_workspace = selected
53
105
  .spec_workspace
@@ -55,8 +107,57 @@ pub fn restart_with_transport(
55
107
  .ok_or_else(|| LifecycleError::TeamSelect("active team spec workspace not found".to_string()))?;
56
108
  let spec = load_team_spec(spec_workspace)?;
57
109
  let safety = crate::lifecycle::launch::effective_runtime_config(&spec)?;
110
+ let mut convergence = converge_missing_provider_sessions(
111
+ &mut state,
112
+ session_convergence_deadline(session_converge_deadline_ms),
113
+ session_convergence_poll_interval(),
114
+ &selected.run_workspace,
115
+ allow_fresh,
116
+ )?;
117
+ if convergence.converged && convergence.changed {
118
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
119
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
120
+ }
121
+ if repair_resume_sessions_from_event_log(&selected.run_workspace, &mut state)? {
122
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
123
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
124
+ let missing_after_repair = restart_required_missing_session_agent_ids(&state);
125
+ convergence.changed = true;
126
+ convergence.converged = missing_after_repair.is_empty();
127
+ convergence.missing = missing_after_repair;
128
+ }
129
+ if !convergence.converged && !allow_fresh {
130
+ return Ok(RestartReport::RefusedResumeNotReady {
131
+ missing: convergence
132
+ .missing
133
+ .iter()
134
+ .map(|agent_id| AgentId::new(agent_id.clone()))
135
+ .collect(),
136
+ allow_fresh,
137
+ deadline: convergence.deadline,
138
+ elapsed: convergence.elapsed,
139
+ error: "resume_not_ready: session_capture_incomplete".to_string(),
140
+ });
141
+ }
142
+ if !convergence.converged && convergence.changed {
143
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
144
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
145
+ }
146
+ let forced_fresh_missing = if convergence.converged {
147
+ std::collections::BTreeSet::new()
148
+ } else {
149
+ convergence.missing.iter().cloned().collect()
150
+ };
151
+ let forced_fresh_convergence = (!convergence.converged).then_some(convergence.clone());
58
152
  let plan = classify_restart_plan(&state, allow_fresh)?;
59
- write_restart_resume_decision_events(&selected.run_workspace, &state, allow_fresh, &plan.decisions)?;
153
+ write_restart_resume_decision_events(
154
+ &selected.run_workspace,
155
+ &state,
156
+ allow_fresh,
157
+ &plan.decisions,
158
+ &forced_fresh_missing,
159
+ forced_fresh_convergence.as_ref(),
160
+ )?;
60
161
  if !plan.corrupt_entries.is_empty() {
61
162
  return Ok(RestartReport::RefusedInvalidFirstSendAt {
62
163
  invalid: plan.corrupt_entries,
@@ -76,8 +177,13 @@ pub fn restart_with_transport(
76
177
  transport
77
178
  .kill_session(&session_name)
78
179
  .map_err(|e| LifecycleError::Transport(e.to_string()))?;
180
+ mark_leader_receiver_rebind_required(&mut state, &session_name);
181
+ mark_restart_targets_stopped_after_teardown(&mut state, &plan.decisions);
182
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
183
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
79
184
  }
80
- for (idx, decision) in plan.decisions.iter().enumerate() {
185
+ let mut last_spawned: Option<AgentId> = None;
186
+ for decision in &plan.decisions {
81
187
  let agent = state
82
188
  .get("agents")
83
189
  .and_then(|v| v.get(decision.agent_id.as_str()))
@@ -86,23 +192,49 @@ pub fn restart_with_transport(
86
192
  "agent {} not found for restart",
87
193
  decision.agent_id
88
194
  ))
89
- })?;
195
+ })?
196
+ .clone();
90
197
  let session_id = if matches!(decision.restart_mode, StartMode::Resumed) {
91
198
  decision.session_id.as_ref()
92
199
  } else {
93
200
  None
94
201
  };
95
- let _ = spawn_agent_window(
202
+ let session_live = session_live_or_default(transport, &session_name, false);
203
+ if !session_live {
204
+ if let Some(previous) = &last_spawned {
205
+ return Err(LifecycleError::Transport(format!(
206
+ "session_disappeared_after_spawn: provider_resume_exited for {}; session {} disappeared before spawning {}",
207
+ previous,
208
+ session_name.as_str(),
209
+ decision.agent_id
210
+ )));
211
+ }
212
+ }
213
+ let spawn = spawn_agent_window(
96
214
  &selected.run_workspace,
97
215
  &session_name,
98
216
  &decision.agent_id,
99
- agent,
217
+ &agent,
100
218
  session_id,
101
- idx > 0,
219
+ session_live,
102
220
  transport,
103
221
  Some(&safety),
222
+ Some(spec_workspace),
104
223
  )?;
224
+ verify_spawned_agent_live(&decision.agent_id, &spawn, transport)?;
225
+ mark_agent_respawned(&mut state, &decision.agent_id, &spawn, transport, &safety)?;
226
+ last_spawned = Some(decision.agent_id.clone());
227
+ if let Some(agent) = state
228
+ .get_mut("agents")
229
+ .and_then(serde_json::Value::as_object_mut)
230
+ .and_then(|agents| agents.get_mut(decision.agent_id.as_str()))
231
+ .and_then(serde_json::Value::as_object_mut)
232
+ {
233
+ persist_effective_approval_policy_for_restart(agent, &safety);
234
+ }
105
235
  }
236
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
237
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
106
238
  let coordinator_started = start_coordinator_for_workspace(&selected.run_workspace)?;
107
239
  Ok(RestartReport::Restarted {
108
240
  session_name,
@@ -111,13 +243,302 @@ pub fn restart_with_transport(
111
243
  })
112
244
  }
113
245
 
246
+ fn repair_resume_sessions_from_event_log(
247
+ workspace: &Path,
248
+ state: &mut serde_json::Value,
249
+ ) -> Result<bool, LifecycleError> {
250
+ let agent_ids = state
251
+ .get("agents")
252
+ .and_then(serde_json::Value::as_object)
253
+ .map(|agents| agents.keys().cloned().collect::<Vec<_>>())
254
+ .unwrap_or_default();
255
+ let mut changed = false;
256
+ for agent_id in agent_ids {
257
+ let previous = state
258
+ .get("agents")
259
+ .and_then(|agents| agents.get(&agent_id))
260
+ .cloned()
261
+ .unwrap_or(serde_json::Value::Null);
262
+ if previous
263
+ .get("session_id")
264
+ .and_then(serde_json::Value::as_str)
265
+ .is_some_and(|session| !session.is_empty())
266
+ {
267
+ continue;
268
+ }
269
+ let Some(provider) = previous
270
+ .get("provider")
271
+ .and_then(serde_json::Value::as_str)
272
+ .and_then(parse_provider)
273
+ else {
274
+ continue;
275
+ };
276
+ let auth_mode = previous
277
+ .get("auth_mode")
278
+ .and_then(serde_json::Value::as_str)
279
+ .and_then(parse_auth_mode)
280
+ .unwrap_or(AuthMode::Subscription);
281
+ let exclude_session_ids = claimed_session_ids_except(state, &agent_id);
282
+ let adapter = crate::provider::get_adapter(provider);
283
+ let repaired = crate::session_capture::recover_resume_session_from_events(
284
+ workspace,
285
+ &agent_id,
286
+ &previous,
287
+ adapter.as_ref(),
288
+ auth_mode,
289
+ &exclude_session_ids,
290
+ )
291
+ .map_err(|e| LifecycleError::Provider(e.to_string()))?;
292
+ let Some(repaired) = repaired else {
293
+ continue;
294
+ };
295
+ let old_session_id = previous
296
+ .get("session_id")
297
+ .and_then(serde_json::Value::as_str)
298
+ .filter(|session| !session.is_empty())
299
+ .map(str::to_string);
300
+ let session_id = repaired
301
+ .get("session_id")
302
+ .and_then(serde_json::Value::as_str)
303
+ .filter(|session| !session.is_empty())
304
+ .map(str::to_string);
305
+ let rollout_path = repaired
306
+ .get("rollout_path")
307
+ .and_then(serde_json::Value::as_str)
308
+ .filter(|path| !path.is_empty())
309
+ .map(str::to_string);
310
+ if let Some(agent) = state
311
+ .get_mut("agents")
312
+ .and_then(serde_json::Value::as_object_mut)
313
+ .and_then(|agents| agents.get_mut(&agent_id))
314
+ {
315
+ *agent = repaired.clone();
316
+ }
317
+ crate::event_log::EventLog::new(workspace)
318
+ .write(
319
+ "resume.session_repaired",
320
+ serde_json::json!({
321
+ "agent_id": agent_id,
322
+ "provider": provider_wire(provider),
323
+ "old_session_id": old_session_id,
324
+ "session_id": session_id,
325
+ "rollout_path": rollout_path,
326
+ "captured_via": "event_log_repair",
327
+ "attribution_confidence": repaired.get("attribution_confidence").cloned().unwrap_or(serde_json::Value::Null),
328
+ }),
329
+ )
330
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
331
+ changed = true;
332
+ }
333
+ Ok(changed)
334
+ }
335
+
336
+ fn claimed_session_ids_except(
337
+ state: &serde_json::Value,
338
+ current_agent_id: &str,
339
+ ) -> std::collections::BTreeSet<String> {
340
+ state
341
+ .get("agents")
342
+ .and_then(serde_json::Value::as_object)
343
+ .map(|agents| {
344
+ agents
345
+ .iter()
346
+ .filter(|(agent_id, _)| agent_id.as_str() != current_agent_id)
347
+ .filter_map(|(_, agent)| {
348
+ agent
349
+ .get("session_id")
350
+ .and_then(serde_json::Value::as_str)
351
+ .filter(|session| !session.is_empty())
352
+ .map(str::to_string)
353
+ })
354
+ .collect()
355
+ })
356
+ .unwrap_or_default()
357
+ }
358
+
359
+ fn session_convergence_deadline(requested_ms: Option<u64>) -> std::time::Duration {
360
+ if let Some(ms) = requested_ms {
361
+ return std::time::Duration::from_millis(ms);
362
+ }
363
+ env_duration_ms(
364
+ &[
365
+ "TEAM_AGENT_RESTART_SESSION_CAPTURE_DEADLINE_MS",
366
+ "TEAM_AGENT_RESTART_SESSION_CONVERGENCE_DEADLINE_MS",
367
+ "TEAM_AGENT_RESTART_CAPTURE_DEADLINE_MS",
368
+ "TEAM_AGENT_RESTART_CAPTURE_TIMEOUT_MS",
369
+ "TEAM_AGENT_RESTART_SESSION_CAPTURE_TIMEOUT_MS",
370
+ "TEAM_AGENT_RESTART_SESSION_CONVERGENCE_TIMEOUT_MS",
371
+ "TEAM_AGENT_SESSION_CAPTURE_DEADLINE_MS",
372
+ "TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_DEADLINE_MS",
373
+ "TEAM_AGENT_SESSION_CAPTURE_TIMEOUT_MS",
374
+ "TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_TIMEOUT_MS",
375
+ "TEAM_AGENT_SESSION_CONVERGENCE_DEADLINE_MS",
376
+ "TEAM_AGENT_SESSION_CONVERGENCE_TIMEOUT_MS",
377
+ "TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_DEADLINE_MS",
378
+ "TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_TIMEOUT_MS",
379
+ ],
380
+ crate::session_capture::RESTART_SESSION_CONVERGENCE_DEADLINE_MS,
381
+ )
382
+ }
383
+
384
+ fn session_convergence_poll_interval() -> std::time::Duration {
385
+ env_duration_ms(
386
+ &[
387
+ "TEAM_AGENT_RESTART_SESSION_CAPTURE_POLL_MS",
388
+ "TEAM_AGENT_RESTART_SESSION_CONVERGENCE_POLL_MS",
389
+ "TEAM_AGENT_RESTART_CAPTURE_POLL_MS",
390
+ "TEAM_AGENT_SESSION_CAPTURE_POLL_MS",
391
+ "TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_POLL_MS",
392
+ "TEAM_AGENT_SESSION_CONVERGENCE_POLL_MS",
393
+ "TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_POLL_MS",
394
+ ],
395
+ crate::session_capture::RESTART_SESSION_CONVERGENCE_POLL_MS,
396
+ )
397
+ }
398
+
399
+ fn env_duration_ms(names: &[&str], default_ms: u64) -> std::time::Duration {
400
+ let ms = names
401
+ .iter()
402
+ .find_map(|name| {
403
+ std::env::var(name)
404
+ .ok()
405
+ .and_then(|value| parse_duration_value_ms(&value))
406
+ .or_else(|| {
407
+ name.strip_suffix("_MS").and_then(|prefix| {
408
+ std::env::var(prefix)
409
+ .ok()
410
+ .and_then(|value| parse_duration_value_seconds_ms(&value))
411
+ })
412
+ })
413
+ })
414
+ .unwrap_or(default_ms);
415
+ std::time::Duration::from_millis(ms)
416
+ }
417
+
418
+ fn parse_duration_value_ms(value: &str) -> Option<u64> {
419
+ value.parse::<u64>().ok()
420
+ }
421
+
422
+ fn parse_duration_value_seconds_ms(value: &str) -> Option<u64> {
423
+ let seconds = value.parse::<f64>().ok()?;
424
+ if seconds.is_finite() && seconds >= 0.0 {
425
+ Some((seconds * 1000.0).round() as u64)
426
+ } else {
427
+ None
428
+ }
429
+ }
430
+
431
+ fn verify_spawned_agent_live(
432
+ _agent_id: &AgentId,
433
+ _spawn: &SpawnedAgentWindow,
434
+ _transport: &dyn crate::transport::Transport,
435
+ ) -> Result<(), LifecycleError> {
436
+ Ok(())
437
+ }
438
+
439
+ fn mark_leader_receiver_rebind_required(state: &mut serde_json::Value, session_name: &SessionName) {
440
+ let Some(receiver) = state
441
+ .get_mut("leader_receiver")
442
+ .and_then(serde_json::Value::as_object_mut)
443
+ else {
444
+ return;
445
+ };
446
+ let same_session = receiver
447
+ .get("session_name")
448
+ .and_then(|v| v.as_str())
449
+ .map(|session| session == session_name.as_str())
450
+ .unwrap_or(true);
451
+ if !same_session {
452
+ return;
453
+ }
454
+ if receiver
455
+ .get("status")
456
+ .and_then(|v| v.as_str())
457
+ .is_some_and(|status| status == "attached")
458
+ {
459
+ receiver.insert(
460
+ "status".to_string(),
461
+ serde_json::json!("rebind_required"),
462
+ );
463
+ }
464
+ }
465
+
466
+ fn mark_restart_targets_stopped_after_teardown(
467
+ state: &mut serde_json::Value,
468
+ decisions: &[RestartedAgent],
469
+ ) {
470
+ let Some(agents) = state
471
+ .get_mut("agents")
472
+ .and_then(serde_json::Value::as_object_mut)
473
+ else {
474
+ return;
475
+ };
476
+ for decision in decisions {
477
+ let Some(agent) = agents
478
+ .get_mut(decision.agent_id.as_str())
479
+ .and_then(serde_json::Value::as_object_mut)
480
+ else {
481
+ continue;
482
+ };
483
+ agent.insert("status".to_string(), serde_json::json!("stopped"));
484
+ agent.remove("pane_id");
485
+ agent.remove("pane_pid");
486
+ }
487
+ }
488
+
489
+ fn mark_agent_respawned(
490
+ state: &mut serde_json::Value,
491
+ agent_id: &AgentId,
492
+ spawn: &SpawnedAgentWindow,
493
+ transport: &dyn crate::transport::Transport,
494
+ safety: &DangerousApproval,
495
+ ) -> Result<(), LifecycleError> {
496
+ let Some(agent) = state
497
+ .get_mut("agents")
498
+ .and_then(serde_json::Value::as_object_mut)
499
+ .and_then(|agents| agents.get_mut(agent_id.as_str()))
500
+ .and_then(serde_json::Value::as_object_mut)
501
+ else {
502
+ return Err(LifecycleError::StatePersist(format!(
503
+ "agent {} state is not an object",
504
+ agent_id
505
+ )));
506
+ };
507
+ agent.insert("status".to_string(), serde_json::json!("running"));
508
+ agent.insert(
509
+ "pane_id".to_string(),
510
+ serde_json::json!(spawn.spawn.pane_id.as_str()),
511
+ );
512
+ let pane_pid = spawn.spawn.child_pid.or_else(|| {
513
+ transport
514
+ .list_targets()
515
+ .unwrap_or_default()
516
+ .into_iter()
517
+ .find(|pane| pane.pane_id == spawn.spawn.pane_id)
518
+ .and_then(|pane| pane.pane_pid)
519
+ });
520
+ if let Some(pane_pid) = pane_pid {
521
+ agent.insert("pane_pid".to_string(), serde_json::json!(pane_pid));
522
+ }
523
+ crate::lifecycle::launch::persist_command_plan_state(
524
+ agent,
525
+ &spawn.plan,
526
+ &spawn.profile_launch,
527
+ );
528
+ persist_effective_approval_policy_for_restart(agent, safety);
529
+ agent.remove("startup_prompts");
530
+ agent.remove("startup_prompt_status");
531
+ Ok(())
532
+ }
533
+
114
534
  fn write_restart_resume_decision_events(
115
535
  workspace: &Path,
116
536
  state: &serde_json::Value,
117
537
  allow_fresh: bool,
118
538
  decisions: &[RestartedAgent],
539
+ forced_fresh_missing: &std::collections::BTreeSet<String>,
540
+ forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
119
541
  ) -> Result<(), LifecycleError> {
120
- let log = crate::event_log::EventLog::new(workspace);
121
542
  for decision in decisions {
122
543
  let agent = state
123
544
  .get("agents")
@@ -134,23 +555,78 @@ fn write_restart_resume_decision_events(
134
555
  ResumeDecision::FreshStart => "fresh_start",
135
556
  ResumeDecision::Refuse => "refuse",
136
557
  };
137
- log.write(
138
- crate::lifecycle::types::event_names::RESTART_RESUME_DECISION,
139
- serde_json::json!({
140
- "worker_id": decision.agent_id.as_str(),
141
- "has_first_send_at": first_send_at.is_some(),
142
- "has_session_id": session_id.is_some(),
143
- "allow_fresh": allow_fresh,
144
- "decision": decision_wire,
145
- "first_send_at": first_send_at,
146
- "session_id": session_id,
147
- }),
148
- )
149
- .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
558
+ write_restart_resume_decision_event(
559
+ workspace,
560
+ decision.agent_id.as_str(),
561
+ first_send_at,
562
+ session_id,
563
+ allow_fresh,
564
+ decision_wire,
565
+ forced_fresh_missing.contains(decision.agent_id.as_str()),
566
+ forced_fresh_convergence,
567
+ )?;
150
568
  }
151
569
  Ok(())
152
570
  }
153
571
 
572
+ fn write_restart_resume_decision_event(
573
+ workspace: &Path,
574
+ worker_id: &str,
575
+ first_send_at: Option<String>,
576
+ session_id: Option<String>,
577
+ allow_fresh: bool,
578
+ decision: &str,
579
+ forced_fresh: bool,
580
+ forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
581
+ ) -> Result<(), LifecycleError> {
582
+ use std::io::Write as _;
583
+
584
+ let path = workspace.join(".team").join("logs").join("events.jsonl");
585
+ if let Some(parent) = path.parent() {
586
+ std::fs::create_dir_all(parent)
587
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
588
+ }
589
+ let mut event = serde_json::json!({
590
+ "ts": chrono::Utc::now().to_rfc3339(),
591
+ "event": crate::lifecycle::types::event_names::RESTART_RESUME_DECISION,
592
+ "worker_id": worker_id,
593
+ "has_first_send_at": first_send_at.is_some(),
594
+ "has_session_id": session_id.is_some(),
595
+ "allow_fresh": allow_fresh,
596
+ "decision": decision,
597
+ "first_send_at": first_send_at,
598
+ "session_id": session_id,
599
+ });
600
+ if forced_fresh {
601
+ if let Some(event) = event.as_object_mut() {
602
+ event.insert("forced_fresh".to_string(), serde_json::json!(true));
603
+ event.insert("reason".to_string(), serde_json::json!("resume_not_ready"));
604
+ if let Some(convergence) = forced_fresh_convergence {
605
+ event.insert(
606
+ "session_convergence".to_string(),
607
+ serde_json::json!({
608
+ "complete": false,
609
+ "deadline_s": convergence.deadline.as_secs_f64(),
610
+ "deadline_ms": convergence.deadline.as_millis(),
611
+ "elapsed_ms": convergence.elapsed.as_millis(),
612
+ "pending_agent_ids": convergence.missing.clone(),
613
+ }),
614
+ );
615
+ }
616
+ }
617
+ }
618
+ let line = serde_json::to_string(&event)
619
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
620
+ let mut file = std::fs::OpenOptions::new()
621
+ .create(true)
622
+ .append(true)
623
+ .open(&path)
624
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
625
+ file.write_all(line.as_bytes())
626
+ .and_then(|_| file.write_all(b"\n"))
627
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))
628
+ }
629
+
154
630
  /// `restart_candidates(workspace)`(`restart/selection.py:12`)。从 snapshot + active
155
631
  /// state 收集可重启 team。
156
632
  pub fn restart_candidates(workspace: &Path) -> Result<Vec<RestartCandidate>, LifecycleError> {
@@ -169,7 +169,11 @@ fn remove_agent_inner(
169
169
  // (team projection) — NOT a raw save, so other teams in a multi-team workspace are preserved.
170
170
  let mut removed_state = working_state;
171
171
  remove_agent_from_state(&mut removed_state, agent_id)?;
172
- crate::state::projection::save_team_scoped_state(paths.run_workspace, &removed_state)
172
+ crate::state::projection::save_team_scoped_state_with_deleted_agents(
173
+ paths.run_workspace,
174
+ &removed_state,
175
+ &[agent_id.as_str()],
176
+ )
173
177
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
174
178
  cleared_locations.push(serde_json::json!("state.json:agents"));
175
179
  write_remove_step_event(
@@ -226,6 +230,7 @@ fn remove_agent_inner(
226
230
  "agent_health",
227
231
  None,
228
232
  )?;
233
+ maybe_fail_remove_after_agent_health_delete()?;
229
234
  Ok(RemoveSuccess {
230
235
  outcome: RemoveAgentOutcome::Removed {
231
236
  agent_id: agent_id.clone(),
@@ -581,15 +586,16 @@ fn select_agent_health(
581
586
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
582
587
  let row = conn
583
588
  .query_row(
584
- "select status, last_output_at, context_usage_pct, current_task_id \
589
+ "select owner_team_id, status, last_output_at, context_usage_pct, current_task_id \
585
590
  from agent_health where agent_id = ?1",
586
591
  [agent_id.as_str()],
587
592
  |r| {
588
593
  Ok(CapturedHealth {
589
- status: r.get::<_, Option<String>>(0)?,
590
- last_output_at: r.get::<_, Option<String>>(1)?,
591
- context_usage_pct: r.get::<_, Option<i64>>(2)?,
592
- current_task_id: r.get::<_, Option<String>>(3)?,
594
+ owner_team_id: r.get::<_, Option<String>>(0)?,
595
+ status: r.get::<_, Option<String>>(1)?,
596
+ last_output_at: r.get::<_, Option<String>>(2)?,
597
+ context_usage_pct: r.get::<_, Option<i64>>(3)?,
598
+ current_task_id: r.get::<_, Option<String>>(4)?,
593
599
  })
594
600
  },
595
601
  )
@@ -618,8 +624,9 @@ fn restore_agent_health(
618
624
  // health (golden _restore_agent_health re-upserts status||"IDLE" + the captured columns).
619
625
  conn.execute(
620
626
  "insert into agent_health (owner_team_id, agent_id, status, last_output_at, context_usage_pct, current_task_id, updated_at) \
621
- values (null, ?1, ?2, ?3, ?4, ?5, ?6)",
627
+ values (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
622
628
  rusqlite::params![
629
+ row.owner_team_id,
623
630
  agent_id.as_str(),
624
631
  status,
625
632
  row.last_output_at,
@@ -634,12 +641,25 @@ fn restore_agent_health(
634
641
 
635
642
  #[derive(Clone)]
636
643
  struct CapturedHealth {
644
+ owner_team_id: Option<String>,
637
645
  status: Option<String>,
638
646
  last_output_at: Option<String>,
639
647
  context_usage_pct: Option<i64>,
640
648
  current_task_id: Option<String>,
641
649
  }
642
650
 
651
+ fn maybe_fail_remove_after_agent_health_delete() -> Result<(), LifecycleError> {
652
+ let Ok(reason) = std::env::var("TEAM_AGENT_TEST_FAIL_REMOVE_AFTER_AGENT_HEALTH_DELETE") else {
653
+ return Ok(());
654
+ };
655
+ if reason.is_empty() {
656
+ return Ok(());
657
+ }
658
+ Err(LifecycleError::StatePersist(format!(
659
+ "injected remove failure after agent_health delete: {reason}"
660
+ )))
661
+ }
662
+
643
663
  struct RemoveRollback {
644
664
  agent_id: AgentId,
645
665
  spec_text: Option<String>,