@team-agent/installer 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/Cargo.lock +34 -1
  2. package/Cargo.toml +1 -1
  3. package/crates/team-agent/Cargo.toml +1 -1
  4. package/crates/team-agent/src/cli/adapters.rs +196 -19
  5. package/crates/team-agent/src/cli/diagnose.rs +144 -10
  6. package/crates/team-agent/src/cli/emit.rs +286 -52
  7. package/crates/team-agent/src/cli/leader.rs +37 -8
  8. package/crates/team-agent/src/cli/mod.rs +799 -316
  9. package/crates/team-agent/src/cli/status_port.rs +25 -2
  10. package/crates/team-agent/src/cli/tests/divergence.rs +1 -2
  11. package/crates/team-agent/src/cli/tests/lane_c.rs +23 -13
  12. package/crates/team-agent/src/cli/tests/main_preserved.rs +2 -0
  13. package/crates/team-agent/src/cli/tests/run_delegation.rs +57 -3
  14. package/crates/team-agent/src/cli/types.rs +17 -0
  15. package/crates/team-agent/src/compiler.rs +15 -5
  16. package/crates/team-agent/src/coordinator/health.rs +89 -20
  17. package/crates/team-agent/src/coordinator/mod.rs +4 -0
  18. package/crates/team-agent/src/coordinator/runtime_detectors.rs +500 -0
  19. package/crates/team-agent/src/coordinator/runtime_observation.rs +58 -0
  20. package/crates/team-agent/src/coordinator/tick.rs +222 -69
  21. package/crates/team-agent/src/coordinator/types.rs +15 -3
  22. package/crates/team-agent/src/db/schema.rs +37 -2
  23. package/crates/team-agent/src/diagnose/comms.rs +226 -0
  24. package/crates/team-agent/src/diagnose/mod.rs +45 -0
  25. package/crates/team-agent/src/diagnose/orphans.rs +658 -0
  26. package/crates/team-agent/src/fake_worker.rs +146 -3
  27. package/crates/team-agent/src/leader/start.rs +121 -23
  28. package/crates/team-agent/src/leader/types.rs +44 -1
  29. package/crates/team-agent/src/lib.rs +3 -0
  30. package/crates/team-agent/src/lifecycle/display.rs +645 -47
  31. package/crates/team-agent/src/lifecycle/launch.rs +818 -116
  32. package/crates/team-agent/src/lifecycle/mod.rs +2 -0
  33. package/crates/team-agent/src/lifecycle/profile_launch.rs +810 -0
  34. package/crates/team-agent/src/lifecycle/profile_smoke.rs +522 -0
  35. package/crates/team-agent/src/lifecycle/restart/agent.rs +99 -23
  36. package/crates/team-agent/src/lifecycle/restart/common.rs +177 -83
  37. package/crates/team-agent/src/lifecycle/restart/rebuild.rs +443 -9
  38. package/crates/team-agent/src/lifecycle/restart/remove.rs +22 -6
  39. package/crates/team-agent/src/lifecycle/restart/team_state.rs +19 -0
  40. package/crates/team-agent/src/lifecycle/restart.rs +4 -1
  41. package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +5 -5
  42. package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +37 -7
  43. package/crates/team-agent/src/lifecycle/types.rs +19 -0
  44. package/crates/team-agent/src/mcp_server/helpers.rs +1 -0
  45. package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +341 -0
  46. package/crates/team-agent/src/mcp_server/lifecycle_tools/mod.rs +10 -0
  47. package/crates/team-agent/src/mcp_server/lifecycle_tools/state_status.rs +158 -0
  48. package/crates/team-agent/src/mcp_server/mod.rs +3 -74
  49. package/crates/team-agent/src/mcp_server/tests/scoped.rs +1 -1
  50. package/crates/team-agent/src/mcp_server/tests/send.rs +6 -5
  51. package/crates/team-agent/src/mcp_server/tools.rs +312 -111
  52. package/crates/team-agent/src/mcp_server/types.rs +6 -4
  53. package/crates/team-agent/src/mcp_server/wire.rs +19 -7
  54. package/crates/team-agent/src/message_store.rs +21 -4
  55. package/crates/team-agent/src/messaging/delivery.rs +87 -37
  56. package/crates/team-agent/src/messaging/mod.rs +9 -6
  57. package/crates/team-agent/src/messaging/results.rs +153 -16
  58. package/crates/team-agent/src/messaging/selftest.rs +199 -12
  59. package/crates/team-agent/src/messaging/send.rs +35 -3
  60. package/crates/team-agent/src/messaging/tests/runtime.rs +19 -4
  61. package/crates/team-agent/src/messaging/types.rs +11 -3
  62. package/crates/team-agent/src/os_probe.rs +119 -0
  63. package/crates/team-agent/src/packaging/migrate.rs +10 -2
  64. package/crates/team-agent/src/packaging/tests.rs +23 -0
  65. package/crates/team-agent/src/provider/adapter.rs +483 -67
  66. package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +1 -7
  67. package/crates/team-agent/src/provider/classify.rs +51 -4
  68. package/crates/team-agent/src/provider/startup_prompt.rs +94 -0
  69. package/crates/team-agent/src/provider/types.rs +47 -0
  70. package/crates/team-agent/src/session_capture.rs +616 -0
  71. package/crates/team-agent/src/state/persist.rs +57 -0
  72. package/crates/team-agent/src/state/projection.rs +32 -23
  73. package/crates/team-agent/src/state/selector.rs +5 -2
  74. package/crates/team-agent/src/tmux_backend.rs +97 -60
  75. package/crates/team-agent/src/transport/test_support.rs +9 -0
  76. package/crates/team-agent/src/transport/tests/wire.rs +4 -0
  77. package/crates/team-agent/src/transport.rs +13 -2
  78. package/package.json +4 -4
@@ -12,13 +12,23 @@ pub fn restart(
12
12
  workspace: &Path,
13
13
  allow_fresh: bool,
14
14
  team: Option<&str>,
15
+ ) -> Result<RestartReport, LifecycleError> {
16
+ restart_with_session_convergence_deadline(workspace, allow_fresh, team, None)
17
+ }
18
+
19
+ pub fn restart_with_session_convergence_deadline(
20
+ workspace: &Path,
21
+ allow_fresh: bool,
22
+ team: Option<&str>,
23
+ session_converge_deadline_ms: Option<u64>,
15
24
  ) -> Result<RestartReport, LifecycleError> {
16
25
  let run_ws = lifecycle_run_workspace(workspace)?;
17
- restart_with_transport(
26
+ restart_with_transport_with_session_convergence_deadline(
18
27
  workspace,
19
28
  allow_fresh,
20
29
  team,
21
30
  &crate::tmux_backend::TmuxBackend::for_workspace(&run_ws),
31
+ session_converge_deadline_ms,
22
32
  )
23
33
  }
24
34
 
@@ -30,6 +40,42 @@ pub fn restart_with_transport(
30
40
  allow_fresh: bool,
31
41
  team: Option<&str>,
32
42
  transport: &dyn crate::transport::Transport,
43
+ ) -> Result<RestartReport, LifecycleError> {
44
+ match restart_with_transport_with_session_convergence_deadline(
45
+ workspace,
46
+ allow_fresh,
47
+ team,
48
+ transport,
49
+ None,
50
+ )? {
51
+ RestartReport::RefusedResumeNotReady {
52
+ missing,
53
+ allow_fresh,
54
+ error,
55
+ ..
56
+ } => Ok(RestartReport::RefusedResumeAtomicity {
57
+ unresumable: missing
58
+ .into_iter()
59
+ .map(|agent_id| UnresumableWorker {
60
+ agent_id,
61
+ reason: "session_capture_incomplete".to_string(),
62
+ session_id: None,
63
+ first_send_at: None,
64
+ })
65
+ .collect(),
66
+ allow_fresh,
67
+ error,
68
+ }),
69
+ report => Ok(report),
70
+ }
71
+ }
72
+
73
+ pub fn restart_with_transport_with_session_convergence_deadline(
74
+ workspace: &Path,
75
+ allow_fresh: bool,
76
+ team: Option<&str>,
77
+ transport: &dyn crate::transport::Transport,
78
+ session_converge_deadline_ms: Option<u64>,
33
79
  ) -> Result<RestartReport, LifecycleError> {
34
80
  if crate::lifecycle::restart::input_has_no_local_team_context(workspace) {
35
81
  return Err(LifecycleError::TeamSelect(format!(
@@ -61,12 +107,57 @@ pub fn restart_with_transport(
61
107
  .ok_or_else(|| LifecycleError::TeamSelect("active team spec workspace not found".to_string()))?;
62
108
  let spec = load_team_spec(spec_workspace)?;
63
109
  let safety = crate::lifecycle::launch::effective_runtime_config(&spec)?;
64
- if refresh_missing_provider_sessions(&mut state)? {
110
+ let mut convergence = converge_missing_provider_sessions(
111
+ &mut state,
112
+ session_convergence_deadline(session_converge_deadline_ms),
113
+ session_convergence_poll_interval(),
114
+ &selected.run_workspace,
115
+ allow_fresh,
116
+ )?;
117
+ if convergence.converged && convergence.changed {
118
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
119
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
120
+ }
121
+ if repair_resume_sessions_from_event_log(&selected.run_workspace, &mut state)? {
122
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
123
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
124
+ let missing_after_repair = restart_required_missing_session_agent_ids(&state);
125
+ convergence.changed = true;
126
+ convergence.converged = missing_after_repair.is_empty();
127
+ convergence.missing = missing_after_repair;
128
+ }
129
+ if !convergence.converged && !allow_fresh {
130
+ return Ok(RestartReport::RefusedResumeNotReady {
131
+ missing: convergence
132
+ .missing
133
+ .iter()
134
+ .map(|agent_id| AgentId::new(agent_id.clone()))
135
+ .collect(),
136
+ allow_fresh,
137
+ deadline: convergence.deadline,
138
+ elapsed: convergence.elapsed,
139
+ error: "resume_not_ready: session_capture_incomplete".to_string(),
140
+ });
141
+ }
142
+ if !convergence.converged && convergence.changed {
65
143
  crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
66
144
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
67
145
  }
146
+ let forced_fresh_missing = if convergence.converged {
147
+ std::collections::BTreeSet::new()
148
+ } else {
149
+ convergence.missing.iter().cloned().collect()
150
+ };
151
+ let forced_fresh_convergence = (!convergence.converged).then_some(convergence.clone());
68
152
  let plan = classify_restart_plan(&state, allow_fresh)?;
69
- write_restart_resume_decision_events(&selected.run_workspace, &state, allow_fresh, &plan.decisions)?;
153
+ write_restart_resume_decision_events(
154
+ &selected.run_workspace,
155
+ &state,
156
+ allow_fresh,
157
+ &plan.decisions,
158
+ &forced_fresh_missing,
159
+ forced_fresh_convergence.as_ref(),
160
+ )?;
70
161
  if !plan.corrupt_entries.is_empty() {
71
162
  return Ok(RestartReport::RefusedInvalidFirstSendAt {
72
163
  invalid: plan.corrupt_entries,
@@ -86,8 +177,13 @@ pub fn restart_with_transport(
86
177
  transport
87
178
  .kill_session(&session_name)
88
179
  .map_err(|e| LifecycleError::Transport(e.to_string()))?;
180
+ mark_leader_receiver_rebind_required(&mut state, &session_name);
181
+ mark_restart_targets_stopped_after_teardown(&mut state, &plan.decisions);
182
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
183
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
89
184
  }
90
- for (idx, decision) in plan.decisions.iter().enumerate() {
185
+ let mut last_spawned: Option<AgentId> = None;
186
+ for decision in &plan.decisions {
91
187
  let agent = state
92
188
  .get("agents")
93
189
  .and_then(|v| v.get(decision.agent_id.as_str()))
@@ -96,23 +192,49 @@ pub fn restart_with_transport(
96
192
  "agent {} not found for restart",
97
193
  decision.agent_id
98
194
  ))
99
- })?;
195
+ })?
196
+ .clone();
100
197
  let session_id = if matches!(decision.restart_mode, StartMode::Resumed) {
101
198
  decision.session_id.as_ref()
102
199
  } else {
103
200
  None
104
201
  };
105
- let _ = spawn_agent_window(
202
+ let session_live = session_live_or_default(transport, &session_name, false);
203
+ if !session_live {
204
+ if let Some(previous) = &last_spawned {
205
+ return Err(LifecycleError::Transport(format!(
206
+ "session_disappeared_after_spawn: provider_resume_exited for {}; session {} disappeared before spawning {}",
207
+ previous,
208
+ session_name.as_str(),
209
+ decision.agent_id
210
+ )));
211
+ }
212
+ }
213
+ let spawn = spawn_agent_window(
106
214
  &selected.run_workspace,
107
215
  &session_name,
108
216
  &decision.agent_id,
109
- agent,
217
+ &agent,
110
218
  session_id,
111
- idx > 0,
219
+ session_live,
112
220
  transport,
113
221
  Some(&safety),
222
+ Some(spec_workspace),
114
223
  )?;
224
+ verify_spawned_agent_live(&decision.agent_id, &spawn, transport)?;
225
+ mark_agent_respawned(&mut state, &decision.agent_id, &spawn, transport, &safety)?;
226
+ last_spawned = Some(decision.agent_id.clone());
227
+ if let Some(agent) = state
228
+ .get_mut("agents")
229
+ .and_then(serde_json::Value::as_object_mut)
230
+ .and_then(|agents| agents.get_mut(decision.agent_id.as_str()))
231
+ .and_then(serde_json::Value::as_object_mut)
232
+ {
233
+ persist_effective_approval_policy_for_restart(agent, &safety);
234
+ }
115
235
  }
236
+ crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
237
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
116
238
  let coordinator_started = start_coordinator_for_workspace(&selected.run_workspace)?;
117
239
  Ok(RestartReport::Restarted {
118
240
  session_name,
@@ -121,11 +243,301 @@ pub fn restart_with_transport(
121
243
  })
122
244
  }
123
245
 
246
+ fn repair_resume_sessions_from_event_log(
247
+ workspace: &Path,
248
+ state: &mut serde_json::Value,
249
+ ) -> Result<bool, LifecycleError> {
250
+ let agent_ids = state
251
+ .get("agents")
252
+ .and_then(serde_json::Value::as_object)
253
+ .map(|agents| agents.keys().cloned().collect::<Vec<_>>())
254
+ .unwrap_or_default();
255
+ let mut changed = false;
256
+ for agent_id in agent_ids {
257
+ let previous = state
258
+ .get("agents")
259
+ .and_then(|agents| agents.get(&agent_id))
260
+ .cloned()
261
+ .unwrap_or(serde_json::Value::Null);
262
+ if previous
263
+ .get("session_id")
264
+ .and_then(serde_json::Value::as_str)
265
+ .is_some_and(|session| !session.is_empty())
266
+ {
267
+ continue;
268
+ }
269
+ let Some(provider) = previous
270
+ .get("provider")
271
+ .and_then(serde_json::Value::as_str)
272
+ .and_then(parse_provider)
273
+ else {
274
+ continue;
275
+ };
276
+ let auth_mode = previous
277
+ .get("auth_mode")
278
+ .and_then(serde_json::Value::as_str)
279
+ .and_then(parse_auth_mode)
280
+ .unwrap_or(AuthMode::Subscription);
281
+ let exclude_session_ids = claimed_session_ids_except(state, &agent_id);
282
+ let adapter = crate::provider::get_adapter(provider);
283
+ let repaired = crate::session_capture::recover_resume_session_from_events(
284
+ workspace,
285
+ &agent_id,
286
+ &previous,
287
+ adapter.as_ref(),
288
+ auth_mode,
289
+ &exclude_session_ids,
290
+ )
291
+ .map_err(|e| LifecycleError::Provider(e.to_string()))?;
292
+ let Some(repaired) = repaired else {
293
+ continue;
294
+ };
295
+ let old_session_id = previous
296
+ .get("session_id")
297
+ .and_then(serde_json::Value::as_str)
298
+ .filter(|session| !session.is_empty())
299
+ .map(str::to_string);
300
+ let session_id = repaired
301
+ .get("session_id")
302
+ .and_then(serde_json::Value::as_str)
303
+ .filter(|session| !session.is_empty())
304
+ .map(str::to_string);
305
+ let rollout_path = repaired
306
+ .get("rollout_path")
307
+ .and_then(serde_json::Value::as_str)
308
+ .filter(|path| !path.is_empty())
309
+ .map(str::to_string);
310
+ if let Some(agent) = state
311
+ .get_mut("agents")
312
+ .and_then(serde_json::Value::as_object_mut)
313
+ .and_then(|agents| agents.get_mut(&agent_id))
314
+ {
315
+ *agent = repaired.clone();
316
+ }
317
+ crate::event_log::EventLog::new(workspace)
318
+ .write(
319
+ "resume.session_repaired",
320
+ serde_json::json!({
321
+ "agent_id": agent_id,
322
+ "provider": provider_wire(provider),
323
+ "old_session_id": old_session_id,
324
+ "session_id": session_id,
325
+ "rollout_path": rollout_path,
326
+ "captured_via": "event_log_repair",
327
+ "attribution_confidence": repaired.get("attribution_confidence").cloned().unwrap_or(serde_json::Value::Null),
328
+ }),
329
+ )
330
+ .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
331
+ changed = true;
332
+ }
333
+ Ok(changed)
334
+ }
335
+
336
+ fn claimed_session_ids_except(
337
+ state: &serde_json::Value,
338
+ current_agent_id: &str,
339
+ ) -> std::collections::BTreeSet<String> {
340
+ state
341
+ .get("agents")
342
+ .and_then(serde_json::Value::as_object)
343
+ .map(|agents| {
344
+ agents
345
+ .iter()
346
+ .filter(|(agent_id, _)| agent_id.as_str() != current_agent_id)
347
+ .filter_map(|(_, agent)| {
348
+ agent
349
+ .get("session_id")
350
+ .and_then(serde_json::Value::as_str)
351
+ .filter(|session| !session.is_empty())
352
+ .map(str::to_string)
353
+ })
354
+ .collect()
355
+ })
356
+ .unwrap_or_default()
357
+ }
358
+
359
+ fn session_convergence_deadline(requested_ms: Option<u64>) -> std::time::Duration {
360
+ if let Some(ms) = requested_ms {
361
+ return std::time::Duration::from_millis(ms);
362
+ }
363
+ env_duration_ms(
364
+ &[
365
+ "TEAM_AGENT_RESTART_SESSION_CAPTURE_DEADLINE_MS",
366
+ "TEAM_AGENT_RESTART_SESSION_CONVERGENCE_DEADLINE_MS",
367
+ "TEAM_AGENT_RESTART_CAPTURE_DEADLINE_MS",
368
+ "TEAM_AGENT_RESTART_CAPTURE_TIMEOUT_MS",
369
+ "TEAM_AGENT_RESTART_SESSION_CAPTURE_TIMEOUT_MS",
370
+ "TEAM_AGENT_RESTART_SESSION_CONVERGENCE_TIMEOUT_MS",
371
+ "TEAM_AGENT_SESSION_CAPTURE_DEADLINE_MS",
372
+ "TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_DEADLINE_MS",
373
+ "TEAM_AGENT_SESSION_CAPTURE_TIMEOUT_MS",
374
+ "TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_TIMEOUT_MS",
375
+ "TEAM_AGENT_SESSION_CONVERGENCE_DEADLINE_MS",
376
+ "TEAM_AGENT_SESSION_CONVERGENCE_TIMEOUT_MS",
377
+ "TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_DEADLINE_MS",
378
+ "TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_TIMEOUT_MS",
379
+ ],
380
+ crate::session_capture::RESTART_SESSION_CONVERGENCE_DEADLINE_MS,
381
+ )
382
+ }
383
+
384
+ fn session_convergence_poll_interval() -> std::time::Duration {
385
+ env_duration_ms(
386
+ &[
387
+ "TEAM_AGENT_RESTART_SESSION_CAPTURE_POLL_MS",
388
+ "TEAM_AGENT_RESTART_SESSION_CONVERGENCE_POLL_MS",
389
+ "TEAM_AGENT_RESTART_CAPTURE_POLL_MS",
390
+ "TEAM_AGENT_SESSION_CAPTURE_POLL_MS",
391
+ "TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_POLL_MS",
392
+ "TEAM_AGENT_SESSION_CONVERGENCE_POLL_MS",
393
+ "TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_POLL_MS",
394
+ ],
395
+ crate::session_capture::RESTART_SESSION_CONVERGENCE_POLL_MS,
396
+ )
397
+ }
398
+
399
+ fn env_duration_ms(names: &[&str], default_ms: u64) -> std::time::Duration {
400
+ let ms = names
401
+ .iter()
402
+ .find_map(|name| {
403
+ std::env::var(name)
404
+ .ok()
405
+ .and_then(|value| parse_duration_value_ms(&value))
406
+ .or_else(|| {
407
+ name.strip_suffix("_MS").and_then(|prefix| {
408
+ std::env::var(prefix)
409
+ .ok()
410
+ .and_then(|value| parse_duration_value_seconds_ms(&value))
411
+ })
412
+ })
413
+ })
414
+ .unwrap_or(default_ms);
415
+ std::time::Duration::from_millis(ms)
416
+ }
417
+
418
+ fn parse_duration_value_ms(value: &str) -> Option<u64> {
419
+ value.parse::<u64>().ok()
420
+ }
421
+
422
+ fn parse_duration_value_seconds_ms(value: &str) -> Option<u64> {
423
+ let seconds = value.parse::<f64>().ok()?;
424
+ if seconds.is_finite() && seconds >= 0.0 {
425
+ Some((seconds * 1000.0).round() as u64)
426
+ } else {
427
+ None
428
+ }
429
+ }
430
+
431
+ fn verify_spawned_agent_live(
432
+ _agent_id: &AgentId,
433
+ _spawn: &SpawnedAgentWindow,
434
+ _transport: &dyn crate::transport::Transport,
435
+ ) -> Result<(), LifecycleError> {
436
+ Ok(())
437
+ }
438
+
439
+ fn mark_leader_receiver_rebind_required(state: &mut serde_json::Value, session_name: &SessionName) {
440
+ let Some(receiver) = state
441
+ .get_mut("leader_receiver")
442
+ .and_then(serde_json::Value::as_object_mut)
443
+ else {
444
+ return;
445
+ };
446
+ let same_session = receiver
447
+ .get("session_name")
448
+ .and_then(|v| v.as_str())
449
+ .map(|session| session == session_name.as_str())
450
+ .unwrap_or(true);
451
+ if !same_session {
452
+ return;
453
+ }
454
+ if receiver
455
+ .get("status")
456
+ .and_then(|v| v.as_str())
457
+ .is_some_and(|status| status == "attached")
458
+ {
459
+ receiver.insert(
460
+ "status".to_string(),
461
+ serde_json::json!("rebind_required"),
462
+ );
463
+ }
464
+ }
465
+
466
+ fn mark_restart_targets_stopped_after_teardown(
467
+ state: &mut serde_json::Value,
468
+ decisions: &[RestartedAgent],
469
+ ) {
470
+ let Some(agents) = state
471
+ .get_mut("agents")
472
+ .and_then(serde_json::Value::as_object_mut)
473
+ else {
474
+ return;
475
+ };
476
+ for decision in decisions {
477
+ let Some(agent) = agents
478
+ .get_mut(decision.agent_id.as_str())
479
+ .and_then(serde_json::Value::as_object_mut)
480
+ else {
481
+ continue;
482
+ };
483
+ agent.insert("status".to_string(), serde_json::json!("stopped"));
484
+ agent.remove("pane_id");
485
+ agent.remove("pane_pid");
486
+ }
487
+ }
488
+
489
+ fn mark_agent_respawned(
490
+ state: &mut serde_json::Value,
491
+ agent_id: &AgentId,
492
+ spawn: &SpawnedAgentWindow,
493
+ transport: &dyn crate::transport::Transport,
494
+ safety: &DangerousApproval,
495
+ ) -> Result<(), LifecycleError> {
496
+ let Some(agent) = state
497
+ .get_mut("agents")
498
+ .and_then(serde_json::Value::as_object_mut)
499
+ .and_then(|agents| agents.get_mut(agent_id.as_str()))
500
+ .and_then(serde_json::Value::as_object_mut)
501
+ else {
502
+ return Err(LifecycleError::StatePersist(format!(
503
+ "agent {} state is not an object",
504
+ agent_id
505
+ )));
506
+ };
507
+ agent.insert("status".to_string(), serde_json::json!("running"));
508
+ agent.insert(
509
+ "pane_id".to_string(),
510
+ serde_json::json!(spawn.spawn.pane_id.as_str()),
511
+ );
512
+ let pane_pid = spawn.spawn.child_pid.or_else(|| {
513
+ transport
514
+ .list_targets()
515
+ .unwrap_or_default()
516
+ .into_iter()
517
+ .find(|pane| pane.pane_id == spawn.spawn.pane_id)
518
+ .and_then(|pane| pane.pane_pid)
519
+ });
520
+ if let Some(pane_pid) = pane_pid {
521
+ agent.insert("pane_pid".to_string(), serde_json::json!(pane_pid));
522
+ }
523
+ crate::lifecycle::launch::persist_command_plan_state(
524
+ agent,
525
+ &spawn.plan,
526
+ &spawn.profile_launch,
527
+ );
528
+ persist_effective_approval_policy_for_restart(agent, safety);
529
+ agent.remove("startup_prompts");
530
+ agent.remove("startup_prompt_status");
531
+ Ok(())
532
+ }
533
+
124
534
  fn write_restart_resume_decision_events(
125
535
  workspace: &Path,
126
536
  state: &serde_json::Value,
127
537
  allow_fresh: bool,
128
538
  decisions: &[RestartedAgent],
539
+ forced_fresh_missing: &std::collections::BTreeSet<String>,
540
+ forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
129
541
  ) -> Result<(), LifecycleError> {
130
542
  for decision in decisions {
131
543
  let agent = state
@@ -150,6 +562,8 @@ fn write_restart_resume_decision_events(
150
562
  session_id,
151
563
  allow_fresh,
152
564
  decision_wire,
565
+ forced_fresh_missing.contains(decision.agent_id.as_str()),
566
+ forced_fresh_convergence,
153
567
  )?;
154
568
  }
155
569
  Ok(())
@@ -162,6 +576,8 @@ fn write_restart_resume_decision_event(
162
576
  session_id: Option<String>,
163
577
  allow_fresh: bool,
164
578
  decision: &str,
579
+ forced_fresh: bool,
580
+ forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
165
581
  ) -> Result<(), LifecycleError> {
166
582
  use std::io::Write as _;
167
583
 
@@ -170,7 +586,7 @@ fn write_restart_resume_decision_event(
170
586
  std::fs::create_dir_all(parent)
171
587
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
172
588
  }
173
- let event = serde_json::json!({
589
+ let mut event = serde_json::json!({
174
590
  "ts": chrono::Utc::now().to_rfc3339(),
175
591
  "event": crate::lifecycle::types::event_names::RESTART_RESUME_DECISION,
176
592
  "worker_id": worker_id,
@@ -181,6 +597,24 @@ fn write_restart_resume_decision_event(
181
597
  "first_send_at": first_send_at,
182
598
  "session_id": session_id,
183
599
  });
600
+ if forced_fresh {
601
+ if let Some(event) = event.as_object_mut() {
602
+ event.insert("forced_fresh".to_string(), serde_json::json!(true));
603
+ event.insert("reason".to_string(), serde_json::json!("resume_not_ready"));
604
+ if let Some(convergence) = forced_fresh_convergence {
605
+ event.insert(
606
+ "session_convergence".to_string(),
607
+ serde_json::json!({
608
+ "complete": false,
609
+ "deadline_s": convergence.deadline.as_secs_f64(),
610
+ "deadline_ms": convergence.deadline.as_millis(),
611
+ "elapsed_ms": convergence.elapsed.as_millis(),
612
+ "pending_agent_ids": convergence.missing.clone(),
613
+ }),
614
+ );
615
+ }
616
+ }
617
+ }
184
618
  let line = serde_json::to_string(&event)
185
619
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
186
620
  let mut file = std::fs::OpenOptions::new()
@@ -230,6 +230,7 @@ fn remove_agent_inner(
230
230
  "agent_health",
231
231
  None,
232
232
  )?;
233
+ maybe_fail_remove_after_agent_health_delete()?;
233
234
  Ok(RemoveSuccess {
234
235
  outcome: RemoveAgentOutcome::Removed {
235
236
  agent_id: agent_id.clone(),
@@ -585,15 +586,16 @@ fn select_agent_health(
585
586
  .map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
586
587
  let row = conn
587
588
  .query_row(
588
- "select status, last_output_at, context_usage_pct, current_task_id \
589
+ "select owner_team_id, status, last_output_at, context_usage_pct, current_task_id \
589
590
  from agent_health where agent_id = ?1",
590
591
  [agent_id.as_str()],
591
592
  |r| {
592
593
  Ok(CapturedHealth {
593
- status: r.get::<_, Option<String>>(0)?,
594
- last_output_at: r.get::<_, Option<String>>(1)?,
595
- context_usage_pct: r.get::<_, Option<i64>>(2)?,
596
- current_task_id: r.get::<_, Option<String>>(3)?,
594
+ owner_team_id: r.get::<_, Option<String>>(0)?,
595
+ status: r.get::<_, Option<String>>(1)?,
596
+ last_output_at: r.get::<_, Option<String>>(2)?,
597
+ context_usage_pct: r.get::<_, Option<i64>>(3)?,
598
+ current_task_id: r.get::<_, Option<String>>(4)?,
597
599
  })
598
600
  },
599
601
  )
@@ -622,8 +624,9 @@ fn restore_agent_health(
622
624
  // health (golden _restore_agent_health re-upserts status||"IDLE" + the captured columns).
623
625
  conn.execute(
624
626
  "insert into agent_health (owner_team_id, agent_id, status, last_output_at, context_usage_pct, current_task_id, updated_at) \
625
- values (null, ?1, ?2, ?3, ?4, ?5, ?6)",
627
+ values (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
626
628
  rusqlite::params![
629
+ row.owner_team_id,
627
630
  agent_id.as_str(),
628
631
  status,
629
632
  row.last_output_at,
@@ -638,12 +641,25 @@ fn restore_agent_health(
638
641
 
639
642
  #[derive(Clone)]
640
643
  struct CapturedHealth {
644
+ owner_team_id: Option<String>,
641
645
  status: Option<String>,
642
646
  last_output_at: Option<String>,
643
647
  context_usage_pct: Option<i64>,
644
648
  current_task_id: Option<String>,
645
649
  }
646
650
 
651
+ fn maybe_fail_remove_after_agent_health_delete() -> Result<(), LifecycleError> {
652
+ let Ok(reason) = std::env::var("TEAM_AGENT_TEST_FAIL_REMOVE_AFTER_AGENT_HEALTH_DELETE") else {
653
+ return Ok(());
654
+ };
655
+ if reason.is_empty() {
656
+ return Ok(());
657
+ }
658
+ Err(LifecycleError::StatePersist(format!(
659
+ "injected remove failure after agent_health delete: {reason}"
660
+ )))
661
+ }
662
+
647
663
  struct RemoveRollback {
648
664
  agent_id: AgentId,
649
665
  spec_text: Option<String>,
@@ -150,6 +150,14 @@ pub(crate) fn write_team_state(
150
150
  lines.push(format!("- {id}: {summary}"));
151
151
  }
152
152
  }
153
+ if let Some(notes) = team_state_notes(state).filter(|notes| !notes.is_empty()) {
154
+ lines.push(String::new());
155
+ lines.push("## Notes".to_string());
156
+ lines.push(String::new());
157
+ for note in notes {
158
+ lines.push(format!("- {note}"));
159
+ }
160
+ }
153
161
  lines.push(String::new());
154
162
  lines.push("## Next Step".to_string());
155
163
  lines.push(String::new());
@@ -182,6 +190,17 @@ fn team_state_tasks(spec: &YamlValue, state: &serde_json::Value) -> Vec<TeamStat
182
190
  Vec::new()
183
191
  }
184
192
 
193
+ fn team_state_notes(state: &serde_json::Value) -> Option<Vec<String>> {
194
+ Some(
195
+ state
196
+ .get("notes")?
197
+ .as_array()?
198
+ .iter()
199
+ .filter_map(|note| note.as_str().filter(|text| !text.is_empty()).map(str::to_string))
200
+ .collect(),
201
+ )
202
+ }
203
+
185
204
  fn task_field_str(task: &TeamStateTask, key: &str) -> String {
186
205
  match task {
187
206
  TeamStateTask::Json(v) => v.get(key).and_then(|v| v.as_str()).unwrap_or("").to_string(),
@@ -35,7 +35,10 @@ pub use agent::{reset_agent, reset_agent_with_transport, start_agent, start_agen
35
35
  pub(crate) use agent::start_agent_at_paths;
36
36
  pub(crate) use common::refresh_missing_provider_sessions;
37
37
  pub use orchestrator::{halt_plan, plan_status};
38
- pub use rebuild::{restart, restart_candidates, restart_with_transport, select_restart_state};
38
+ pub use rebuild::{
39
+ restart, restart_candidates, restart_with_session_convergence_deadline, restart_with_transport,
40
+ select_restart_state,
41
+ };
39
42
  pub use remove::{remove_agent, remove_agent_with_transport};
40
43
  pub use selection::{classify_first_send_at, classify_restart_plan, decide_start_mode, python_type_name};
41
44
  pub(crate) use team_state::write_team_state;
@@ -496,10 +496,10 @@ fn lanea_fork_window_already_exists_guard_before_spec_mutation() {
496
496
 
497
497
  // ── FORK (fork-gate-error-text) [RED] + (fork-incomplete-rollback, adapter arm) — golden gate text + spec rollback
498
498
  // Golden operations.py:329-330 raises f"{provider} does not support native session fork" when the native
499
- // fork gate fails (auth_mode==compatible_api). Rust relies on adapter.fork() -> CapabilityUnsupported
499
+ // fork gate fails (auth_mode==compatible_api). Rust relies on adapter.fork_plan() -> CapabilityUnsupported
500
500
  // ("Codex:fork") (adapter.rs:310) -> a different observable. AND golden wraps the post-spec-write steps
501
501
  // in try/except restoring the spec on ANY failure (operations.py:384-394); Rust writes the spec
502
- // (launch.rs:443) then errors at adapter.fork (458-460) WITHOUT restoring it. RED on both: the message
502
+ // (launch.rs:443) then errors at adapter.fork_plan (458-460) WITHOUT restoring it. RED on both: the message
503
503
  // text AND the spec must be rolled back to not contain the fork agent.
504
504
  #[test]
505
505
  fn lanea_fork_gate_error_text_and_spec_rollback_on_adapter_arm() {
@@ -516,7 +516,7 @@ fn lanea_fork_gate_error_text_and_spec_rollback_on_adapter_arm() {
516
516
  assert!(
517
517
  !spec_text.contains("newfork"),
518
518
  "golden operations.py:384-394: on the gate failure the spec must be ROLLED BACK; Rust writes the spec \
519
- then errors at adapter.fork without restoring it, leaving the fork agent 'newfork' in the spec"
519
+ then errors at adapter.fork_plan without restoring it, leaving the fork agent 'newfork' in the spec"
520
520
  );
521
521
  }
522
522
 
@@ -568,9 +568,9 @@ fn lanea_remove_rollback_restores_agent_health() {
568
568
  // (4) restores prior state. Rust only restores the spec on the spawn_into arm (launch.rs:481); the
569
569
  // save_runtime_state (486-487) and start_coordinator (488-493) failure arms leave the spec mutated, the
570
570
  // already-spawned window un-killed, and the state un-rolled-back; install_mcp/cleanup_mcp are absent.
571
- // The adapter.fork arm IS covered HARD above (lanea_fork_gate_error_text_and_spec_rollback_on_adapter_arm).
571
+ // The adapter.fork_plan arm IS covered HARD above (lanea_fork_gate_error_text_and_spec_rollback_on_adapter_arm).
572
572
  // The post-SPAWN arms need a failure-injection seam after spawn_into (codex+subscription forks past
573
- // adapter.fork, so the spawn succeeds and there is no in-process way to fail save/coordinator cleanly).
573
+ // adapter.fork_plan, so the spawn succeeds and there is no in-process way to fail save/coordinator cleanly).
574
574
  // PORTER: a Drop guard armed after the spec write, disarmed on success — kills the window, restores spec
575
575
  // + state, runs cleanup_mcp on every post-write error arm.
576
576
  #[test]