@team-agent/installer 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/Cargo.lock +1 -1
  2. package/Cargo.toml +1 -1
  3. package/crates/team-agent/src/cli/adapters.rs +8 -0
  4. package/crates/team-agent/src/cli/diagnose.rs +51 -10
  5. package/crates/team-agent/src/cli/emit.rs +2 -1
  6. package/crates/team-agent/src/cli/mod.rs +217 -80
  7. package/crates/team-agent/src/cli/send.rs +1 -0
  8. package/crates/team-agent/src/cli/status_port.rs +135 -7
  9. package/crates/team-agent/src/cli/tests/missing_subcommands.rs +8 -1
  10. package/crates/team-agent/src/cli/tests/mod.rs +1 -0
  11. package/crates/team-agent/src/cli/tests/shutdown_kill_plan.rs +39 -0
  12. package/crates/team-agent/src/cli/types.rs +5 -1
  13. package/crates/team-agent/src/coordinator/backoff.rs +57 -9
  14. package/crates/team-agent/src/coordinator/health.rs +65 -2
  15. package/crates/team-agent/src/coordinator/runtime_detectors.rs +28 -16
  16. package/crates/team-agent/src/coordinator/tests/a0_lostupdate.rs +87 -0
  17. package/crates/team-agent/src/coordinator/tests/mod.rs +1 -0
  18. package/crates/team-agent/src/coordinator/tick.rs +195 -43
  19. package/crates/team-agent/src/leader/helpers.rs +2 -0
  20. package/crates/team-agent/src/leader/rediscover.rs +1 -0
  21. package/crates/team-agent/src/leader/start.rs +9 -1
  22. package/crates/team-agent/src/leader/takeover.rs +18 -1
  23. package/crates/team-agent/src/lifecycle/launch.rs +434 -29
  24. package/crates/team-agent/src/lifecycle/profile_launch.rs +110 -4
  25. package/crates/team-agent/src/lifecycle/profile_smoke.rs +4 -1
  26. package/crates/team-agent/src/lifecycle/restart/common.rs +19 -2
  27. package/crates/team-agent/src/lifecycle/tests/agent_ops.rs +2 -2
  28. package/crates/team-agent/src/lifecycle/tests/core.rs +1 -1
  29. package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +4 -4
  30. package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +3 -1
  31. package/crates/team-agent/src/lifecycle/worker_command_context.rs +44 -9
  32. package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +2 -1
  33. package/crates/team-agent/src/mcp_server/tests/scoped.rs +14 -1
  34. package/crates/team-agent/src/mcp_server/tests/send.rs +15 -1
  35. package/crates/team-agent/src/mcp_server/tools.rs +65 -9
  36. package/crates/team-agent/src/mcp_server/wire.rs +2 -1
  37. package/crates/team-agent/src/message_store.rs +80 -0
  38. package/crates/team-agent/src/messaging/results.rs +76 -5
  39. package/crates/team-agent/src/messaging/send.rs +3 -1
  40. package/crates/team-agent/src/messaging/types.rs +15 -1
  41. package/crates/team-agent/src/messaging/watchers.rs +68 -30
  42. package/crates/team-agent/src/model/enums.rs +7 -1
  43. package/crates/team-agent/src/model/permissions.rs +7 -0
  44. package/crates/team-agent/src/model/spec.rs +3 -1
  45. package/crates/team-agent/src/provider/adapter.rs +472 -7
  46. package/crates/team-agent/src/provider/classify.rs +6 -2
  47. package/crates/team-agent/src/provider/faults.rs +3 -2
  48. package/crates/team-agent/src/provider/startup_prompt.rs +25 -7
  49. package/crates/team-agent/src/provider/types.rs +11 -0
  50. package/crates/team-agent/src/session_capture.rs +1 -0
  51. package/crates/team-agent/src/state/persist.rs +95 -19
  52. package/crates/team-agent/src/tmux_backend/tests.rs +8 -7
  53. package/crates/team-agent/src/tmux_backend.rs +80 -6
  54. package/crates/team-agent/src/transport.rs +32 -0
  55. package/package.json +4 -4
package/Cargo.lock CHANGED
@@ -566,7 +566,7 @@ dependencies = [
566
566
 
567
567
  [[package]]
568
568
  name = "team-agent"
569
- version = "0.3.4"
569
+ version = "0.3.5"
570
570
  dependencies = [
571
571
  "anyhow",
572
572
  "chrono",
package/Cargo.toml CHANGED
@@ -9,7 +9,7 @@ members = ["crates/team-agent"]
9
9
 
10
10
  [workspace.package]
11
11
  edition = "2021"
12
- version = "0.3.4"
12
+ version = "0.3.5"
13
13
  license = "AGPL-3.0"
14
14
  rust-version = "1.95"
15
15
 
@@ -1490,6 +1490,14 @@ pub fn cmd_doctor(args: &DoctorArgs) -> Result<CmdResult, CliError> {
1490
1490
  if args.fix && args.gate.is_none() {
1491
1491
  return Err(CliError::Runtime("--fix requires --gate".to_string()));
1492
1492
  }
1493
+ // swallow batch 3 ①: an unknown gate refuses explicitly (Python commands.py:234-235
1494
+ // `unknown doctor gate`), never an empty default-doctor green.
1495
+ if let Some(DoctorGate::Unknown(raw)) = &args.gate {
1496
+ return Ok(CmdResult::from_json(
1497
+ serde_json::json!({"ok": false, "status": "unknown_gate", "gate": raw}),
1498
+ args.json,
1499
+ ));
1500
+ }
1493
1501
  if args.comms || matches!(args.gate, Some(DoctorGate::Comms)) {
1494
1502
  let value = crate::diagnose::comms::doctor_comms_json(&args.workspace, args.team.as_deref(), Some("comms"))?;
1495
1503
  if !args.json {
@@ -289,21 +289,46 @@ fn copy_optional_field(from: &Value, to: &mut Value, key: &str) {
289
289
  }
290
290
 
291
291
  pub(crate) fn build_wait_ready_report(workspace: &std::path::Path, timeout: f64) -> Result<Value, CliError> {
292
- let selected = crate::state::selector::resolve_active_team(
292
+ // swallow batch 3 ③: an unreadable runtime state must never read as "ready" — the
293
+ // read error is surfaced verbatim (state_read_error) with ready=false instead of
294
+ // silently degrading to an empty/stale state.
295
+ let selected = match crate::state::selector::resolve_active_team(
293
296
  workspace,
294
297
  None,
295
298
  crate::state::selector::SelectorMode::RuntimeOnly,
296
- )
297
- .map_err(|e| CliError::Runtime(e.to_string()))?;
299
+ ) {
300
+ Ok(selected) => selected,
301
+ Err(error) => {
302
+ return Ok(json!({
303
+ "ok": false,
304
+ "status": "error",
305
+ "reason": "state_read_error",
306
+ "state_read_error": error.to_string(),
307
+ "readiness": {"ready": false},
308
+ "summary": "runtime state could not be read",
309
+ "next_actions": [json!("inspect .team/runtime/state.json (corrupt or unreadable) and retry")],
310
+ }));
311
+ }
312
+ };
298
313
  let timeout = if timeout.is_finite() && timeout > 0.0 { timeout } else { 0.0 };
299
314
  let deadline = std::time::Instant::now() + std::time::Duration::from_secs_f64(timeout);
300
315
  let mut readiness;
316
+ let mut state_read_error: Option<String> = None;
301
317
  loop {
302
- let mut state = crate::state::projection::select_runtime_state(
318
+ let mut state = match crate::state::projection::select_runtime_state(
303
319
  &selected.run_workspace,
304
320
  Some(&selected.team_key),
305
- )
306
- .unwrap_or_else(|_| selected.state.clone());
321
+ ) {
322
+ Ok(state) => {
323
+ state_read_error = None;
324
+ state
325
+ }
326
+ Err(error) => {
327
+ state_read_error = Some(error.to_string());
328
+ readiness = json!({"ready": false, "state_read_error": error.to_string()});
329
+ break;
330
+ }
331
+ };
307
332
  inject_tmux_session_present(&selected.run_workspace, &mut state);
308
333
  inject_message_counts(&selected.run_workspace, &mut state)?;
309
334
  readiness = wait_readiness(&state);
@@ -322,7 +347,15 @@ pub(crate) fn build_wait_ready_report(workspace: &std::path::Path, timeout: f64)
322
347
  .and_then(Value::as_bool)
323
348
  == Some(true);
324
349
  let ready = readiness.get("ready").and_then(Value::as_bool) == Some(true);
325
- let (ok, status, reason, summary, next_actions) = if awaiting_trust {
350
+ let (ok, status, reason, summary, next_actions) = if state_read_error.is_some() {
351
+ (
352
+ false,
353
+ "error",
354
+ "state_read_error",
355
+ "runtime state could not be read",
356
+ vec![json!("inspect .team/runtime/state.json (corrupt or unreadable) and retry")],
357
+ )
358
+ } else if awaiting_trust {
326
359
  (
327
360
  false,
328
361
  "pending",
@@ -360,7 +393,7 @@ pub(crate) fn build_wait_ready_report(workspace: &std::path::Path, timeout: f64)
360
393
  "readiness": readiness,
361
394
  }),
362
395
  )?;
363
- Ok(json!({
396
+ let mut report = json!({
364
397
  "details_log": details_log.to_string_lossy().to_string(),
365
398
  "next_actions": next_actions,
366
399
  "ok": ok,
@@ -368,7 +401,11 @@ pub(crate) fn build_wait_ready_report(workspace: &std::path::Path, timeout: f64)
368
401
  "readiness": readiness,
369
402
  "status": status,
370
403
  "summary": summary,
371
- }))
404
+ });
405
+ if let Some(error) = state_read_error {
406
+ report["state_read_error"] = json!(error);
407
+ }
408
+ Ok(report)
372
409
  }
373
410
 
374
411
  fn inject_tmux_session_present(workspace: &std::path::Path, state: &mut Value) {
@@ -392,10 +429,12 @@ pub(crate) fn wait_readiness(state: &Value) -> Value {
392
429
  let mut task_prompt_delivered = false;
393
430
  let mut awaiting_trust_prompt = false;
394
431
  let mut incomplete_sessions = Vec::new();
432
+ // A-5: a missing/unreadable leader_receiver must NOT count as attached —
433
+ // "unreadable is never ready" (doctor/wait-ready truthfulness rule).
395
434
  let all_attached_receiver = state
396
435
  .get("leader_receiver")
397
436
  .and_then(Value::as_object)
398
- .is_none_or(|receiver| {
437
+ .is_some_and(|receiver| {
399
438
  receiver
400
439
  .get("status")
401
440
  .and_then(Value::as_str)
@@ -673,6 +712,7 @@ fn provider_wire(provider: crate::provider::Provider) -> &'static str {
673
712
  crate::provider::Provider::Claude => "claude",
674
713
  crate::provider::Provider::ClaudeCode => "claude_code",
675
714
  crate::provider::Provider::Codex => "codex",
715
+ crate::provider::Provider::Copilot => "copilot",
676
716
  crate::provider::Provider::GeminiCli => "gemini_cli",
677
717
  crate::provider::Provider::Fake => "fake",
678
718
  }
@@ -682,6 +722,7 @@ fn provider_command(provider: crate::provider::Provider) -> &'static str {
682
722
  match provider {
683
723
  crate::provider::Provider::Claude | crate::provider::Provider::ClaudeCode => "claude",
684
724
  crate::provider::Provider::Codex => "codex",
725
+ crate::provider::Provider::Copilot => "copilot",
685
726
  crate::provider::Provider::GeminiCli => "gemini",
686
727
  crate::provider::Provider::Fake => "team-agent fake-worker",
687
728
  }
@@ -963,7 +963,8 @@ fn doctor_gate(raw: Option<&str>) -> Option<DoctorGate> {
963
963
  match raw {
964
964
  Some("orphans") => Some(DoctorGate::Orphans),
965
965
  Some("comms") => Some(DoctorGate::Comms),
966
- _ => None,
966
+ Some(other) => Some(DoctorGate::Unknown(other.to_string())),
967
+ None => None,
967
968
  }
968
969
  }
969
970
 
@@ -176,11 +176,60 @@ pub mod lifecycle_port {
176
176
  let result =
177
177
  shutdown_with_transport_and_state(workspace, keep_logs, team, &transport, Some(state));
178
178
  if team.is_none() {
179
- transport.kill_server();
179
+ // B5/F1: the leader terminal (`team-agent claude`) lives on this same
180
+ // workspace socket by design (leader/start.rs); a bare shutdown must not
181
+ // `kill-server` it away. Spare `team-agent-leader-*` sessions and clear the
182
+ // remaining non-leader sessions individually; only an empty-of-leader socket
183
+ // gets the whole-server teardown (the original leak-cleanup intent).
184
+ let transport_dyn: &dyn crate::transport::Transport = &transport;
185
+ let sessions = socket_session_names(transport_dyn);
186
+ match sessions_to_kill_sparing_leader(&sessions) {
187
+ None => transport.kill_server(),
188
+ Some(non_leader_sessions) => {
189
+ for session in &non_leader_sessions {
190
+ let _ = transport_dyn.kill_session(session);
191
+ }
192
+ }
193
+ }
180
194
  }
181
195
  result
182
196
  }
183
197
 
198
+ fn socket_session_names(
199
+ transport: &dyn crate::transport::Transport,
200
+ ) -> Vec<crate::transport::SessionName> {
201
+ let mut seen = std::collections::BTreeSet::new();
202
+ transport
203
+ .list_targets()
204
+ .unwrap_or_default()
205
+ .into_iter()
206
+ .map(|pane| pane.session)
207
+ .filter(|session| seen.insert(session.as_str().to_string()))
208
+ .collect()
209
+ }
210
+
211
+ /// B5/F1 pure kill decision for the bare-shutdown socket teardown.
212
+ /// `None` => no `team-agent-leader-*` session on the socket → safe to kill the whole
213
+ /// server. `Some(rest)` => leader present → kill only the non-leader sessions.
214
+ pub(crate) fn sessions_to_kill_sparing_leader(
215
+ sessions: &[crate::transport::SessionName],
216
+ ) -> Option<Vec<crate::transport::SessionName>> {
217
+ let leader_present = sessions
218
+ .iter()
219
+ .any(|session| session.as_str().starts_with(crate::leader::LEADER_SESSION_PREFIX));
220
+ leader_present.then(|| {
221
+ sessions
222
+ .iter()
223
+ .filter(|session| {
224
+ !session
225
+ .as_str()
226
+ .starts_with(crate::leader::LEADER_SESSION_PREFIX)
227
+ })
228
+ .cloned()
229
+ .collect()
230
+ })
231
+ }
232
+
184
233
  pub fn shutdown_with_transport(
185
234
  workspace: &Path,
186
235
  keep_logs: bool,
@@ -223,7 +272,14 @@ pub mod lifecycle_port {
223
272
  .and_then(Value::as_str)
224
273
  .filter(|s| !s.is_empty())
225
274
  .map(crate::transport::SessionName::new);
226
- let protected = shutdown_protection_set();
275
+ // PERF-6 C-①-1: ONE process-table snapshot for the whole happy path; the
276
+ // protected / pgid / kill / wait sets all derive from it (N39 same-source).
277
+ // A probe failure is observable, not a silent empty table (swallow batch 1).
278
+ let mut probe_degraded = false;
279
+ let entry_table = shutdown_table_snapshot(&run_workspace, &mut probe_degraded, "entry");
280
+ let mut protected = shutdown_protection_set(&entry_table);
281
+ extend_protection_with_leader_panes(&mut protected, transport, &entry_table);
282
+ let protected = protected;
227
283
  let reap_scope = if team.is_some() {
228
284
  ShutdownReapScope::ScopedTeam
229
285
  } else {
@@ -246,11 +302,9 @@ pub mod lifecycle_port {
246
302
  root_pids.extend(pane_pids);
247
303
  root_pids.sort_unstable();
248
304
  root_pids.dedup();
249
- let root_pgids = process_pgids(&root_pids, &protected);
305
+ let root_pgids = process_pgids(&root_pids, &protected, &entry_table);
250
306
  deadline.check("reap_process_tree")?;
251
- for pid in &root_pids {
252
- reap_process_tree(*pid, &protected);
253
- }
307
+ reap_process_tree(&root_pids, &protected, &entry_table);
254
308
  reap_process_groups(&root_pgids, &protected);
255
309
  let mut kill_error: Option<String> = None;
256
310
  deadline.check("kill_session")?;
@@ -267,8 +321,9 @@ pub mod lifecycle_port {
267
321
  &state,
268
322
  &root_pids,
269
323
  &root_pgids,
270
- &protected,
324
+ transport,
271
325
  reap_scope,
326
+ &mut probe_degraded,
272
327
  );
273
328
  deadline.check("session_residuals")?;
274
329
  let session_residuals = if let Some(session) = session_name.as_ref() {
@@ -286,6 +341,10 @@ pub mod lifecycle_port {
286
341
  Vec::new()
287
342
  };
288
343
  deadline.check("process_residuals")?;
344
+ // C-①: the post-verify gets ONE fresh verification snapshot (reaps changed
345
+ // the world; #248 post-verify facts must be current, not the entry view).
346
+ let verify_table =
347
+ shutdown_table_snapshot(&run_workspace, &mut probe_degraded, "post_verify");
289
348
  let process_residuals = process_residuals(
290
349
  &run_workspace,
291
350
  &state,
@@ -293,6 +352,7 @@ pub mod lifecycle_port {
293
352
  &root_pgids,
294
353
  &protected,
295
354
  reap_scope,
355
+ &verify_table,
296
356
  );
297
357
  deadline.check("stop_coordinator")?;
298
358
  let mut coordinator_timeout = false;
@@ -313,7 +373,9 @@ pub mod lifecycle_port {
313
373
  None
314
374
  };
315
375
  let probe_timeout = crate::os_probe::probe_timeout();
316
- let verification_degraded = probe_timeout.is_some();
376
+ // swallow batch 1: a failed ps probe degrades verification truthfully — the
377
+ // empty table must never read as a clean "no residual processes".
378
+ let verification_degraded = probe_timeout.is_some() || probe_degraded;
317
379
  let session_killed = session_name.is_some()
318
380
  && kill_error.is_none()
319
381
  && session_residuals.is_empty()
@@ -393,6 +455,7 @@ pub mod lifecycle_port {
393
455
  "status": status,
394
456
  "phase": phase,
395
457
  "verification_degraded": verification_degraded,
458
+ "probe_degraded": probe_degraded,
396
459
  "probe_timeout_kind": probe_timeout_kind,
397
460
  "probe_timeout": probe_timeout_value,
398
461
  "keep_logs": keep_logs,
@@ -574,11 +637,29 @@ pub mod lifecycle_port {
574
637
  .filter(|pid| *pid > 0)
575
638
  }
576
639
 
577
- fn reap_process_tree(root_pid: u32, protected: &ShutdownProtection) {
578
- let pids = process_tree_pids(root_pid)
579
- .into_iter()
580
- .filter(|pid| !protected.contains_pid(*pid))
581
- .collect::<Vec<_>>();
640
+ /// PERF-6 C-② batched signals: the UNION of all root trees gets SIGTERM, shares ONE
641
+ /// >=150ms grace window (no single pid's grace is shortened — the serial per-root
642
+ /// chain is what's removed), then the union gets SIGKILL (noop for already-dead
643
+ /// pids; Gap 37 escalation order TERM -> grace -> KILL preserved), then a single
644
+ /// bounded wait for the whole union. kill/wait sets derive from the SAME snapshot
645
+ /// as the protected set (N39).
646
+ fn reap_process_tree(
647
+ root_pids: &[u32],
648
+ protected: &ShutdownProtection,
649
+ table: &[ProcessInfo],
650
+ ) {
651
+ let mut pids = Vec::new();
652
+ let mut seen = std::collections::BTreeSet::new();
653
+ for root in root_pids {
654
+ for pid in process_tree_from_table(*root, table) {
655
+ if !protected.contains_pid(pid) && seen.insert(pid) {
656
+ pids.push(pid);
657
+ }
658
+ }
659
+ }
660
+ if pids.is_empty() {
661
+ return;
662
+ }
582
663
  for pid in pids.iter().rev() {
583
664
  send_process_signal(*pid, libc::SIGTERM);
584
665
  }
@@ -611,86 +692,83 @@ pub mod lifecycle_port {
611
692
  }
612
693
  }
613
694
 
695
+ /// PERF-6 C-①-2 + C-②-5: every residual round fetches ONE fresh snapshot (reap
696
+ /// changed the world) and re-derives the protected set from THAT snapshot; all
697
+ /// in-round consumers (match + tree walks) reuse it.
614
698
  fn reap_workspace_process_residuals(
615
699
  workspace: &Path,
616
700
  state: &Value,
617
701
  root_pids: &[u32],
618
702
  root_pgids: &[u32],
619
- protected: &ShutdownProtection,
703
+ transport: &dyn crate::transport::Transport,
620
704
  scope: ShutdownReapScope,
705
+ probe_degraded: &mut bool,
621
706
  ) {
622
707
  for _ in 0..5 {
623
- let residuals =
624
- matched_processes(workspace, state, root_pids, root_pgids, protected, scope);
708
+ let round_table = shutdown_table_snapshot(workspace, probe_degraded, "residual_round");
709
+ let mut protected = shutdown_protection_set(&round_table);
710
+ extend_protection_with_leader_panes(&mut protected, transport, &round_table);
711
+ let residuals = matched_processes(
712
+ workspace, state, root_pids, root_pgids, &protected, scope, &round_table,
713
+ );
625
714
  if residuals.is_empty() {
626
715
  return;
627
716
  }
628
- for process in &residuals {
629
- reap_process_tree(process.pid, protected);
630
- }
717
+ let residual_pids = residuals.iter().map(|process| process.pid).collect::<Vec<_>>();
718
+ reap_process_tree(&residual_pids, &protected, &round_table);
631
719
  let pgids = residuals
632
720
  .iter()
633
721
  .filter_map(|process| process.pgid)
634
722
  .collect::<Vec<_>>();
635
- reap_process_groups(&pgids, protected);
723
+ reap_process_groups(&pgids, &protected);
636
724
  std::thread::sleep(std::time::Duration::from_millis(100));
637
725
  }
638
726
  }
639
727
 
640
- fn process_tree_pids(root_pid: u32) -> Vec<u32> {
641
- if root_pid == 0 {
642
- return Vec::new();
643
- }
644
- let pairs = process_parent_pairs();
645
- let mut out = vec![root_pid];
646
- let mut seen = std::collections::BTreeSet::new();
647
- seen.insert(root_pid);
648
- let mut index = 0;
649
- while index < out.len() {
650
- let parent = out[index];
651
- for (pid, ppid) in &pairs {
652
- if *ppid == parent && seen.insert(*pid) {
653
- out.push(*pid);
654
- }
655
- }
656
- index += 1;
657
- }
658
- out
659
- }
660
-
661
- fn process_parent_pairs() -> Vec<(u32, u32)> {
662
- let output = match crate::os_probe::bounded_command_output_with_probe(
663
- std::process::Command::new("ps").args(["-axo", "pid=,ppid="]),
664
- "ps_parent",
728
+ /// swallow batch 1: the raw ps probe with an explicit error channel — a failed
729
+ /// probe must never masquerade as "no processes" (CLAUDE.md §5).
730
+ fn probed_process_table() -> Result<Vec<ProcessInfo>, String> {
731
+ match crate::os_probe::bounded_command_output_with_probe(
732
+ std::process::Command::new("ps").args(["-axo", "pid=,ppid=,pgid=,sess=,command="]),
733
+ "ps_table",
665
734
  None,
666
735
  ) {
667
- Ok(output) if output.status.success() => output,
668
- _ => return Vec::new(),
669
- };
670
- String::from_utf8_lossy(&output.stdout)
671
- .lines()
672
- .filter_map(|line| {
673
- let mut parts = line.split_whitespace();
674
- let pid = parts.next()?.parse::<u32>().ok()?;
675
- let ppid = parts.next()?.parse::<u32>().ok()?;
676
- Some((pid, ppid))
677
- })
678
- .collect()
736
+ Ok(output) if output.status.success() => Ok(String::from_utf8_lossy(&output.stdout)
737
+ .lines()
738
+ .filter_map(parse_process_info)
739
+ .collect()),
740
+ Ok(output) => Err(format!("ps exited with status {:?}", output.status.code())),
741
+ Err(error) => Err(error.to_string()),
742
+ }
679
743
  }
680
744
 
681
745
  fn process_table() -> Vec<ProcessInfo> {
682
- let output = match crate::os_probe::bounded_command_output_with_probe(
683
- std::process::Command::new("ps").args(["-axo", "pid=,ppid=,pgid=,sess=,command="]),
684
- "ps_table",
685
- None,
686
- ) {
687
- Ok(output) if output.status.success() => output,
688
- _ => return Vec::new(),
689
- };
690
- String::from_utf8_lossy(&output.stdout)
691
- .lines()
692
- .filter_map(parse_process_info)
693
- .collect()
746
+ probed_process_table().unwrap_or_default()
747
+ }
748
+
749
+ /// PERF-6 C-①-1 / swallow batch 1: the shutdown-scope snapshot fetch. A probe
750
+ /// failure writes a `shutdown.process_probe_failed` event (non-null error) and
751
+ /// marks the run degraded instead of silently treating it as "no processes".
752
+ fn shutdown_table_snapshot(
753
+ workspace: &Path,
754
+ probe_degraded: &mut bool,
755
+ phase: &str,
756
+ ) -> Vec<ProcessInfo> {
757
+ match probed_process_table() {
758
+ Ok(table) => table,
759
+ Err(error) => {
760
+ *probe_degraded = true;
761
+ let _ = crate::event_log::EventLog::new(workspace).write(
762
+ "shutdown.process_probe_failed",
763
+ json!({
764
+ "phase": phase,
765
+ "probe": "ps_table",
766
+ "error": error,
767
+ }),
768
+ );
769
+ Vec::new()
770
+ }
771
+ }
694
772
  }
695
773
 
696
774
  fn parse_process_info(line: &str) -> Option<ProcessInfo> {
@@ -739,8 +817,9 @@ pub mod lifecycle_port {
739
817
  }
740
818
  }
741
819
 
742
- fn shutdown_protection_set() -> ShutdownProtection {
743
- let table = process_table();
820
+ /// PERF-6 C-①-1/C-②-4 (N39): the protected set derives from the CALLER's snapshot —
821
+ /// the same table the kill/wait sets derive from.
822
+ fn shutdown_protection_set(table: &[ProcessInfo]) -> ShutdownProtection {
744
823
  let mut protected = ShutdownProtection::default();
745
824
  let current = std::process::id();
746
825
  protected.pids.insert(current);
@@ -765,6 +844,61 @@ pub mod lifecycle_port {
765
844
  protected
766
845
  }
767
846
 
847
+ /// B5/F2: the leader terminal's pane process tree joins the protected set (same
848
+ /// set, same mechanism as the invoker ancestry) so the workspace residual sweep's
849
+ /// cmdline/cwd matching cannot reap the leader — including when ANOTHER team's bare
850
+ /// shutdown runs, where the leader is never in the invoker's ancestry.
851
+ fn extend_protection_with_leader_panes(
852
+ protected: &mut ShutdownProtection,
853
+ transport: &dyn crate::transport::Transport,
854
+ table: &[ProcessInfo],
855
+ ) {
856
+ let leader_pane_pids: Vec<u32> = transport
857
+ .list_targets()
858
+ .unwrap_or_default()
859
+ .into_iter()
860
+ .filter(|pane| {
861
+ pane.session
862
+ .as_str()
863
+ .starts_with(crate::leader::LEADER_SESSION_PREFIX)
864
+ })
865
+ .filter_map(|pane| pane.pane_pid)
866
+ .collect();
867
+ if leader_pane_pids.is_empty() {
868
+ return;
869
+ }
870
+ for root in &leader_pane_pids {
871
+ for pid in process_tree_from_table(*root, table) {
872
+ protected.pids.insert(pid);
873
+ if let Some(pgid) = table
874
+ .iter()
875
+ .find(|process| process.pid == pid)
876
+ .and_then(|process| process.pgid)
877
+ {
878
+ protected.pgids.insert(pgid);
879
+ }
880
+ }
881
+ }
882
+ // The tmux SERVER carrying the leader pane must survive too: its command line
883
+ // contains the workspace path (it was started with the worker spawn command), so
884
+ // the residual sweep matches it, and killing the server SIGHUPs every pane —
885
+ // including the protected leader — bypassing per-pid protection. Protect the
886
+ // server pid itself (NOT its tree: worker panes must still die).
887
+ for pane_pid in &leader_pane_pids {
888
+ if let Some(server) = table
889
+ .iter()
890
+ .find(|process| process.pid == *pane_pid)
891
+ .and_then(|pane| table.iter().find(|process| process.pid == pane.ppid))
892
+ .filter(|server| server.pid > 1)
893
+ {
894
+ protected.pids.insert(server.pid);
895
+ if let Some(pgid) = server.pgid {
896
+ protected.pgids.insert(pgid);
897
+ }
898
+ }
899
+ }
900
+ }
901
+
768
902
  fn send_process_signal(pid: u32, signal: libc::c_int) {
769
903
  let Ok(pid_t) = libc::pid_t::try_from(pid) else {
770
904
  return;
@@ -815,8 +949,11 @@ pub mod lifecycle_port {
815
949
  err.raw_os_error() == Some(libc::EPERM)
816
950
  }
817
951
 
818
- fn process_pgids(pids: &[u32], protected: &ShutdownProtection) -> Vec<u32> {
819
- let table = process_table();
952
+ fn process_pgids(
953
+ pids: &[u32],
954
+ protected: &ShutdownProtection,
955
+ table: &[ProcessInfo],
956
+ ) -> Vec<u32> {
820
957
  let mut pgids = pids
821
958
  .iter()
822
959
  .filter_map(|pid| table.iter().find(|process| process.pid == *pid))
@@ -839,9 +976,10 @@ pub mod lifecycle_port {
839
976
  root_pgids: &[u32],
840
977
  protected: &ShutdownProtection,
841
978
  scope: ShutdownReapScope,
979
+ table: &[ProcessInfo],
842
980
  ) -> Vec<Value> {
843
981
  let mut residuals =
844
- matched_processes(workspace, state, root_pids, root_pgids, protected, scope);
982
+ matched_processes(workspace, state, root_pids, root_pgids, protected, scope, table);
845
983
  let mut seen = residuals
846
984
  .iter()
847
985
  .map(|process| process.pid)
@@ -878,11 +1016,11 @@ pub mod lifecycle_port {
878
1016
  root_pgids: &[u32],
879
1017
  protected: &ShutdownProtection,
880
1018
  scope: ShutdownReapScope,
1019
+ table: &[ProcessInfo],
881
1020
  ) -> Vec<ProcessInfo> {
882
- let table = process_table();
883
1021
  let root_tree = root_pids
884
1022
  .iter()
885
- .flat_map(|pid| process_tree_from_table(*pid, &table))
1023
+ .flat_map(|pid| process_tree_from_table(*pid, table))
886
1024
  .filter(|pid| !protected.contains_pid(*pid))
887
1025
  .collect::<std::collections::BTreeSet<_>>();
888
1026
  let root_pgids = root_pgids
@@ -899,7 +1037,7 @@ pub mod lifecycle_port {
899
1037
  }
900
1038
  let matches_workspace = scope == ShutdownReapScope::Workspace
901
1039
  && process_matches_workspace(
902
- &process,
1040
+ process,
903
1041
  &workspace_text,
904
1042
  &spawn_cwds,
905
1043
  &mut cwd_probe_budget,
@@ -908,7 +1046,7 @@ pub mod lifecycle_port {
908
1046
  || root_tree.contains(&process.pid)
909
1047
  || process.pgid.is_some_and(|pgid| root_pgids.contains(&pgid))
910
1048
  {
911
- out.push(process);
1049
+ out.push(process.clone());
912
1050
  }
913
1051
  }
914
1052
  out
@@ -1132,10 +1270,9 @@ pub mod lifecycle_port {
1132
1270
  open_display: bool,
1133
1271
  team: Option<&str>,
1134
1272
  ) -> Result<Value, CliError> {
1135
- let _ = label;
1136
1273
  let source = crate::model::ids::AgentId::new(source_agent);
1137
1274
  let dest = crate::model::ids::AgentId::new(as_agent_id);
1138
- match crate::lifecycle::fork_agent(workspace, &source, &dest, open_display, team) {
1275
+ match crate::lifecycle::fork_agent(workspace, &source, &dest, label, open_display, team) {
1139
1276
  Ok(report) => Ok(json!({
1140
1277
  "ok": true,
1141
1278
  "source_agent_id": report.source_agent_id.as_str(),
@@ -202,6 +202,7 @@ fn delivery_refusal_wire(reason: DeliveryRefusal) -> &'static str {
202
202
  DeliveryRefusal::SessionDrift => "session_drift",
203
203
  DeliveryRefusal::Duplicate => "duplicate",
204
204
  DeliveryRefusal::RoutingAmbiguous => "routing_ambiguous",
205
+ DeliveryRefusal::EmptyTargetList => "empty_target_list",
205
206
  }
206
207
  }
207
208