@team-agent/installer 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +1 -1
- package/Cargo.toml +1 -1
- package/crates/team-agent/src/cli/adapters.rs +8 -0
- package/crates/team-agent/src/cli/diagnose.rs +51 -10
- package/crates/team-agent/src/cli/emit.rs +2 -1
- package/crates/team-agent/src/cli/mod.rs +217 -80
- package/crates/team-agent/src/cli/send.rs +1 -0
- package/crates/team-agent/src/cli/status_port.rs +135 -7
- package/crates/team-agent/src/cli/tests/missing_subcommands.rs +8 -1
- package/crates/team-agent/src/cli/tests/mod.rs +1 -0
- package/crates/team-agent/src/cli/tests/shutdown_kill_plan.rs +39 -0
- package/crates/team-agent/src/cli/types.rs +5 -1
- package/crates/team-agent/src/coordinator/backoff.rs +57 -9
- package/crates/team-agent/src/coordinator/health.rs +65 -2
- package/crates/team-agent/src/coordinator/runtime_detectors.rs +28 -16
- package/crates/team-agent/src/coordinator/tests/a0_lostupdate.rs +87 -0
- package/crates/team-agent/src/coordinator/tests/mod.rs +1 -0
- package/crates/team-agent/src/coordinator/tick.rs +195 -43
- package/crates/team-agent/src/leader/helpers.rs +2 -0
- package/crates/team-agent/src/leader/rediscover.rs +1 -0
- package/crates/team-agent/src/leader/start.rs +9 -1
- package/crates/team-agent/src/leader/takeover.rs +18 -1
- package/crates/team-agent/src/lifecycle/launch.rs +434 -29
- package/crates/team-agent/src/lifecycle/profile_launch.rs +110 -4
- package/crates/team-agent/src/lifecycle/profile_smoke.rs +4 -1
- package/crates/team-agent/src/lifecycle/restart/common.rs +19 -2
- package/crates/team-agent/src/lifecycle/tests/agent_ops.rs +2 -2
- package/crates/team-agent/src/lifecycle/tests/core.rs +1 -1
- package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +4 -4
- package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +3 -1
- package/crates/team-agent/src/lifecycle/worker_command_context.rs +44 -9
- package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +2 -1
- package/crates/team-agent/src/mcp_server/tests/scoped.rs +14 -1
- package/crates/team-agent/src/mcp_server/tests/send.rs +15 -1
- package/crates/team-agent/src/mcp_server/tools.rs +65 -9
- package/crates/team-agent/src/mcp_server/wire.rs +2 -1
- package/crates/team-agent/src/message_store.rs +80 -0
- package/crates/team-agent/src/messaging/results.rs +76 -5
- package/crates/team-agent/src/messaging/send.rs +3 -1
- package/crates/team-agent/src/messaging/types.rs +15 -1
- package/crates/team-agent/src/messaging/watchers.rs +68 -30
- package/crates/team-agent/src/model/enums.rs +7 -1
- package/crates/team-agent/src/model/permissions.rs +7 -0
- package/crates/team-agent/src/model/spec.rs +3 -1
- package/crates/team-agent/src/provider/adapter.rs +472 -7
- package/crates/team-agent/src/provider/classify.rs +6 -2
- package/crates/team-agent/src/provider/faults.rs +3 -2
- package/crates/team-agent/src/provider/startup_prompt.rs +25 -7
- package/crates/team-agent/src/provider/types.rs +11 -0
- package/crates/team-agent/src/session_capture.rs +1 -0
- package/crates/team-agent/src/state/persist.rs +95 -19
- package/crates/team-agent/src/tmux_backend/tests.rs +8 -7
- package/crates/team-agent/src/tmux_backend.rs +80 -6
- package/crates/team-agent/src/transport.rs +32 -0
- package/npm/install.mjs +21 -0
- package/package.json +4 -4
package/Cargo.lock
CHANGED
package/Cargo.toml
CHANGED
|
@@ -1490,6 +1490,14 @@ pub fn cmd_doctor(args: &DoctorArgs) -> Result<CmdResult, CliError> {
|
|
|
1490
1490
|
if args.fix && args.gate.is_none() {
|
|
1491
1491
|
return Err(CliError::Runtime("--fix requires --gate".to_string()));
|
|
1492
1492
|
}
|
|
1493
|
+
// swallow batch 3 ①: an unknown gate refuses explicitly (Python commands.py:234-235
|
|
1494
|
+
// `unknown doctor gate`), never an empty default-doctor green.
|
|
1495
|
+
if let Some(DoctorGate::Unknown(raw)) = &args.gate {
|
|
1496
|
+
return Ok(CmdResult::from_json(
|
|
1497
|
+
serde_json::json!({"ok": false, "status": "unknown_gate", "gate": raw}),
|
|
1498
|
+
args.json,
|
|
1499
|
+
));
|
|
1500
|
+
}
|
|
1493
1501
|
if args.comms || matches!(args.gate, Some(DoctorGate::Comms)) {
|
|
1494
1502
|
let value = crate::diagnose::comms::doctor_comms_json(&args.workspace, args.team.as_deref(), Some("comms"))?;
|
|
1495
1503
|
if !args.json {
|
|
@@ -289,21 +289,46 @@ fn copy_optional_field(from: &Value, to: &mut Value, key: &str) {
|
|
|
289
289
|
}
|
|
290
290
|
|
|
291
291
|
pub(crate) fn build_wait_ready_report(workspace: &std::path::Path, timeout: f64) -> Result<Value, CliError> {
|
|
292
|
-
|
|
292
|
+
// swallow batch 3 ③: an unreadable runtime state must never read as "ready" — the
|
|
293
|
+
// read error is surfaced verbatim (state_read_error) with ready=false instead of
|
|
294
|
+
// silently degrading to an empty/stale state.
|
|
295
|
+
let selected = match crate::state::selector::resolve_active_team(
|
|
293
296
|
workspace,
|
|
294
297
|
None,
|
|
295
298
|
crate::state::selector::SelectorMode::RuntimeOnly,
|
|
296
|
-
)
|
|
297
|
-
|
|
299
|
+
) {
|
|
300
|
+
Ok(selected) => selected,
|
|
301
|
+
Err(error) => {
|
|
302
|
+
return Ok(json!({
|
|
303
|
+
"ok": false,
|
|
304
|
+
"status": "error",
|
|
305
|
+
"reason": "state_read_error",
|
|
306
|
+
"state_read_error": error.to_string(),
|
|
307
|
+
"readiness": {"ready": false},
|
|
308
|
+
"summary": "runtime state could not be read",
|
|
309
|
+
"next_actions": [json!("inspect .team/runtime/state.json (corrupt or unreadable) and retry")],
|
|
310
|
+
}));
|
|
311
|
+
}
|
|
312
|
+
};
|
|
298
313
|
let timeout = if timeout.is_finite() && timeout > 0.0 { timeout } else { 0.0 };
|
|
299
314
|
let deadline = std::time::Instant::now() + std::time::Duration::from_secs_f64(timeout);
|
|
300
315
|
let mut readiness;
|
|
316
|
+
let mut state_read_error: Option<String> = None;
|
|
301
317
|
loop {
|
|
302
|
-
let mut state = crate::state::projection::select_runtime_state(
|
|
318
|
+
let mut state = match crate::state::projection::select_runtime_state(
|
|
303
319
|
&selected.run_workspace,
|
|
304
320
|
Some(&selected.team_key),
|
|
305
|
-
)
|
|
306
|
-
|
|
321
|
+
) {
|
|
322
|
+
Ok(state) => {
|
|
323
|
+
state_read_error = None;
|
|
324
|
+
state
|
|
325
|
+
}
|
|
326
|
+
Err(error) => {
|
|
327
|
+
state_read_error = Some(error.to_string());
|
|
328
|
+
readiness = json!({"ready": false, "state_read_error": error.to_string()});
|
|
329
|
+
break;
|
|
330
|
+
}
|
|
331
|
+
};
|
|
307
332
|
inject_tmux_session_present(&selected.run_workspace, &mut state);
|
|
308
333
|
inject_message_counts(&selected.run_workspace, &mut state)?;
|
|
309
334
|
readiness = wait_readiness(&state);
|
|
@@ -322,7 +347,15 @@ pub(crate) fn build_wait_ready_report(workspace: &std::path::Path, timeout: f64)
|
|
|
322
347
|
.and_then(Value::as_bool)
|
|
323
348
|
== Some(true);
|
|
324
349
|
let ready = readiness.get("ready").and_then(Value::as_bool) == Some(true);
|
|
325
|
-
let (ok, status, reason, summary, next_actions) = if
|
|
350
|
+
let (ok, status, reason, summary, next_actions) = if state_read_error.is_some() {
|
|
351
|
+
(
|
|
352
|
+
false,
|
|
353
|
+
"error",
|
|
354
|
+
"state_read_error",
|
|
355
|
+
"runtime state could not be read",
|
|
356
|
+
vec![json!("inspect .team/runtime/state.json (corrupt or unreadable) and retry")],
|
|
357
|
+
)
|
|
358
|
+
} else if awaiting_trust {
|
|
326
359
|
(
|
|
327
360
|
false,
|
|
328
361
|
"pending",
|
|
@@ -360,7 +393,7 @@ pub(crate) fn build_wait_ready_report(workspace: &std::path::Path, timeout: f64)
|
|
|
360
393
|
"readiness": readiness,
|
|
361
394
|
}),
|
|
362
395
|
)?;
|
|
363
|
-
|
|
396
|
+
let mut report = json!({
|
|
364
397
|
"details_log": details_log.to_string_lossy().to_string(),
|
|
365
398
|
"next_actions": next_actions,
|
|
366
399
|
"ok": ok,
|
|
@@ -368,7 +401,11 @@ pub(crate) fn build_wait_ready_report(workspace: &std::path::Path, timeout: f64)
|
|
|
368
401
|
"readiness": readiness,
|
|
369
402
|
"status": status,
|
|
370
403
|
"summary": summary,
|
|
371
|
-
})
|
|
404
|
+
});
|
|
405
|
+
if let Some(error) = state_read_error {
|
|
406
|
+
report["state_read_error"] = json!(error);
|
|
407
|
+
}
|
|
408
|
+
Ok(report)
|
|
372
409
|
}
|
|
373
410
|
|
|
374
411
|
fn inject_tmux_session_present(workspace: &std::path::Path, state: &mut Value) {
|
|
@@ -392,10 +429,12 @@ pub(crate) fn wait_readiness(state: &Value) -> Value {
|
|
|
392
429
|
let mut task_prompt_delivered = false;
|
|
393
430
|
let mut awaiting_trust_prompt = false;
|
|
394
431
|
let mut incomplete_sessions = Vec::new();
|
|
432
|
+
// A-5: a missing/unreadable leader_receiver must NOT count as attached —
|
|
433
|
+
// "unreadable is never ready" (doctor/wait-ready truthfulness rule).
|
|
395
434
|
let all_attached_receiver = state
|
|
396
435
|
.get("leader_receiver")
|
|
397
436
|
.and_then(Value::as_object)
|
|
398
|
-
.
|
|
437
|
+
.is_some_and(|receiver| {
|
|
399
438
|
receiver
|
|
400
439
|
.get("status")
|
|
401
440
|
.and_then(Value::as_str)
|
|
@@ -673,6 +712,7 @@ fn provider_wire(provider: crate::provider::Provider) -> &'static str {
|
|
|
673
712
|
crate::provider::Provider::Claude => "claude",
|
|
674
713
|
crate::provider::Provider::ClaudeCode => "claude_code",
|
|
675
714
|
crate::provider::Provider::Codex => "codex",
|
|
715
|
+
crate::provider::Provider::Copilot => "copilot",
|
|
676
716
|
crate::provider::Provider::GeminiCli => "gemini_cli",
|
|
677
717
|
crate::provider::Provider::Fake => "fake",
|
|
678
718
|
}
|
|
@@ -682,6 +722,7 @@ fn provider_command(provider: crate::provider::Provider) -> &'static str {
|
|
|
682
722
|
match provider {
|
|
683
723
|
crate::provider::Provider::Claude | crate::provider::Provider::ClaudeCode => "claude",
|
|
684
724
|
crate::provider::Provider::Codex => "codex",
|
|
725
|
+
crate::provider::Provider::Copilot => "copilot",
|
|
685
726
|
crate::provider::Provider::GeminiCli => "gemini",
|
|
686
727
|
crate::provider::Provider::Fake => "team-agent fake-worker",
|
|
687
728
|
}
|
|
@@ -963,7 +963,8 @@ fn doctor_gate(raw: Option<&str>) -> Option<DoctorGate> {
|
|
|
963
963
|
match raw {
|
|
964
964
|
Some("orphans") => Some(DoctorGate::Orphans),
|
|
965
965
|
Some("comms") => Some(DoctorGate::Comms),
|
|
966
|
-
|
|
966
|
+
Some(other) => Some(DoctorGate::Unknown(other.to_string())),
|
|
967
|
+
None => None,
|
|
967
968
|
}
|
|
968
969
|
}
|
|
969
970
|
|
|
@@ -176,11 +176,60 @@ pub mod lifecycle_port {
|
|
|
176
176
|
let result =
|
|
177
177
|
shutdown_with_transport_and_state(workspace, keep_logs, team, &transport, Some(state));
|
|
178
178
|
if team.is_none() {
|
|
179
|
-
|
|
179
|
+
// B5/F1: the leader terminal (`team-agent claude`) lives on this same
|
|
180
|
+
// workspace socket by design (leader/start.rs); a bare shutdown must not
|
|
181
|
+
// `kill-server` it away. Spare `team-agent-leader-*` sessions and clear the
|
|
182
|
+
// remaining non-leader sessions individually; only an empty-of-leader socket
|
|
183
|
+
// gets the whole-server teardown (the original leak-cleanup intent).
|
|
184
|
+
let transport_dyn: &dyn crate::transport::Transport = &transport;
|
|
185
|
+
let sessions = socket_session_names(transport_dyn);
|
|
186
|
+
match sessions_to_kill_sparing_leader(&sessions) {
|
|
187
|
+
None => transport.kill_server(),
|
|
188
|
+
Some(non_leader_sessions) => {
|
|
189
|
+
for session in &non_leader_sessions {
|
|
190
|
+
let _ = transport_dyn.kill_session(session);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
180
194
|
}
|
|
181
195
|
result
|
|
182
196
|
}
|
|
183
197
|
|
|
198
|
+
fn socket_session_names(
|
|
199
|
+
transport: &dyn crate::transport::Transport,
|
|
200
|
+
) -> Vec<crate::transport::SessionName> {
|
|
201
|
+
let mut seen = std::collections::BTreeSet::new();
|
|
202
|
+
transport
|
|
203
|
+
.list_targets()
|
|
204
|
+
.unwrap_or_default()
|
|
205
|
+
.into_iter()
|
|
206
|
+
.map(|pane| pane.session)
|
|
207
|
+
.filter(|session| seen.insert(session.as_str().to_string()))
|
|
208
|
+
.collect()
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/// B5/F1 pure kill decision for the bare-shutdown socket teardown.
|
|
212
|
+
/// `None` => no `team-agent-leader-*` session on the socket → safe to kill the whole
|
|
213
|
+
/// server. `Some(rest)` => leader present → kill only the non-leader sessions.
|
|
214
|
+
pub(crate) fn sessions_to_kill_sparing_leader(
|
|
215
|
+
sessions: &[crate::transport::SessionName],
|
|
216
|
+
) -> Option<Vec<crate::transport::SessionName>> {
|
|
217
|
+
let leader_present = sessions
|
|
218
|
+
.iter()
|
|
219
|
+
.any(|session| session.as_str().starts_with(crate::leader::LEADER_SESSION_PREFIX));
|
|
220
|
+
leader_present.then(|| {
|
|
221
|
+
sessions
|
|
222
|
+
.iter()
|
|
223
|
+
.filter(|session| {
|
|
224
|
+
!session
|
|
225
|
+
.as_str()
|
|
226
|
+
.starts_with(crate::leader::LEADER_SESSION_PREFIX)
|
|
227
|
+
})
|
|
228
|
+
.cloned()
|
|
229
|
+
.collect()
|
|
230
|
+
})
|
|
231
|
+
}
|
|
232
|
+
|
|
184
233
|
pub fn shutdown_with_transport(
|
|
185
234
|
workspace: &Path,
|
|
186
235
|
keep_logs: bool,
|
|
@@ -223,7 +272,14 @@ pub mod lifecycle_port {
|
|
|
223
272
|
.and_then(Value::as_str)
|
|
224
273
|
.filter(|s| !s.is_empty())
|
|
225
274
|
.map(crate::transport::SessionName::new);
|
|
226
|
-
|
|
275
|
+
// PERF-6 C-①-1: ONE process-table snapshot for the whole happy path; the
|
|
276
|
+
// protected / pgid / kill / wait sets all derive from it (N39 same-source).
|
|
277
|
+
// A probe failure is observable, not a silent empty table (swallow batch 1).
|
|
278
|
+
let mut probe_degraded = false;
|
|
279
|
+
let entry_table = shutdown_table_snapshot(&run_workspace, &mut probe_degraded, "entry");
|
|
280
|
+
let mut protected = shutdown_protection_set(&entry_table);
|
|
281
|
+
extend_protection_with_leader_panes(&mut protected, transport, &entry_table);
|
|
282
|
+
let protected = protected;
|
|
227
283
|
let reap_scope = if team.is_some() {
|
|
228
284
|
ShutdownReapScope::ScopedTeam
|
|
229
285
|
} else {
|
|
@@ -246,11 +302,9 @@ pub mod lifecycle_port {
|
|
|
246
302
|
root_pids.extend(pane_pids);
|
|
247
303
|
root_pids.sort_unstable();
|
|
248
304
|
root_pids.dedup();
|
|
249
|
-
let root_pgids = process_pgids(&root_pids, &protected);
|
|
305
|
+
let root_pgids = process_pgids(&root_pids, &protected, &entry_table);
|
|
250
306
|
deadline.check("reap_process_tree")?;
|
|
251
|
-
|
|
252
|
-
reap_process_tree(*pid, &protected);
|
|
253
|
-
}
|
|
307
|
+
reap_process_tree(&root_pids, &protected, &entry_table);
|
|
254
308
|
reap_process_groups(&root_pgids, &protected);
|
|
255
309
|
let mut kill_error: Option<String> = None;
|
|
256
310
|
deadline.check("kill_session")?;
|
|
@@ -267,8 +321,9 @@ pub mod lifecycle_port {
|
|
|
267
321
|
&state,
|
|
268
322
|
&root_pids,
|
|
269
323
|
&root_pgids,
|
|
270
|
-
|
|
324
|
+
transport,
|
|
271
325
|
reap_scope,
|
|
326
|
+
&mut probe_degraded,
|
|
272
327
|
);
|
|
273
328
|
deadline.check("session_residuals")?;
|
|
274
329
|
let session_residuals = if let Some(session) = session_name.as_ref() {
|
|
@@ -286,6 +341,10 @@ pub mod lifecycle_port {
|
|
|
286
341
|
Vec::new()
|
|
287
342
|
};
|
|
288
343
|
deadline.check("process_residuals")?;
|
|
344
|
+
// C-①: the post-verify gets ONE fresh verification snapshot (reaps changed
|
|
345
|
+
// the world; #248 post-verify facts must be current, not the entry view).
|
|
346
|
+
let verify_table =
|
|
347
|
+
shutdown_table_snapshot(&run_workspace, &mut probe_degraded, "post_verify");
|
|
289
348
|
let process_residuals = process_residuals(
|
|
290
349
|
&run_workspace,
|
|
291
350
|
&state,
|
|
@@ -293,6 +352,7 @@ pub mod lifecycle_port {
|
|
|
293
352
|
&root_pgids,
|
|
294
353
|
&protected,
|
|
295
354
|
reap_scope,
|
|
355
|
+
&verify_table,
|
|
296
356
|
);
|
|
297
357
|
deadline.check("stop_coordinator")?;
|
|
298
358
|
let mut coordinator_timeout = false;
|
|
@@ -313,7 +373,9 @@ pub mod lifecycle_port {
|
|
|
313
373
|
None
|
|
314
374
|
};
|
|
315
375
|
let probe_timeout = crate::os_probe::probe_timeout();
|
|
316
|
-
|
|
376
|
+
// swallow batch 1: a failed ps probe degrades verification truthfully — the
|
|
377
|
+
// empty table must never read as a clean "no residual processes".
|
|
378
|
+
let verification_degraded = probe_timeout.is_some() || probe_degraded;
|
|
317
379
|
let session_killed = session_name.is_some()
|
|
318
380
|
&& kill_error.is_none()
|
|
319
381
|
&& session_residuals.is_empty()
|
|
@@ -393,6 +455,7 @@ pub mod lifecycle_port {
|
|
|
393
455
|
"status": status,
|
|
394
456
|
"phase": phase,
|
|
395
457
|
"verification_degraded": verification_degraded,
|
|
458
|
+
"probe_degraded": probe_degraded,
|
|
396
459
|
"probe_timeout_kind": probe_timeout_kind,
|
|
397
460
|
"probe_timeout": probe_timeout_value,
|
|
398
461
|
"keep_logs": keep_logs,
|
|
@@ -574,11 +637,29 @@ pub mod lifecycle_port {
|
|
|
574
637
|
.filter(|pid| *pid > 0)
|
|
575
638
|
}
|
|
576
639
|
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
640
|
+
/// PERF-6 C-② batched signals: the UNION of all root trees gets SIGTERM, shares ONE
|
|
641
|
+
/// >=150ms grace window (no single pid's grace is shortened — the serial per-root
|
|
642
|
+
/// chain is what's removed), then the union gets SIGKILL (noop for already-dead
|
|
643
|
+
/// pids; Gap 37 escalation order TERM -> grace -> KILL preserved), then a single
|
|
644
|
+
/// bounded wait for the whole union. kill/wait sets derive from the SAME snapshot
|
|
645
|
+
/// as the protected set (N39).
|
|
646
|
+
fn reap_process_tree(
|
|
647
|
+
root_pids: &[u32],
|
|
648
|
+
protected: &ShutdownProtection,
|
|
649
|
+
table: &[ProcessInfo],
|
|
650
|
+
) {
|
|
651
|
+
let mut pids = Vec::new();
|
|
652
|
+
let mut seen = std::collections::BTreeSet::new();
|
|
653
|
+
for root in root_pids {
|
|
654
|
+
for pid in process_tree_from_table(*root, table) {
|
|
655
|
+
if !protected.contains_pid(pid) && seen.insert(pid) {
|
|
656
|
+
pids.push(pid);
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
if pids.is_empty() {
|
|
661
|
+
return;
|
|
662
|
+
}
|
|
582
663
|
for pid in pids.iter().rev() {
|
|
583
664
|
send_process_signal(*pid, libc::SIGTERM);
|
|
584
665
|
}
|
|
@@ -611,86 +692,83 @@ pub mod lifecycle_port {
|
|
|
611
692
|
}
|
|
612
693
|
}
|
|
613
694
|
|
|
695
|
+
/// PERF-6 C-①-2 + C-②-5: every residual round fetches ONE fresh snapshot (reap
|
|
696
|
+
/// changed the world) and re-derives the protected set from THAT snapshot; all
|
|
697
|
+
/// in-round consumers (match + tree walks) reuse it.
|
|
614
698
|
fn reap_workspace_process_residuals(
|
|
615
699
|
workspace: &Path,
|
|
616
700
|
state: &Value,
|
|
617
701
|
root_pids: &[u32],
|
|
618
702
|
root_pgids: &[u32],
|
|
619
|
-
|
|
703
|
+
transport: &dyn crate::transport::Transport,
|
|
620
704
|
scope: ShutdownReapScope,
|
|
705
|
+
probe_degraded: &mut bool,
|
|
621
706
|
) {
|
|
622
707
|
for _ in 0..5 {
|
|
623
|
-
let
|
|
624
|
-
|
|
708
|
+
let round_table = shutdown_table_snapshot(workspace, probe_degraded, "residual_round");
|
|
709
|
+
let mut protected = shutdown_protection_set(&round_table);
|
|
710
|
+
extend_protection_with_leader_panes(&mut protected, transport, &round_table);
|
|
711
|
+
let residuals = matched_processes(
|
|
712
|
+
workspace, state, root_pids, root_pgids, &protected, scope, &round_table,
|
|
713
|
+
);
|
|
625
714
|
if residuals.is_empty() {
|
|
626
715
|
return;
|
|
627
716
|
}
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
}
|
|
717
|
+
let residual_pids = residuals.iter().map(|process| process.pid).collect::<Vec<_>>();
|
|
718
|
+
reap_process_tree(&residual_pids, &protected, &round_table);
|
|
631
719
|
let pgids = residuals
|
|
632
720
|
.iter()
|
|
633
721
|
.filter_map(|process| process.pgid)
|
|
634
722
|
.collect::<Vec<_>>();
|
|
635
|
-
reap_process_groups(&pgids, protected);
|
|
723
|
+
reap_process_groups(&pgids, &protected);
|
|
636
724
|
std::thread::sleep(std::time::Duration::from_millis(100));
|
|
637
725
|
}
|
|
638
726
|
}
|
|
639
727
|
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
let mut seen = std::collections::BTreeSet::new();
|
|
647
|
-
seen.insert(root_pid);
|
|
648
|
-
let mut index = 0;
|
|
649
|
-
while index < out.len() {
|
|
650
|
-
let parent = out[index];
|
|
651
|
-
for (pid, ppid) in &pairs {
|
|
652
|
-
if *ppid == parent && seen.insert(*pid) {
|
|
653
|
-
out.push(*pid);
|
|
654
|
-
}
|
|
655
|
-
}
|
|
656
|
-
index += 1;
|
|
657
|
-
}
|
|
658
|
-
out
|
|
659
|
-
}
|
|
660
|
-
|
|
661
|
-
fn process_parent_pairs() -> Vec<(u32, u32)> {
|
|
662
|
-
let output = match crate::os_probe::bounded_command_output_with_probe(
|
|
663
|
-
std::process::Command::new("ps").args(["-axo", "pid=,ppid="]),
|
|
664
|
-
"ps_parent",
|
|
728
|
+
/// swallow batch 1: the raw ps probe with an explicit error channel — a failed
|
|
729
|
+
/// probe must never masquerade as "no processes" (CLAUDE.md §5).
|
|
730
|
+
fn probed_process_table() -> Result<Vec<ProcessInfo>, String> {
|
|
731
|
+
match crate::os_probe::bounded_command_output_with_probe(
|
|
732
|
+
std::process::Command::new("ps").args(["-axo", "pid=,ppid=,pgid=,sess=,command="]),
|
|
733
|
+
"ps_table",
|
|
665
734
|
None,
|
|
666
735
|
) {
|
|
667
|
-
Ok(output) if output.status.success() => output
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
.
|
|
672
|
-
.
|
|
673
|
-
|
|
674
|
-
let pid = parts.next()?.parse::<u32>().ok()?;
|
|
675
|
-
let ppid = parts.next()?.parse::<u32>().ok()?;
|
|
676
|
-
Some((pid, ppid))
|
|
677
|
-
})
|
|
678
|
-
.collect()
|
|
736
|
+
Ok(output) if output.status.success() => Ok(String::from_utf8_lossy(&output.stdout)
|
|
737
|
+
.lines()
|
|
738
|
+
.filter_map(parse_process_info)
|
|
739
|
+
.collect()),
|
|
740
|
+
Ok(output) => Err(format!("ps exited with status {:?}", output.status.code())),
|
|
741
|
+
Err(error) => Err(error.to_string()),
|
|
742
|
+
}
|
|
679
743
|
}
|
|
680
744
|
|
|
681
745
|
fn process_table() -> Vec<ProcessInfo> {
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
746
|
+
probed_process_table().unwrap_or_default()
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
/// PERF-6 C-①-1 / swallow batch 1: the shutdown-scope snapshot fetch. A probe
|
|
750
|
+
/// failure writes a `shutdown.process_probe_failed` event (non-null error) and
|
|
751
|
+
/// marks the run degraded instead of silently treating it as "no processes".
|
|
752
|
+
fn shutdown_table_snapshot(
|
|
753
|
+
workspace: &Path,
|
|
754
|
+
probe_degraded: &mut bool,
|
|
755
|
+
phase: &str,
|
|
756
|
+
) -> Vec<ProcessInfo> {
|
|
757
|
+
match probed_process_table() {
|
|
758
|
+
Ok(table) => table,
|
|
759
|
+
Err(error) => {
|
|
760
|
+
*probe_degraded = true;
|
|
761
|
+
let _ = crate::event_log::EventLog::new(workspace).write(
|
|
762
|
+
"shutdown.process_probe_failed",
|
|
763
|
+
json!({
|
|
764
|
+
"phase": phase,
|
|
765
|
+
"probe": "ps_table",
|
|
766
|
+
"error": error,
|
|
767
|
+
}),
|
|
768
|
+
);
|
|
769
|
+
Vec::new()
|
|
770
|
+
}
|
|
771
|
+
}
|
|
694
772
|
}
|
|
695
773
|
|
|
696
774
|
fn parse_process_info(line: &str) -> Option<ProcessInfo> {
|
|
@@ -739,8 +817,9 @@ pub mod lifecycle_port {
|
|
|
739
817
|
}
|
|
740
818
|
}
|
|
741
819
|
|
|
742
|
-
|
|
743
|
-
|
|
820
|
+
/// PERF-6 C-①-1/C-②-4 (N39): the protected set derives from the CALLER's snapshot —
|
|
821
|
+
/// the same table the kill/wait sets derive from.
|
|
822
|
+
fn shutdown_protection_set(table: &[ProcessInfo]) -> ShutdownProtection {
|
|
744
823
|
let mut protected = ShutdownProtection::default();
|
|
745
824
|
let current = std::process::id();
|
|
746
825
|
protected.pids.insert(current);
|
|
@@ -765,6 +844,61 @@ pub mod lifecycle_port {
|
|
|
765
844
|
protected
|
|
766
845
|
}
|
|
767
846
|
|
|
847
|
+
/// B5/F2: the leader terminal's pane process tree joins the protected set (same
|
|
848
|
+
/// set, same mechanism as the invoker ancestry) so the workspace residual sweep's
|
|
849
|
+
/// cmdline/cwd matching cannot reap the leader — including when ANOTHER team's bare
|
|
850
|
+
/// shutdown runs, where the leader is never in the invoker's ancestry.
|
|
851
|
+
fn extend_protection_with_leader_panes(
|
|
852
|
+
protected: &mut ShutdownProtection,
|
|
853
|
+
transport: &dyn crate::transport::Transport,
|
|
854
|
+
table: &[ProcessInfo],
|
|
855
|
+
) {
|
|
856
|
+
let leader_pane_pids: Vec<u32> = transport
|
|
857
|
+
.list_targets()
|
|
858
|
+
.unwrap_or_default()
|
|
859
|
+
.into_iter()
|
|
860
|
+
.filter(|pane| {
|
|
861
|
+
pane.session
|
|
862
|
+
.as_str()
|
|
863
|
+
.starts_with(crate::leader::LEADER_SESSION_PREFIX)
|
|
864
|
+
})
|
|
865
|
+
.filter_map(|pane| pane.pane_pid)
|
|
866
|
+
.collect();
|
|
867
|
+
if leader_pane_pids.is_empty() {
|
|
868
|
+
return;
|
|
869
|
+
}
|
|
870
|
+
for root in &leader_pane_pids {
|
|
871
|
+
for pid in process_tree_from_table(*root, table) {
|
|
872
|
+
protected.pids.insert(pid);
|
|
873
|
+
if let Some(pgid) = table
|
|
874
|
+
.iter()
|
|
875
|
+
.find(|process| process.pid == pid)
|
|
876
|
+
.and_then(|process| process.pgid)
|
|
877
|
+
{
|
|
878
|
+
protected.pgids.insert(pgid);
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
}
|
|
882
|
+
// The tmux SERVER carrying the leader pane must survive too: its command line
|
|
883
|
+
// contains the workspace path (it was started with the worker spawn command), so
|
|
884
|
+
// the residual sweep matches it, and killing the server SIGHUPs every pane —
|
|
885
|
+
// including the protected leader — bypassing per-pid protection. Protect the
|
|
886
|
+
// server pid itself (NOT its tree: worker panes must still die).
|
|
887
|
+
for pane_pid in &leader_pane_pids {
|
|
888
|
+
if let Some(server) = table
|
|
889
|
+
.iter()
|
|
890
|
+
.find(|process| process.pid == *pane_pid)
|
|
891
|
+
.and_then(|pane| table.iter().find(|process| process.pid == pane.ppid))
|
|
892
|
+
.filter(|server| server.pid > 1)
|
|
893
|
+
{
|
|
894
|
+
protected.pids.insert(server.pid);
|
|
895
|
+
if let Some(pgid) = server.pgid {
|
|
896
|
+
protected.pgids.insert(pgid);
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
|
|
768
902
|
fn send_process_signal(pid: u32, signal: libc::c_int) {
|
|
769
903
|
let Ok(pid_t) = libc::pid_t::try_from(pid) else {
|
|
770
904
|
return;
|
|
@@ -815,8 +949,11 @@ pub mod lifecycle_port {
|
|
|
815
949
|
err.raw_os_error() == Some(libc::EPERM)
|
|
816
950
|
}
|
|
817
951
|
|
|
818
|
-
fn process_pgids(
|
|
819
|
-
|
|
952
|
+
fn process_pgids(
|
|
953
|
+
pids: &[u32],
|
|
954
|
+
protected: &ShutdownProtection,
|
|
955
|
+
table: &[ProcessInfo],
|
|
956
|
+
) -> Vec<u32> {
|
|
820
957
|
let mut pgids = pids
|
|
821
958
|
.iter()
|
|
822
959
|
.filter_map(|pid| table.iter().find(|process| process.pid == *pid))
|
|
@@ -839,9 +976,10 @@ pub mod lifecycle_port {
|
|
|
839
976
|
root_pgids: &[u32],
|
|
840
977
|
protected: &ShutdownProtection,
|
|
841
978
|
scope: ShutdownReapScope,
|
|
979
|
+
table: &[ProcessInfo],
|
|
842
980
|
) -> Vec<Value> {
|
|
843
981
|
let mut residuals =
|
|
844
|
-
matched_processes(workspace, state, root_pids, root_pgids, protected, scope);
|
|
982
|
+
matched_processes(workspace, state, root_pids, root_pgids, protected, scope, table);
|
|
845
983
|
let mut seen = residuals
|
|
846
984
|
.iter()
|
|
847
985
|
.map(|process| process.pid)
|
|
@@ -878,11 +1016,11 @@ pub mod lifecycle_port {
|
|
|
878
1016
|
root_pgids: &[u32],
|
|
879
1017
|
protected: &ShutdownProtection,
|
|
880
1018
|
scope: ShutdownReapScope,
|
|
1019
|
+
table: &[ProcessInfo],
|
|
881
1020
|
) -> Vec<ProcessInfo> {
|
|
882
|
-
let table = process_table();
|
|
883
1021
|
let root_tree = root_pids
|
|
884
1022
|
.iter()
|
|
885
|
-
.flat_map(|pid| process_tree_from_table(*pid,
|
|
1023
|
+
.flat_map(|pid| process_tree_from_table(*pid, table))
|
|
886
1024
|
.filter(|pid| !protected.contains_pid(*pid))
|
|
887
1025
|
.collect::<std::collections::BTreeSet<_>>();
|
|
888
1026
|
let root_pgids = root_pgids
|
|
@@ -899,7 +1037,7 @@ pub mod lifecycle_port {
|
|
|
899
1037
|
}
|
|
900
1038
|
let matches_workspace = scope == ShutdownReapScope::Workspace
|
|
901
1039
|
&& process_matches_workspace(
|
|
902
|
-
|
|
1040
|
+
process,
|
|
903
1041
|
&workspace_text,
|
|
904
1042
|
&spawn_cwds,
|
|
905
1043
|
&mut cwd_probe_budget,
|
|
@@ -908,7 +1046,7 @@ pub mod lifecycle_port {
|
|
|
908
1046
|
|| root_tree.contains(&process.pid)
|
|
909
1047
|
|| process.pgid.is_some_and(|pgid| root_pgids.contains(&pgid))
|
|
910
1048
|
{
|
|
911
|
-
out.push(process);
|
|
1049
|
+
out.push(process.clone());
|
|
912
1050
|
}
|
|
913
1051
|
}
|
|
914
1052
|
out
|
|
@@ -1132,10 +1270,9 @@ pub mod lifecycle_port {
|
|
|
1132
1270
|
open_display: bool,
|
|
1133
1271
|
team: Option<&str>,
|
|
1134
1272
|
) -> Result<Value, CliError> {
|
|
1135
|
-
let _ = label;
|
|
1136
1273
|
let source = crate::model::ids::AgentId::new(source_agent);
|
|
1137
1274
|
let dest = crate::model::ids::AgentId::new(as_agent_id);
|
|
1138
|
-
match crate::lifecycle::fork_agent(workspace, &source, &dest, open_display, team) {
|
|
1275
|
+
match crate::lifecycle::fork_agent(workspace, &source, &dest, label, open_display, team) {
|
|
1139
1276
|
Ok(report) => Ok(json!({
|
|
1140
1277
|
"ok": true,
|
|
1141
1278
|
"source_agent_id": report.source_agent_id.as_str(),
|
|
@@ -202,6 +202,7 @@ fn delivery_refusal_wire(reason: DeliveryRefusal) -> &'static str {
|
|
|
202
202
|
DeliveryRefusal::SessionDrift => "session_drift",
|
|
203
203
|
DeliveryRefusal::Duplicate => "duplicate",
|
|
204
204
|
DeliveryRefusal::RoutingAmbiguous => "routing_ambiguous",
|
|
205
|
+
DeliveryRefusal::EmptyTargetList => "empty_target_list",
|
|
205
206
|
}
|
|
206
207
|
}
|
|
207
208
|
|