@team-agent/installer 0.3.9 → 0.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +1 -1
- package/Cargo.toml +1 -1
- package/crates/team-agent/src/coordinator/tests/spine.rs +6 -0
- package/crates/team-agent/src/coordinator/tick.rs +83 -1
- package/crates/team-agent/src/leader/lease.rs +19 -0
- package/crates/team-agent/src/leader/rediscover/tests.rs +12 -0
- package/crates/team-agent/src/leader/rediscover.rs +2 -0
- package/crates/team-agent/src/lifecycle/launch.rs +35 -0
- package/crates/team-agent/src/lifecycle/restart/agent.rs +17 -3
- package/crates/team-agent/src/lifecycle/restart/common.rs +75 -0
- package/crates/team-agent/src/lifecycle/restart/rebuild.rs +201 -3
- package/crates/team-agent/src/lifecycle/restart/selection.rs +51 -14
- package/crates/team-agent/src/lifecycle/restart.rs +1 -1
- package/crates/team-agent/src/lifecycle/tests/core.rs +89 -15
- package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +68 -3
- package/crates/team-agent/src/lifecycle/tests/main_preserved.rs +3 -1
- package/crates/team-agent/src/messaging/delivery.rs +83 -2
- package/crates/team-agent/src/messaging/tests/runtime.rs +90 -0
- package/crates/team-agent/src/provider/approvals/parsing.rs +43 -14
- package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +12 -9
- package/crates/team-agent/src/transport/test_support.rs +12 -1
- package/package.json +4 -4
package/Cargo.lock
CHANGED
package/Cargo.toml
CHANGED
|
@@ -258,4 +258,10 @@ fn spine_tick_session_missing_emits_event() {
|
|
|
258
258
|
.any(|e| e.get("event").and_then(|v| v.as_str()) == Some("coordinator.session_missing")),
|
|
259
259
|
"the tmux-missing gate must emit a coordinator.session_missing event before the stop report; got {events:?}"
|
|
260
260
|
);
|
|
261
|
+
assert!(
|
|
262
|
+
events
|
|
263
|
+
.iter()
|
|
264
|
+
.any(|e| e.get("event").and_then(|v| v.as_str()) == Some("coordinator.session_missing_alert")),
|
|
265
|
+
"the tmux-missing gate must emit an explicit leader-visible alert before stopping; got {events:?}"
|
|
266
|
+
);
|
|
261
267
|
}
|
|
@@ -198,6 +198,7 @@ impl Coordinator {
|
|
|
198
198
|
"coordinator.session_missing",
|
|
199
199
|
serde_json::json!({"session": session_name}),
|
|
200
200
|
)?;
|
|
201
|
+
notify_session_missing(self.workspace.as_path(), &state, &event_log, session_name)?;
|
|
201
202
|
return Ok(empty_tick_report(
|
|
202
203
|
false,
|
|
203
204
|
true,
|
|
@@ -971,7 +972,8 @@ impl Coordinator {
|
|
|
971
972
|
"runtime_approval.auto_approved",
|
|
972
973
|
serde_json::json!({
|
|
973
974
|
"agent_id": agent_id,
|
|
974
|
-
"
|
|
975
|
+
"server": prompt.server.as_deref(),
|
|
976
|
+
"tool": prompt.tool.as_deref(),
|
|
975
977
|
"choice": choice,
|
|
976
978
|
"cleared": cleared,
|
|
977
979
|
"policy_source": approval_policy.source,
|
|
@@ -980,6 +982,23 @@ impl Coordinator {
|
|
|
980
982
|
"worker_capability_above_leader": approval_policy.worker_capability_above_leader,
|
|
981
983
|
}),
|
|
982
984
|
)?;
|
|
985
|
+
event_log.write(
|
|
986
|
+
"mcp.tool.auto_approved",
|
|
987
|
+
serde_json::json!({
|
|
988
|
+
"agent_id": agent_id,
|
|
989
|
+
"server": prompt.server.as_deref(),
|
|
990
|
+
"tool": prompt.tool.as_deref(),
|
|
991
|
+
"choice": choice,
|
|
992
|
+
"cleared": cleared,
|
|
993
|
+
"inherit_reason": approval_policy.inherit_reason(),
|
|
994
|
+
"bypass_source": approval_policy.source,
|
|
995
|
+
"provider": approval_policy.provider,
|
|
996
|
+
"flag": approval_policy.flag,
|
|
997
|
+
"inherited": approval_policy.inherited,
|
|
998
|
+
"explicit_yes_confirmed": approval_policy.explicit_yes_confirmed,
|
|
999
|
+
"worker_capability_above_leader": approval_policy.worker_capability_above_leader,
|
|
1000
|
+
}),
|
|
1001
|
+
)?;
|
|
983
1002
|
}
|
|
984
1003
|
RuntimeApprovalDecision::AwaitingHumanConfirm => {
|
|
985
1004
|
let Some(reason) = awaiting_human_confirm_reason(&prompt, auto_answer_allowed) else {
|
|
@@ -2110,6 +2129,8 @@ struct RuntimeApprovalPolicy {
|
|
|
2110
2129
|
source: String,
|
|
2111
2130
|
inherited: bool,
|
|
2112
2131
|
explicit_yes_confirmed: bool,
|
|
2132
|
+
provider: Option<String>,
|
|
2133
|
+
flag: Option<String>,
|
|
2113
2134
|
worker_capability_above_leader: bool,
|
|
2114
2135
|
}
|
|
2115
2136
|
|
|
@@ -2127,6 +2148,14 @@ impl RuntimeApprovalPolicy {
|
|
|
2127
2148
|
&& (!self.worker_capability_above_leader
|
|
2128
2149
|
|| (self.source == "runtime_config" && self.explicit_yes_confirmed))
|
|
2129
2150
|
}
|
|
2151
|
+
|
|
2152
|
+
fn inherit_reason(&self) -> &'static str {
|
|
2153
|
+
match self.source.as_str() {
|
|
2154
|
+
"leader_process" if self.inherited => "leader_bypass",
|
|
2155
|
+
"runtime_config" if self.explicit_yes_confirmed => "runtime_config_explicit_yes",
|
|
2156
|
+
_ => "none",
|
|
2157
|
+
}
|
|
2158
|
+
}
|
|
2130
2159
|
}
|
|
2131
2160
|
|
|
2132
2161
|
fn runtime_approval_policy_from_agent(agent: &Value) -> RuntimeApprovalPolicy {
|
|
@@ -2151,6 +2180,14 @@ fn runtime_approval_policy_from_agent(agent: &Value) -> RuntimeApprovalPolicy {
|
|
|
2151
2180
|
.and_then(|p| p.get("explicit_yes_confirmed"))
|
|
2152
2181
|
.and_then(Value::as_bool)
|
|
2153
2182
|
.unwrap_or(false),
|
|
2183
|
+
provider: policy
|
|
2184
|
+
.and_then(|p| p.get("provider"))
|
|
2185
|
+
.and_then(Value::as_str)
|
|
2186
|
+
.map(str::to_string),
|
|
2187
|
+
flag: policy
|
|
2188
|
+
.and_then(|p| p.get("flag"))
|
|
2189
|
+
.and_then(Value::as_str)
|
|
2190
|
+
.map(str::to_string),
|
|
2154
2191
|
worker_capability_above_leader: policy
|
|
2155
2192
|
.and_then(|p| p.get("worker_capability_above_leader"))
|
|
2156
2193
|
.and_then(Value::as_bool)
|
|
@@ -2423,3 +2460,48 @@ fn remove_file_if_exists(path: &Path) -> Result<(), std::io::Error> {
|
|
|
2423
2460
|
Err(e) => Err(e),
|
|
2424
2461
|
}
|
|
2425
2462
|
}
|
|
2463
|
+
|
|
2464
|
+
fn notify_session_missing(
|
|
2465
|
+
workspace: &Path,
|
|
2466
|
+
state: &Value,
|
|
2467
|
+
event_log: &EventLog,
|
|
2468
|
+
session_name: &str,
|
|
2469
|
+
) -> Result<(), TickError> {
|
|
2470
|
+
let content = format!(
|
|
2471
|
+
"coordinator.session_missing\nerror: tmux session {session_name} is missing; coordinator is stopping\naction: restart the team or recover the missing tmux session\nlog: .team/logs/events.jsonl"
|
|
2472
|
+
);
|
|
2473
|
+
let dedupe_key = format!("coordinator.session_missing:{session_name}");
|
|
2474
|
+
match crate::messaging::send_to_leader_receiver(
|
|
2475
|
+
workspace,
|
|
2476
|
+
state,
|
|
2477
|
+
"leader",
|
|
2478
|
+
&content,
|
|
2479
|
+
None,
|
|
2480
|
+
"coordinator",
|
|
2481
|
+
false,
|
|
2482
|
+
Some(&dedupe_key),
|
|
2483
|
+
event_log,
|
|
2484
|
+
) {
|
|
2485
|
+
Ok(outcome) => {
|
|
2486
|
+
event_log.write(
|
|
2487
|
+
"coordinator.session_missing_alert",
|
|
2488
|
+
serde_json::json!({
|
|
2489
|
+
"session": session_name,
|
|
2490
|
+
"leader_notification_status": crate::messaging::helpers::status_wire(outcome.status),
|
|
2491
|
+
"message_id": outcome.message_id,
|
|
2492
|
+
}),
|
|
2493
|
+
)?;
|
|
2494
|
+
}
|
|
2495
|
+
Err(error) => {
|
|
2496
|
+
event_log.write(
|
|
2497
|
+
"coordinator.session_missing_alert_failed",
|
|
2498
|
+
serde_json::json!({
|
|
2499
|
+
"session": session_name,
|
|
2500
|
+
"error": error.to_string(),
|
|
2501
|
+
"action": "inspect .team/logs/events.jsonl and restart the team",
|
|
2502
|
+
}),
|
|
2503
|
+
)?;
|
|
2504
|
+
}
|
|
2505
|
+
}
|
|
2506
|
+
Ok(())
|
|
2507
|
+
}
|
|
@@ -654,6 +654,8 @@ fn leader_command_provider(command: &str) -> Option<Provider> {
|
|
|
654
654
|
Some(Provider::ClaudeCode)
|
|
655
655
|
} else if lower.contains("codex") {
|
|
656
656
|
Some(Provider::Codex)
|
|
657
|
+
} else if lower.contains("copilot") {
|
|
658
|
+
Some(Provider::Copilot)
|
|
657
659
|
} else if lower.contains("fake") {
|
|
658
660
|
Some(Provider::Fake)
|
|
659
661
|
} else {
|
|
@@ -1082,3 +1084,20 @@ pub fn detect_dual_state_divergence(
|
|
|
1082
1084
|
"team_owner_epoch": team_epoch,
|
|
1083
1085
|
})))
|
|
1084
1086
|
}
|
|
1087
|
+
|
|
1088
|
+
#[cfg(test)]
|
|
1089
|
+
mod tests {
|
|
1090
|
+
use super::*;
|
|
1091
|
+
|
|
1092
|
+
#[test]
|
|
1093
|
+
fn leader_command_provider_recognizes_copilot() {
|
|
1094
|
+
assert_eq!(
|
|
1095
|
+
leader_command_provider("copilot --allow-all-tools"),
|
|
1096
|
+
Some(Provider::Copilot)
|
|
1097
|
+
);
|
|
1098
|
+
assert_eq!(
|
|
1099
|
+
leader_command_provider("/usr/local/bin/copilot"),
|
|
1100
|
+
Some(Provider::Copilot)
|
|
1101
|
+
);
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
@@ -80,6 +80,18 @@ fn event_named(events: &[Value], name: &str) -> Value {
|
|
|
80
80
|
.unwrap_or_else(|| panic!("missing event {name}; got {events:?}"))
|
|
81
81
|
}
|
|
82
82
|
|
|
83
|
+
#[test]
|
|
84
|
+
fn leader_command_provider_recognizes_copilot() {
|
|
85
|
+
assert_eq!(
|
|
86
|
+
leader_command_provider("copilot --allow-all-tools"),
|
|
87
|
+
Some(Provider::Copilot)
|
|
88
|
+
);
|
|
89
|
+
assert_eq!(
|
|
90
|
+
leader_command_provider("/usr/local/bin/copilot"),
|
|
91
|
+
Some(Provider::Copilot)
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
|
|
83
95
|
fn last_event_named(events: &[Value], name: &str) -> Value {
|
|
84
96
|
events
|
|
85
97
|
.iter()
|
|
@@ -590,6 +590,8 @@ fn leader_command_provider(command: &str) -> Option<Provider> {
|
|
|
590
590
|
Some(Provider::ClaudeCode)
|
|
591
591
|
} else if lower.contains("codex") {
|
|
592
592
|
Some(Provider::Codex)
|
|
593
|
+
} else if lower.contains("copilot") {
|
|
594
|
+
Some(Provider::Copilot)
|
|
593
595
|
} else if lower.contains("fake") {
|
|
594
596
|
Some(Provider::Fake)
|
|
595
597
|
} else {
|
|
@@ -288,6 +288,7 @@ fn spawn_agents(
|
|
|
288
288
|
let mut env =
|
|
289
289
|
inherited_env_with_team_overrides(workspace, agent_id_raw, Some(&mcp_team_id));
|
|
290
290
|
apply_profile_launch_env(&mut env, &profile_launch);
|
|
291
|
+
apply_mcp_auto_approval_env(&mut env, &safety);
|
|
291
292
|
// Python providers.py:145 + launch/core.py:253 — fresh launch runs the worker
|
|
292
293
|
// with cwd=workspace, same as the RS fork/add and restart paths.
|
|
293
294
|
let env_unset: Vec<String> = profile_launch.env_unset.iter().cloned().collect();
|
|
@@ -1390,6 +1391,39 @@ pub(crate) fn inherited_env_with_team_overrides(
|
|
|
1390
1391
|
env
|
|
1391
1392
|
}
|
|
1392
1393
|
|
|
1394
|
+
pub(crate) fn apply_mcp_auto_approval_env(
|
|
1395
|
+
env: &mut BTreeMap<String, String>,
|
|
1396
|
+
safety: &DangerousApproval,
|
|
1397
|
+
) {
|
|
1398
|
+
for key in [
|
|
1399
|
+
"TEAM_AGENT_LEADER_BYPASS",
|
|
1400
|
+
"TEAM_AGENT_LEADER_BYPASS_SOURCE",
|
|
1401
|
+
"TEAM_AGENT_LEADER_BYPASS_PROVIDER",
|
|
1402
|
+
"TEAM_AGENT_LEADER_BYPASS_FLAG",
|
|
1403
|
+
"TEAM_AGENT_MCP_AUTO_APPROVE",
|
|
1404
|
+
"TEAM_AGENT_MCP_AUTO_APPROVE_SOURCE",
|
|
1405
|
+
] {
|
|
1406
|
+
env.remove(key);
|
|
1407
|
+
}
|
|
1408
|
+
if safety.enabled
|
|
1409
|
+
&& matches!(safety.source, DangerousApprovalSource::LeaderProcess)
|
|
1410
|
+
&& safety.inherited
|
|
1411
|
+
{
|
|
1412
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS".to_string(), "1".to_string());
|
|
1413
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS_SOURCE".to_string(), "leader_process".to_string());
|
|
1414
|
+
if let Some(provider) = safety.provider.as_deref() {
|
|
1415
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS_PROVIDER".to_string(), provider.to_string());
|
|
1416
|
+
}
|
|
1417
|
+
if let Some(flag) = safety.flag.as_deref() {
|
|
1418
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS_FLAG".to_string(), flag.to_string());
|
|
1419
|
+
}
|
|
1420
|
+
env.insert("TEAM_AGENT_MCP_AUTO_APPROVE".to_string(), "team_orchestrator".to_string());
|
|
1421
|
+
env.insert("TEAM_AGENT_MCP_AUTO_APPROVE_SOURCE".to_string(), "leader_bypass".to_string());
|
|
1422
|
+
} else {
|
|
1423
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS".to_string(), "0".to_string());
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1393
1427
|
/// BUG / B2 灵魂件 + C-1-2 + C-6-1 cr verdict — Copilot per-worker AGENTS.md
|
|
1394
1428
|
/// 写入 + `COPILOT_CUSTOM_INSTRUCTIONS_DIRS` 注入。
|
|
1395
1429
|
///
|
|
@@ -2971,6 +3005,7 @@ pub fn fork_agent_with_transport(
|
|
|
2971
3005
|
let mut env =
|
|
2972
3006
|
inherited_env_with_team_overrides(&workspace, as_agent_id.as_str(), Some(&fork_team));
|
|
2973
3007
|
apply_profile_launch_env(&mut env, &profile_launch);
|
|
3008
|
+
apply_mcp_auto_approval_env(&mut env, &safety);
|
|
2974
3009
|
// golden operations.py:336 -> _tmux_start_command_for_agent_window (runtime.py:1017-1020): branch on
|
|
2975
3010
|
// _tmux_session_exists — an ABSENT session => new-session (spawn_first), present => new-window
|
|
2976
3011
|
// (spawn_into). The Rust restart seam (restart.rs spawn_agent_window) uses the same branch.
|
|
@@ -108,17 +108,31 @@ pub(crate) fn start_agent_at_paths(
|
|
|
108
108
|
let provider = agent_provider(&agent);
|
|
109
109
|
let session_id = agent_session_id(&agent);
|
|
110
110
|
let rollout_path = agent_rollout_path(&agent);
|
|
111
|
-
let
|
|
111
|
+
let resume_backing_exists = session_id
|
|
112
112
|
.as_ref()
|
|
113
|
-
.map(|
|
|
113
|
+
.map(|session| {
|
|
114
|
+
resume_backing_exists_for_agent(
|
|
115
|
+
workspace,
|
|
116
|
+
agent_id,
|
|
117
|
+
&agent,
|
|
118
|
+
provider,
|
|
119
|
+
session,
|
|
120
|
+
rollout_path.as_ref(),
|
|
121
|
+
)
|
|
122
|
+
})
|
|
114
123
|
.unwrap_or(false);
|
|
115
124
|
let start_mode = decide_start_mode(
|
|
116
125
|
provider_wire(provider),
|
|
117
126
|
session_id.as_ref(),
|
|
118
127
|
rollout_path.as_ref(),
|
|
119
|
-
|
|
128
|
+
resume_backing_exists,
|
|
120
129
|
allow_fresh,
|
|
121
130
|
);
|
|
131
|
+
if matches!(start_mode, StartMode::Noop) {
|
|
132
|
+
return Err(LifecycleError::RequirementUnmet(format!(
|
|
133
|
+
"resume_not_ready: session backing store missing for agent {agent_id}; rerun with --allow-fresh to start fresh"
|
|
134
|
+
)));
|
|
135
|
+
}
|
|
122
136
|
let spawn_session_id = if matches!(start_mode, StartMode::Resumed) {
|
|
123
137
|
session_id.as_ref()
|
|
124
138
|
} else {
|
|
@@ -122,6 +122,7 @@ pub(super) fn spawn_agent_window(
|
|
|
122
122
|
team_id.as_deref(),
|
|
123
123
|
);
|
|
124
124
|
crate::lifecycle::launch::apply_profile_launch_env(&mut env, &profile_launch);
|
|
125
|
+
crate::lifecycle::launch::apply_mcp_auto_approval_env(&mut env, safety);
|
|
125
126
|
let spawn_cwd = spawn_cwd_override
|
|
126
127
|
.or_else(|| {
|
|
127
128
|
agent
|
|
@@ -242,6 +243,80 @@ pub(super) fn agent_rollout_path(agent: &serde_json::Value) -> Option<RolloutPat
|
|
|
242
243
|
.map(RolloutPath::new)
|
|
243
244
|
}
|
|
244
245
|
|
|
246
|
+
pub(super) fn resume_backing_exists_for_agent(
|
|
247
|
+
workspace: &Path,
|
|
248
|
+
agent_id: &AgentId,
|
|
249
|
+
agent: &serde_json::Value,
|
|
250
|
+
provider: Provider,
|
|
251
|
+
session_id: &SessionId,
|
|
252
|
+
rollout_path: Option<&RolloutPath>,
|
|
253
|
+
) -> bool {
|
|
254
|
+
match provider {
|
|
255
|
+
Provider::Codex => rollout_path_exists(rollout_path),
|
|
256
|
+
Provider::Claude | Provider::ClaudeCode => {
|
|
257
|
+
rollout_path_exists(rollout_path)
|
|
258
|
+
|| event_log_transcript_exists(workspace, agent_id.as_str(), session_id.as_str())
|
|
259
|
+
}
|
|
260
|
+
Provider::Copilot => copilot_session_store_has_session(session_id.as_str()),
|
|
261
|
+
Provider::GeminiCli | Provider::Fake => {
|
|
262
|
+
let _ = agent;
|
|
263
|
+
true
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
fn rollout_path_exists(rollout_path: Option<&RolloutPath>) -> bool {
|
|
269
|
+
rollout_path
|
|
270
|
+
.as_ref()
|
|
271
|
+
.is_some_and(|path| path.as_path().exists())
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
fn event_log_transcript_exists(workspace: &Path, agent_id: &str, session_id: &str) -> bool {
|
|
275
|
+
let Ok(events) = crate::event_log::EventLog::new(workspace).tail(0) else {
|
|
276
|
+
return false;
|
|
277
|
+
};
|
|
278
|
+
events.iter().rev().any(|event| {
|
|
279
|
+
event.get("event").and_then(serde_json::Value::as_str) == Some("session.captured")
|
|
280
|
+
&& ["agent_id", "worker_id"]
|
|
281
|
+
.iter()
|
|
282
|
+
.any(|key| event.get(*key).and_then(serde_json::Value::as_str) == Some(agent_id))
|
|
283
|
+
&& event.get("session_id").and_then(serde_json::Value::as_str) == Some(session_id)
|
|
284
|
+
&& event_transcript_path(event).is_some_and(|path| path.exists())
|
|
285
|
+
})
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
fn event_transcript_path(event: &serde_json::Value) -> Option<PathBuf> {
|
|
289
|
+
event
|
|
290
|
+
.get("rollout_path")
|
|
291
|
+
.or_else(|| event.get("transcript_path"))
|
|
292
|
+
.and_then(serde_json::Value::as_str)
|
|
293
|
+
.filter(|path| !path.is_empty())
|
|
294
|
+
.map(PathBuf::from)
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
fn copilot_session_store_has_session(session_id: &str) -> bool {
|
|
298
|
+
let Some(home) = std::env::var_os("HOME").map(PathBuf::from) else {
|
|
299
|
+
return false;
|
|
300
|
+
};
|
|
301
|
+
let db_path = home.join(".copilot").join("session-store.db");
|
|
302
|
+
if !db_path.exists() {
|
|
303
|
+
return false;
|
|
304
|
+
}
|
|
305
|
+
let Ok(conn) = rusqlite::Connection::open_with_flags(
|
|
306
|
+
db_path,
|
|
307
|
+
rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY | rusqlite::OpenFlags::SQLITE_OPEN_NO_MUTEX,
|
|
308
|
+
) else {
|
|
309
|
+
return false;
|
|
310
|
+
};
|
|
311
|
+
conn
|
|
312
|
+
.query_row(
|
|
313
|
+
"select 1 from sessions where id = ?1 limit 1",
|
|
314
|
+
[session_id],
|
|
315
|
+
|_| Ok(()),
|
|
316
|
+
)
|
|
317
|
+
.is_ok()
|
|
318
|
+
}
|
|
319
|
+
|
|
245
320
|
pub(crate) fn refresh_missing_provider_sessions(
|
|
246
321
|
state: &mut serde_json::Value,
|
|
247
322
|
) -> Result<bool, LifecycleError> {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
use super::common::*;
|
|
2
|
-
use super::selection::
|
|
2
|
+
use super::selection::classify_restart_plan_with_resume_validation;
|
|
3
3
|
use super::*;
|
|
4
4
|
|
|
5
5
|
// ── lifecycle::restart —— 整队 Route B resume-or-fresh 重建 ──────────────────
|
|
@@ -29,6 +29,7 @@ pub fn restart_with_session_convergence_deadline(
|
|
|
29
29
|
team,
|
|
30
30
|
&crate::tmux_backend::TmuxBackend::for_workspace(&run_ws),
|
|
31
31
|
session_converge_deadline_ms,
|
|
32
|
+
None,
|
|
32
33
|
)
|
|
33
34
|
}
|
|
34
35
|
|
|
@@ -40,6 +41,16 @@ pub fn restart_with_transport(
|
|
|
40
41
|
allow_fresh: bool,
|
|
41
42
|
team: Option<&str>,
|
|
42
43
|
transport: &dyn crate::transport::Transport,
|
|
44
|
+
) -> Result<RestartReport, LifecycleError> {
|
|
45
|
+
restart_with_transport_with_readiness_deadline(workspace, allow_fresh, team, transport, None)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
pub fn restart_with_transport_with_readiness_deadline(
|
|
49
|
+
workspace: &Path,
|
|
50
|
+
allow_fresh: bool,
|
|
51
|
+
team: Option<&str>,
|
|
52
|
+
transport: &dyn crate::transport::Transport,
|
|
53
|
+
readiness_deadline_ms: Option<u64>,
|
|
43
54
|
) -> Result<RestartReport, LifecycleError> {
|
|
44
55
|
match restart_with_transport_with_session_convergence_deadline(
|
|
45
56
|
workspace,
|
|
@@ -47,6 +58,7 @@ pub fn restart_with_transport(
|
|
|
47
58
|
team,
|
|
48
59
|
transport,
|
|
49
60
|
None,
|
|
61
|
+
readiness_deadline_ms,
|
|
50
62
|
)? {
|
|
51
63
|
RestartReport::RefusedResumeNotReady {
|
|
52
64
|
missing,
|
|
@@ -76,6 +88,7 @@ pub fn restart_with_transport_with_session_convergence_deadline(
|
|
|
76
88
|
team: Option<&str>,
|
|
77
89
|
transport: &dyn crate::transport::Transport,
|
|
78
90
|
session_converge_deadline_ms: Option<u64>,
|
|
91
|
+
readiness_deadline_ms: Option<u64>,
|
|
79
92
|
) -> Result<RestartReport, LifecycleError> {
|
|
80
93
|
// RED-2-STILL(P0):入口门必须在 canonical_run_workspace 解析后的路径上判,不用 raw workspace。
|
|
81
94
|
// 根因:quick-start <dir> 把 .team/runtime/spec 落在 team_workspace(dir)=**parent**/.team;
|
|
@@ -166,7 +179,7 @@ pub fn restart_with_transport_with_session_convergence_deadline(
|
|
|
166
179
|
convergence.missing.iter().cloned().collect()
|
|
167
180
|
};
|
|
168
181
|
let forced_fresh_convergence = (!convergence.converged).then_some(convergence.clone());
|
|
169
|
-
let plan =
|
|
182
|
+
let plan = classify_restart_plan_with_resume_validation(Some(&selected.run_workspace), &state, allow_fresh)?;
|
|
170
183
|
write_restart_resume_decision_events(
|
|
171
184
|
&selected.run_workspace,
|
|
172
185
|
&state,
|
|
@@ -186,7 +199,7 @@ pub fn restart_with_transport_with_session_convergence_deadline(
|
|
|
186
199
|
return Ok(RestartReport::RefusedResumeAtomicity {
|
|
187
200
|
unresumable: plan.unresumable,
|
|
188
201
|
allow_fresh,
|
|
189
|
-
error: "restart requires resumable workers before live spawn".to_string(),
|
|
202
|
+
error: "restart requires resumable workers before live spawn; rerun with --allow-fresh to start fresh".to_string(),
|
|
190
203
|
});
|
|
191
204
|
}
|
|
192
205
|
let session_name = state_session_name(&state);
|
|
@@ -253,6 +266,15 @@ pub fn restart_with_transport_with_session_convergence_deadline(
|
|
|
253
266
|
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
254
267
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
255
268
|
let coordinator_started = start_coordinator_for_workspace(&selected.run_workspace)?;
|
|
269
|
+
wait_restart_readiness_or_timeout(
|
|
270
|
+
&selected.run_workspace,
|
|
271
|
+
&state,
|
|
272
|
+
&session_name,
|
|
273
|
+
&plan.decisions,
|
|
274
|
+
transport,
|
|
275
|
+
restart_readiness_deadline(readiness_deadline_ms),
|
|
276
|
+
restart_readiness_poll_interval(),
|
|
277
|
+
)?;
|
|
256
278
|
let attach_commands = crate::tmux_backend::attach_commands_for_windows(
|
|
257
279
|
&selected.run_workspace,
|
|
258
280
|
&session_name,
|
|
@@ -455,6 +477,182 @@ fn parse_duration_value_seconds_ms(value: &str) -> Option<u64> {
|
|
|
455
477
|
}
|
|
456
478
|
}
|
|
457
479
|
|
|
480
|
+
fn restart_readiness_deadline(requested_ms: Option<u64>) -> std::time::Duration {
|
|
481
|
+
requested_ms.map(std::time::Duration::from_millis).unwrap_or_else(|| {
|
|
482
|
+
env_duration_ms(&["TEAM_AGENT_RESTART_READINESS_DEADLINE_MS"], 30_000)
|
|
483
|
+
})
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
fn restart_readiness_poll_interval() -> std::time::Duration {
|
|
487
|
+
env_duration_ms(&["TEAM_AGENT_RESTART_READINESS_POLL_MS"], 200)
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
#[derive(Debug, Clone, Copy)]
|
|
491
|
+
struct RestartReadiness {
|
|
492
|
+
session_created: bool,
|
|
493
|
+
worker_pane_addressable: bool,
|
|
494
|
+
coordinator_alive: bool,
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
impl RestartReadiness {
|
|
498
|
+
fn ready(self) -> bool {
|
|
499
|
+
self.session_created && self.worker_pane_addressable && self.coordinator_alive
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
fn wait_restart_readiness_or_timeout(
|
|
504
|
+
workspace: &Path,
|
|
505
|
+
state: &serde_json::Value,
|
|
506
|
+
session_name: &SessionName,
|
|
507
|
+
decisions: &[RestartedAgent],
|
|
508
|
+
transport: &dyn crate::transport::Transport,
|
|
509
|
+
deadline: std::time::Duration,
|
|
510
|
+
poll_interval: std::time::Duration,
|
|
511
|
+
) -> Result<(), LifecycleError> {
|
|
512
|
+
let started = std::time::Instant::now();
|
|
513
|
+
loop {
|
|
514
|
+
let readiness = restart_readiness(workspace, state, session_name, decisions, transport);
|
|
515
|
+
if readiness.ready() {
|
|
516
|
+
return Ok(());
|
|
517
|
+
}
|
|
518
|
+
let elapsed = started.elapsed();
|
|
519
|
+
if elapsed >= deadline {
|
|
520
|
+
write_restart_readiness_timeout_event(workspace, readiness, deadline, elapsed)?;
|
|
521
|
+
return Err(LifecycleError::RequirementUnmet(restart_readiness_timeout_message(
|
|
522
|
+
workspace, readiness, deadline,
|
|
523
|
+
)));
|
|
524
|
+
}
|
|
525
|
+
std::thread::sleep(std::cmp::min(poll_interval, deadline.saturating_sub(elapsed)));
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
fn restart_readiness(
|
|
530
|
+
workspace: &Path,
|
|
531
|
+
state: &serde_json::Value,
|
|
532
|
+
session_name: &SessionName,
|
|
533
|
+
decisions: &[RestartedAgent],
|
|
534
|
+
transport: &dyn crate::transport::Transport,
|
|
535
|
+
) -> RestartReadiness {
|
|
536
|
+
let session_created = session_live_or_default(transport, session_name, false);
|
|
537
|
+
let worker_pane_addressable = restart_worker_panes_addressable(state, decisions, transport);
|
|
538
|
+
let coordinator_workspace = crate::coordinator::WorkspacePath::new(workspace.to_path_buf());
|
|
539
|
+
let coordinator_alive =
|
|
540
|
+
crate::coordinator::coordinator_health(&coordinator_workspace).ok && session_created;
|
|
541
|
+
RestartReadiness { session_created, worker_pane_addressable, coordinator_alive }
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
fn restart_worker_panes_addressable(
|
|
545
|
+
state: &serde_json::Value,
|
|
546
|
+
decisions: &[RestartedAgent],
|
|
547
|
+
transport: &dyn crate::transport::Transport,
|
|
548
|
+
) -> bool {
|
|
549
|
+
if decisions.is_empty() {
|
|
550
|
+
return true;
|
|
551
|
+
}
|
|
552
|
+
decisions.iter().all(|decision| {
|
|
553
|
+
let Some(pane_id) = state
|
|
554
|
+
.get("agents")
|
|
555
|
+
.and_then(|agents| agents.get(decision.agent_id.as_str()))
|
|
556
|
+
.and_then(|agent| agent.get("pane_id"))
|
|
557
|
+
.and_then(serde_json::Value::as_str)
|
|
558
|
+
.filter(|pane| !pane.is_empty())
|
|
559
|
+
.map(crate::transport::PaneId::new)
|
|
560
|
+
else {
|
|
561
|
+
return false;
|
|
562
|
+
};
|
|
563
|
+
pane_addressable(transport, &pane_id)
|
|
564
|
+
})
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
fn pane_addressable(
|
|
568
|
+
transport: &dyn crate::transport::Transport,
|
|
569
|
+
pane_id: &crate::transport::PaneId,
|
|
570
|
+
) -> bool {
|
|
571
|
+
match transport.has_pane(pane_id) {
|
|
572
|
+
Ok(Some(present)) => present,
|
|
573
|
+
Ok(None) | Err(_) => {
|
|
574
|
+
transport
|
|
575
|
+
.list_targets()
|
|
576
|
+
.map(|targets| targets.iter().any(|pane| pane.pane_id == *pane_id))
|
|
577
|
+
.unwrap_or(false)
|
|
578
|
+
|| transport
|
|
579
|
+
.liveness(pane_id)
|
|
580
|
+
.map(|state| state == crate::transport::PaneLiveness::Live)
|
|
581
|
+
.unwrap_or(false)
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
fn write_restart_readiness_timeout_event(
|
|
587
|
+
workspace: &Path,
|
|
588
|
+
readiness: RestartReadiness,
|
|
589
|
+
deadline: std::time::Duration,
|
|
590
|
+
elapsed: std::time::Duration,
|
|
591
|
+
) -> Result<(), LifecycleError> {
|
|
592
|
+
crate::event_log::EventLog::new(workspace)
|
|
593
|
+
.write(
|
|
594
|
+
"restart.readiness_timeout",
|
|
595
|
+
serde_json::json!({
|
|
596
|
+
"tmux_session_created": readiness.session_created,
|
|
597
|
+
"worker_pane_addressable": readiness.worker_pane_addressable,
|
|
598
|
+
"coordinator_alive": readiness.coordinator_alive,
|
|
599
|
+
"deadline_ms": deadline.as_millis(),
|
|
600
|
+
"elapsed_ms": elapsed.as_millis(),
|
|
601
|
+
"coordinator_log": crate::coordinator::coordinator_log_path(
|
|
602
|
+
&crate::coordinator::WorkspacePath::new(workspace.to_path_buf())
|
|
603
|
+
).display().to_string(),
|
|
604
|
+
"state_path": crate::state::persist::runtime_state_path(workspace).display().to_string(),
|
|
605
|
+
"pid_path": crate::coordinator::coordinator_pid_path(
|
|
606
|
+
&crate::coordinator::WorkspacePath::new(workspace.to_path_buf())
|
|
607
|
+
).display().to_string(),
|
|
608
|
+
}),
|
|
609
|
+
)
|
|
610
|
+
.map(|_| ())
|
|
611
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
fn restart_readiness_timeout_message(
|
|
615
|
+
workspace: &Path,
|
|
616
|
+
readiness: RestartReadiness,
|
|
617
|
+
deadline: std::time::Duration,
|
|
618
|
+
) -> String {
|
|
619
|
+
let coordinator_workspace = crate::coordinator::WorkspacePath::new(workspace.to_path_buf());
|
|
620
|
+
let deadline_s = deadline.as_secs_f64();
|
|
621
|
+
format!(
|
|
622
|
+
"restart not ready within {deadline_s:.1}s: {missing}\n\
|
|
623
|
+
- tmux session created: {session}\n\
|
|
624
|
+
- worker pane addressable: {pane}\n\
|
|
625
|
+
- coordinator alive: {coordinator}\n\
|
|
626
|
+
Action: check coordinator log {log}, then `team-agent restart <agent> --allow-fresh` or `team-agent diagnose`\n\
|
|
627
|
+
Log: coordinator_log={log} state={state} pid_file={pid}",
|
|
628
|
+
missing = restart_readiness_missing_summary(readiness),
|
|
629
|
+
session = yes_no(readiness.session_created),
|
|
630
|
+
pane = yes_no(readiness.worker_pane_addressable),
|
|
631
|
+
coordinator = yes_no(readiness.coordinator_alive),
|
|
632
|
+
log = crate::coordinator::coordinator_log_path(&coordinator_workspace).display(),
|
|
633
|
+
state = crate::state::persist::runtime_state_path(workspace).display(),
|
|
634
|
+
pid = crate::coordinator::coordinator_pid_path(&coordinator_workspace).display(),
|
|
635
|
+
)
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
fn restart_readiness_missing_summary(readiness: RestartReadiness) -> String {
|
|
639
|
+
let mut missing = Vec::new();
|
|
640
|
+
if !readiness.session_created {
|
|
641
|
+
missing.push("tmux session created");
|
|
642
|
+
}
|
|
643
|
+
if !readiness.worker_pane_addressable {
|
|
644
|
+
missing.push("worker pane addressable");
|
|
645
|
+
}
|
|
646
|
+
if !readiness.coordinator_alive {
|
|
647
|
+
missing.push("coordinator alive");
|
|
648
|
+
}
|
|
649
|
+
missing.join(", ")
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
fn yes_no(value: bool) -> &'static str {
|
|
653
|
+
if value { "yes" } else { "no" }
|
|
654
|
+
}
|
|
655
|
+
|
|
458
656
|
fn verify_spawned_agent_live(
|
|
459
657
|
_agent_id: &AgentId,
|
|
460
658
|
_spawn: &SpawnedAgentWindow,
|