@team-agent/installer 0.3.9 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +1 -1
- package/Cargo.toml +1 -1
- package/crates/team-agent/src/cli/send.rs +9 -2
- package/crates/team-agent/src/coordinator/backoff.rs +83 -2
- package/crates/team-agent/src/coordinator/tests/spine.rs +6 -0
- package/crates/team-agent/src/coordinator/tick.rs +410 -168
- package/crates/team-agent/src/leader/lease.rs +19 -0
- package/crates/team-agent/src/leader/rediscover/tests.rs +12 -0
- package/crates/team-agent/src/leader/rediscover.rs +2 -0
- package/crates/team-agent/src/lifecycle/launch.rs +35 -0
- package/crates/team-agent/src/lifecycle/restart/agent.rs +17 -3
- package/crates/team-agent/src/lifecycle/restart/common.rs +75 -0
- package/crates/team-agent/src/lifecycle/restart/rebuild.rs +201 -3
- package/crates/team-agent/src/lifecycle/restart/selection.rs +51 -14
- package/crates/team-agent/src/lifecycle/restart.rs +1 -1
- package/crates/team-agent/src/lifecycle/tests/core.rs +89 -15
- package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +68 -3
- package/crates/team-agent/src/lifecycle/tests/main_preserved.rs +3 -1
- package/crates/team-agent/src/mcp_server/helpers.rs +24 -5
- package/crates/team-agent/src/mcp_server/normalize.rs +13 -6
- package/crates/team-agent/src/mcp_server/tests/send.rs +310 -212
- package/crates/team-agent/src/messaging/delivery.rs +83 -2
- package/crates/team-agent/src/messaging/helpers.rs +30 -10
- package/crates/team-agent/src/messaging/send.rs +71 -14
- package/crates/team-agent/src/messaging/tests/basic.rs +25 -7
- package/crates/team-agent/src/messaging/tests/runtime.rs +565 -111
- package/crates/team-agent/src/messaging/types.rs +19 -4
- package/crates/team-agent/src/provider/approvals/parsing.rs +43 -14
- package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +12 -9
- package/crates/team-agent/src/transport/test_support.rs +12 -1
- package/package.json +4 -4
|
@@ -654,6 +654,8 @@ fn leader_command_provider(command: &str) -> Option<Provider> {
|
|
|
654
654
|
Some(Provider::ClaudeCode)
|
|
655
655
|
} else if lower.contains("codex") {
|
|
656
656
|
Some(Provider::Codex)
|
|
657
|
+
} else if lower.contains("copilot") {
|
|
658
|
+
Some(Provider::Copilot)
|
|
657
659
|
} else if lower.contains("fake") {
|
|
658
660
|
Some(Provider::Fake)
|
|
659
661
|
} else {
|
|
@@ -1082,3 +1084,20 @@ pub fn detect_dual_state_divergence(
|
|
|
1082
1084
|
"team_owner_epoch": team_epoch,
|
|
1083
1085
|
})))
|
|
1084
1086
|
}
|
|
1087
|
+
|
|
1088
|
+
#[cfg(test)]
|
|
1089
|
+
mod tests {
|
|
1090
|
+
use super::*;
|
|
1091
|
+
|
|
1092
|
+
#[test]
|
|
1093
|
+
fn leader_command_provider_recognizes_copilot() {
|
|
1094
|
+
assert_eq!(
|
|
1095
|
+
leader_command_provider("copilot --allow-all-tools"),
|
|
1096
|
+
Some(Provider::Copilot)
|
|
1097
|
+
);
|
|
1098
|
+
assert_eq!(
|
|
1099
|
+
leader_command_provider("/usr/local/bin/copilot"),
|
|
1100
|
+
Some(Provider::Copilot)
|
|
1101
|
+
);
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
@@ -80,6 +80,18 @@ fn event_named(events: &[Value], name: &str) -> Value {
|
|
|
80
80
|
.unwrap_or_else(|| panic!("missing event {name}; got {events:?}"))
|
|
81
81
|
}
|
|
82
82
|
|
|
83
|
+
#[test]
|
|
84
|
+
fn leader_command_provider_recognizes_copilot() {
|
|
85
|
+
assert_eq!(
|
|
86
|
+
leader_command_provider("copilot --allow-all-tools"),
|
|
87
|
+
Some(Provider::Copilot)
|
|
88
|
+
);
|
|
89
|
+
assert_eq!(
|
|
90
|
+
leader_command_provider("/usr/local/bin/copilot"),
|
|
91
|
+
Some(Provider::Copilot)
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
|
|
83
95
|
fn last_event_named(events: &[Value], name: &str) -> Value {
|
|
84
96
|
events
|
|
85
97
|
.iter()
|
|
@@ -590,6 +590,8 @@ fn leader_command_provider(command: &str) -> Option<Provider> {
|
|
|
590
590
|
Some(Provider::ClaudeCode)
|
|
591
591
|
} else if lower.contains("codex") {
|
|
592
592
|
Some(Provider::Codex)
|
|
593
|
+
} else if lower.contains("copilot") {
|
|
594
|
+
Some(Provider::Copilot)
|
|
593
595
|
} else if lower.contains("fake") {
|
|
594
596
|
Some(Provider::Fake)
|
|
595
597
|
} else {
|
|
@@ -288,6 +288,7 @@ fn spawn_agents(
|
|
|
288
288
|
let mut env =
|
|
289
289
|
inherited_env_with_team_overrides(workspace, agent_id_raw, Some(&mcp_team_id));
|
|
290
290
|
apply_profile_launch_env(&mut env, &profile_launch);
|
|
291
|
+
apply_mcp_auto_approval_env(&mut env, &safety);
|
|
291
292
|
// Python providers.py:145 + launch/core.py:253 — fresh launch runs the worker
|
|
292
293
|
// with cwd=workspace, same as the RS fork/add and restart paths.
|
|
293
294
|
let env_unset: Vec<String> = profile_launch.env_unset.iter().cloned().collect();
|
|
@@ -1390,6 +1391,39 @@ pub(crate) fn inherited_env_with_team_overrides(
|
|
|
1390
1391
|
env
|
|
1391
1392
|
}
|
|
1392
1393
|
|
|
1394
|
+
pub(crate) fn apply_mcp_auto_approval_env(
|
|
1395
|
+
env: &mut BTreeMap<String, String>,
|
|
1396
|
+
safety: &DangerousApproval,
|
|
1397
|
+
) {
|
|
1398
|
+
for key in [
|
|
1399
|
+
"TEAM_AGENT_LEADER_BYPASS",
|
|
1400
|
+
"TEAM_AGENT_LEADER_BYPASS_SOURCE",
|
|
1401
|
+
"TEAM_AGENT_LEADER_BYPASS_PROVIDER",
|
|
1402
|
+
"TEAM_AGENT_LEADER_BYPASS_FLAG",
|
|
1403
|
+
"TEAM_AGENT_MCP_AUTO_APPROVE",
|
|
1404
|
+
"TEAM_AGENT_MCP_AUTO_APPROVE_SOURCE",
|
|
1405
|
+
] {
|
|
1406
|
+
env.remove(key);
|
|
1407
|
+
}
|
|
1408
|
+
if safety.enabled
|
|
1409
|
+
&& matches!(safety.source, DangerousApprovalSource::LeaderProcess)
|
|
1410
|
+
&& safety.inherited
|
|
1411
|
+
{
|
|
1412
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS".to_string(), "1".to_string());
|
|
1413
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS_SOURCE".to_string(), "leader_process".to_string());
|
|
1414
|
+
if let Some(provider) = safety.provider.as_deref() {
|
|
1415
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS_PROVIDER".to_string(), provider.to_string());
|
|
1416
|
+
}
|
|
1417
|
+
if let Some(flag) = safety.flag.as_deref() {
|
|
1418
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS_FLAG".to_string(), flag.to_string());
|
|
1419
|
+
}
|
|
1420
|
+
env.insert("TEAM_AGENT_MCP_AUTO_APPROVE".to_string(), "team_orchestrator".to_string());
|
|
1421
|
+
env.insert("TEAM_AGENT_MCP_AUTO_APPROVE_SOURCE".to_string(), "leader_bypass".to_string());
|
|
1422
|
+
} else {
|
|
1423
|
+
env.insert("TEAM_AGENT_LEADER_BYPASS".to_string(), "0".to_string());
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1393
1427
|
/// BUG / B2 灵魂件 + C-1-2 + C-6-1 cr verdict — Copilot per-worker AGENTS.md
|
|
1394
1428
|
/// 写入 + `COPILOT_CUSTOM_INSTRUCTIONS_DIRS` 注入。
|
|
1395
1429
|
///
|
|
@@ -2971,6 +3005,7 @@ pub fn fork_agent_with_transport(
|
|
|
2971
3005
|
let mut env =
|
|
2972
3006
|
inherited_env_with_team_overrides(&workspace, as_agent_id.as_str(), Some(&fork_team));
|
|
2973
3007
|
apply_profile_launch_env(&mut env, &profile_launch);
|
|
3008
|
+
apply_mcp_auto_approval_env(&mut env, &safety);
|
|
2974
3009
|
// golden operations.py:336 -> _tmux_start_command_for_agent_window (runtime.py:1017-1020): branch on
|
|
2975
3010
|
// _tmux_session_exists — an ABSENT session => new-session (spawn_first), present => new-window
|
|
2976
3011
|
// (spawn_into). The Rust restart seam (restart.rs spawn_agent_window) uses the same branch.
|
|
@@ -108,17 +108,31 @@ pub(crate) fn start_agent_at_paths(
|
|
|
108
108
|
let provider = agent_provider(&agent);
|
|
109
109
|
let session_id = agent_session_id(&agent);
|
|
110
110
|
let rollout_path = agent_rollout_path(&agent);
|
|
111
|
-
let
|
|
111
|
+
let resume_backing_exists = session_id
|
|
112
112
|
.as_ref()
|
|
113
|
-
.map(|
|
|
113
|
+
.map(|session| {
|
|
114
|
+
resume_backing_exists_for_agent(
|
|
115
|
+
workspace,
|
|
116
|
+
agent_id,
|
|
117
|
+
&agent,
|
|
118
|
+
provider,
|
|
119
|
+
session,
|
|
120
|
+
rollout_path.as_ref(),
|
|
121
|
+
)
|
|
122
|
+
})
|
|
114
123
|
.unwrap_or(false);
|
|
115
124
|
let start_mode = decide_start_mode(
|
|
116
125
|
provider_wire(provider),
|
|
117
126
|
session_id.as_ref(),
|
|
118
127
|
rollout_path.as_ref(),
|
|
119
|
-
|
|
128
|
+
resume_backing_exists,
|
|
120
129
|
allow_fresh,
|
|
121
130
|
);
|
|
131
|
+
if matches!(start_mode, StartMode::Noop) {
|
|
132
|
+
return Err(LifecycleError::RequirementUnmet(format!(
|
|
133
|
+
"resume_not_ready: session backing store missing for agent {agent_id}; rerun with --allow-fresh to start fresh"
|
|
134
|
+
)));
|
|
135
|
+
}
|
|
122
136
|
let spawn_session_id = if matches!(start_mode, StartMode::Resumed) {
|
|
123
137
|
session_id.as_ref()
|
|
124
138
|
} else {
|
|
@@ -122,6 +122,7 @@ pub(super) fn spawn_agent_window(
|
|
|
122
122
|
team_id.as_deref(),
|
|
123
123
|
);
|
|
124
124
|
crate::lifecycle::launch::apply_profile_launch_env(&mut env, &profile_launch);
|
|
125
|
+
crate::lifecycle::launch::apply_mcp_auto_approval_env(&mut env, safety);
|
|
125
126
|
let spawn_cwd = spawn_cwd_override
|
|
126
127
|
.or_else(|| {
|
|
127
128
|
agent
|
|
@@ -242,6 +243,80 @@ pub(super) fn agent_rollout_path(agent: &serde_json::Value) -> Option<RolloutPat
|
|
|
242
243
|
.map(RolloutPath::new)
|
|
243
244
|
}
|
|
244
245
|
|
|
246
|
+
pub(super) fn resume_backing_exists_for_agent(
|
|
247
|
+
workspace: &Path,
|
|
248
|
+
agent_id: &AgentId,
|
|
249
|
+
agent: &serde_json::Value,
|
|
250
|
+
provider: Provider,
|
|
251
|
+
session_id: &SessionId,
|
|
252
|
+
rollout_path: Option<&RolloutPath>,
|
|
253
|
+
) -> bool {
|
|
254
|
+
match provider {
|
|
255
|
+
Provider::Codex => rollout_path_exists(rollout_path),
|
|
256
|
+
Provider::Claude | Provider::ClaudeCode => {
|
|
257
|
+
rollout_path_exists(rollout_path)
|
|
258
|
+
|| event_log_transcript_exists(workspace, agent_id.as_str(), session_id.as_str())
|
|
259
|
+
}
|
|
260
|
+
Provider::Copilot => copilot_session_store_has_session(session_id.as_str()),
|
|
261
|
+
Provider::GeminiCli | Provider::Fake => {
|
|
262
|
+
let _ = agent;
|
|
263
|
+
true
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
fn rollout_path_exists(rollout_path: Option<&RolloutPath>) -> bool {
|
|
269
|
+
rollout_path
|
|
270
|
+
.as_ref()
|
|
271
|
+
.is_some_and(|path| path.as_path().exists())
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
fn event_log_transcript_exists(workspace: &Path, agent_id: &str, session_id: &str) -> bool {
|
|
275
|
+
let Ok(events) = crate::event_log::EventLog::new(workspace).tail(0) else {
|
|
276
|
+
return false;
|
|
277
|
+
};
|
|
278
|
+
events.iter().rev().any(|event| {
|
|
279
|
+
event.get("event").and_then(serde_json::Value::as_str) == Some("session.captured")
|
|
280
|
+
&& ["agent_id", "worker_id"]
|
|
281
|
+
.iter()
|
|
282
|
+
.any(|key| event.get(*key).and_then(serde_json::Value::as_str) == Some(agent_id))
|
|
283
|
+
&& event.get("session_id").and_then(serde_json::Value::as_str) == Some(session_id)
|
|
284
|
+
&& event_transcript_path(event).is_some_and(|path| path.exists())
|
|
285
|
+
})
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
fn event_transcript_path(event: &serde_json::Value) -> Option<PathBuf> {
|
|
289
|
+
event
|
|
290
|
+
.get("rollout_path")
|
|
291
|
+
.or_else(|| event.get("transcript_path"))
|
|
292
|
+
.and_then(serde_json::Value::as_str)
|
|
293
|
+
.filter(|path| !path.is_empty())
|
|
294
|
+
.map(PathBuf::from)
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
fn copilot_session_store_has_session(session_id: &str) -> bool {
|
|
298
|
+
let Some(home) = std::env::var_os("HOME").map(PathBuf::from) else {
|
|
299
|
+
return false;
|
|
300
|
+
};
|
|
301
|
+
let db_path = home.join(".copilot").join("session-store.db");
|
|
302
|
+
if !db_path.exists() {
|
|
303
|
+
return false;
|
|
304
|
+
}
|
|
305
|
+
let Ok(conn) = rusqlite::Connection::open_with_flags(
|
|
306
|
+
db_path,
|
|
307
|
+
rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY | rusqlite::OpenFlags::SQLITE_OPEN_NO_MUTEX,
|
|
308
|
+
) else {
|
|
309
|
+
return false;
|
|
310
|
+
};
|
|
311
|
+
conn
|
|
312
|
+
.query_row(
|
|
313
|
+
"select 1 from sessions where id = ?1 limit 1",
|
|
314
|
+
[session_id],
|
|
315
|
+
|_| Ok(()),
|
|
316
|
+
)
|
|
317
|
+
.is_ok()
|
|
318
|
+
}
|
|
319
|
+
|
|
245
320
|
pub(crate) fn refresh_missing_provider_sessions(
|
|
246
321
|
state: &mut serde_json::Value,
|
|
247
322
|
) -> Result<bool, LifecycleError> {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
use super::common::*;
|
|
2
|
-
use super::selection::
|
|
2
|
+
use super::selection::classify_restart_plan_with_resume_validation;
|
|
3
3
|
use super::*;
|
|
4
4
|
|
|
5
5
|
// ── lifecycle::restart —— 整队 Route B resume-or-fresh 重建 ──────────────────
|
|
@@ -29,6 +29,7 @@ pub fn restart_with_session_convergence_deadline(
|
|
|
29
29
|
team,
|
|
30
30
|
&crate::tmux_backend::TmuxBackend::for_workspace(&run_ws),
|
|
31
31
|
session_converge_deadline_ms,
|
|
32
|
+
None,
|
|
32
33
|
)
|
|
33
34
|
}
|
|
34
35
|
|
|
@@ -40,6 +41,16 @@ pub fn restart_with_transport(
|
|
|
40
41
|
allow_fresh: bool,
|
|
41
42
|
team: Option<&str>,
|
|
42
43
|
transport: &dyn crate::transport::Transport,
|
|
44
|
+
) -> Result<RestartReport, LifecycleError> {
|
|
45
|
+
restart_with_transport_with_readiness_deadline(workspace, allow_fresh, team, transport, None)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
pub fn restart_with_transport_with_readiness_deadline(
|
|
49
|
+
workspace: &Path,
|
|
50
|
+
allow_fresh: bool,
|
|
51
|
+
team: Option<&str>,
|
|
52
|
+
transport: &dyn crate::transport::Transport,
|
|
53
|
+
readiness_deadline_ms: Option<u64>,
|
|
43
54
|
) -> Result<RestartReport, LifecycleError> {
|
|
44
55
|
match restart_with_transport_with_session_convergence_deadline(
|
|
45
56
|
workspace,
|
|
@@ -47,6 +58,7 @@ pub fn restart_with_transport(
|
|
|
47
58
|
team,
|
|
48
59
|
transport,
|
|
49
60
|
None,
|
|
61
|
+
readiness_deadline_ms,
|
|
50
62
|
)? {
|
|
51
63
|
RestartReport::RefusedResumeNotReady {
|
|
52
64
|
missing,
|
|
@@ -76,6 +88,7 @@ pub fn restart_with_transport_with_session_convergence_deadline(
|
|
|
76
88
|
team: Option<&str>,
|
|
77
89
|
transport: &dyn crate::transport::Transport,
|
|
78
90
|
session_converge_deadline_ms: Option<u64>,
|
|
91
|
+
readiness_deadline_ms: Option<u64>,
|
|
79
92
|
) -> Result<RestartReport, LifecycleError> {
|
|
80
93
|
// RED-2-STILL(P0):入口门必须在 canonical_run_workspace 解析后的路径上判,不用 raw workspace。
|
|
81
94
|
// 根因:quick-start <dir> 把 .team/runtime/spec 落在 team_workspace(dir)=**parent**/.team;
|
|
@@ -166,7 +179,7 @@ pub fn restart_with_transport_with_session_convergence_deadline(
|
|
|
166
179
|
convergence.missing.iter().cloned().collect()
|
|
167
180
|
};
|
|
168
181
|
let forced_fresh_convergence = (!convergence.converged).then_some(convergence.clone());
|
|
169
|
-
let plan =
|
|
182
|
+
let plan = classify_restart_plan_with_resume_validation(Some(&selected.run_workspace), &state, allow_fresh)?;
|
|
170
183
|
write_restart_resume_decision_events(
|
|
171
184
|
&selected.run_workspace,
|
|
172
185
|
&state,
|
|
@@ -186,7 +199,7 @@ pub fn restart_with_transport_with_session_convergence_deadline(
|
|
|
186
199
|
return Ok(RestartReport::RefusedResumeAtomicity {
|
|
187
200
|
unresumable: plan.unresumable,
|
|
188
201
|
allow_fresh,
|
|
189
|
-
error: "restart requires resumable workers before live spawn".to_string(),
|
|
202
|
+
error: "restart requires resumable workers before live spawn; rerun with --allow-fresh to start fresh".to_string(),
|
|
190
203
|
});
|
|
191
204
|
}
|
|
192
205
|
let session_name = state_session_name(&state);
|
|
@@ -253,6 +266,15 @@ pub fn restart_with_transport_with_session_convergence_deadline(
|
|
|
253
266
|
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
254
267
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
255
268
|
let coordinator_started = start_coordinator_for_workspace(&selected.run_workspace)?;
|
|
269
|
+
wait_restart_readiness_or_timeout(
|
|
270
|
+
&selected.run_workspace,
|
|
271
|
+
&state,
|
|
272
|
+
&session_name,
|
|
273
|
+
&plan.decisions,
|
|
274
|
+
transport,
|
|
275
|
+
restart_readiness_deadline(readiness_deadline_ms),
|
|
276
|
+
restart_readiness_poll_interval(),
|
|
277
|
+
)?;
|
|
256
278
|
let attach_commands = crate::tmux_backend::attach_commands_for_windows(
|
|
257
279
|
&selected.run_workspace,
|
|
258
280
|
&session_name,
|
|
@@ -455,6 +477,182 @@ fn parse_duration_value_seconds_ms(value: &str) -> Option<u64> {
|
|
|
455
477
|
}
|
|
456
478
|
}
|
|
457
479
|
|
|
480
|
+
fn restart_readiness_deadline(requested_ms: Option<u64>) -> std::time::Duration {
|
|
481
|
+
requested_ms.map(std::time::Duration::from_millis).unwrap_or_else(|| {
|
|
482
|
+
env_duration_ms(&["TEAM_AGENT_RESTART_READINESS_DEADLINE_MS"], 30_000)
|
|
483
|
+
})
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
fn restart_readiness_poll_interval() -> std::time::Duration {
|
|
487
|
+
env_duration_ms(&["TEAM_AGENT_RESTART_READINESS_POLL_MS"], 200)
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
#[derive(Debug, Clone, Copy)]
|
|
491
|
+
struct RestartReadiness {
|
|
492
|
+
session_created: bool,
|
|
493
|
+
worker_pane_addressable: bool,
|
|
494
|
+
coordinator_alive: bool,
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
impl RestartReadiness {
|
|
498
|
+
fn ready(self) -> bool {
|
|
499
|
+
self.session_created && self.worker_pane_addressable && self.coordinator_alive
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
fn wait_restart_readiness_or_timeout(
|
|
504
|
+
workspace: &Path,
|
|
505
|
+
state: &serde_json::Value,
|
|
506
|
+
session_name: &SessionName,
|
|
507
|
+
decisions: &[RestartedAgent],
|
|
508
|
+
transport: &dyn crate::transport::Transport,
|
|
509
|
+
deadline: std::time::Duration,
|
|
510
|
+
poll_interval: std::time::Duration,
|
|
511
|
+
) -> Result<(), LifecycleError> {
|
|
512
|
+
let started = std::time::Instant::now();
|
|
513
|
+
loop {
|
|
514
|
+
let readiness = restart_readiness(workspace, state, session_name, decisions, transport);
|
|
515
|
+
if readiness.ready() {
|
|
516
|
+
return Ok(());
|
|
517
|
+
}
|
|
518
|
+
let elapsed = started.elapsed();
|
|
519
|
+
if elapsed >= deadline {
|
|
520
|
+
write_restart_readiness_timeout_event(workspace, readiness, deadline, elapsed)?;
|
|
521
|
+
return Err(LifecycleError::RequirementUnmet(restart_readiness_timeout_message(
|
|
522
|
+
workspace, readiness, deadline,
|
|
523
|
+
)));
|
|
524
|
+
}
|
|
525
|
+
std::thread::sleep(std::cmp::min(poll_interval, deadline.saturating_sub(elapsed)));
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
fn restart_readiness(
|
|
530
|
+
workspace: &Path,
|
|
531
|
+
state: &serde_json::Value,
|
|
532
|
+
session_name: &SessionName,
|
|
533
|
+
decisions: &[RestartedAgent],
|
|
534
|
+
transport: &dyn crate::transport::Transport,
|
|
535
|
+
) -> RestartReadiness {
|
|
536
|
+
let session_created = session_live_or_default(transport, session_name, false);
|
|
537
|
+
let worker_pane_addressable = restart_worker_panes_addressable(state, decisions, transport);
|
|
538
|
+
let coordinator_workspace = crate::coordinator::WorkspacePath::new(workspace.to_path_buf());
|
|
539
|
+
let coordinator_alive =
|
|
540
|
+
crate::coordinator::coordinator_health(&coordinator_workspace).ok && session_created;
|
|
541
|
+
RestartReadiness { session_created, worker_pane_addressable, coordinator_alive }
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
fn restart_worker_panes_addressable(
|
|
545
|
+
state: &serde_json::Value,
|
|
546
|
+
decisions: &[RestartedAgent],
|
|
547
|
+
transport: &dyn crate::transport::Transport,
|
|
548
|
+
) -> bool {
|
|
549
|
+
if decisions.is_empty() {
|
|
550
|
+
return true;
|
|
551
|
+
}
|
|
552
|
+
decisions.iter().all(|decision| {
|
|
553
|
+
let Some(pane_id) = state
|
|
554
|
+
.get("agents")
|
|
555
|
+
.and_then(|agents| agents.get(decision.agent_id.as_str()))
|
|
556
|
+
.and_then(|agent| agent.get("pane_id"))
|
|
557
|
+
.and_then(serde_json::Value::as_str)
|
|
558
|
+
.filter(|pane| !pane.is_empty())
|
|
559
|
+
.map(crate::transport::PaneId::new)
|
|
560
|
+
else {
|
|
561
|
+
return false;
|
|
562
|
+
};
|
|
563
|
+
pane_addressable(transport, &pane_id)
|
|
564
|
+
})
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
fn pane_addressable(
|
|
568
|
+
transport: &dyn crate::transport::Transport,
|
|
569
|
+
pane_id: &crate::transport::PaneId,
|
|
570
|
+
) -> bool {
|
|
571
|
+
match transport.has_pane(pane_id) {
|
|
572
|
+
Ok(Some(present)) => present,
|
|
573
|
+
Ok(None) | Err(_) => {
|
|
574
|
+
transport
|
|
575
|
+
.list_targets()
|
|
576
|
+
.map(|targets| targets.iter().any(|pane| pane.pane_id == *pane_id))
|
|
577
|
+
.unwrap_or(false)
|
|
578
|
+
|| transport
|
|
579
|
+
.liveness(pane_id)
|
|
580
|
+
.map(|state| state == crate::transport::PaneLiveness::Live)
|
|
581
|
+
.unwrap_or(false)
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
fn write_restart_readiness_timeout_event(
|
|
587
|
+
workspace: &Path,
|
|
588
|
+
readiness: RestartReadiness,
|
|
589
|
+
deadline: std::time::Duration,
|
|
590
|
+
elapsed: std::time::Duration,
|
|
591
|
+
) -> Result<(), LifecycleError> {
|
|
592
|
+
crate::event_log::EventLog::new(workspace)
|
|
593
|
+
.write(
|
|
594
|
+
"restart.readiness_timeout",
|
|
595
|
+
serde_json::json!({
|
|
596
|
+
"tmux_session_created": readiness.session_created,
|
|
597
|
+
"worker_pane_addressable": readiness.worker_pane_addressable,
|
|
598
|
+
"coordinator_alive": readiness.coordinator_alive,
|
|
599
|
+
"deadline_ms": deadline.as_millis(),
|
|
600
|
+
"elapsed_ms": elapsed.as_millis(),
|
|
601
|
+
"coordinator_log": crate::coordinator::coordinator_log_path(
|
|
602
|
+
&crate::coordinator::WorkspacePath::new(workspace.to_path_buf())
|
|
603
|
+
).display().to_string(),
|
|
604
|
+
"state_path": crate::state::persist::runtime_state_path(workspace).display().to_string(),
|
|
605
|
+
"pid_path": crate::coordinator::coordinator_pid_path(
|
|
606
|
+
&crate::coordinator::WorkspacePath::new(workspace.to_path_buf())
|
|
607
|
+
).display().to_string(),
|
|
608
|
+
}),
|
|
609
|
+
)
|
|
610
|
+
.map(|_| ())
|
|
611
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
fn restart_readiness_timeout_message(
|
|
615
|
+
workspace: &Path,
|
|
616
|
+
readiness: RestartReadiness,
|
|
617
|
+
deadline: std::time::Duration,
|
|
618
|
+
) -> String {
|
|
619
|
+
let coordinator_workspace = crate::coordinator::WorkspacePath::new(workspace.to_path_buf());
|
|
620
|
+
let deadline_s = deadline.as_secs_f64();
|
|
621
|
+
format!(
|
|
622
|
+
"restart not ready within {deadline_s:.1}s: {missing}\n\
|
|
623
|
+
- tmux session created: {session}\n\
|
|
624
|
+
- worker pane addressable: {pane}\n\
|
|
625
|
+
- coordinator alive: {coordinator}\n\
|
|
626
|
+
Action: check coordinator log {log}, then `team-agent restart <agent> --allow-fresh` or `team-agent diagnose`\n\
|
|
627
|
+
Log: coordinator_log={log} state={state} pid_file={pid}",
|
|
628
|
+
missing = restart_readiness_missing_summary(readiness),
|
|
629
|
+
session = yes_no(readiness.session_created),
|
|
630
|
+
pane = yes_no(readiness.worker_pane_addressable),
|
|
631
|
+
coordinator = yes_no(readiness.coordinator_alive),
|
|
632
|
+
log = crate::coordinator::coordinator_log_path(&coordinator_workspace).display(),
|
|
633
|
+
state = crate::state::persist::runtime_state_path(workspace).display(),
|
|
634
|
+
pid = crate::coordinator::coordinator_pid_path(&coordinator_workspace).display(),
|
|
635
|
+
)
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
fn restart_readiness_missing_summary(readiness: RestartReadiness) -> String {
|
|
639
|
+
let mut missing = Vec::new();
|
|
640
|
+
if !readiness.session_created {
|
|
641
|
+
missing.push("tmux session created");
|
|
642
|
+
}
|
|
643
|
+
if !readiness.worker_pane_addressable {
|
|
644
|
+
missing.push("worker pane addressable");
|
|
645
|
+
}
|
|
646
|
+
if !readiness.coordinator_alive {
|
|
647
|
+
missing.push("coordinator alive");
|
|
648
|
+
}
|
|
649
|
+
missing.join(", ")
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
fn yes_no(value: bool) -> &'static str {
|
|
653
|
+
if value { "yes" } else { "no" }
|
|
654
|
+
}
|
|
655
|
+
|
|
458
656
|
fn verify_spawned_agent_live(
|
|
459
657
|
_agent_id: &AgentId,
|
|
460
658
|
_spawn: &SpawnedAgentWindow,
|
|
@@ -4,32 +4,35 @@ use super::common::*;
|
|
|
4
4
|
/// bug-085 四象限 `start_mode` 决策(`start.py:179-188` + `_resume_rollout_missing` `start.py:66-69`),
|
|
5
5
|
/// **从 start_agent 的整条 lock+spawn 路径里分离出的纯函数**(gate gap:porter 需要单元级 RED
|
|
6
6
|
/// for `FreshAfterMissingRollout`,而 start_agent 全路径不可单测)。语义:
|
|
7
|
-
/// -
|
|
7
|
+
/// - resume backing 缺失时不可 resume:codex/claude 用 transcript/rollout 文件,
|
|
8
|
+
/// copilot 用 session-store 行存在性(由调用方折叠进 `rollout_exists`)。
|
|
8
9
|
/// - 初始 `start_mode = if session_id { Resumed } else { Fresh }`(`start.py:179`)。
|
|
9
|
-
/// -
|
|
10
|
-
///
|
|
11
|
-
/// - 非 codex:rollout 永不"缺失",直接看 session_id。
|
|
10
|
+
/// - `missing && allow_fresh` 升级为 `FreshAfterMissingRollout` 并清空 session_id。
|
|
11
|
+
/// - `missing && !allow_fresh` 返回 `Noop`,调用方据此诚实拒绝并提示 `--allow-fresh`。
|
|
12
12
|
pub fn decide_start_mode(
|
|
13
13
|
provider: &str,
|
|
14
14
|
session_id: Option<&SessionId>,
|
|
15
|
-
|
|
15
|
+
_rollout_path: Option<&RolloutPath>,
|
|
16
16
|
rollout_exists: bool,
|
|
17
17
|
allow_fresh: bool,
|
|
18
18
|
) -> StartMode {
|
|
19
19
|
match session_id {
|
|
20
20
|
None => StartMode::Fresh,
|
|
21
21
|
Some(_) => {
|
|
22
|
-
let
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
StartMode::
|
|
26
|
-
|
|
27
|
-
StartMode::Resumed
|
|
22
|
+
let missing_resume_backing = resumable_provider_requires_backing(provider) && !rollout_exists;
|
|
23
|
+
match (missing_resume_backing, allow_fresh) {
|
|
24
|
+
(true, true) => StartMode::FreshAfterMissingRollout,
|
|
25
|
+
(true, false) => StartMode::Noop,
|
|
26
|
+
(false, _) => StartMode::Resumed,
|
|
28
27
|
}
|
|
29
28
|
}
|
|
30
29
|
}
|
|
31
30
|
}
|
|
32
31
|
|
|
32
|
+
pub(crate) fn resumable_provider_requires_backing(provider: &str) -> bool {
|
|
33
|
+
matches!(provider, "codex" | "claude" | "claude_code" | "copilot")
|
|
34
|
+
}
|
|
35
|
+
|
|
33
36
|
/// `first_send_at` 严格分类(`_classify_first_send_at`,`orchestration.py:399`)。
|
|
34
37
|
/// **绝不靠 truthiness**:`""`/`0`/`False`/`"null"`/非 ISO → `Corrupt`。
|
|
35
38
|
pub fn classify_first_send_at(raw: &serde_json::Value) -> FirstSendAtState {
|
|
@@ -129,6 +132,14 @@ pub fn python_type_name(value: &serde_json::Value) -> &'static str {
|
|
|
129
132
|
pub fn classify_restart_plan(
|
|
130
133
|
state: &serde_json::Value,
|
|
131
134
|
allow_fresh: bool,
|
|
135
|
+
) -> Result<RestartPlan, LifecycleError> {
|
|
136
|
+
classify_restart_plan_with_resume_validation(None, state, allow_fresh)
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
pub(crate) fn classify_restart_plan_with_resume_validation(
|
|
140
|
+
workspace: Option<&Path>,
|
|
141
|
+
state: &serde_json::Value,
|
|
142
|
+
allow_fresh: bool,
|
|
132
143
|
) -> Result<RestartPlan, LifecycleError> {
|
|
133
144
|
let mut decisions = Vec::new();
|
|
134
145
|
let mut corrupt_entries = Vec::new();
|
|
@@ -171,21 +182,47 @@ pub fn classify_restart_plan(
|
|
|
171
182
|
.and_then(|v| v.as_str())
|
|
172
183
|
.filter(|s| !s.is_empty())
|
|
173
184
|
.map(SessionId::new);
|
|
185
|
+
let agent_id = AgentId::new(worker_id.clone());
|
|
174
186
|
// E6 层2 (C2, 用户裁定"绝不静默 fresh"): null session 只有显式 --allow-fresh 才 fresh,
|
|
175
187
|
// 否则 Refuse(→ resume_not_ready + 指引)。删 `!interacted` 短路 —— 自启动 worker
|
|
176
188
|
// (leader 从未发消息 → first_send_at=null → interacted=false)会被它静默 fresh 丢上下文。
|
|
177
|
-
let
|
|
189
|
+
let provider = agent_provider(agent);
|
|
190
|
+
let provider_wire = provider_wire(provider);
|
|
191
|
+
let resume_backing_exists = match (workspace, session_id.as_ref()) {
|
|
192
|
+
(Some(workspace), Some(session)) => resume_backing_exists_for_agent(
|
|
193
|
+
workspace,
|
|
194
|
+
&agent_id,
|
|
195
|
+
agent,
|
|
196
|
+
provider,
|
|
197
|
+
session,
|
|
198
|
+
agent_rollout_path(agent).as_ref(),
|
|
199
|
+
),
|
|
200
|
+
(None, Some(_)) if resumable_provider_requires_backing(provider_wire) => {
|
|
201
|
+
agent_rollout_path(agent)
|
|
202
|
+
.as_ref()
|
|
203
|
+
.is_some_and(|path| path.as_path().exists())
|
|
204
|
+
}
|
|
205
|
+
_ => true,
|
|
206
|
+
};
|
|
207
|
+
let decision = if session_id.is_some() && resume_backing_exists {
|
|
178
208
|
ResumeDecision::Resume
|
|
209
|
+
} else if session_id.is_some() && allow_fresh {
|
|
210
|
+
ResumeDecision::FreshStart
|
|
211
|
+
} else if session_id.is_some() {
|
|
212
|
+
ResumeDecision::Refuse
|
|
179
213
|
} else if allow_fresh {
|
|
180
214
|
ResumeDecision::FreshStart
|
|
181
215
|
} else {
|
|
182
216
|
ResumeDecision::Refuse
|
|
183
217
|
};
|
|
184
|
-
let agent_id = AgentId::new(worker_id.clone());
|
|
185
218
|
if matches!(decision, ResumeDecision::Refuse) {
|
|
186
219
|
unresumable.push(UnresumableWorker {
|
|
187
220
|
agent_id: agent_id.clone(),
|
|
188
|
-
reason:
|
|
221
|
+
reason: if session_id.is_some() {
|
|
222
|
+
"session_unresumable".to_string()
|
|
223
|
+
} else {
|
|
224
|
+
"no_persisted_session_id".to_string()
|
|
225
|
+
},
|
|
189
226
|
session_id: session_id.clone(),
|
|
190
227
|
first_send_at: first_send_at_raw.as_str().map(|s| s.to_string()),
|
|
191
228
|
});
|
|
@@ -37,7 +37,7 @@ pub(crate) use common::refresh_missing_provider_sessions;
|
|
|
37
37
|
pub use orchestrator::{halt_plan, plan_status};
|
|
38
38
|
pub use rebuild::{
|
|
39
39
|
restart, restart_candidates, restart_with_session_convergence_deadline, restart_with_transport,
|
|
40
|
-
select_restart_state,
|
|
40
|
+
restart_with_transport_with_readiness_deadline, select_restart_state,
|
|
41
41
|
};
|
|
42
42
|
pub use remove::{remove_agent, remove_agent_with_transport};
|
|
43
43
|
pub use selection::{classify_first_send_at, classify_restart_plan, decide_start_mode, python_type_name};
|