@team-agent/installer 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +34 -1
- package/Cargo.toml +1 -1
- package/crates/team-agent/Cargo.toml +1 -1
- package/crates/team-agent/src/cli/adapters.rs +196 -19
- package/crates/team-agent/src/cli/diagnose.rs +145 -11
- package/crates/team-agent/src/cli/emit.rs +287 -53
- package/crates/team-agent/src/cli/leader.rs +37 -8
- package/crates/team-agent/src/cli/mod.rs +807 -316
- package/crates/team-agent/src/cli/status_port.rs +25 -2
- package/crates/team-agent/src/cli/tests/divergence.rs +1 -2
- package/crates/team-agent/src/cli/tests/lane_c.rs +23 -13
- package/crates/team-agent/src/cli/tests/main_preserved.rs +2 -0
- package/crates/team-agent/src/cli/tests/run_delegation.rs +57 -3
- package/crates/team-agent/src/cli/types.rs +17 -0
- package/crates/team-agent/src/compiler/tests.rs +2 -2
- package/crates/team-agent/src/compiler.rs +16 -6
- package/crates/team-agent/src/coordinator/health.rs +89 -20
- package/crates/team-agent/src/coordinator/mod.rs +4 -0
- package/crates/team-agent/src/coordinator/runtime_detectors.rs +500 -0
- package/crates/team-agent/src/coordinator/runtime_observation.rs +58 -0
- package/crates/team-agent/src/coordinator/tests/watch.rs +4 -2
- package/crates/team-agent/src/coordinator/tick.rs +222 -69
- package/crates/team-agent/src/coordinator/types.rs +15 -3
- package/crates/team-agent/src/db/schema.rs +37 -2
- package/crates/team-agent/src/diagnose/comms.rs +226 -0
- package/crates/team-agent/src/diagnose/mod.rs +45 -0
- package/crates/team-agent/src/diagnose/orphans.rs +658 -0
- package/crates/team-agent/src/fake_worker.rs +146 -3
- package/crates/team-agent/src/leader/start.rs +121 -23
- package/crates/team-agent/src/leader/types.rs +44 -1
- package/crates/team-agent/src/lib.rs +3 -0
- package/crates/team-agent/src/lifecycle/display.rs +648 -50
- package/crates/team-agent/src/lifecycle/launch.rs +1048 -264
- package/crates/team-agent/src/lifecycle/mod.rs +3 -0
- package/crates/team-agent/src/lifecycle/profile_launch.rs +810 -0
- package/crates/team-agent/src/lifecycle/profile_smoke.rs +522 -0
- package/crates/team-agent/src/lifecycle/restart/agent.rs +113 -26
- package/crates/team-agent/src/lifecycle/restart/common.rs +189 -102
- package/crates/team-agent/src/lifecycle/restart/rebuild.rs +465 -25
- package/crates/team-agent/src/lifecycle/restart/remove.rs +22 -6
- package/crates/team-agent/src/lifecycle/restart/team_state.rs +19 -0
- package/crates/team-agent/src/lifecycle/restart.rs +4 -1
- package/crates/team-agent/src/lifecycle/tests/core.rs +4 -4
- package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +5 -5
- package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +39 -9
- package/crates/team-agent/src/lifecycle/types.rs +23 -0
- package/crates/team-agent/src/lifecycle/worker_command_context.rs +326 -0
- package/crates/team-agent/src/mcp_server/helpers.rs +1 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +341 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/mod.rs +10 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/state_status.rs +158 -0
- package/crates/team-agent/src/mcp_server/mod.rs +3 -74
- package/crates/team-agent/src/mcp_server/tests/scoped.rs +1 -1
- package/crates/team-agent/src/mcp_server/tests/send.rs +6 -5
- package/crates/team-agent/src/mcp_server/tools.rs +312 -111
- package/crates/team-agent/src/mcp_server/types.rs +6 -4
- package/crates/team-agent/src/mcp_server/wire.rs +19 -7
- package/crates/team-agent/src/message_store.rs +21 -4
- package/crates/team-agent/src/messaging/delivery.rs +87 -37
- package/crates/team-agent/src/messaging/mod.rs +9 -6
- package/crates/team-agent/src/messaging/results.rs +153 -16
- package/crates/team-agent/src/messaging/selftest.rs +199 -12
- package/crates/team-agent/src/messaging/send.rs +35 -3
- package/crates/team-agent/src/messaging/tests/runtime.rs +19 -4
- package/crates/team-agent/src/messaging/types.rs +11 -3
- package/crates/team-agent/src/os_probe.rs +119 -0
- package/crates/team-agent/src/packaging/migrate.rs +10 -2
- package/crates/team-agent/src/packaging/tests.rs +23 -0
- package/crates/team-agent/src/provider/adapter.rs +483 -67
- package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +1 -7
- package/crates/team-agent/src/provider/classify.rs +51 -4
- package/crates/team-agent/src/provider/startup_prompt.rs +94 -0
- package/crates/team-agent/src/provider/types.rs +47 -0
- package/crates/team-agent/src/session_capture.rs +616 -0
- package/crates/team-agent/src/state/persist.rs +57 -0
- package/crates/team-agent/src/state/projection.rs +32 -23
- package/crates/team-agent/src/state/selector.rs +5 -2
- package/crates/team-agent/src/tmux_backend.rs +151 -60
- package/crates/team-agent/src/transport/test_support.rs +9 -0
- package/crates/team-agent/src/transport/tests/wire.rs +4 -0
- package/crates/team-agent/src/transport.rs +13 -2
- package/package.json +4 -4
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
use super::*;
|
|
2
1
|
use super::common::*;
|
|
3
2
|
use super::selection::classify_restart_plan;
|
|
3
|
+
use super::*;
|
|
4
4
|
|
|
5
5
|
// ── lifecycle::restart —— 整队 Route B resume-or-fresh 重建 ──────────────────
|
|
6
6
|
|
|
@@ -12,13 +12,23 @@ pub fn restart(
|
|
|
12
12
|
workspace: &Path,
|
|
13
13
|
allow_fresh: bool,
|
|
14
14
|
team: Option<&str>,
|
|
15
|
+
) -> Result<RestartReport, LifecycleError> {
|
|
16
|
+
restart_with_session_convergence_deadline(workspace, allow_fresh, team, None)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
pub fn restart_with_session_convergence_deadline(
|
|
20
|
+
workspace: &Path,
|
|
21
|
+
allow_fresh: bool,
|
|
22
|
+
team: Option<&str>,
|
|
23
|
+
session_converge_deadline_ms: Option<u64>,
|
|
15
24
|
) -> Result<RestartReport, LifecycleError> {
|
|
16
25
|
let run_ws = lifecycle_run_workspace(workspace)?;
|
|
17
|
-
|
|
26
|
+
restart_with_transport_with_session_convergence_deadline(
|
|
18
27
|
workspace,
|
|
19
28
|
allow_fresh,
|
|
20
29
|
team,
|
|
21
30
|
&crate::tmux_backend::TmuxBackend::for_workspace(&run_ws),
|
|
31
|
+
session_converge_deadline_ms,
|
|
22
32
|
)
|
|
23
33
|
}
|
|
24
34
|
|
|
@@ -30,6 +40,42 @@ pub fn restart_with_transport(
|
|
|
30
40
|
allow_fresh: bool,
|
|
31
41
|
team: Option<&str>,
|
|
32
42
|
transport: &dyn crate::transport::Transport,
|
|
43
|
+
) -> Result<RestartReport, LifecycleError> {
|
|
44
|
+
match restart_with_transport_with_session_convergence_deadline(
|
|
45
|
+
workspace,
|
|
46
|
+
allow_fresh,
|
|
47
|
+
team,
|
|
48
|
+
transport,
|
|
49
|
+
None,
|
|
50
|
+
)? {
|
|
51
|
+
RestartReport::RefusedResumeNotReady {
|
|
52
|
+
missing,
|
|
53
|
+
allow_fresh,
|
|
54
|
+
error,
|
|
55
|
+
..
|
|
56
|
+
} => Ok(RestartReport::RefusedResumeAtomicity {
|
|
57
|
+
unresumable: missing
|
|
58
|
+
.into_iter()
|
|
59
|
+
.map(|agent_id| UnresumableWorker {
|
|
60
|
+
agent_id,
|
|
61
|
+
reason: "session_capture_incomplete".to_string(),
|
|
62
|
+
session_id: None,
|
|
63
|
+
first_send_at: None,
|
|
64
|
+
})
|
|
65
|
+
.collect(),
|
|
66
|
+
allow_fresh,
|
|
67
|
+
error,
|
|
68
|
+
}),
|
|
69
|
+
report => Ok(report),
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
pub fn restart_with_transport_with_session_convergence_deadline(
|
|
74
|
+
workspace: &Path,
|
|
75
|
+
allow_fresh: bool,
|
|
76
|
+
team: Option<&str>,
|
|
77
|
+
transport: &dyn crate::transport::Transport,
|
|
78
|
+
session_converge_deadline_ms: Option<u64>,
|
|
33
79
|
) -> Result<RestartReport, LifecycleError> {
|
|
34
80
|
if crate::lifecycle::restart::input_has_no_local_team_context(workspace) {
|
|
35
81
|
return Err(LifecycleError::TeamSelect(format!(
|
|
@@ -55,18 +101,62 @@ pub fn restart_with_transport(
|
|
|
55
101
|
.map_err(|e| LifecycleError::TeamSelect(e.to_string()))?;
|
|
56
102
|
let mut state = selected.state;
|
|
57
103
|
crate::lifecycle::launch::ensure_owner_allowed_for_state(&state, None)?;
|
|
58
|
-
let spec_workspace = selected
|
|
59
|
-
.
|
|
60
|
-
|
|
61
|
-
.ok_or_else(|| LifecycleError::TeamSelect("active team spec workspace not found".to_string()))?;
|
|
104
|
+
let spec_workspace = selected.spec_workspace.as_ref().ok_or_else(|| {
|
|
105
|
+
LifecycleError::TeamSelect("active team spec workspace not found".to_string())
|
|
106
|
+
})?;
|
|
62
107
|
let spec = load_team_spec(spec_workspace)?;
|
|
63
108
|
let safety = crate::lifecycle::launch::effective_runtime_config(&spec)?;
|
|
64
|
-
|
|
109
|
+
let mut convergence = converge_missing_provider_sessions(
|
|
110
|
+
&mut state,
|
|
111
|
+
session_convergence_deadline(session_converge_deadline_ms),
|
|
112
|
+
session_convergence_poll_interval(),
|
|
113
|
+
&selected.run_workspace,
|
|
114
|
+
allow_fresh,
|
|
115
|
+
)?;
|
|
116
|
+
if convergence.converged && convergence.changed {
|
|
117
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
118
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
119
|
+
}
|
|
120
|
+
if repair_resume_sessions_from_event_log(&selected.run_workspace, &mut state)? {
|
|
121
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
122
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
123
|
+
let missing_after_repair = restart_required_missing_session_agent_ids(&state);
|
|
124
|
+
convergence.changed = true;
|
|
125
|
+
convergence.converged = missing_after_repair.is_empty();
|
|
126
|
+
convergence.missing = missing_after_repair;
|
|
127
|
+
}
|
|
128
|
+
if !convergence.converged && !allow_fresh {
|
|
129
|
+
return Ok(RestartReport::RefusedResumeNotReady {
|
|
130
|
+
missing: convergence
|
|
131
|
+
.missing
|
|
132
|
+
.iter()
|
|
133
|
+
.map(|agent_id| AgentId::new(agent_id.clone()))
|
|
134
|
+
.collect(),
|
|
135
|
+
allow_fresh,
|
|
136
|
+
deadline: convergence.deadline,
|
|
137
|
+
elapsed: convergence.elapsed,
|
|
138
|
+
error: "resume_not_ready: session_capture_incomplete".to_string(),
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
if !convergence.converged && convergence.changed {
|
|
65
142
|
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
66
143
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
67
144
|
}
|
|
145
|
+
let forced_fresh_missing = if convergence.converged {
|
|
146
|
+
std::collections::BTreeSet::new()
|
|
147
|
+
} else {
|
|
148
|
+
convergence.missing.iter().cloned().collect()
|
|
149
|
+
};
|
|
150
|
+
let forced_fresh_convergence = (!convergence.converged).then_some(convergence.clone());
|
|
68
151
|
let plan = classify_restart_plan(&state, allow_fresh)?;
|
|
69
|
-
write_restart_resume_decision_events(
|
|
152
|
+
write_restart_resume_decision_events(
|
|
153
|
+
&selected.run_workspace,
|
|
154
|
+
&state,
|
|
155
|
+
allow_fresh,
|
|
156
|
+
&plan.decisions,
|
|
157
|
+
&forced_fresh_missing,
|
|
158
|
+
forced_fresh_convergence.as_ref(),
|
|
159
|
+
)?;
|
|
70
160
|
if !plan.corrupt_entries.is_empty() {
|
|
71
161
|
return Ok(RestartReport::RefusedInvalidFirstSendAt {
|
|
72
162
|
invalid: plan.corrupt_entries,
|
|
@@ -86,8 +176,13 @@ pub fn restart_with_transport(
|
|
|
86
176
|
transport
|
|
87
177
|
.kill_session(&session_name)
|
|
88
178
|
.map_err(|e| LifecycleError::Transport(e.to_string()))?;
|
|
179
|
+
mark_leader_receiver_rebind_required(&mut state, &session_name);
|
|
180
|
+
mark_restart_targets_stopped_after_teardown(&mut state, &plan.decisions);
|
|
181
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
182
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
89
183
|
}
|
|
90
|
-
|
|
184
|
+
let mut last_spawned: Option<AgentId> = None;
|
|
185
|
+
for decision in &plan.decisions {
|
|
91
186
|
let agent = state
|
|
92
187
|
.get("agents")
|
|
93
188
|
.and_then(|v| v.get(decision.agent_id.as_str()))
|
|
@@ -96,36 +191,355 @@ pub fn restart_with_transport(
|
|
|
96
191
|
"agent {} not found for restart",
|
|
97
192
|
decision.agent_id
|
|
98
193
|
))
|
|
99
|
-
})
|
|
194
|
+
})?
|
|
195
|
+
.clone();
|
|
100
196
|
let session_id = if matches!(decision.restart_mode, StartMode::Resumed) {
|
|
101
197
|
decision.session_id.as_ref()
|
|
102
198
|
} else {
|
|
103
199
|
None
|
|
104
200
|
};
|
|
105
|
-
let
|
|
201
|
+
let session_live = session_live_or_default(transport, &session_name, false);
|
|
202
|
+
if !session_live {
|
|
203
|
+
if let Some(previous) = &last_spawned {
|
|
204
|
+
return Err(LifecycleError::Transport(format!(
|
|
205
|
+
"session_disappeared_after_spawn: provider_resume_exited for {}; session {} disappeared before spawning {}",
|
|
206
|
+
previous,
|
|
207
|
+
session_name.as_str(),
|
|
208
|
+
decision.agent_id
|
|
209
|
+
)));
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
let spawn = spawn_agent_window(
|
|
106
213
|
&selected.run_workspace,
|
|
107
214
|
&session_name,
|
|
108
215
|
&decision.agent_id,
|
|
109
|
-
agent,
|
|
216
|
+
&agent,
|
|
110
217
|
session_id,
|
|
111
|
-
|
|
218
|
+
session_live,
|
|
112
219
|
transport,
|
|
113
220
|
Some(&safety),
|
|
221
|
+
Some(spec_workspace),
|
|
114
222
|
)?;
|
|
223
|
+
verify_spawned_agent_live(&decision.agent_id, &spawn, transport)?;
|
|
224
|
+
mark_agent_respawned(&mut state, &decision.agent_id, &spawn, transport, &safety)?;
|
|
225
|
+
last_spawned = Some(decision.agent_id.clone());
|
|
226
|
+
if let Some(agent) = state
|
|
227
|
+
.get_mut("agents")
|
|
228
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
229
|
+
.and_then(|agents| agents.get_mut(decision.agent_id.as_str()))
|
|
230
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
231
|
+
{
|
|
232
|
+
persist_effective_approval_policy_for_restart(agent, &safety);
|
|
233
|
+
}
|
|
115
234
|
}
|
|
235
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
236
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
116
237
|
let coordinator_started = start_coordinator_for_workspace(&selected.run_workspace)?;
|
|
238
|
+
let attach_commands = crate::tmux_backend::attach_commands_for_windows(
|
|
239
|
+
&selected.run_workspace,
|
|
240
|
+
&session_name,
|
|
241
|
+
plan.decisions
|
|
242
|
+
.iter()
|
|
243
|
+
.map(|decision| decision.agent_id.as_str()),
|
|
244
|
+
);
|
|
245
|
+
let next_actions = attach_commands.clone();
|
|
117
246
|
Ok(RestartReport::Restarted {
|
|
118
247
|
session_name,
|
|
119
248
|
agents: plan.decisions,
|
|
120
249
|
coordinator_started,
|
|
250
|
+
next_actions,
|
|
251
|
+
attach_commands,
|
|
121
252
|
})
|
|
122
253
|
}
|
|
123
254
|
|
|
255
|
+
fn repair_resume_sessions_from_event_log(
|
|
256
|
+
workspace: &Path,
|
|
257
|
+
state: &mut serde_json::Value,
|
|
258
|
+
) -> Result<bool, LifecycleError> {
|
|
259
|
+
let agent_ids = state
|
|
260
|
+
.get("agents")
|
|
261
|
+
.and_then(serde_json::Value::as_object)
|
|
262
|
+
.map(|agents| agents.keys().cloned().collect::<Vec<_>>())
|
|
263
|
+
.unwrap_or_default();
|
|
264
|
+
let mut changed = false;
|
|
265
|
+
for agent_id in agent_ids {
|
|
266
|
+
let previous = state
|
|
267
|
+
.get("agents")
|
|
268
|
+
.and_then(|agents| agents.get(&agent_id))
|
|
269
|
+
.cloned()
|
|
270
|
+
.unwrap_or(serde_json::Value::Null);
|
|
271
|
+
if previous
|
|
272
|
+
.get("session_id")
|
|
273
|
+
.and_then(serde_json::Value::as_str)
|
|
274
|
+
.is_some_and(|session| !session.is_empty())
|
|
275
|
+
{
|
|
276
|
+
continue;
|
|
277
|
+
}
|
|
278
|
+
let Some(provider) = previous
|
|
279
|
+
.get("provider")
|
|
280
|
+
.and_then(serde_json::Value::as_str)
|
|
281
|
+
.and_then(parse_provider)
|
|
282
|
+
else {
|
|
283
|
+
continue;
|
|
284
|
+
};
|
|
285
|
+
let auth_mode = previous
|
|
286
|
+
.get("auth_mode")
|
|
287
|
+
.and_then(serde_json::Value::as_str)
|
|
288
|
+
.and_then(parse_auth_mode)
|
|
289
|
+
.unwrap_or(AuthMode::Subscription);
|
|
290
|
+
let exclude_session_ids = claimed_session_ids_except(state, &agent_id);
|
|
291
|
+
let adapter = crate::provider::get_adapter(provider);
|
|
292
|
+
let repaired = crate::session_capture::recover_resume_session_from_events(
|
|
293
|
+
workspace,
|
|
294
|
+
&agent_id,
|
|
295
|
+
&previous,
|
|
296
|
+
adapter.as_ref(),
|
|
297
|
+
auth_mode,
|
|
298
|
+
&exclude_session_ids,
|
|
299
|
+
)
|
|
300
|
+
.map_err(|e| LifecycleError::Provider(e.to_string()))?;
|
|
301
|
+
let Some(repaired) = repaired else {
|
|
302
|
+
continue;
|
|
303
|
+
};
|
|
304
|
+
let old_session_id = previous
|
|
305
|
+
.get("session_id")
|
|
306
|
+
.and_then(serde_json::Value::as_str)
|
|
307
|
+
.filter(|session| !session.is_empty())
|
|
308
|
+
.map(str::to_string);
|
|
309
|
+
let session_id = repaired
|
|
310
|
+
.get("session_id")
|
|
311
|
+
.and_then(serde_json::Value::as_str)
|
|
312
|
+
.filter(|session| !session.is_empty())
|
|
313
|
+
.map(str::to_string);
|
|
314
|
+
let rollout_path = repaired
|
|
315
|
+
.get("rollout_path")
|
|
316
|
+
.and_then(serde_json::Value::as_str)
|
|
317
|
+
.filter(|path| !path.is_empty())
|
|
318
|
+
.map(str::to_string);
|
|
319
|
+
if let Some(agent) = state
|
|
320
|
+
.get_mut("agents")
|
|
321
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
322
|
+
.and_then(|agents| agents.get_mut(&agent_id))
|
|
323
|
+
{
|
|
324
|
+
*agent = repaired.clone();
|
|
325
|
+
}
|
|
326
|
+
crate::event_log::EventLog::new(workspace)
|
|
327
|
+
.write(
|
|
328
|
+
"resume.session_repaired",
|
|
329
|
+
serde_json::json!({
|
|
330
|
+
"agent_id": agent_id,
|
|
331
|
+
"provider": provider_wire(provider),
|
|
332
|
+
"old_session_id": old_session_id,
|
|
333
|
+
"session_id": session_id,
|
|
334
|
+
"rollout_path": rollout_path,
|
|
335
|
+
"captured_via": "event_log_repair",
|
|
336
|
+
"attribution_confidence": repaired.get("attribution_confidence").cloned().unwrap_or(serde_json::Value::Null),
|
|
337
|
+
}),
|
|
338
|
+
)
|
|
339
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
340
|
+
changed = true;
|
|
341
|
+
}
|
|
342
|
+
Ok(changed)
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
fn claimed_session_ids_except(
|
|
346
|
+
state: &serde_json::Value,
|
|
347
|
+
current_agent_id: &str,
|
|
348
|
+
) -> std::collections::BTreeSet<String> {
|
|
349
|
+
state
|
|
350
|
+
.get("agents")
|
|
351
|
+
.and_then(serde_json::Value::as_object)
|
|
352
|
+
.map(|agents| {
|
|
353
|
+
agents
|
|
354
|
+
.iter()
|
|
355
|
+
.filter(|(agent_id, _)| agent_id.as_str() != current_agent_id)
|
|
356
|
+
.filter_map(|(_, agent)| {
|
|
357
|
+
agent
|
|
358
|
+
.get("session_id")
|
|
359
|
+
.and_then(serde_json::Value::as_str)
|
|
360
|
+
.filter(|session| !session.is_empty())
|
|
361
|
+
.map(str::to_string)
|
|
362
|
+
})
|
|
363
|
+
.collect()
|
|
364
|
+
})
|
|
365
|
+
.unwrap_or_default()
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
fn session_convergence_deadline(requested_ms: Option<u64>) -> std::time::Duration {
|
|
369
|
+
if let Some(ms) = requested_ms {
|
|
370
|
+
return std::time::Duration::from_millis(ms);
|
|
371
|
+
}
|
|
372
|
+
env_duration_ms(
|
|
373
|
+
&[
|
|
374
|
+
"TEAM_AGENT_RESTART_SESSION_CAPTURE_DEADLINE_MS",
|
|
375
|
+
"TEAM_AGENT_RESTART_SESSION_CONVERGENCE_DEADLINE_MS",
|
|
376
|
+
"TEAM_AGENT_RESTART_CAPTURE_DEADLINE_MS",
|
|
377
|
+
"TEAM_AGENT_RESTART_CAPTURE_TIMEOUT_MS",
|
|
378
|
+
"TEAM_AGENT_RESTART_SESSION_CAPTURE_TIMEOUT_MS",
|
|
379
|
+
"TEAM_AGENT_RESTART_SESSION_CONVERGENCE_TIMEOUT_MS",
|
|
380
|
+
"TEAM_AGENT_SESSION_CAPTURE_DEADLINE_MS",
|
|
381
|
+
"TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_DEADLINE_MS",
|
|
382
|
+
"TEAM_AGENT_SESSION_CAPTURE_TIMEOUT_MS",
|
|
383
|
+
"TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_TIMEOUT_MS",
|
|
384
|
+
"TEAM_AGENT_SESSION_CONVERGENCE_DEADLINE_MS",
|
|
385
|
+
"TEAM_AGENT_SESSION_CONVERGENCE_TIMEOUT_MS",
|
|
386
|
+
"TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_DEADLINE_MS",
|
|
387
|
+
"TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_TIMEOUT_MS",
|
|
388
|
+
],
|
|
389
|
+
crate::session_capture::RESTART_SESSION_CONVERGENCE_DEADLINE_MS,
|
|
390
|
+
)
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
fn session_convergence_poll_interval() -> std::time::Duration {
|
|
394
|
+
env_duration_ms(
|
|
395
|
+
&[
|
|
396
|
+
"TEAM_AGENT_RESTART_SESSION_CAPTURE_POLL_MS",
|
|
397
|
+
"TEAM_AGENT_RESTART_SESSION_CONVERGENCE_POLL_MS",
|
|
398
|
+
"TEAM_AGENT_RESTART_CAPTURE_POLL_MS",
|
|
399
|
+
"TEAM_AGENT_SESSION_CAPTURE_POLL_MS",
|
|
400
|
+
"TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_POLL_MS",
|
|
401
|
+
"TEAM_AGENT_SESSION_CONVERGENCE_POLL_MS",
|
|
402
|
+
"TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_POLL_MS",
|
|
403
|
+
],
|
|
404
|
+
crate::session_capture::RESTART_SESSION_CONVERGENCE_POLL_MS,
|
|
405
|
+
)
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
fn env_duration_ms(names: &[&str], default_ms: u64) -> std::time::Duration {
|
|
409
|
+
let ms = names
|
|
410
|
+
.iter()
|
|
411
|
+
.find_map(|name| {
|
|
412
|
+
std::env::var(name)
|
|
413
|
+
.ok()
|
|
414
|
+
.and_then(|value| parse_duration_value_ms(&value))
|
|
415
|
+
.or_else(|| {
|
|
416
|
+
name.strip_suffix("_MS").and_then(|prefix| {
|
|
417
|
+
std::env::var(prefix)
|
|
418
|
+
.ok()
|
|
419
|
+
.and_then(|value| parse_duration_value_seconds_ms(&value))
|
|
420
|
+
})
|
|
421
|
+
})
|
|
422
|
+
})
|
|
423
|
+
.unwrap_or(default_ms);
|
|
424
|
+
std::time::Duration::from_millis(ms)
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
fn parse_duration_value_ms(value: &str) -> Option<u64> {
|
|
428
|
+
value.parse::<u64>().ok()
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
fn parse_duration_value_seconds_ms(value: &str) -> Option<u64> {
|
|
432
|
+
let seconds = value.parse::<f64>().ok()?;
|
|
433
|
+
if seconds.is_finite() && seconds >= 0.0 {
|
|
434
|
+
Some((seconds * 1000.0).round() as u64)
|
|
435
|
+
} else {
|
|
436
|
+
None
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
fn verify_spawned_agent_live(
|
|
441
|
+
_agent_id: &AgentId,
|
|
442
|
+
_spawn: &SpawnedAgentWindow,
|
|
443
|
+
_transport: &dyn crate::transport::Transport,
|
|
444
|
+
) -> Result<(), LifecycleError> {
|
|
445
|
+
Ok(())
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
fn mark_leader_receiver_rebind_required(state: &mut serde_json::Value, session_name: &SessionName) {
|
|
449
|
+
let Some(receiver) = state
|
|
450
|
+
.get_mut("leader_receiver")
|
|
451
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
452
|
+
else {
|
|
453
|
+
return;
|
|
454
|
+
};
|
|
455
|
+
let same_session = receiver
|
|
456
|
+
.get("session_name")
|
|
457
|
+
.and_then(|v| v.as_str())
|
|
458
|
+
.map(|session| session == session_name.as_str())
|
|
459
|
+
.unwrap_or(true);
|
|
460
|
+
if !same_session {
|
|
461
|
+
return;
|
|
462
|
+
}
|
|
463
|
+
if receiver
|
|
464
|
+
.get("status")
|
|
465
|
+
.and_then(|v| v.as_str())
|
|
466
|
+
.is_some_and(|status| status == "attached")
|
|
467
|
+
{
|
|
468
|
+
receiver.insert("status".to_string(), serde_json::json!("rebind_required"));
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
fn mark_restart_targets_stopped_after_teardown(
|
|
473
|
+
state: &mut serde_json::Value,
|
|
474
|
+
decisions: &[RestartedAgent],
|
|
475
|
+
) {
|
|
476
|
+
let Some(agents) = state
|
|
477
|
+
.get_mut("agents")
|
|
478
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
479
|
+
else {
|
|
480
|
+
return;
|
|
481
|
+
};
|
|
482
|
+
for decision in decisions {
|
|
483
|
+
let Some(agent) = agents
|
|
484
|
+
.get_mut(decision.agent_id.as_str())
|
|
485
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
486
|
+
else {
|
|
487
|
+
continue;
|
|
488
|
+
};
|
|
489
|
+
agent.insert("status".to_string(), serde_json::json!("stopped"));
|
|
490
|
+
agent.remove("pane_id");
|
|
491
|
+
agent.remove("pane_pid");
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
fn mark_agent_respawned(
|
|
496
|
+
state: &mut serde_json::Value,
|
|
497
|
+
agent_id: &AgentId,
|
|
498
|
+
spawn: &SpawnedAgentWindow,
|
|
499
|
+
transport: &dyn crate::transport::Transport,
|
|
500
|
+
safety: &DangerousApproval,
|
|
501
|
+
) -> Result<(), LifecycleError> {
|
|
502
|
+
let Some(agent) = state
|
|
503
|
+
.get_mut("agents")
|
|
504
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
505
|
+
.and_then(|agents| agents.get_mut(agent_id.as_str()))
|
|
506
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
507
|
+
else {
|
|
508
|
+
return Err(LifecycleError::StatePersist(format!(
|
|
509
|
+
"agent {} state is not an object",
|
|
510
|
+
agent_id
|
|
511
|
+
)));
|
|
512
|
+
};
|
|
513
|
+
agent.insert("status".to_string(), serde_json::json!("running"));
|
|
514
|
+
agent.insert(
|
|
515
|
+
"pane_id".to_string(),
|
|
516
|
+
serde_json::json!(spawn.spawn.pane_id.as_str()),
|
|
517
|
+
);
|
|
518
|
+
let pane_pid = spawn.spawn.child_pid.or_else(|| {
|
|
519
|
+
transport
|
|
520
|
+
.list_targets()
|
|
521
|
+
.unwrap_or_default()
|
|
522
|
+
.into_iter()
|
|
523
|
+
.find(|pane| pane.pane_id == spawn.spawn.pane_id)
|
|
524
|
+
.and_then(|pane| pane.pane_pid)
|
|
525
|
+
});
|
|
526
|
+
if let Some(pane_pid) = pane_pid {
|
|
527
|
+
agent.insert("pane_pid".to_string(), serde_json::json!(pane_pid));
|
|
528
|
+
}
|
|
529
|
+
crate::lifecycle::launch::persist_command_plan_state(agent, &spawn.plan, &spawn.profile_launch);
|
|
530
|
+
persist_effective_approval_policy_for_restart(agent, safety);
|
|
531
|
+
agent.remove("startup_prompts");
|
|
532
|
+
agent.remove("startup_prompt_status");
|
|
533
|
+
Ok(())
|
|
534
|
+
}
|
|
535
|
+
|
|
124
536
|
fn write_restart_resume_decision_events(
|
|
125
537
|
workspace: &Path,
|
|
126
538
|
state: &serde_json::Value,
|
|
127
539
|
allow_fresh: bool,
|
|
128
540
|
decisions: &[RestartedAgent],
|
|
541
|
+
forced_fresh_missing: &std::collections::BTreeSet<String>,
|
|
542
|
+
forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
|
|
129
543
|
) -> Result<(), LifecycleError> {
|
|
130
544
|
for decision in decisions {
|
|
131
545
|
let agent = state
|
|
@@ -150,6 +564,8 @@ fn write_restart_resume_decision_events(
|
|
|
150
564
|
session_id,
|
|
151
565
|
allow_fresh,
|
|
152
566
|
decision_wire,
|
|
567
|
+
forced_fresh_missing.contains(decision.agent_id.as_str()),
|
|
568
|
+
forced_fresh_convergence,
|
|
153
569
|
)?;
|
|
154
570
|
}
|
|
155
571
|
Ok(())
|
|
@@ -162,15 +578,16 @@ fn write_restart_resume_decision_event(
|
|
|
162
578
|
session_id: Option<String>,
|
|
163
579
|
allow_fresh: bool,
|
|
164
580
|
decision: &str,
|
|
581
|
+
forced_fresh: bool,
|
|
582
|
+
forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
|
|
165
583
|
) -> Result<(), LifecycleError> {
|
|
166
584
|
use std::io::Write as _;
|
|
167
585
|
|
|
168
586
|
let path = workspace.join(".team").join("logs").join("events.jsonl");
|
|
169
587
|
if let Some(parent) = path.parent() {
|
|
170
|
-
std::fs::create_dir_all(parent)
|
|
171
|
-
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
588
|
+
std::fs::create_dir_all(parent).map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
172
589
|
}
|
|
173
|
-
let event = serde_json::json!({
|
|
590
|
+
let mut event = serde_json::json!({
|
|
174
591
|
"ts": chrono::Utc::now().to_rfc3339(),
|
|
175
592
|
"event": crate::lifecycle::types::event_names::RESTART_RESUME_DECISION,
|
|
176
593
|
"worker_id": worker_id,
|
|
@@ -181,8 +598,26 @@ fn write_restart_resume_decision_event(
|
|
|
181
598
|
"first_send_at": first_send_at,
|
|
182
599
|
"session_id": session_id,
|
|
183
600
|
});
|
|
184
|
-
|
|
185
|
-
|
|
601
|
+
if forced_fresh {
|
|
602
|
+
if let Some(event) = event.as_object_mut() {
|
|
603
|
+
event.insert("forced_fresh".to_string(), serde_json::json!(true));
|
|
604
|
+
event.insert("reason".to_string(), serde_json::json!("resume_not_ready"));
|
|
605
|
+
if let Some(convergence) = forced_fresh_convergence {
|
|
606
|
+
event.insert(
|
|
607
|
+
"session_convergence".to_string(),
|
|
608
|
+
serde_json::json!({
|
|
609
|
+
"complete": false,
|
|
610
|
+
"deadline_s": convergence.deadline.as_secs_f64(),
|
|
611
|
+
"deadline_ms": convergence.deadline.as_millis(),
|
|
612
|
+
"elapsed_ms": convergence.elapsed.as_millis(),
|
|
613
|
+
"pending_agent_ids": convergence.missing.clone(),
|
|
614
|
+
}),
|
|
615
|
+
);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
let line =
|
|
620
|
+
serde_json::to_string(&event).map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
186
621
|
let mut file = std::fs::OpenOptions::new()
|
|
187
622
|
.create(true)
|
|
188
623
|
.append(true)
|
|
@@ -233,7 +668,10 @@ pub fn select_restart_state(
|
|
|
233
668
|
.get("active_team_key")
|
|
234
669
|
.and_then(serde_json::Value::as_str)
|
|
235
670
|
.filter(|s| !s.is_empty())
|
|
236
|
-
.map_or_else(
|
|
671
|
+
.map_or_else(
|
|
672
|
+
|| crate::state::projection::team_state_key(&selected),
|
|
673
|
+
str::to_string,
|
|
674
|
+
);
|
|
237
675
|
Ok(restart_candidate_from_state(workspace, &key, &selected))
|
|
238
676
|
}
|
|
239
677
|
|
|
@@ -299,12 +737,14 @@ fn restart_candidate_has_context(state: &serde_json::Value) -> bool {
|
|
|
299
737
|
.and_then(serde_json::Value::as_object)
|
|
300
738
|
.is_some_and(|agents| {
|
|
301
739
|
agents.values().any(|agent| {
|
|
302
|
-
["session_id", "rollout_path", "first_send_at"]
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
740
|
+
["session_id", "rollout_path", "first_send_at"]
|
|
741
|
+
.iter()
|
|
742
|
+
.any(|key| {
|
|
743
|
+
agent
|
|
744
|
+
.get(*key)
|
|
745
|
+
.and_then(serde_json::Value::as_str)
|
|
746
|
+
.is_some_and(|s| !s.is_empty())
|
|
747
|
+
})
|
|
308
748
|
})
|
|
309
749
|
})
|
|
310
750
|
}
|
|
@@ -230,6 +230,7 @@ fn remove_agent_inner(
|
|
|
230
230
|
"agent_health",
|
|
231
231
|
None,
|
|
232
232
|
)?;
|
|
233
|
+
maybe_fail_remove_after_agent_health_delete()?;
|
|
233
234
|
Ok(RemoveSuccess {
|
|
234
235
|
outcome: RemoveAgentOutcome::Removed {
|
|
235
236
|
agent_id: agent_id.clone(),
|
|
@@ -585,15 +586,16 @@ fn select_agent_health(
|
|
|
585
586
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
586
587
|
let row = conn
|
|
587
588
|
.query_row(
|
|
588
|
-
"select status, last_output_at, context_usage_pct, current_task_id \
|
|
589
|
+
"select owner_team_id, status, last_output_at, context_usage_pct, current_task_id \
|
|
589
590
|
from agent_health where agent_id = ?1",
|
|
590
591
|
[agent_id.as_str()],
|
|
591
592
|
|r| {
|
|
592
593
|
Ok(CapturedHealth {
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
594
|
+
owner_team_id: r.get::<_, Option<String>>(0)?,
|
|
595
|
+
status: r.get::<_, Option<String>>(1)?,
|
|
596
|
+
last_output_at: r.get::<_, Option<String>>(2)?,
|
|
597
|
+
context_usage_pct: r.get::<_, Option<i64>>(3)?,
|
|
598
|
+
current_task_id: r.get::<_, Option<String>>(4)?,
|
|
597
599
|
})
|
|
598
600
|
},
|
|
599
601
|
)
|
|
@@ -622,8 +624,9 @@ fn restore_agent_health(
|
|
|
622
624
|
// health (golden _restore_agent_health re-upserts status||"IDLE" + the captured columns).
|
|
623
625
|
conn.execute(
|
|
624
626
|
"insert into agent_health (owner_team_id, agent_id, status, last_output_at, context_usage_pct, current_task_id, updated_at) \
|
|
625
|
-
values (
|
|
627
|
+
values (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
|
|
626
628
|
rusqlite::params![
|
|
629
|
+
row.owner_team_id,
|
|
627
630
|
agent_id.as_str(),
|
|
628
631
|
status,
|
|
629
632
|
row.last_output_at,
|
|
@@ -638,12 +641,25 @@ fn restore_agent_health(
|
|
|
638
641
|
|
|
639
642
|
#[derive(Clone)]
|
|
640
643
|
struct CapturedHealth {
|
|
644
|
+
owner_team_id: Option<String>,
|
|
641
645
|
status: Option<String>,
|
|
642
646
|
last_output_at: Option<String>,
|
|
643
647
|
context_usage_pct: Option<i64>,
|
|
644
648
|
current_task_id: Option<String>,
|
|
645
649
|
}
|
|
646
650
|
|
|
651
|
+
fn maybe_fail_remove_after_agent_health_delete() -> Result<(), LifecycleError> {
|
|
652
|
+
let Ok(reason) = std::env::var("TEAM_AGENT_TEST_FAIL_REMOVE_AFTER_AGENT_HEALTH_DELETE") else {
|
|
653
|
+
return Ok(());
|
|
654
|
+
};
|
|
655
|
+
if reason.is_empty() {
|
|
656
|
+
return Ok(());
|
|
657
|
+
}
|
|
658
|
+
Err(LifecycleError::StatePersist(format!(
|
|
659
|
+
"injected remove failure after agent_health delete: {reason}"
|
|
660
|
+
)))
|
|
661
|
+
}
|
|
662
|
+
|
|
647
663
|
struct RemoveRollback {
|
|
648
664
|
agent_id: AgentId,
|
|
649
665
|
spec_text: Option<String>,
|