@team-agent/installer 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +34 -1
- package/Cargo.toml +1 -1
- package/crates/team-agent/Cargo.toml +1 -1
- package/crates/team-agent/src/cli/adapters.rs +234 -26
- package/crates/team-agent/src/cli/diagnose.rs +144 -10
- package/crates/team-agent/src/cli/emit.rs +289 -54
- package/crates/team-agent/src/cli/leader.rs +37 -8
- package/crates/team-agent/src/cli/mod.rs +1281 -196
- package/crates/team-agent/src/cli/status_port.rs +195 -46
- package/crates/team-agent/src/cli/tests/divergence.rs +1 -2
- package/crates/team-agent/src/cli/tests/lane_c.rs +23 -13
- package/crates/team-agent/src/cli/tests/main_preserved.rs +2 -0
- package/crates/team-agent/src/cli/tests/run_delegation.rs +59 -3
- package/crates/team-agent/src/cli/types.rs +18 -0
- package/crates/team-agent/src/compiler.rs +15 -5
- package/crates/team-agent/src/coordinator/health.rs +95 -17
- package/crates/team-agent/src/coordinator/mod.rs +4 -0
- package/crates/team-agent/src/coordinator/runtime_detectors.rs +500 -0
- package/crates/team-agent/src/coordinator/runtime_observation.rs +58 -0
- package/crates/team-agent/src/coordinator/tick.rs +222 -69
- package/crates/team-agent/src/coordinator/types.rs +15 -3
- package/crates/team-agent/src/db/schema.rs +37 -2
- package/crates/team-agent/src/diagnose/comms.rs +226 -0
- package/crates/team-agent/src/diagnose/mod.rs +45 -0
- package/crates/team-agent/src/diagnose/orphans.rs +658 -0
- package/crates/team-agent/src/fake_worker.rs +146 -3
- package/crates/team-agent/src/leader/start.rs +121 -23
- package/crates/team-agent/src/leader/types.rs +44 -1
- package/crates/team-agent/src/lib.rs +3 -0
- package/crates/team-agent/src/lifecycle/display.rs +645 -47
- package/crates/team-agent/src/lifecycle/launch.rs +1061 -146
- package/crates/team-agent/src/lifecycle/mod.rs +2 -0
- package/crates/team-agent/src/lifecycle/profile_launch.rs +810 -0
- package/crates/team-agent/src/lifecycle/profile_smoke.rs +522 -0
- package/crates/team-agent/src/lifecycle/restart/agent.rs +99 -23
- package/crates/team-agent/src/lifecycle/restart/common.rs +183 -24
- package/crates/team-agent/src/lifecycle/restart/rebuild.rs +498 -22
- package/crates/team-agent/src/lifecycle/restart/remove.rs +27 -7
- package/crates/team-agent/src/lifecycle/restart/team_state.rs +19 -0
- package/crates/team-agent/src/lifecycle/restart.rs +24 -1
- package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +5 -5
- package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +37 -7
- package/crates/team-agent/src/lifecycle/types.rs +19 -0
- package/crates/team-agent/src/mcp_server/helpers.rs +1 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +341 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/mod.rs +10 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/state_status.rs +158 -0
- package/crates/team-agent/src/mcp_server/mod.rs +3 -74
- package/crates/team-agent/src/mcp_server/tests/scoped.rs +1 -1
- package/crates/team-agent/src/mcp_server/tests/send.rs +6 -5
- package/crates/team-agent/src/mcp_server/tools.rs +312 -111
- package/crates/team-agent/src/mcp_server/types.rs +6 -4
- package/crates/team-agent/src/mcp_server/wire.rs +19 -7
- package/crates/team-agent/src/message_store.rs +21 -4
- package/crates/team-agent/src/messaging/delivery.rs +470 -59
- package/crates/team-agent/src/messaging/mod.rs +9 -6
- package/crates/team-agent/src/messaging/results.rs +353 -63
- package/crates/team-agent/src/messaging/selftest.rs +199 -12
- package/crates/team-agent/src/messaging/send.rs +35 -3
- package/crates/team-agent/src/messaging/tests/runtime.rs +19 -4
- package/crates/team-agent/src/messaging/types.rs +11 -3
- package/crates/team-agent/src/os_probe.rs +119 -0
- package/crates/team-agent/src/packaging/migrate.rs +10 -2
- package/crates/team-agent/src/packaging/tests.rs +23 -0
- package/crates/team-agent/src/provider/adapter.rs +564 -63
- package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +1 -7
- package/crates/team-agent/src/provider/classify.rs +51 -4
- package/crates/team-agent/src/provider/helpers.rs +10 -1
- package/crates/team-agent/src/provider/startup_prompt.rs +94 -0
- package/crates/team-agent/src/provider/types.rs +47 -0
- package/crates/team-agent/src/session_capture.rs +616 -0
- package/crates/team-agent/src/state/persist.rs +170 -1
- package/crates/team-agent/src/state/projection.rs +141 -8
- package/crates/team-agent/src/state/selector.rs +5 -2
- package/crates/team-agent/src/tmux_backend.rs +161 -64
- package/crates/team-agent/src/transport/test_support.rs +9 -0
- package/crates/team-agent/src/transport/tests/wire.rs +4 -0
- package/crates/team-agent/src/transport.rs +13 -2
- package/package.json +4 -4
|
@@ -12,13 +12,23 @@ pub fn restart(
|
|
|
12
12
|
workspace: &Path,
|
|
13
13
|
allow_fresh: bool,
|
|
14
14
|
team: Option<&str>,
|
|
15
|
+
) -> Result<RestartReport, LifecycleError> {
|
|
16
|
+
restart_with_session_convergence_deadline(workspace, allow_fresh, team, None)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
pub fn restart_with_session_convergence_deadline(
|
|
20
|
+
workspace: &Path,
|
|
21
|
+
allow_fresh: bool,
|
|
22
|
+
team: Option<&str>,
|
|
23
|
+
session_converge_deadline_ms: Option<u64>,
|
|
15
24
|
) -> Result<RestartReport, LifecycleError> {
|
|
16
25
|
let run_ws = lifecycle_run_workspace(workspace)?;
|
|
17
|
-
|
|
26
|
+
restart_with_transport_with_session_convergence_deadline(
|
|
18
27
|
workspace,
|
|
19
28
|
allow_fresh,
|
|
20
29
|
team,
|
|
21
30
|
&crate::tmux_backend::TmuxBackend::for_workspace(&run_ws),
|
|
31
|
+
session_converge_deadline_ms,
|
|
22
32
|
)
|
|
23
33
|
}
|
|
24
34
|
|
|
@@ -31,6 +41,48 @@ pub fn restart_with_transport(
|
|
|
31
41
|
team: Option<&str>,
|
|
32
42
|
transport: &dyn crate::transport::Transport,
|
|
33
43
|
) -> Result<RestartReport, LifecycleError> {
|
|
44
|
+
match restart_with_transport_with_session_convergence_deadline(
|
|
45
|
+
workspace,
|
|
46
|
+
allow_fresh,
|
|
47
|
+
team,
|
|
48
|
+
transport,
|
|
49
|
+
None,
|
|
50
|
+
)? {
|
|
51
|
+
RestartReport::RefusedResumeNotReady {
|
|
52
|
+
missing,
|
|
53
|
+
allow_fresh,
|
|
54
|
+
error,
|
|
55
|
+
..
|
|
56
|
+
} => Ok(RestartReport::RefusedResumeAtomicity {
|
|
57
|
+
unresumable: missing
|
|
58
|
+
.into_iter()
|
|
59
|
+
.map(|agent_id| UnresumableWorker {
|
|
60
|
+
agent_id,
|
|
61
|
+
reason: "session_capture_incomplete".to_string(),
|
|
62
|
+
session_id: None,
|
|
63
|
+
first_send_at: None,
|
|
64
|
+
})
|
|
65
|
+
.collect(),
|
|
66
|
+
allow_fresh,
|
|
67
|
+
error,
|
|
68
|
+
}),
|
|
69
|
+
report => Ok(report),
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
pub fn restart_with_transport_with_session_convergence_deadline(
|
|
74
|
+
workspace: &Path,
|
|
75
|
+
allow_fresh: bool,
|
|
76
|
+
team: Option<&str>,
|
|
77
|
+
transport: &dyn crate::transport::Transport,
|
|
78
|
+
session_converge_deadline_ms: Option<u64>,
|
|
79
|
+
) -> Result<RestartReport, LifecycleError> {
|
|
80
|
+
if crate::lifecycle::restart::input_has_no_local_team_context(workspace) {
|
|
81
|
+
return Err(LifecycleError::TeamSelect(format!(
|
|
82
|
+
"missing spec for restart: {}",
|
|
83
|
+
workspace.join("team.spec.yaml").display()
|
|
84
|
+
)));
|
|
85
|
+
}
|
|
34
86
|
let run_candidate = crate::model::paths::canonical_run_workspace(workspace)
|
|
35
87
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
36
88
|
if !workspace.join("team.spec.yaml").exists()
|
|
@@ -47,7 +99,7 @@ pub fn restart_with_transport(
|
|
|
47
99
|
crate::state::selector::SelectorMode::RequireSpec,
|
|
48
100
|
)
|
|
49
101
|
.map_err(|e| LifecycleError::TeamSelect(e.to_string()))?;
|
|
50
|
-
let state = selected.state;
|
|
102
|
+
let mut state = selected.state;
|
|
51
103
|
crate::lifecycle::launch::ensure_owner_allowed_for_state(&state, None)?;
|
|
52
104
|
let spec_workspace = selected
|
|
53
105
|
.spec_workspace
|
|
@@ -55,8 +107,57 @@ pub fn restart_with_transport(
|
|
|
55
107
|
.ok_or_else(|| LifecycleError::TeamSelect("active team spec workspace not found".to_string()))?;
|
|
56
108
|
let spec = load_team_spec(spec_workspace)?;
|
|
57
109
|
let safety = crate::lifecycle::launch::effective_runtime_config(&spec)?;
|
|
110
|
+
let mut convergence = converge_missing_provider_sessions(
|
|
111
|
+
&mut state,
|
|
112
|
+
session_convergence_deadline(session_converge_deadline_ms),
|
|
113
|
+
session_convergence_poll_interval(),
|
|
114
|
+
&selected.run_workspace,
|
|
115
|
+
allow_fresh,
|
|
116
|
+
)?;
|
|
117
|
+
if convergence.converged && convergence.changed {
|
|
118
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
119
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
120
|
+
}
|
|
121
|
+
if repair_resume_sessions_from_event_log(&selected.run_workspace, &mut state)? {
|
|
122
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
123
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
124
|
+
let missing_after_repair = restart_required_missing_session_agent_ids(&state);
|
|
125
|
+
convergence.changed = true;
|
|
126
|
+
convergence.converged = missing_after_repair.is_empty();
|
|
127
|
+
convergence.missing = missing_after_repair;
|
|
128
|
+
}
|
|
129
|
+
if !convergence.converged && !allow_fresh {
|
|
130
|
+
return Ok(RestartReport::RefusedResumeNotReady {
|
|
131
|
+
missing: convergence
|
|
132
|
+
.missing
|
|
133
|
+
.iter()
|
|
134
|
+
.map(|agent_id| AgentId::new(agent_id.clone()))
|
|
135
|
+
.collect(),
|
|
136
|
+
allow_fresh,
|
|
137
|
+
deadline: convergence.deadline,
|
|
138
|
+
elapsed: convergence.elapsed,
|
|
139
|
+
error: "resume_not_ready: session_capture_incomplete".to_string(),
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
if !convergence.converged && convergence.changed {
|
|
143
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
144
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
145
|
+
}
|
|
146
|
+
let forced_fresh_missing = if convergence.converged {
|
|
147
|
+
std::collections::BTreeSet::new()
|
|
148
|
+
} else {
|
|
149
|
+
convergence.missing.iter().cloned().collect()
|
|
150
|
+
};
|
|
151
|
+
let forced_fresh_convergence = (!convergence.converged).then_some(convergence.clone());
|
|
58
152
|
let plan = classify_restart_plan(&state, allow_fresh)?;
|
|
59
|
-
write_restart_resume_decision_events(
|
|
153
|
+
write_restart_resume_decision_events(
|
|
154
|
+
&selected.run_workspace,
|
|
155
|
+
&state,
|
|
156
|
+
allow_fresh,
|
|
157
|
+
&plan.decisions,
|
|
158
|
+
&forced_fresh_missing,
|
|
159
|
+
forced_fresh_convergence.as_ref(),
|
|
160
|
+
)?;
|
|
60
161
|
if !plan.corrupt_entries.is_empty() {
|
|
61
162
|
return Ok(RestartReport::RefusedInvalidFirstSendAt {
|
|
62
163
|
invalid: plan.corrupt_entries,
|
|
@@ -76,8 +177,13 @@ pub fn restart_with_transport(
|
|
|
76
177
|
transport
|
|
77
178
|
.kill_session(&session_name)
|
|
78
179
|
.map_err(|e| LifecycleError::Transport(e.to_string()))?;
|
|
180
|
+
mark_leader_receiver_rebind_required(&mut state, &session_name);
|
|
181
|
+
mark_restart_targets_stopped_after_teardown(&mut state, &plan.decisions);
|
|
182
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
183
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
79
184
|
}
|
|
80
|
-
|
|
185
|
+
let mut last_spawned: Option<AgentId> = None;
|
|
186
|
+
for decision in &plan.decisions {
|
|
81
187
|
let agent = state
|
|
82
188
|
.get("agents")
|
|
83
189
|
.and_then(|v| v.get(decision.agent_id.as_str()))
|
|
@@ -86,23 +192,49 @@ pub fn restart_with_transport(
|
|
|
86
192
|
"agent {} not found for restart",
|
|
87
193
|
decision.agent_id
|
|
88
194
|
))
|
|
89
|
-
})
|
|
195
|
+
})?
|
|
196
|
+
.clone();
|
|
90
197
|
let session_id = if matches!(decision.restart_mode, StartMode::Resumed) {
|
|
91
198
|
decision.session_id.as_ref()
|
|
92
199
|
} else {
|
|
93
200
|
None
|
|
94
201
|
};
|
|
95
|
-
let
|
|
202
|
+
let session_live = session_live_or_default(transport, &session_name, false);
|
|
203
|
+
if !session_live {
|
|
204
|
+
if let Some(previous) = &last_spawned {
|
|
205
|
+
return Err(LifecycleError::Transport(format!(
|
|
206
|
+
"session_disappeared_after_spawn: provider_resume_exited for {}; session {} disappeared before spawning {}",
|
|
207
|
+
previous,
|
|
208
|
+
session_name.as_str(),
|
|
209
|
+
decision.agent_id
|
|
210
|
+
)));
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
let spawn = spawn_agent_window(
|
|
96
214
|
&selected.run_workspace,
|
|
97
215
|
&session_name,
|
|
98
216
|
&decision.agent_id,
|
|
99
|
-
agent,
|
|
217
|
+
&agent,
|
|
100
218
|
session_id,
|
|
101
|
-
|
|
219
|
+
session_live,
|
|
102
220
|
transport,
|
|
103
221
|
Some(&safety),
|
|
222
|
+
Some(spec_workspace),
|
|
104
223
|
)?;
|
|
224
|
+
verify_spawned_agent_live(&decision.agent_id, &spawn, transport)?;
|
|
225
|
+
mark_agent_respawned(&mut state, &decision.agent_id, &spawn, transport, &safety)?;
|
|
226
|
+
last_spawned = Some(decision.agent_id.clone());
|
|
227
|
+
if let Some(agent) = state
|
|
228
|
+
.get_mut("agents")
|
|
229
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
230
|
+
.and_then(|agents| agents.get_mut(decision.agent_id.as_str()))
|
|
231
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
232
|
+
{
|
|
233
|
+
persist_effective_approval_policy_for_restart(agent, &safety);
|
|
234
|
+
}
|
|
105
235
|
}
|
|
236
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
237
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
106
238
|
let coordinator_started = start_coordinator_for_workspace(&selected.run_workspace)?;
|
|
107
239
|
Ok(RestartReport::Restarted {
|
|
108
240
|
session_name,
|
|
@@ -111,13 +243,302 @@ pub fn restart_with_transport(
|
|
|
111
243
|
})
|
|
112
244
|
}
|
|
113
245
|
|
|
246
|
+
fn repair_resume_sessions_from_event_log(
|
|
247
|
+
workspace: &Path,
|
|
248
|
+
state: &mut serde_json::Value,
|
|
249
|
+
) -> Result<bool, LifecycleError> {
|
|
250
|
+
let agent_ids = state
|
|
251
|
+
.get("agents")
|
|
252
|
+
.and_then(serde_json::Value::as_object)
|
|
253
|
+
.map(|agents| agents.keys().cloned().collect::<Vec<_>>())
|
|
254
|
+
.unwrap_or_default();
|
|
255
|
+
let mut changed = false;
|
|
256
|
+
for agent_id in agent_ids {
|
|
257
|
+
let previous = state
|
|
258
|
+
.get("agents")
|
|
259
|
+
.and_then(|agents| agents.get(&agent_id))
|
|
260
|
+
.cloned()
|
|
261
|
+
.unwrap_or(serde_json::Value::Null);
|
|
262
|
+
if previous
|
|
263
|
+
.get("session_id")
|
|
264
|
+
.and_then(serde_json::Value::as_str)
|
|
265
|
+
.is_some_and(|session| !session.is_empty())
|
|
266
|
+
{
|
|
267
|
+
continue;
|
|
268
|
+
}
|
|
269
|
+
let Some(provider) = previous
|
|
270
|
+
.get("provider")
|
|
271
|
+
.and_then(serde_json::Value::as_str)
|
|
272
|
+
.and_then(parse_provider)
|
|
273
|
+
else {
|
|
274
|
+
continue;
|
|
275
|
+
};
|
|
276
|
+
let auth_mode = previous
|
|
277
|
+
.get("auth_mode")
|
|
278
|
+
.and_then(serde_json::Value::as_str)
|
|
279
|
+
.and_then(parse_auth_mode)
|
|
280
|
+
.unwrap_or(AuthMode::Subscription);
|
|
281
|
+
let exclude_session_ids = claimed_session_ids_except(state, &agent_id);
|
|
282
|
+
let adapter = crate::provider::get_adapter(provider);
|
|
283
|
+
let repaired = crate::session_capture::recover_resume_session_from_events(
|
|
284
|
+
workspace,
|
|
285
|
+
&agent_id,
|
|
286
|
+
&previous,
|
|
287
|
+
adapter.as_ref(),
|
|
288
|
+
auth_mode,
|
|
289
|
+
&exclude_session_ids,
|
|
290
|
+
)
|
|
291
|
+
.map_err(|e| LifecycleError::Provider(e.to_string()))?;
|
|
292
|
+
let Some(repaired) = repaired else {
|
|
293
|
+
continue;
|
|
294
|
+
};
|
|
295
|
+
let old_session_id = previous
|
|
296
|
+
.get("session_id")
|
|
297
|
+
.and_then(serde_json::Value::as_str)
|
|
298
|
+
.filter(|session| !session.is_empty())
|
|
299
|
+
.map(str::to_string);
|
|
300
|
+
let session_id = repaired
|
|
301
|
+
.get("session_id")
|
|
302
|
+
.and_then(serde_json::Value::as_str)
|
|
303
|
+
.filter(|session| !session.is_empty())
|
|
304
|
+
.map(str::to_string);
|
|
305
|
+
let rollout_path = repaired
|
|
306
|
+
.get("rollout_path")
|
|
307
|
+
.and_then(serde_json::Value::as_str)
|
|
308
|
+
.filter(|path| !path.is_empty())
|
|
309
|
+
.map(str::to_string);
|
|
310
|
+
if let Some(agent) = state
|
|
311
|
+
.get_mut("agents")
|
|
312
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
313
|
+
.and_then(|agents| agents.get_mut(&agent_id))
|
|
314
|
+
{
|
|
315
|
+
*agent = repaired.clone();
|
|
316
|
+
}
|
|
317
|
+
crate::event_log::EventLog::new(workspace)
|
|
318
|
+
.write(
|
|
319
|
+
"resume.session_repaired",
|
|
320
|
+
serde_json::json!({
|
|
321
|
+
"agent_id": agent_id,
|
|
322
|
+
"provider": provider_wire(provider),
|
|
323
|
+
"old_session_id": old_session_id,
|
|
324
|
+
"session_id": session_id,
|
|
325
|
+
"rollout_path": rollout_path,
|
|
326
|
+
"captured_via": "event_log_repair",
|
|
327
|
+
"attribution_confidence": repaired.get("attribution_confidence").cloned().unwrap_or(serde_json::Value::Null),
|
|
328
|
+
}),
|
|
329
|
+
)
|
|
330
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
331
|
+
changed = true;
|
|
332
|
+
}
|
|
333
|
+
Ok(changed)
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
fn claimed_session_ids_except(
|
|
337
|
+
state: &serde_json::Value,
|
|
338
|
+
current_agent_id: &str,
|
|
339
|
+
) -> std::collections::BTreeSet<String> {
|
|
340
|
+
state
|
|
341
|
+
.get("agents")
|
|
342
|
+
.and_then(serde_json::Value::as_object)
|
|
343
|
+
.map(|agents| {
|
|
344
|
+
agents
|
|
345
|
+
.iter()
|
|
346
|
+
.filter(|(agent_id, _)| agent_id.as_str() != current_agent_id)
|
|
347
|
+
.filter_map(|(_, agent)| {
|
|
348
|
+
agent
|
|
349
|
+
.get("session_id")
|
|
350
|
+
.and_then(serde_json::Value::as_str)
|
|
351
|
+
.filter(|session| !session.is_empty())
|
|
352
|
+
.map(str::to_string)
|
|
353
|
+
})
|
|
354
|
+
.collect()
|
|
355
|
+
})
|
|
356
|
+
.unwrap_or_default()
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
fn session_convergence_deadline(requested_ms: Option<u64>) -> std::time::Duration {
|
|
360
|
+
if let Some(ms) = requested_ms {
|
|
361
|
+
return std::time::Duration::from_millis(ms);
|
|
362
|
+
}
|
|
363
|
+
env_duration_ms(
|
|
364
|
+
&[
|
|
365
|
+
"TEAM_AGENT_RESTART_SESSION_CAPTURE_DEADLINE_MS",
|
|
366
|
+
"TEAM_AGENT_RESTART_SESSION_CONVERGENCE_DEADLINE_MS",
|
|
367
|
+
"TEAM_AGENT_RESTART_CAPTURE_DEADLINE_MS",
|
|
368
|
+
"TEAM_AGENT_RESTART_CAPTURE_TIMEOUT_MS",
|
|
369
|
+
"TEAM_AGENT_RESTART_SESSION_CAPTURE_TIMEOUT_MS",
|
|
370
|
+
"TEAM_AGENT_RESTART_SESSION_CONVERGENCE_TIMEOUT_MS",
|
|
371
|
+
"TEAM_AGENT_SESSION_CAPTURE_DEADLINE_MS",
|
|
372
|
+
"TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_DEADLINE_MS",
|
|
373
|
+
"TEAM_AGENT_SESSION_CAPTURE_TIMEOUT_MS",
|
|
374
|
+
"TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_TIMEOUT_MS",
|
|
375
|
+
"TEAM_AGENT_SESSION_CONVERGENCE_DEADLINE_MS",
|
|
376
|
+
"TEAM_AGENT_SESSION_CONVERGENCE_TIMEOUT_MS",
|
|
377
|
+
"TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_DEADLINE_MS",
|
|
378
|
+
"TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_TIMEOUT_MS",
|
|
379
|
+
],
|
|
380
|
+
crate::session_capture::RESTART_SESSION_CONVERGENCE_DEADLINE_MS,
|
|
381
|
+
)
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
fn session_convergence_poll_interval() -> std::time::Duration {
|
|
385
|
+
env_duration_ms(
|
|
386
|
+
&[
|
|
387
|
+
"TEAM_AGENT_RESTART_SESSION_CAPTURE_POLL_MS",
|
|
388
|
+
"TEAM_AGENT_RESTART_SESSION_CONVERGENCE_POLL_MS",
|
|
389
|
+
"TEAM_AGENT_RESTART_CAPTURE_POLL_MS",
|
|
390
|
+
"TEAM_AGENT_SESSION_CAPTURE_POLL_MS",
|
|
391
|
+
"TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_POLL_MS",
|
|
392
|
+
"TEAM_AGENT_SESSION_CONVERGENCE_POLL_MS",
|
|
393
|
+
"TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_POLL_MS",
|
|
394
|
+
],
|
|
395
|
+
crate::session_capture::RESTART_SESSION_CONVERGENCE_POLL_MS,
|
|
396
|
+
)
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
fn env_duration_ms(names: &[&str], default_ms: u64) -> std::time::Duration {
|
|
400
|
+
let ms = names
|
|
401
|
+
.iter()
|
|
402
|
+
.find_map(|name| {
|
|
403
|
+
std::env::var(name)
|
|
404
|
+
.ok()
|
|
405
|
+
.and_then(|value| parse_duration_value_ms(&value))
|
|
406
|
+
.or_else(|| {
|
|
407
|
+
name.strip_suffix("_MS").and_then(|prefix| {
|
|
408
|
+
std::env::var(prefix)
|
|
409
|
+
.ok()
|
|
410
|
+
.and_then(|value| parse_duration_value_seconds_ms(&value))
|
|
411
|
+
})
|
|
412
|
+
})
|
|
413
|
+
})
|
|
414
|
+
.unwrap_or(default_ms);
|
|
415
|
+
std::time::Duration::from_millis(ms)
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
fn parse_duration_value_ms(value: &str) -> Option<u64> {
|
|
419
|
+
value.parse::<u64>().ok()
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
fn parse_duration_value_seconds_ms(value: &str) -> Option<u64> {
|
|
423
|
+
let seconds = value.parse::<f64>().ok()?;
|
|
424
|
+
if seconds.is_finite() && seconds >= 0.0 {
|
|
425
|
+
Some((seconds * 1000.0).round() as u64)
|
|
426
|
+
} else {
|
|
427
|
+
None
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
fn verify_spawned_agent_live(
|
|
432
|
+
_agent_id: &AgentId,
|
|
433
|
+
_spawn: &SpawnedAgentWindow,
|
|
434
|
+
_transport: &dyn crate::transport::Transport,
|
|
435
|
+
) -> Result<(), LifecycleError> {
|
|
436
|
+
Ok(())
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
fn mark_leader_receiver_rebind_required(state: &mut serde_json::Value, session_name: &SessionName) {
|
|
440
|
+
let Some(receiver) = state
|
|
441
|
+
.get_mut("leader_receiver")
|
|
442
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
443
|
+
else {
|
|
444
|
+
return;
|
|
445
|
+
};
|
|
446
|
+
let same_session = receiver
|
|
447
|
+
.get("session_name")
|
|
448
|
+
.and_then(|v| v.as_str())
|
|
449
|
+
.map(|session| session == session_name.as_str())
|
|
450
|
+
.unwrap_or(true);
|
|
451
|
+
if !same_session {
|
|
452
|
+
return;
|
|
453
|
+
}
|
|
454
|
+
if receiver
|
|
455
|
+
.get("status")
|
|
456
|
+
.and_then(|v| v.as_str())
|
|
457
|
+
.is_some_and(|status| status == "attached")
|
|
458
|
+
{
|
|
459
|
+
receiver.insert(
|
|
460
|
+
"status".to_string(),
|
|
461
|
+
serde_json::json!("rebind_required"),
|
|
462
|
+
);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
fn mark_restart_targets_stopped_after_teardown(
|
|
467
|
+
state: &mut serde_json::Value,
|
|
468
|
+
decisions: &[RestartedAgent],
|
|
469
|
+
) {
|
|
470
|
+
let Some(agents) = state
|
|
471
|
+
.get_mut("agents")
|
|
472
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
473
|
+
else {
|
|
474
|
+
return;
|
|
475
|
+
};
|
|
476
|
+
for decision in decisions {
|
|
477
|
+
let Some(agent) = agents
|
|
478
|
+
.get_mut(decision.agent_id.as_str())
|
|
479
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
480
|
+
else {
|
|
481
|
+
continue;
|
|
482
|
+
};
|
|
483
|
+
agent.insert("status".to_string(), serde_json::json!("stopped"));
|
|
484
|
+
agent.remove("pane_id");
|
|
485
|
+
agent.remove("pane_pid");
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
fn mark_agent_respawned(
|
|
490
|
+
state: &mut serde_json::Value,
|
|
491
|
+
agent_id: &AgentId,
|
|
492
|
+
spawn: &SpawnedAgentWindow,
|
|
493
|
+
transport: &dyn crate::transport::Transport,
|
|
494
|
+
safety: &DangerousApproval,
|
|
495
|
+
) -> Result<(), LifecycleError> {
|
|
496
|
+
let Some(agent) = state
|
|
497
|
+
.get_mut("agents")
|
|
498
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
499
|
+
.and_then(|agents| agents.get_mut(agent_id.as_str()))
|
|
500
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
501
|
+
else {
|
|
502
|
+
return Err(LifecycleError::StatePersist(format!(
|
|
503
|
+
"agent {} state is not an object",
|
|
504
|
+
agent_id
|
|
505
|
+
)));
|
|
506
|
+
};
|
|
507
|
+
agent.insert("status".to_string(), serde_json::json!("running"));
|
|
508
|
+
agent.insert(
|
|
509
|
+
"pane_id".to_string(),
|
|
510
|
+
serde_json::json!(spawn.spawn.pane_id.as_str()),
|
|
511
|
+
);
|
|
512
|
+
let pane_pid = spawn.spawn.child_pid.or_else(|| {
|
|
513
|
+
transport
|
|
514
|
+
.list_targets()
|
|
515
|
+
.unwrap_or_default()
|
|
516
|
+
.into_iter()
|
|
517
|
+
.find(|pane| pane.pane_id == spawn.spawn.pane_id)
|
|
518
|
+
.and_then(|pane| pane.pane_pid)
|
|
519
|
+
});
|
|
520
|
+
if let Some(pane_pid) = pane_pid {
|
|
521
|
+
agent.insert("pane_pid".to_string(), serde_json::json!(pane_pid));
|
|
522
|
+
}
|
|
523
|
+
crate::lifecycle::launch::persist_command_plan_state(
|
|
524
|
+
agent,
|
|
525
|
+
&spawn.plan,
|
|
526
|
+
&spawn.profile_launch,
|
|
527
|
+
);
|
|
528
|
+
persist_effective_approval_policy_for_restart(agent, safety);
|
|
529
|
+
agent.remove("startup_prompts");
|
|
530
|
+
agent.remove("startup_prompt_status");
|
|
531
|
+
Ok(())
|
|
532
|
+
}
|
|
533
|
+
|
|
114
534
|
fn write_restart_resume_decision_events(
|
|
115
535
|
workspace: &Path,
|
|
116
536
|
state: &serde_json::Value,
|
|
117
537
|
allow_fresh: bool,
|
|
118
538
|
decisions: &[RestartedAgent],
|
|
539
|
+
forced_fresh_missing: &std::collections::BTreeSet<String>,
|
|
540
|
+
forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
|
|
119
541
|
) -> Result<(), LifecycleError> {
|
|
120
|
-
let log = crate::event_log::EventLog::new(workspace);
|
|
121
542
|
for decision in decisions {
|
|
122
543
|
let agent = state
|
|
123
544
|
.get("agents")
|
|
@@ -134,23 +555,78 @@ fn write_restart_resume_decision_events(
|
|
|
134
555
|
ResumeDecision::FreshStart => "fresh_start",
|
|
135
556
|
ResumeDecision::Refuse => "refuse",
|
|
136
557
|
};
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
}),
|
|
148
|
-
)
|
|
149
|
-
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
558
|
+
write_restart_resume_decision_event(
|
|
559
|
+
workspace,
|
|
560
|
+
decision.agent_id.as_str(),
|
|
561
|
+
first_send_at,
|
|
562
|
+
session_id,
|
|
563
|
+
allow_fresh,
|
|
564
|
+
decision_wire,
|
|
565
|
+
forced_fresh_missing.contains(decision.agent_id.as_str()),
|
|
566
|
+
forced_fresh_convergence,
|
|
567
|
+
)?;
|
|
150
568
|
}
|
|
151
569
|
Ok(())
|
|
152
570
|
}
|
|
153
571
|
|
|
572
|
+
fn write_restart_resume_decision_event(
|
|
573
|
+
workspace: &Path,
|
|
574
|
+
worker_id: &str,
|
|
575
|
+
first_send_at: Option<String>,
|
|
576
|
+
session_id: Option<String>,
|
|
577
|
+
allow_fresh: bool,
|
|
578
|
+
decision: &str,
|
|
579
|
+
forced_fresh: bool,
|
|
580
|
+
forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
|
|
581
|
+
) -> Result<(), LifecycleError> {
|
|
582
|
+
use std::io::Write as _;
|
|
583
|
+
|
|
584
|
+
let path = workspace.join(".team").join("logs").join("events.jsonl");
|
|
585
|
+
if let Some(parent) = path.parent() {
|
|
586
|
+
std::fs::create_dir_all(parent)
|
|
587
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
588
|
+
}
|
|
589
|
+
let mut event = serde_json::json!({
|
|
590
|
+
"ts": chrono::Utc::now().to_rfc3339(),
|
|
591
|
+
"event": crate::lifecycle::types::event_names::RESTART_RESUME_DECISION,
|
|
592
|
+
"worker_id": worker_id,
|
|
593
|
+
"has_first_send_at": first_send_at.is_some(),
|
|
594
|
+
"has_session_id": session_id.is_some(),
|
|
595
|
+
"allow_fresh": allow_fresh,
|
|
596
|
+
"decision": decision,
|
|
597
|
+
"first_send_at": first_send_at,
|
|
598
|
+
"session_id": session_id,
|
|
599
|
+
});
|
|
600
|
+
if forced_fresh {
|
|
601
|
+
if let Some(event) = event.as_object_mut() {
|
|
602
|
+
event.insert("forced_fresh".to_string(), serde_json::json!(true));
|
|
603
|
+
event.insert("reason".to_string(), serde_json::json!("resume_not_ready"));
|
|
604
|
+
if let Some(convergence) = forced_fresh_convergence {
|
|
605
|
+
event.insert(
|
|
606
|
+
"session_convergence".to_string(),
|
|
607
|
+
serde_json::json!({
|
|
608
|
+
"complete": false,
|
|
609
|
+
"deadline_s": convergence.deadline.as_secs_f64(),
|
|
610
|
+
"deadline_ms": convergence.deadline.as_millis(),
|
|
611
|
+
"elapsed_ms": convergence.elapsed.as_millis(),
|
|
612
|
+
"pending_agent_ids": convergence.missing.clone(),
|
|
613
|
+
}),
|
|
614
|
+
);
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
let line = serde_json::to_string(&event)
|
|
619
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
620
|
+
let mut file = std::fs::OpenOptions::new()
|
|
621
|
+
.create(true)
|
|
622
|
+
.append(true)
|
|
623
|
+
.open(&path)
|
|
624
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
625
|
+
file.write_all(line.as_bytes())
|
|
626
|
+
.and_then(|_| file.write_all(b"\n"))
|
|
627
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))
|
|
628
|
+
}
|
|
629
|
+
|
|
154
630
|
/// `restart_candidates(workspace)`(`restart/selection.py:12`)。从 snapshot + active
|
|
155
631
|
/// state 收集可重启 team。
|
|
156
632
|
pub fn restart_candidates(workspace: &Path) -> Result<Vec<RestartCandidate>, LifecycleError> {
|
|
@@ -169,7 +169,11 @@ fn remove_agent_inner(
|
|
|
169
169
|
// (team projection) — NOT a raw save, so other teams in a multi-team workspace are preserved.
|
|
170
170
|
let mut removed_state = working_state;
|
|
171
171
|
remove_agent_from_state(&mut removed_state, agent_id)?;
|
|
172
|
-
crate::state::projection::
|
|
172
|
+
crate::state::projection::save_team_scoped_state_with_deleted_agents(
|
|
173
|
+
paths.run_workspace,
|
|
174
|
+
&removed_state,
|
|
175
|
+
&[agent_id.as_str()],
|
|
176
|
+
)
|
|
173
177
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
174
178
|
cleared_locations.push(serde_json::json!("state.json:agents"));
|
|
175
179
|
write_remove_step_event(
|
|
@@ -226,6 +230,7 @@ fn remove_agent_inner(
|
|
|
226
230
|
"agent_health",
|
|
227
231
|
None,
|
|
228
232
|
)?;
|
|
233
|
+
maybe_fail_remove_after_agent_health_delete()?;
|
|
229
234
|
Ok(RemoveSuccess {
|
|
230
235
|
outcome: RemoveAgentOutcome::Removed {
|
|
231
236
|
agent_id: agent_id.clone(),
|
|
@@ -581,15 +586,16 @@ fn select_agent_health(
|
|
|
581
586
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
582
587
|
let row = conn
|
|
583
588
|
.query_row(
|
|
584
|
-
"select status, last_output_at, context_usage_pct, current_task_id \
|
|
589
|
+
"select owner_team_id, status, last_output_at, context_usage_pct, current_task_id \
|
|
585
590
|
from agent_health where agent_id = ?1",
|
|
586
591
|
[agent_id.as_str()],
|
|
587
592
|
|r| {
|
|
588
593
|
Ok(CapturedHealth {
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
594
|
+
owner_team_id: r.get::<_, Option<String>>(0)?,
|
|
595
|
+
status: r.get::<_, Option<String>>(1)?,
|
|
596
|
+
last_output_at: r.get::<_, Option<String>>(2)?,
|
|
597
|
+
context_usage_pct: r.get::<_, Option<i64>>(3)?,
|
|
598
|
+
current_task_id: r.get::<_, Option<String>>(4)?,
|
|
593
599
|
})
|
|
594
600
|
},
|
|
595
601
|
)
|
|
@@ -618,8 +624,9 @@ fn restore_agent_health(
|
|
|
618
624
|
// health (golden _restore_agent_health re-upserts status||"IDLE" + the captured columns).
|
|
619
625
|
conn.execute(
|
|
620
626
|
"insert into agent_health (owner_team_id, agent_id, status, last_output_at, context_usage_pct, current_task_id, updated_at) \
|
|
621
|
-
values (
|
|
627
|
+
values (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
|
|
622
628
|
rusqlite::params![
|
|
629
|
+
row.owner_team_id,
|
|
623
630
|
agent_id.as_str(),
|
|
624
631
|
status,
|
|
625
632
|
row.last_output_at,
|
|
@@ -634,12 +641,25 @@ fn restore_agent_health(
|
|
|
634
641
|
|
|
635
642
|
#[derive(Clone)]
|
|
636
643
|
struct CapturedHealth {
|
|
644
|
+
owner_team_id: Option<String>,
|
|
637
645
|
status: Option<String>,
|
|
638
646
|
last_output_at: Option<String>,
|
|
639
647
|
context_usage_pct: Option<i64>,
|
|
640
648
|
current_task_id: Option<String>,
|
|
641
649
|
}
|
|
642
650
|
|
|
651
|
+
fn maybe_fail_remove_after_agent_health_delete() -> Result<(), LifecycleError> {
|
|
652
|
+
let Ok(reason) = std::env::var("TEAM_AGENT_TEST_FAIL_REMOVE_AFTER_AGENT_HEALTH_DELETE") else {
|
|
653
|
+
return Ok(());
|
|
654
|
+
};
|
|
655
|
+
if reason.is_empty() {
|
|
656
|
+
return Ok(());
|
|
657
|
+
}
|
|
658
|
+
Err(LifecycleError::StatePersist(format!(
|
|
659
|
+
"injected remove failure after agent_health delete: {reason}"
|
|
660
|
+
)))
|
|
661
|
+
}
|
|
662
|
+
|
|
643
663
|
struct RemoveRollback {
|
|
644
664
|
agent_id: AgentId,
|
|
645
665
|
spec_text: Option<String>,
|