@team-agent/installer 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +34 -1
- package/Cargo.toml +1 -1
- package/crates/team-agent/Cargo.toml +1 -1
- package/crates/team-agent/src/cli/adapters.rs +196 -19
- package/crates/team-agent/src/cli/diagnose.rs +144 -10
- package/crates/team-agent/src/cli/emit.rs +286 -52
- package/crates/team-agent/src/cli/leader.rs +37 -8
- package/crates/team-agent/src/cli/mod.rs +799 -316
- package/crates/team-agent/src/cli/status_port.rs +25 -2
- package/crates/team-agent/src/cli/tests/divergence.rs +1 -2
- package/crates/team-agent/src/cli/tests/lane_c.rs +23 -13
- package/crates/team-agent/src/cli/tests/main_preserved.rs +2 -0
- package/crates/team-agent/src/cli/tests/run_delegation.rs +57 -3
- package/crates/team-agent/src/cli/types.rs +17 -0
- package/crates/team-agent/src/compiler.rs +15 -5
- package/crates/team-agent/src/coordinator/health.rs +89 -20
- package/crates/team-agent/src/coordinator/mod.rs +4 -0
- package/crates/team-agent/src/coordinator/runtime_detectors.rs +500 -0
- package/crates/team-agent/src/coordinator/runtime_observation.rs +58 -0
- package/crates/team-agent/src/coordinator/tick.rs +222 -69
- package/crates/team-agent/src/coordinator/types.rs +15 -3
- package/crates/team-agent/src/db/schema.rs +37 -2
- package/crates/team-agent/src/diagnose/comms.rs +226 -0
- package/crates/team-agent/src/diagnose/mod.rs +45 -0
- package/crates/team-agent/src/diagnose/orphans.rs +658 -0
- package/crates/team-agent/src/fake_worker.rs +146 -3
- package/crates/team-agent/src/leader/start.rs +121 -23
- package/crates/team-agent/src/leader/types.rs +44 -1
- package/crates/team-agent/src/lib.rs +3 -0
- package/crates/team-agent/src/lifecycle/display.rs +645 -47
- package/crates/team-agent/src/lifecycle/launch.rs +818 -116
- package/crates/team-agent/src/lifecycle/mod.rs +2 -0
- package/crates/team-agent/src/lifecycle/profile_launch.rs +810 -0
- package/crates/team-agent/src/lifecycle/profile_smoke.rs +522 -0
- package/crates/team-agent/src/lifecycle/restart/agent.rs +99 -23
- package/crates/team-agent/src/lifecycle/restart/common.rs +177 -83
- package/crates/team-agent/src/lifecycle/restart/rebuild.rs +443 -9
- package/crates/team-agent/src/lifecycle/restart/remove.rs +22 -6
- package/crates/team-agent/src/lifecycle/restart/team_state.rs +19 -0
- package/crates/team-agent/src/lifecycle/restart.rs +4 -1
- package/crates/team-agent/src/lifecycle/tests/lane_ops.rs +5 -5
- package/crates/team-agent/src/lifecycle/tests/launch_spawn.rs +37 -7
- package/crates/team-agent/src/lifecycle/types.rs +19 -0
- package/crates/team-agent/src/mcp_server/helpers.rs +1 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/agent_ops.rs +341 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/mod.rs +10 -0
- package/crates/team-agent/src/mcp_server/lifecycle_tools/state_status.rs +158 -0
- package/crates/team-agent/src/mcp_server/mod.rs +3 -74
- package/crates/team-agent/src/mcp_server/tests/scoped.rs +1 -1
- package/crates/team-agent/src/mcp_server/tests/send.rs +6 -5
- package/crates/team-agent/src/mcp_server/tools.rs +312 -111
- package/crates/team-agent/src/mcp_server/types.rs +6 -4
- package/crates/team-agent/src/mcp_server/wire.rs +19 -7
- package/crates/team-agent/src/message_store.rs +21 -4
- package/crates/team-agent/src/messaging/delivery.rs +87 -37
- package/crates/team-agent/src/messaging/mod.rs +9 -6
- package/crates/team-agent/src/messaging/results.rs +153 -16
- package/crates/team-agent/src/messaging/selftest.rs +199 -12
- package/crates/team-agent/src/messaging/send.rs +35 -3
- package/crates/team-agent/src/messaging/tests/runtime.rs +19 -4
- package/crates/team-agent/src/messaging/types.rs +11 -3
- package/crates/team-agent/src/os_probe.rs +119 -0
- package/crates/team-agent/src/packaging/migrate.rs +10 -2
- package/crates/team-agent/src/packaging/tests.rs +23 -0
- package/crates/team-agent/src/provider/adapter.rs +483 -67
- package/crates/team-agent/src/provider/approvals/runtime_prompts.rs +1 -7
- package/crates/team-agent/src/provider/classify.rs +51 -4
- package/crates/team-agent/src/provider/startup_prompt.rs +94 -0
- package/crates/team-agent/src/provider/types.rs +47 -0
- package/crates/team-agent/src/session_capture.rs +616 -0
- package/crates/team-agent/src/state/persist.rs +57 -0
- package/crates/team-agent/src/state/projection.rs +32 -23
- package/crates/team-agent/src/state/selector.rs +5 -2
- package/crates/team-agent/src/tmux_backend.rs +97 -60
- package/crates/team-agent/src/transport/test_support.rs +9 -0
- package/crates/team-agent/src/transport/tests/wire.rs +4 -0
- package/crates/team-agent/src/transport.rs +13 -2
- package/package.json +4 -4
|
@@ -12,13 +12,23 @@ pub fn restart(
|
|
|
12
12
|
workspace: &Path,
|
|
13
13
|
allow_fresh: bool,
|
|
14
14
|
team: Option<&str>,
|
|
15
|
+
) -> Result<RestartReport, LifecycleError> {
|
|
16
|
+
restart_with_session_convergence_deadline(workspace, allow_fresh, team, None)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
pub fn restart_with_session_convergence_deadline(
|
|
20
|
+
workspace: &Path,
|
|
21
|
+
allow_fresh: bool,
|
|
22
|
+
team: Option<&str>,
|
|
23
|
+
session_converge_deadline_ms: Option<u64>,
|
|
15
24
|
) -> Result<RestartReport, LifecycleError> {
|
|
16
25
|
let run_ws = lifecycle_run_workspace(workspace)?;
|
|
17
|
-
|
|
26
|
+
restart_with_transport_with_session_convergence_deadline(
|
|
18
27
|
workspace,
|
|
19
28
|
allow_fresh,
|
|
20
29
|
team,
|
|
21
30
|
&crate::tmux_backend::TmuxBackend::for_workspace(&run_ws),
|
|
31
|
+
session_converge_deadline_ms,
|
|
22
32
|
)
|
|
23
33
|
}
|
|
24
34
|
|
|
@@ -30,6 +40,42 @@ pub fn restart_with_transport(
|
|
|
30
40
|
allow_fresh: bool,
|
|
31
41
|
team: Option<&str>,
|
|
32
42
|
transport: &dyn crate::transport::Transport,
|
|
43
|
+
) -> Result<RestartReport, LifecycleError> {
|
|
44
|
+
match restart_with_transport_with_session_convergence_deadline(
|
|
45
|
+
workspace,
|
|
46
|
+
allow_fresh,
|
|
47
|
+
team,
|
|
48
|
+
transport,
|
|
49
|
+
None,
|
|
50
|
+
)? {
|
|
51
|
+
RestartReport::RefusedResumeNotReady {
|
|
52
|
+
missing,
|
|
53
|
+
allow_fresh,
|
|
54
|
+
error,
|
|
55
|
+
..
|
|
56
|
+
} => Ok(RestartReport::RefusedResumeAtomicity {
|
|
57
|
+
unresumable: missing
|
|
58
|
+
.into_iter()
|
|
59
|
+
.map(|agent_id| UnresumableWorker {
|
|
60
|
+
agent_id,
|
|
61
|
+
reason: "session_capture_incomplete".to_string(),
|
|
62
|
+
session_id: None,
|
|
63
|
+
first_send_at: None,
|
|
64
|
+
})
|
|
65
|
+
.collect(),
|
|
66
|
+
allow_fresh,
|
|
67
|
+
error,
|
|
68
|
+
}),
|
|
69
|
+
report => Ok(report),
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
pub fn restart_with_transport_with_session_convergence_deadline(
|
|
74
|
+
workspace: &Path,
|
|
75
|
+
allow_fresh: bool,
|
|
76
|
+
team: Option<&str>,
|
|
77
|
+
transport: &dyn crate::transport::Transport,
|
|
78
|
+
session_converge_deadline_ms: Option<u64>,
|
|
33
79
|
) -> Result<RestartReport, LifecycleError> {
|
|
34
80
|
if crate::lifecycle::restart::input_has_no_local_team_context(workspace) {
|
|
35
81
|
return Err(LifecycleError::TeamSelect(format!(
|
|
@@ -61,12 +107,57 @@ pub fn restart_with_transport(
|
|
|
61
107
|
.ok_or_else(|| LifecycleError::TeamSelect("active team spec workspace not found".to_string()))?;
|
|
62
108
|
let spec = load_team_spec(spec_workspace)?;
|
|
63
109
|
let safety = crate::lifecycle::launch::effective_runtime_config(&spec)?;
|
|
64
|
-
|
|
110
|
+
let mut convergence = converge_missing_provider_sessions(
|
|
111
|
+
&mut state,
|
|
112
|
+
session_convergence_deadline(session_converge_deadline_ms),
|
|
113
|
+
session_convergence_poll_interval(),
|
|
114
|
+
&selected.run_workspace,
|
|
115
|
+
allow_fresh,
|
|
116
|
+
)?;
|
|
117
|
+
if convergence.converged && convergence.changed {
|
|
118
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
119
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
120
|
+
}
|
|
121
|
+
if repair_resume_sessions_from_event_log(&selected.run_workspace, &mut state)? {
|
|
122
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
123
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
124
|
+
let missing_after_repair = restart_required_missing_session_agent_ids(&state);
|
|
125
|
+
convergence.changed = true;
|
|
126
|
+
convergence.converged = missing_after_repair.is_empty();
|
|
127
|
+
convergence.missing = missing_after_repair;
|
|
128
|
+
}
|
|
129
|
+
if !convergence.converged && !allow_fresh {
|
|
130
|
+
return Ok(RestartReport::RefusedResumeNotReady {
|
|
131
|
+
missing: convergence
|
|
132
|
+
.missing
|
|
133
|
+
.iter()
|
|
134
|
+
.map(|agent_id| AgentId::new(agent_id.clone()))
|
|
135
|
+
.collect(),
|
|
136
|
+
allow_fresh,
|
|
137
|
+
deadline: convergence.deadline,
|
|
138
|
+
elapsed: convergence.elapsed,
|
|
139
|
+
error: "resume_not_ready: session_capture_incomplete".to_string(),
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
if !convergence.converged && convergence.changed {
|
|
65
143
|
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
66
144
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
67
145
|
}
|
|
146
|
+
let forced_fresh_missing = if convergence.converged {
|
|
147
|
+
std::collections::BTreeSet::new()
|
|
148
|
+
} else {
|
|
149
|
+
convergence.missing.iter().cloned().collect()
|
|
150
|
+
};
|
|
151
|
+
let forced_fresh_convergence = (!convergence.converged).then_some(convergence.clone());
|
|
68
152
|
let plan = classify_restart_plan(&state, allow_fresh)?;
|
|
69
|
-
write_restart_resume_decision_events(
|
|
153
|
+
write_restart_resume_decision_events(
|
|
154
|
+
&selected.run_workspace,
|
|
155
|
+
&state,
|
|
156
|
+
allow_fresh,
|
|
157
|
+
&plan.decisions,
|
|
158
|
+
&forced_fresh_missing,
|
|
159
|
+
forced_fresh_convergence.as_ref(),
|
|
160
|
+
)?;
|
|
70
161
|
if !plan.corrupt_entries.is_empty() {
|
|
71
162
|
return Ok(RestartReport::RefusedInvalidFirstSendAt {
|
|
72
163
|
invalid: plan.corrupt_entries,
|
|
@@ -86,8 +177,13 @@ pub fn restart_with_transport(
|
|
|
86
177
|
transport
|
|
87
178
|
.kill_session(&session_name)
|
|
88
179
|
.map_err(|e| LifecycleError::Transport(e.to_string()))?;
|
|
180
|
+
mark_leader_receiver_rebind_required(&mut state, &session_name);
|
|
181
|
+
mark_restart_targets_stopped_after_teardown(&mut state, &plan.decisions);
|
|
182
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
183
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
89
184
|
}
|
|
90
|
-
|
|
185
|
+
let mut last_spawned: Option<AgentId> = None;
|
|
186
|
+
for decision in &plan.decisions {
|
|
91
187
|
let agent = state
|
|
92
188
|
.get("agents")
|
|
93
189
|
.and_then(|v| v.get(decision.agent_id.as_str()))
|
|
@@ -96,23 +192,49 @@ pub fn restart_with_transport(
|
|
|
96
192
|
"agent {} not found for restart",
|
|
97
193
|
decision.agent_id
|
|
98
194
|
))
|
|
99
|
-
})
|
|
195
|
+
})?
|
|
196
|
+
.clone();
|
|
100
197
|
let session_id = if matches!(decision.restart_mode, StartMode::Resumed) {
|
|
101
198
|
decision.session_id.as_ref()
|
|
102
199
|
} else {
|
|
103
200
|
None
|
|
104
201
|
};
|
|
105
|
-
let
|
|
202
|
+
let session_live = session_live_or_default(transport, &session_name, false);
|
|
203
|
+
if !session_live {
|
|
204
|
+
if let Some(previous) = &last_spawned {
|
|
205
|
+
return Err(LifecycleError::Transport(format!(
|
|
206
|
+
"session_disappeared_after_spawn: provider_resume_exited for {}; session {} disappeared before spawning {}",
|
|
207
|
+
previous,
|
|
208
|
+
session_name.as_str(),
|
|
209
|
+
decision.agent_id
|
|
210
|
+
)));
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
let spawn = spawn_agent_window(
|
|
106
214
|
&selected.run_workspace,
|
|
107
215
|
&session_name,
|
|
108
216
|
&decision.agent_id,
|
|
109
|
-
agent,
|
|
217
|
+
&agent,
|
|
110
218
|
session_id,
|
|
111
|
-
|
|
219
|
+
session_live,
|
|
112
220
|
transport,
|
|
113
221
|
Some(&safety),
|
|
222
|
+
Some(spec_workspace),
|
|
114
223
|
)?;
|
|
224
|
+
verify_spawned_agent_live(&decision.agent_id, &spawn, transport)?;
|
|
225
|
+
mark_agent_respawned(&mut state, &decision.agent_id, &spawn, transport, &safety)?;
|
|
226
|
+
last_spawned = Some(decision.agent_id.clone());
|
|
227
|
+
if let Some(agent) = state
|
|
228
|
+
.get_mut("agents")
|
|
229
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
230
|
+
.and_then(|agents| agents.get_mut(decision.agent_id.as_str()))
|
|
231
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
232
|
+
{
|
|
233
|
+
persist_effective_approval_policy_for_restart(agent, &safety);
|
|
234
|
+
}
|
|
115
235
|
}
|
|
236
|
+
crate::state::projection::save_team_scoped_state(&selected.run_workspace, &state)
|
|
237
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
116
238
|
let coordinator_started = start_coordinator_for_workspace(&selected.run_workspace)?;
|
|
117
239
|
Ok(RestartReport::Restarted {
|
|
118
240
|
session_name,
|
|
@@ -121,11 +243,301 @@ pub fn restart_with_transport(
|
|
|
121
243
|
})
|
|
122
244
|
}
|
|
123
245
|
|
|
246
|
+
fn repair_resume_sessions_from_event_log(
|
|
247
|
+
workspace: &Path,
|
|
248
|
+
state: &mut serde_json::Value,
|
|
249
|
+
) -> Result<bool, LifecycleError> {
|
|
250
|
+
let agent_ids = state
|
|
251
|
+
.get("agents")
|
|
252
|
+
.and_then(serde_json::Value::as_object)
|
|
253
|
+
.map(|agents| agents.keys().cloned().collect::<Vec<_>>())
|
|
254
|
+
.unwrap_or_default();
|
|
255
|
+
let mut changed = false;
|
|
256
|
+
for agent_id in agent_ids {
|
|
257
|
+
let previous = state
|
|
258
|
+
.get("agents")
|
|
259
|
+
.and_then(|agents| agents.get(&agent_id))
|
|
260
|
+
.cloned()
|
|
261
|
+
.unwrap_or(serde_json::Value::Null);
|
|
262
|
+
if previous
|
|
263
|
+
.get("session_id")
|
|
264
|
+
.and_then(serde_json::Value::as_str)
|
|
265
|
+
.is_some_and(|session| !session.is_empty())
|
|
266
|
+
{
|
|
267
|
+
continue;
|
|
268
|
+
}
|
|
269
|
+
let Some(provider) = previous
|
|
270
|
+
.get("provider")
|
|
271
|
+
.and_then(serde_json::Value::as_str)
|
|
272
|
+
.and_then(parse_provider)
|
|
273
|
+
else {
|
|
274
|
+
continue;
|
|
275
|
+
};
|
|
276
|
+
let auth_mode = previous
|
|
277
|
+
.get("auth_mode")
|
|
278
|
+
.and_then(serde_json::Value::as_str)
|
|
279
|
+
.and_then(parse_auth_mode)
|
|
280
|
+
.unwrap_or(AuthMode::Subscription);
|
|
281
|
+
let exclude_session_ids = claimed_session_ids_except(state, &agent_id);
|
|
282
|
+
let adapter = crate::provider::get_adapter(provider);
|
|
283
|
+
let repaired = crate::session_capture::recover_resume_session_from_events(
|
|
284
|
+
workspace,
|
|
285
|
+
&agent_id,
|
|
286
|
+
&previous,
|
|
287
|
+
adapter.as_ref(),
|
|
288
|
+
auth_mode,
|
|
289
|
+
&exclude_session_ids,
|
|
290
|
+
)
|
|
291
|
+
.map_err(|e| LifecycleError::Provider(e.to_string()))?;
|
|
292
|
+
let Some(repaired) = repaired else {
|
|
293
|
+
continue;
|
|
294
|
+
};
|
|
295
|
+
let old_session_id = previous
|
|
296
|
+
.get("session_id")
|
|
297
|
+
.and_then(serde_json::Value::as_str)
|
|
298
|
+
.filter(|session| !session.is_empty())
|
|
299
|
+
.map(str::to_string);
|
|
300
|
+
let session_id = repaired
|
|
301
|
+
.get("session_id")
|
|
302
|
+
.and_then(serde_json::Value::as_str)
|
|
303
|
+
.filter(|session| !session.is_empty())
|
|
304
|
+
.map(str::to_string);
|
|
305
|
+
let rollout_path = repaired
|
|
306
|
+
.get("rollout_path")
|
|
307
|
+
.and_then(serde_json::Value::as_str)
|
|
308
|
+
.filter(|path| !path.is_empty())
|
|
309
|
+
.map(str::to_string);
|
|
310
|
+
if let Some(agent) = state
|
|
311
|
+
.get_mut("agents")
|
|
312
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
313
|
+
.and_then(|agents| agents.get_mut(&agent_id))
|
|
314
|
+
{
|
|
315
|
+
*agent = repaired.clone();
|
|
316
|
+
}
|
|
317
|
+
crate::event_log::EventLog::new(workspace)
|
|
318
|
+
.write(
|
|
319
|
+
"resume.session_repaired",
|
|
320
|
+
serde_json::json!({
|
|
321
|
+
"agent_id": agent_id,
|
|
322
|
+
"provider": provider_wire(provider),
|
|
323
|
+
"old_session_id": old_session_id,
|
|
324
|
+
"session_id": session_id,
|
|
325
|
+
"rollout_path": rollout_path,
|
|
326
|
+
"captured_via": "event_log_repair",
|
|
327
|
+
"attribution_confidence": repaired.get("attribution_confidence").cloned().unwrap_or(serde_json::Value::Null),
|
|
328
|
+
}),
|
|
329
|
+
)
|
|
330
|
+
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
331
|
+
changed = true;
|
|
332
|
+
}
|
|
333
|
+
Ok(changed)
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
fn claimed_session_ids_except(
|
|
337
|
+
state: &serde_json::Value,
|
|
338
|
+
current_agent_id: &str,
|
|
339
|
+
) -> std::collections::BTreeSet<String> {
|
|
340
|
+
state
|
|
341
|
+
.get("agents")
|
|
342
|
+
.and_then(serde_json::Value::as_object)
|
|
343
|
+
.map(|agents| {
|
|
344
|
+
agents
|
|
345
|
+
.iter()
|
|
346
|
+
.filter(|(agent_id, _)| agent_id.as_str() != current_agent_id)
|
|
347
|
+
.filter_map(|(_, agent)| {
|
|
348
|
+
agent
|
|
349
|
+
.get("session_id")
|
|
350
|
+
.and_then(serde_json::Value::as_str)
|
|
351
|
+
.filter(|session| !session.is_empty())
|
|
352
|
+
.map(str::to_string)
|
|
353
|
+
})
|
|
354
|
+
.collect()
|
|
355
|
+
})
|
|
356
|
+
.unwrap_or_default()
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
fn session_convergence_deadline(requested_ms: Option<u64>) -> std::time::Duration {
|
|
360
|
+
if let Some(ms) = requested_ms {
|
|
361
|
+
return std::time::Duration::from_millis(ms);
|
|
362
|
+
}
|
|
363
|
+
env_duration_ms(
|
|
364
|
+
&[
|
|
365
|
+
"TEAM_AGENT_RESTART_SESSION_CAPTURE_DEADLINE_MS",
|
|
366
|
+
"TEAM_AGENT_RESTART_SESSION_CONVERGENCE_DEADLINE_MS",
|
|
367
|
+
"TEAM_AGENT_RESTART_CAPTURE_DEADLINE_MS",
|
|
368
|
+
"TEAM_AGENT_RESTART_CAPTURE_TIMEOUT_MS",
|
|
369
|
+
"TEAM_AGENT_RESTART_SESSION_CAPTURE_TIMEOUT_MS",
|
|
370
|
+
"TEAM_AGENT_RESTART_SESSION_CONVERGENCE_TIMEOUT_MS",
|
|
371
|
+
"TEAM_AGENT_SESSION_CAPTURE_DEADLINE_MS",
|
|
372
|
+
"TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_DEADLINE_MS",
|
|
373
|
+
"TEAM_AGENT_SESSION_CAPTURE_TIMEOUT_MS",
|
|
374
|
+
"TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_TIMEOUT_MS",
|
|
375
|
+
"TEAM_AGENT_SESSION_CONVERGENCE_DEADLINE_MS",
|
|
376
|
+
"TEAM_AGENT_SESSION_CONVERGENCE_TIMEOUT_MS",
|
|
377
|
+
"TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_DEADLINE_MS",
|
|
378
|
+
"TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_TIMEOUT_MS",
|
|
379
|
+
],
|
|
380
|
+
crate::session_capture::RESTART_SESSION_CONVERGENCE_DEADLINE_MS,
|
|
381
|
+
)
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
fn session_convergence_poll_interval() -> std::time::Duration {
|
|
385
|
+
env_duration_ms(
|
|
386
|
+
&[
|
|
387
|
+
"TEAM_AGENT_RESTART_SESSION_CAPTURE_POLL_MS",
|
|
388
|
+
"TEAM_AGENT_RESTART_SESSION_CONVERGENCE_POLL_MS",
|
|
389
|
+
"TEAM_AGENT_RESTART_CAPTURE_POLL_MS",
|
|
390
|
+
"TEAM_AGENT_SESSION_CAPTURE_POLL_MS",
|
|
391
|
+
"TEAM_AGENT_SESSION_CAPTURE_CONVERGENCE_POLL_MS",
|
|
392
|
+
"TEAM_AGENT_SESSION_CONVERGENCE_POLL_MS",
|
|
393
|
+
"TEAM_AGENT_PROVIDER_SESSION_CONVERGENCE_POLL_MS",
|
|
394
|
+
],
|
|
395
|
+
crate::session_capture::RESTART_SESSION_CONVERGENCE_POLL_MS,
|
|
396
|
+
)
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
fn env_duration_ms(names: &[&str], default_ms: u64) -> std::time::Duration {
|
|
400
|
+
let ms = names
|
|
401
|
+
.iter()
|
|
402
|
+
.find_map(|name| {
|
|
403
|
+
std::env::var(name)
|
|
404
|
+
.ok()
|
|
405
|
+
.and_then(|value| parse_duration_value_ms(&value))
|
|
406
|
+
.or_else(|| {
|
|
407
|
+
name.strip_suffix("_MS").and_then(|prefix| {
|
|
408
|
+
std::env::var(prefix)
|
|
409
|
+
.ok()
|
|
410
|
+
.and_then(|value| parse_duration_value_seconds_ms(&value))
|
|
411
|
+
})
|
|
412
|
+
})
|
|
413
|
+
})
|
|
414
|
+
.unwrap_or(default_ms);
|
|
415
|
+
std::time::Duration::from_millis(ms)
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
fn parse_duration_value_ms(value: &str) -> Option<u64> {
|
|
419
|
+
value.parse::<u64>().ok()
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
fn parse_duration_value_seconds_ms(value: &str) -> Option<u64> {
|
|
423
|
+
let seconds = value.parse::<f64>().ok()?;
|
|
424
|
+
if seconds.is_finite() && seconds >= 0.0 {
|
|
425
|
+
Some((seconds * 1000.0).round() as u64)
|
|
426
|
+
} else {
|
|
427
|
+
None
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
fn verify_spawned_agent_live(
|
|
432
|
+
_agent_id: &AgentId,
|
|
433
|
+
_spawn: &SpawnedAgentWindow,
|
|
434
|
+
_transport: &dyn crate::transport::Transport,
|
|
435
|
+
) -> Result<(), LifecycleError> {
|
|
436
|
+
Ok(())
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
fn mark_leader_receiver_rebind_required(state: &mut serde_json::Value, session_name: &SessionName) {
|
|
440
|
+
let Some(receiver) = state
|
|
441
|
+
.get_mut("leader_receiver")
|
|
442
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
443
|
+
else {
|
|
444
|
+
return;
|
|
445
|
+
};
|
|
446
|
+
let same_session = receiver
|
|
447
|
+
.get("session_name")
|
|
448
|
+
.and_then(|v| v.as_str())
|
|
449
|
+
.map(|session| session == session_name.as_str())
|
|
450
|
+
.unwrap_or(true);
|
|
451
|
+
if !same_session {
|
|
452
|
+
return;
|
|
453
|
+
}
|
|
454
|
+
if receiver
|
|
455
|
+
.get("status")
|
|
456
|
+
.and_then(|v| v.as_str())
|
|
457
|
+
.is_some_and(|status| status == "attached")
|
|
458
|
+
{
|
|
459
|
+
receiver.insert(
|
|
460
|
+
"status".to_string(),
|
|
461
|
+
serde_json::json!("rebind_required"),
|
|
462
|
+
);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
fn mark_restart_targets_stopped_after_teardown(
|
|
467
|
+
state: &mut serde_json::Value,
|
|
468
|
+
decisions: &[RestartedAgent],
|
|
469
|
+
) {
|
|
470
|
+
let Some(agents) = state
|
|
471
|
+
.get_mut("agents")
|
|
472
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
473
|
+
else {
|
|
474
|
+
return;
|
|
475
|
+
};
|
|
476
|
+
for decision in decisions {
|
|
477
|
+
let Some(agent) = agents
|
|
478
|
+
.get_mut(decision.agent_id.as_str())
|
|
479
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
480
|
+
else {
|
|
481
|
+
continue;
|
|
482
|
+
};
|
|
483
|
+
agent.insert("status".to_string(), serde_json::json!("stopped"));
|
|
484
|
+
agent.remove("pane_id");
|
|
485
|
+
agent.remove("pane_pid");
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
fn mark_agent_respawned(
|
|
490
|
+
state: &mut serde_json::Value,
|
|
491
|
+
agent_id: &AgentId,
|
|
492
|
+
spawn: &SpawnedAgentWindow,
|
|
493
|
+
transport: &dyn crate::transport::Transport,
|
|
494
|
+
safety: &DangerousApproval,
|
|
495
|
+
) -> Result<(), LifecycleError> {
|
|
496
|
+
let Some(agent) = state
|
|
497
|
+
.get_mut("agents")
|
|
498
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
499
|
+
.and_then(|agents| agents.get_mut(agent_id.as_str()))
|
|
500
|
+
.and_then(serde_json::Value::as_object_mut)
|
|
501
|
+
else {
|
|
502
|
+
return Err(LifecycleError::StatePersist(format!(
|
|
503
|
+
"agent {} state is not an object",
|
|
504
|
+
agent_id
|
|
505
|
+
)));
|
|
506
|
+
};
|
|
507
|
+
agent.insert("status".to_string(), serde_json::json!("running"));
|
|
508
|
+
agent.insert(
|
|
509
|
+
"pane_id".to_string(),
|
|
510
|
+
serde_json::json!(spawn.spawn.pane_id.as_str()),
|
|
511
|
+
);
|
|
512
|
+
let pane_pid = spawn.spawn.child_pid.or_else(|| {
|
|
513
|
+
transport
|
|
514
|
+
.list_targets()
|
|
515
|
+
.unwrap_or_default()
|
|
516
|
+
.into_iter()
|
|
517
|
+
.find(|pane| pane.pane_id == spawn.spawn.pane_id)
|
|
518
|
+
.and_then(|pane| pane.pane_pid)
|
|
519
|
+
});
|
|
520
|
+
if let Some(pane_pid) = pane_pid {
|
|
521
|
+
agent.insert("pane_pid".to_string(), serde_json::json!(pane_pid));
|
|
522
|
+
}
|
|
523
|
+
crate::lifecycle::launch::persist_command_plan_state(
|
|
524
|
+
agent,
|
|
525
|
+
&spawn.plan,
|
|
526
|
+
&spawn.profile_launch,
|
|
527
|
+
);
|
|
528
|
+
persist_effective_approval_policy_for_restart(agent, safety);
|
|
529
|
+
agent.remove("startup_prompts");
|
|
530
|
+
agent.remove("startup_prompt_status");
|
|
531
|
+
Ok(())
|
|
532
|
+
}
|
|
533
|
+
|
|
124
534
|
fn write_restart_resume_decision_events(
|
|
125
535
|
workspace: &Path,
|
|
126
536
|
state: &serde_json::Value,
|
|
127
537
|
allow_fresh: bool,
|
|
128
538
|
decisions: &[RestartedAgent],
|
|
539
|
+
forced_fresh_missing: &std::collections::BTreeSet<String>,
|
|
540
|
+
forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
|
|
129
541
|
) -> Result<(), LifecycleError> {
|
|
130
542
|
for decision in decisions {
|
|
131
543
|
let agent = state
|
|
@@ -150,6 +562,8 @@ fn write_restart_resume_decision_events(
|
|
|
150
562
|
session_id,
|
|
151
563
|
allow_fresh,
|
|
152
564
|
decision_wire,
|
|
565
|
+
forced_fresh_missing.contains(decision.agent_id.as_str()),
|
|
566
|
+
forced_fresh_convergence,
|
|
153
567
|
)?;
|
|
154
568
|
}
|
|
155
569
|
Ok(())
|
|
@@ -162,6 +576,8 @@ fn write_restart_resume_decision_event(
|
|
|
162
576
|
session_id: Option<String>,
|
|
163
577
|
allow_fresh: bool,
|
|
164
578
|
decision: &str,
|
|
579
|
+
forced_fresh: bool,
|
|
580
|
+
forced_fresh_convergence: Option<&crate::session_capture::SessionConvergence>,
|
|
165
581
|
) -> Result<(), LifecycleError> {
|
|
166
582
|
use std::io::Write as _;
|
|
167
583
|
|
|
@@ -170,7 +586,7 @@ fn write_restart_resume_decision_event(
|
|
|
170
586
|
std::fs::create_dir_all(parent)
|
|
171
587
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
172
588
|
}
|
|
173
|
-
let event = serde_json::json!({
|
|
589
|
+
let mut event = serde_json::json!({
|
|
174
590
|
"ts": chrono::Utc::now().to_rfc3339(),
|
|
175
591
|
"event": crate::lifecycle::types::event_names::RESTART_RESUME_DECISION,
|
|
176
592
|
"worker_id": worker_id,
|
|
@@ -181,6 +597,24 @@ fn write_restart_resume_decision_event(
|
|
|
181
597
|
"first_send_at": first_send_at,
|
|
182
598
|
"session_id": session_id,
|
|
183
599
|
});
|
|
600
|
+
if forced_fresh {
|
|
601
|
+
if let Some(event) = event.as_object_mut() {
|
|
602
|
+
event.insert("forced_fresh".to_string(), serde_json::json!(true));
|
|
603
|
+
event.insert("reason".to_string(), serde_json::json!("resume_not_ready"));
|
|
604
|
+
if let Some(convergence) = forced_fresh_convergence {
|
|
605
|
+
event.insert(
|
|
606
|
+
"session_convergence".to_string(),
|
|
607
|
+
serde_json::json!({
|
|
608
|
+
"complete": false,
|
|
609
|
+
"deadline_s": convergence.deadline.as_secs_f64(),
|
|
610
|
+
"deadline_ms": convergence.deadline.as_millis(),
|
|
611
|
+
"elapsed_ms": convergence.elapsed.as_millis(),
|
|
612
|
+
"pending_agent_ids": convergence.missing.clone(),
|
|
613
|
+
}),
|
|
614
|
+
);
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
184
618
|
let line = serde_json::to_string(&event)
|
|
185
619
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
186
620
|
let mut file = std::fs::OpenOptions::new()
|
|
@@ -230,6 +230,7 @@ fn remove_agent_inner(
|
|
|
230
230
|
"agent_health",
|
|
231
231
|
None,
|
|
232
232
|
)?;
|
|
233
|
+
maybe_fail_remove_after_agent_health_delete()?;
|
|
233
234
|
Ok(RemoveSuccess {
|
|
234
235
|
outcome: RemoveAgentOutcome::Removed {
|
|
235
236
|
agent_id: agent_id.clone(),
|
|
@@ -585,15 +586,16 @@ fn select_agent_health(
|
|
|
585
586
|
.map_err(|e| LifecycleError::StatePersist(e.to_string()))?;
|
|
586
587
|
let row = conn
|
|
587
588
|
.query_row(
|
|
588
|
-
"select status, last_output_at, context_usage_pct, current_task_id \
|
|
589
|
+
"select owner_team_id, status, last_output_at, context_usage_pct, current_task_id \
|
|
589
590
|
from agent_health where agent_id = ?1",
|
|
590
591
|
[agent_id.as_str()],
|
|
591
592
|
|r| {
|
|
592
593
|
Ok(CapturedHealth {
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
594
|
+
owner_team_id: r.get::<_, Option<String>>(0)?,
|
|
595
|
+
status: r.get::<_, Option<String>>(1)?,
|
|
596
|
+
last_output_at: r.get::<_, Option<String>>(2)?,
|
|
597
|
+
context_usage_pct: r.get::<_, Option<i64>>(3)?,
|
|
598
|
+
current_task_id: r.get::<_, Option<String>>(4)?,
|
|
597
599
|
})
|
|
598
600
|
},
|
|
599
601
|
)
|
|
@@ -622,8 +624,9 @@ fn restore_agent_health(
|
|
|
622
624
|
// health (golden _restore_agent_health re-upserts status||"IDLE" + the captured columns).
|
|
623
625
|
conn.execute(
|
|
624
626
|
"insert into agent_health (owner_team_id, agent_id, status, last_output_at, context_usage_pct, current_task_id, updated_at) \
|
|
625
|
-
values (
|
|
627
|
+
values (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
|
|
626
628
|
rusqlite::params![
|
|
629
|
+
row.owner_team_id,
|
|
627
630
|
agent_id.as_str(),
|
|
628
631
|
status,
|
|
629
632
|
row.last_output_at,
|
|
@@ -638,12 +641,25 @@ fn restore_agent_health(
|
|
|
638
641
|
|
|
639
642
|
#[derive(Clone)]
|
|
640
643
|
struct CapturedHealth {
|
|
644
|
+
owner_team_id: Option<String>,
|
|
641
645
|
status: Option<String>,
|
|
642
646
|
last_output_at: Option<String>,
|
|
643
647
|
context_usage_pct: Option<i64>,
|
|
644
648
|
current_task_id: Option<String>,
|
|
645
649
|
}
|
|
646
650
|
|
|
651
|
+
fn maybe_fail_remove_after_agent_health_delete() -> Result<(), LifecycleError> {
|
|
652
|
+
let Ok(reason) = std::env::var("TEAM_AGENT_TEST_FAIL_REMOVE_AFTER_AGENT_HEALTH_DELETE") else {
|
|
653
|
+
return Ok(());
|
|
654
|
+
};
|
|
655
|
+
if reason.is_empty() {
|
|
656
|
+
return Ok(());
|
|
657
|
+
}
|
|
658
|
+
Err(LifecycleError::StatePersist(format!(
|
|
659
|
+
"injected remove failure after agent_health delete: {reason}"
|
|
660
|
+
)))
|
|
661
|
+
}
|
|
662
|
+
|
|
647
663
|
struct RemoveRollback {
|
|
648
664
|
agent_id: AgentId,
|
|
649
665
|
spec_text: Option<String>,
|
|
@@ -150,6 +150,14 @@ pub(crate) fn write_team_state(
|
|
|
150
150
|
lines.push(format!("- {id}: {summary}"));
|
|
151
151
|
}
|
|
152
152
|
}
|
|
153
|
+
if let Some(notes) = team_state_notes(state).filter(|notes| !notes.is_empty()) {
|
|
154
|
+
lines.push(String::new());
|
|
155
|
+
lines.push("## Notes".to_string());
|
|
156
|
+
lines.push(String::new());
|
|
157
|
+
for note in notes {
|
|
158
|
+
lines.push(format!("- {note}"));
|
|
159
|
+
}
|
|
160
|
+
}
|
|
153
161
|
lines.push(String::new());
|
|
154
162
|
lines.push("## Next Step".to_string());
|
|
155
163
|
lines.push(String::new());
|
|
@@ -182,6 +190,17 @@ fn team_state_tasks(spec: &YamlValue, state: &serde_json::Value) -> Vec<TeamStat
|
|
|
182
190
|
Vec::new()
|
|
183
191
|
}
|
|
184
192
|
|
|
193
|
+
fn team_state_notes(state: &serde_json::Value) -> Option<Vec<String>> {
|
|
194
|
+
Some(
|
|
195
|
+
state
|
|
196
|
+
.get("notes")?
|
|
197
|
+
.as_array()?
|
|
198
|
+
.iter()
|
|
199
|
+
.filter_map(|note| note.as_str().filter(|text| !text.is_empty()).map(str::to_string))
|
|
200
|
+
.collect(),
|
|
201
|
+
)
|
|
202
|
+
}
|
|
203
|
+
|
|
185
204
|
fn task_field_str(task: &TeamStateTask, key: &str) -> String {
|
|
186
205
|
match task {
|
|
187
206
|
TeamStateTask::Json(v) => v.get(key).and_then(|v| v.as_str()).unwrap_or("").to_string(),
|
|
@@ -35,7 +35,10 @@ pub use agent::{reset_agent, reset_agent_with_transport, start_agent, start_agen
|
|
|
35
35
|
pub(crate) use agent::start_agent_at_paths;
|
|
36
36
|
pub(crate) use common::refresh_missing_provider_sessions;
|
|
37
37
|
pub use orchestrator::{halt_plan, plan_status};
|
|
38
|
-
pub use rebuild::{
|
|
38
|
+
pub use rebuild::{
|
|
39
|
+
restart, restart_candidates, restart_with_session_convergence_deadline, restart_with_transport,
|
|
40
|
+
select_restart_state,
|
|
41
|
+
};
|
|
39
42
|
pub use remove::{remove_agent, remove_agent_with_transport};
|
|
40
43
|
pub use selection::{classify_first_send_at, classify_restart_plan, decide_start_mode, python_type_name};
|
|
41
44
|
pub(crate) use team_state::write_team_state;
|
|
@@ -496,10 +496,10 @@ fn lanea_fork_window_already_exists_guard_before_spec_mutation() {
|
|
|
496
496
|
|
|
497
497
|
// ── FORK (fork-gate-error-text) [RED] + (fork-incomplete-rollback, adapter arm) — golden gate text + spec rollback
|
|
498
498
|
// Golden operations.py:329-330 raises f"{provider} does not support native session fork" when the native
|
|
499
|
-
// fork gate fails (auth_mode==compatible_api). Rust relies on adapter.
|
|
499
|
+
// fork gate fails (auth_mode==compatible_api). Rust relies on adapter.fork_plan() -> CapabilityUnsupported
|
|
500
500
|
// ("Codex:fork") (adapter.rs:310) -> a different observable. AND golden wraps the post-spec-write steps
|
|
501
501
|
// in try/except restoring the spec on ANY failure (operations.py:384-394); Rust writes the spec
|
|
502
|
-
// (launch.rs:443) then errors at adapter.
|
|
502
|
+
// (launch.rs:443) then errors at adapter.fork_plan (458-460) WITHOUT restoring it. RED on both: the message
|
|
503
503
|
// text AND the spec must be rolled back to not contain the fork agent.
|
|
504
504
|
#[test]
|
|
505
505
|
fn lanea_fork_gate_error_text_and_spec_rollback_on_adapter_arm() {
|
|
@@ -516,7 +516,7 @@ fn lanea_fork_gate_error_text_and_spec_rollback_on_adapter_arm() {
|
|
|
516
516
|
assert!(
|
|
517
517
|
!spec_text.contains("newfork"),
|
|
518
518
|
"golden operations.py:384-394: on the gate failure the spec must be ROLLED BACK; Rust writes the spec \
|
|
519
|
-
then errors at adapter.
|
|
519
|
+
then errors at adapter.fork_plan without restoring it, leaving the fork agent 'newfork' in the spec"
|
|
520
520
|
);
|
|
521
521
|
}
|
|
522
522
|
|
|
@@ -568,9 +568,9 @@ fn lanea_remove_rollback_restores_agent_health() {
|
|
|
568
568
|
// (4) restores prior state. Rust only restores the spec on the spawn_into arm (launch.rs:481); the
|
|
569
569
|
// save_runtime_state (486-487) and start_coordinator (488-493) failure arms leave the spec mutated, the
|
|
570
570
|
// already-spawned window un-killed, and the state un-rolled-back; install_mcp/cleanup_mcp are absent.
|
|
571
|
-
// The adapter.
|
|
571
|
+
// The adapter.fork_plan arm IS covered HARD above (lanea_fork_gate_error_text_and_spec_rollback_on_adapter_arm).
|
|
572
572
|
// The post-SPAWN arms need a failure-injection seam after spawn_into (codex+subscription forks past
|
|
573
|
-
// adapter.
|
|
573
|
+
// adapter.fork_plan, so the spawn succeeds and there is no in-process way to fail save/coordinator cleanly).
|
|
574
574
|
// PORTER: a Drop guard armed after the spec write, disarmed on success — kills the window, restores spec
|
|
575
575
|
// + state, runs cleanup_mcp on every post-write error arm.
|
|
576
576
|
#[test]
|