@covibes/zeroshot 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +167 -0
- package/LICENSE +21 -0
- package/README.md +364 -0
- package/cli/index.js +3990 -0
- package/cluster-templates/base-templates/debug-workflow.json +181 -0
- package/cluster-templates/base-templates/full-workflow.json +455 -0
- package/cluster-templates/base-templates/single-worker.json +48 -0
- package/cluster-templates/base-templates/worker-validator.json +131 -0
- package/cluster-templates/conductor-bootstrap.json +122 -0
- package/cluster-templates/conductor-junior-bootstrap.json +69 -0
- package/docker/zeroshot-cluster/Dockerfile +132 -0
- package/lib/completion.js +174 -0
- package/lib/id-detector.js +53 -0
- package/lib/settings.js +97 -0
- package/lib/stream-json-parser.js +236 -0
- package/package.json +121 -0
- package/src/agent/agent-config.js +121 -0
- package/src/agent/agent-context-builder.js +241 -0
- package/src/agent/agent-hook-executor.js +329 -0
- package/src/agent/agent-lifecycle.js +555 -0
- package/src/agent/agent-stuck-detector.js +256 -0
- package/src/agent/agent-task-executor.js +1034 -0
- package/src/agent/agent-trigger-evaluator.js +67 -0
- package/src/agent-wrapper.js +459 -0
- package/src/agents/git-pusher-agent.json +20 -0
- package/src/attach/attach-client.js +438 -0
- package/src/attach/attach-server.js +543 -0
- package/src/attach/index.js +35 -0
- package/src/attach/protocol.js +220 -0
- package/src/attach/ring-buffer.js +121 -0
- package/src/attach/socket-discovery.js +242 -0
- package/src/claude-task-runner.js +468 -0
- package/src/config-router.js +80 -0
- package/src/config-validator.js +598 -0
- package/src/github.js +103 -0
- package/src/isolation-manager.js +1042 -0
- package/src/ledger.js +429 -0
- package/src/logic-engine.js +223 -0
- package/src/message-bus-bridge.js +139 -0
- package/src/message-bus.js +202 -0
- package/src/name-generator.js +232 -0
- package/src/orchestrator.js +1938 -0
- package/src/schemas/sub-cluster.js +156 -0
- package/src/sub-cluster-wrapper.js +545 -0
- package/src/task-runner.js +28 -0
- package/src/template-resolver.js +347 -0
- package/src/tui/CHANGES.txt +133 -0
- package/src/tui/LAYOUT.md +261 -0
- package/src/tui/README.txt +192 -0
- package/src/tui/TWO-LEVEL-NAVIGATION.md +186 -0
- package/src/tui/data-poller.js +325 -0
- package/src/tui/demo.js +208 -0
- package/src/tui/formatters.js +123 -0
- package/src/tui/index.js +193 -0
- package/src/tui/keybindings.js +383 -0
- package/src/tui/layout.js +317 -0
- package/src/tui/renderer.js +194 -0
|
@@ -0,0 +1,555 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* AgentLifecycle - Agent state machine and lifecycle management
|
|
4
|
+
*
|
|
5
|
+
* Provides:
|
|
6
|
+
* - Agent startup and shutdown
|
|
7
|
+
* - Message handling and routing
|
|
8
|
+
* - Trigger action execution (execute_task, stop_cluster)
|
|
9
|
+
* - Task execution with retry logic
|
|
10
|
+
* - Liveness monitoring with multi-indicator stuck detection
|
|
11
|
+
*
|
|
12
|
+
* State machine: idle → evaluating → building_context → executing → idle
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
const { buildContext } = require('./agent-context-builder');
|
|
16
|
+
const { findMatchingTrigger, evaluateTrigger } = require('./agent-trigger-evaluator');
|
|
17
|
+
const { executeHook } = require('./agent-hook-executor');
|
|
18
|
+
const {
|
|
19
|
+
analyzeProcessHealth,
|
|
20
|
+
isPlatformSupported,
|
|
21
|
+
STUCK_THRESHOLD,
|
|
22
|
+
} = require('./agent-stuck-detector');
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Start the agent (begin listening for triggers)
|
|
26
|
+
* @param {AgentWrapper} agent - Agent instance
|
|
27
|
+
*/
|
|
28
|
+
function start(agent) {
|
|
29
|
+
if (agent.running) {
|
|
30
|
+
throw new Error(`Agent ${agent.id} is already running`);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
agent.running = true;
|
|
34
|
+
agent.state = 'idle';
|
|
35
|
+
|
|
36
|
+
// Subscribe to all messages for this cluster
|
|
37
|
+
agent.unsubscribe = agent.messageBus.subscribe((message) => {
|
|
38
|
+
if (message.cluster_id === agent.cluster.id) {
|
|
39
|
+
handleMessage(agent, message).catch((error) => {
|
|
40
|
+
// FATAL: Message handling failed - crash loud
|
|
41
|
+
console.error(`\n${'='.repeat(80)}`);
|
|
42
|
+
console.error(`🔴 FATAL: Agent ${agent.id} message handler crashed`);
|
|
43
|
+
console.error(`${'='.repeat(80)}`);
|
|
44
|
+
console.error(`Topic: ${message.topic}`);
|
|
45
|
+
console.error(`Error: ${error.message}`);
|
|
46
|
+
console.error(`Stack: ${error.stack}`);
|
|
47
|
+
console.error(`${'='.repeat(80)}\n`);
|
|
48
|
+
// Re-throw to crash the process - DO NOT SILENTLY CONTINUE
|
|
49
|
+
throw error;
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
agent._log(`Agent ${agent.id} started (role: ${agent.role})`);
|
|
55
|
+
agent._publishLifecycle('STARTED', {
|
|
56
|
+
triggers: agent.config.triggers?.map((t) => t.topic) || [],
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Stop the agent
|
|
62
|
+
* @param {AgentWrapper} agent - Agent instance
|
|
63
|
+
*/
|
|
64
|
+
function stop(agent) {
|
|
65
|
+
if (!agent.running) {
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
agent.running = false;
|
|
70
|
+
agent.state = 'stopped';
|
|
71
|
+
|
|
72
|
+
if (agent.unsubscribe) {
|
|
73
|
+
agent.unsubscribe();
|
|
74
|
+
agent.unsubscribe = null;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Kill current task if any
|
|
78
|
+
if (agent.currentTask) {
|
|
79
|
+
agent._killTask();
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
agent._log(`Agent ${agent.id} stopped`);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Handle incoming message
|
|
87
|
+
* @param {AgentWrapper} agent - Agent instance
|
|
88
|
+
* @param {Object} message - Incoming message
|
|
89
|
+
*/
|
|
90
|
+
async function handleMessage(agent, message) {
|
|
91
|
+
// Check if any trigger matches FIRST (before state check)
|
|
92
|
+
const matchingTrigger = findMatchingTrigger({
|
|
93
|
+
triggers: agent.config.triggers,
|
|
94
|
+
message,
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
if (!matchingTrigger) {
|
|
98
|
+
return; // No trigger for this message type
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Now check state - LOG if we're dropping a message we SHOULD handle
|
|
102
|
+
if (!agent.running) {
|
|
103
|
+
console.warn(`[${agent.id}] ⚠️ DROPPING message (not running): ${message.topic}`);
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
if (agent.state !== 'idle') {
|
|
107
|
+
console.warn(
|
|
108
|
+
`[${agent.id}] ⚠️ DROPPING message (busy, state=${agent.state}): ${message.topic}`
|
|
109
|
+
);
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Evaluate trigger logic
|
|
114
|
+
agent.state = 'evaluating_logic';
|
|
115
|
+
|
|
116
|
+
const agentContext = {
|
|
117
|
+
id: agent.id,
|
|
118
|
+
role: agent.role,
|
|
119
|
+
iteration: agent.iteration,
|
|
120
|
+
cluster_id: agent.cluster.id,
|
|
121
|
+
};
|
|
122
|
+
|
|
123
|
+
const shouldExecute = evaluateTrigger({
|
|
124
|
+
trigger: matchingTrigger,
|
|
125
|
+
message,
|
|
126
|
+
agent: agentContext,
|
|
127
|
+
logicEngine: agent.logicEngine,
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
if (!shouldExecute) {
|
|
131
|
+
agent.state = 'idle';
|
|
132
|
+
return;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Execute trigger action (lifecycle event published inside for execute_task)
|
|
136
|
+
await executeTriggerAction(agent, matchingTrigger, message);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Execute trigger action
|
|
141
|
+
* @param {AgentWrapper} agent - Agent instance
|
|
142
|
+
* @param {Object} trigger - Matched trigger config
|
|
143
|
+
* @param {Object} message - Triggering message
|
|
144
|
+
*/
|
|
145
|
+
async function executeTriggerAction(agent, trigger, message) {
|
|
146
|
+
const action = trigger.action || 'execute_task';
|
|
147
|
+
|
|
148
|
+
if (action === 'execute_task') {
|
|
149
|
+
await executeTask(agent, message);
|
|
150
|
+
} else if (action === 'stop_cluster') {
|
|
151
|
+
// Publish CLUSTER_COMPLETE message to signal successful completion
|
|
152
|
+
agent._publish({
|
|
153
|
+
topic: 'CLUSTER_COMPLETE',
|
|
154
|
+
receiver: 'system',
|
|
155
|
+
content: {
|
|
156
|
+
text: 'All validation passed. Cluster completing successfully.',
|
|
157
|
+
data: {
|
|
158
|
+
reason: 'all_validators_approved',
|
|
159
|
+
timestamp: Date.now(),
|
|
160
|
+
},
|
|
161
|
+
},
|
|
162
|
+
});
|
|
163
|
+
agent.state = 'completed';
|
|
164
|
+
agent._log(`Agent ${agent.id}: Cluster completion triggered`);
|
|
165
|
+
} else {
|
|
166
|
+
console.warn(`Unknown action: ${action}`);
|
|
167
|
+
agent.state = 'idle';
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Execute claude-zeroshots with built context
|
|
173
|
+
* Retries disabled by default. Set agent config `maxRetries` to enable (e.g., 3).
|
|
174
|
+
* @param {AgentWrapper} agent - Agent instance
|
|
175
|
+
* @param {Object} triggeringMessage - Message that triggered execution
|
|
176
|
+
*/
|
|
177
|
+
async function executeTask(agent, triggeringMessage) {
|
|
178
|
+
// Default: no retries (maxRetries=1 means 1 attempt only)
|
|
179
|
+
// Set agent config `maxRetries: 3` to enable exponential backoff retries
|
|
180
|
+
const maxRetries = agent.config.maxRetries ?? 1;
|
|
181
|
+
const baseDelay = 2000; // 2 seconds
|
|
182
|
+
|
|
183
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
184
|
+
try {
|
|
185
|
+
// Execute onStart hook
|
|
186
|
+
await executeHook({
|
|
187
|
+
hook: agent.config.hooks?.onStart,
|
|
188
|
+
agent: agent,
|
|
189
|
+
message: triggeringMessage,
|
|
190
|
+
result: undefined,
|
|
191
|
+
messageBus: agent.messageBus,
|
|
192
|
+
cluster: agent.cluster,
|
|
193
|
+
orchestrator: agent.orchestrator,
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
// Check max iterations limit BEFORE incrementing (prevents infinite rejection loops)
|
|
197
|
+
if (agent.iteration >= agent.maxIterations) {
|
|
198
|
+
agent._log(
|
|
199
|
+
`[Agent ${agent.id}] Hit max iterations (${agent.maxIterations}), stopping cluster`
|
|
200
|
+
);
|
|
201
|
+
agent._publishLifecycle('MAX_ITERATIONS_REACHED', {
|
|
202
|
+
iteration: agent.iteration,
|
|
203
|
+
maxIterations: agent.maxIterations,
|
|
204
|
+
});
|
|
205
|
+
// Publish failure message - orchestrator watches for this and auto-stops
|
|
206
|
+
agent._publish({
|
|
207
|
+
topic: 'CLUSTER_FAILED',
|
|
208
|
+
receiver: 'system',
|
|
209
|
+
content: {
|
|
210
|
+
text: `Agent ${agent.id} hit max iterations limit (${agent.maxIterations}). Stopping cluster.`,
|
|
211
|
+
data: {
|
|
212
|
+
reason: 'max_iterations',
|
|
213
|
+
iteration: agent.iteration,
|
|
214
|
+
maxIterations: agent.maxIterations,
|
|
215
|
+
},
|
|
216
|
+
},
|
|
217
|
+
});
|
|
218
|
+
agent.state = 'failed';
|
|
219
|
+
return;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Increment iteration BEFORE building context so worker knows current iteration
|
|
223
|
+
agent.iteration++;
|
|
224
|
+
|
|
225
|
+
// Build context
|
|
226
|
+
agent.state = 'building_context';
|
|
227
|
+
const context = buildContext({
|
|
228
|
+
id: agent.id,
|
|
229
|
+
role: agent.role,
|
|
230
|
+
iteration: agent.iteration,
|
|
231
|
+
config: agent.config,
|
|
232
|
+
messageBus: agent.messageBus,
|
|
233
|
+
cluster: agent.cluster,
|
|
234
|
+
lastTaskEndTime: agent.lastTaskEndTime,
|
|
235
|
+
triggeringMessage,
|
|
236
|
+
selectedPrompt: agent._selectPrompt(),
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
// Log input context (helps debug what each agent sees)
|
|
240
|
+
if (!agent.quiet) {
|
|
241
|
+
console.log(`\n${'='.repeat(80)}`);
|
|
242
|
+
console.log(`📥 INPUT CONTEXT - Agent: ${agent.id} (Iteration: ${agent.iteration})`);
|
|
243
|
+
console.log(`${'='.repeat(80)}`);
|
|
244
|
+
console.log(context);
|
|
245
|
+
console.log(`${'='.repeat(80)}\n`);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Spawn claude-zeroshots
|
|
249
|
+
agent.state = 'executing_task';
|
|
250
|
+
|
|
251
|
+
agent._publishLifecycle('TASK_STARTED', {
|
|
252
|
+
iteration: agent.iteration,
|
|
253
|
+
model: agent._selectModel(),
|
|
254
|
+
triggeredBy: triggeringMessage.topic,
|
|
255
|
+
triggerFrom: triggeringMessage.sender,
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
const result = await agent._spawnClaudeTask(context);
|
|
259
|
+
|
|
260
|
+
// Add task ID to result for debugging and hooks
|
|
261
|
+
result.taskId = agent.currentTaskId;
|
|
262
|
+
result.agentId = agent.id;
|
|
263
|
+
result.iteration = agent.iteration;
|
|
264
|
+
|
|
265
|
+
// Check if task execution failed
|
|
266
|
+
if (!result.success) {
|
|
267
|
+
throw new Error(result.error || 'Task execution failed');
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Set state to idle BEFORE publishing lifecycle event
|
|
271
|
+
// (so lifecycle message includes correct state)
|
|
272
|
+
agent.state = 'idle';
|
|
273
|
+
|
|
274
|
+
// Track completion time for context filtering (used by "since: last_task_end")
|
|
275
|
+
agent.lastTaskEndTime = Date.now();
|
|
276
|
+
|
|
277
|
+
agent._publishLifecycle('TASK_COMPLETED', {
|
|
278
|
+
iteration: agent.iteration,
|
|
279
|
+
success: true,
|
|
280
|
+
taskId: agent.currentTaskId,
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
// Execute onComplete hook
|
|
284
|
+
await executeHook({
|
|
285
|
+
hook: agent.config.hooks?.onComplete,
|
|
286
|
+
agent: agent,
|
|
287
|
+
message: triggeringMessage,
|
|
288
|
+
result: result,
|
|
289
|
+
messageBus: agent.messageBus,
|
|
290
|
+
cluster: agent.cluster,
|
|
291
|
+
orchestrator: agent.orchestrator,
|
|
292
|
+
});
|
|
293
|
+
|
|
294
|
+
// ✅ SUCCESS - exit retry loop
|
|
295
|
+
return;
|
|
296
|
+
} catch (error) {
|
|
297
|
+
// Log attempt failure
|
|
298
|
+
console.error(`\n${'='.repeat(80)}`);
|
|
299
|
+
console.error(
|
|
300
|
+
`🔴 TASK EXECUTION FAILED - AGENT: ${agent.id} (Attempt ${attempt}/${maxRetries})`
|
|
301
|
+
);
|
|
302
|
+
console.error(`${'='.repeat(80)}`);
|
|
303
|
+
console.error(`Error: ${error.message}`);
|
|
304
|
+
if (attempt < maxRetries) {
|
|
305
|
+
console.error(`Will retry in ${baseDelay * Math.pow(2, attempt - 1)}ms...`);
|
|
306
|
+
}
|
|
307
|
+
console.error(`${'='.repeat(80)}\n`);
|
|
308
|
+
|
|
309
|
+
// Last attempt - give up
|
|
310
|
+
if (attempt >= maxRetries) {
|
|
311
|
+
console.error(`\n${'='.repeat(80)}`);
|
|
312
|
+
console.error(`🔴🔴🔴 MAX RETRIES EXHAUSTED - AGENT: ${agent.id} 🔴🔴🔴`);
|
|
313
|
+
console.error(`${'='.repeat(80)}`);
|
|
314
|
+
console.error(`All ${maxRetries} attempts failed`);
|
|
315
|
+
console.error(`Final error: ${error.message}`);
|
|
316
|
+
console.error(`Stack: ${error.stack}`);
|
|
317
|
+
console.error(`${'='.repeat(80)}\n`);
|
|
318
|
+
|
|
319
|
+
// ROBUSTNESS: If validator crashes after all retries → auto-approve to unblock cluster
|
|
320
|
+
// Better to skip broken validation than block entire workflow
|
|
321
|
+
if (agent.role === 'validator') {
|
|
322
|
+
console.warn(`\n${'='.repeat(80)}`);
|
|
323
|
+
console.warn(`⚠️ VALIDATOR AUTO-APPROVAL - Agent ${agent.id}`);
|
|
324
|
+
console.warn(`${'='.repeat(80)}`);
|
|
325
|
+
console.warn(`Validator crashed ${maxRetries} times, auto-approving to unblock cluster`);
|
|
326
|
+
console.warn(`This validation was SKIPPED - review manually if needed`);
|
|
327
|
+
console.warn(`${'='.repeat(80)}\n`);
|
|
328
|
+
|
|
329
|
+
// Publish approval message (using hook config structure)
|
|
330
|
+
const hook = agent.config.hooks?.onComplete;
|
|
331
|
+
if (hook && hook.action === 'publish_message') {
|
|
332
|
+
agent._publish({
|
|
333
|
+
topic: hook.config.topic,
|
|
334
|
+
receiver: hook.config.receiver || 'broadcast',
|
|
335
|
+
content: {
|
|
336
|
+
text: `Auto-approved after ${maxRetries} failed attempts: ${error.message}`,
|
|
337
|
+
data: {
|
|
338
|
+
approved: 'true',
|
|
339
|
+
errors: JSON.stringify([
|
|
340
|
+
`VALIDATOR CRASHED: ${error.message}. Auto-approved to unblock cluster.`,
|
|
341
|
+
]),
|
|
342
|
+
autoApproved: true,
|
|
343
|
+
attempts: maxRetries,
|
|
344
|
+
},
|
|
345
|
+
},
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
agent.state = 'idle';
|
|
350
|
+
return; // Auto-approved, continue cluster
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// Non-validator agents: publish error and stop
|
|
354
|
+
agent.state = 'error';
|
|
355
|
+
|
|
356
|
+
// Save failure info to cluster for resume capability
|
|
357
|
+
agent.cluster.failureInfo = {
|
|
358
|
+
agentId: agent.id,
|
|
359
|
+
taskId: agent.currentTaskId,
|
|
360
|
+
iteration: agent.iteration,
|
|
361
|
+
error: error.message,
|
|
362
|
+
attempts: maxRetries,
|
|
363
|
+
timestamp: Date.now(),
|
|
364
|
+
};
|
|
365
|
+
|
|
366
|
+
// Publish error to message bus for visibility in logs
|
|
367
|
+
agent._publish({
|
|
368
|
+
topic: 'AGENT_ERROR',
|
|
369
|
+
receiver: 'broadcast',
|
|
370
|
+
content: {
|
|
371
|
+
text: `Task execution failed after ${maxRetries} attempts: ${error.message}`,
|
|
372
|
+
data: {
|
|
373
|
+
error: error.message,
|
|
374
|
+
stack: error.stack,
|
|
375
|
+
agent: agent.id,
|
|
376
|
+
role: agent.role,
|
|
377
|
+
iteration: agent.iteration,
|
|
378
|
+
taskId: agent.currentTaskId,
|
|
379
|
+
attempts: maxRetries,
|
|
380
|
+
hookFailureContext: error.message.includes('Hook uses result')
|
|
381
|
+
? {
|
|
382
|
+
taskId: agent.currentTaskId || 'UNKNOWN',
|
|
383
|
+
retrieveLogs: agent.currentTaskId
|
|
384
|
+
? `zeroshot task logs ${agent.currentTaskId}`
|
|
385
|
+
: 'N/A',
|
|
386
|
+
}
|
|
387
|
+
: undefined,
|
|
388
|
+
},
|
|
389
|
+
},
|
|
390
|
+
metadata: {
|
|
391
|
+
triggeringTopic: triggeringMessage.topic,
|
|
392
|
+
},
|
|
393
|
+
});
|
|
394
|
+
|
|
395
|
+
// Execute onError hook
|
|
396
|
+
await executeHook({
|
|
397
|
+
hook: agent.config.hooks?.onError,
|
|
398
|
+
agent: agent,
|
|
399
|
+
message: triggeringMessage,
|
|
400
|
+
result: { error },
|
|
401
|
+
messageBus: agent.messageBus,
|
|
402
|
+
cluster: agent.cluster,
|
|
403
|
+
orchestrator: agent.orchestrator,
|
|
404
|
+
});
|
|
405
|
+
|
|
406
|
+
agent.state = 'idle';
|
|
407
|
+
return; // Give up
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Not the last attempt - prepare for retry
|
|
411
|
+
const delay = baseDelay * Math.pow(2, attempt - 1); // 2s, 4s, 8s
|
|
412
|
+
|
|
413
|
+
agent._publishLifecycle('RETRY_SCHEDULED', {
|
|
414
|
+
attempt,
|
|
415
|
+
maxRetries,
|
|
416
|
+
delayMs: delay,
|
|
417
|
+
error: error.message,
|
|
418
|
+
});
|
|
419
|
+
|
|
420
|
+
agent._log(`[${agent.id}] ⚠️ Retrying in ${delay}ms... (${attempt + 1}/${maxRetries})`);
|
|
421
|
+
|
|
422
|
+
// Exponential backoff
|
|
423
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
424
|
+
|
|
425
|
+
agent._log(`[${agent.id}] 🔄 Starting retry attempt ${attempt + 1}/${maxRetries}`);
|
|
426
|
+
// Continue to next iteration of for loop
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
/**
|
|
432
|
+
* Start monitoring agent output liveness using multi-indicator stuck detection
|
|
433
|
+
*
|
|
434
|
+
* SAFE DETECTION: Only flags as stuck when MULTIPLE indicators agree:
|
|
435
|
+
* - Process sleeping (state=S)
|
|
436
|
+
* - Blocked on epoll/poll wait
|
|
437
|
+
* - Low CPU usage (<1%)
|
|
438
|
+
* - Low context switches (<10)
|
|
439
|
+
* - No network data in flight
|
|
440
|
+
*
|
|
441
|
+
* Single-indicator detection (just output freshness) has HIGH false positive risk.
|
|
442
|
+
* This multi-indicator approach eliminates false positives.
|
|
443
|
+
*
|
|
444
|
+
* @param {AgentWrapper} agent - Agent instance
|
|
445
|
+
*/
|
|
446
|
+
function startLivenessCheck(agent) {
|
|
447
|
+
if (agent.livenessCheckInterval) {
|
|
448
|
+
clearInterval(agent.livenessCheckInterval);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// Check if platform supports /proc filesystem (Linux only)
|
|
452
|
+
if (!isPlatformSupported()) {
|
|
453
|
+
agent._log(
|
|
454
|
+
`[${agent.id}] Liveness check disabled: /proc filesystem not available (non-Linux platform)`
|
|
455
|
+
);
|
|
456
|
+
return;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// Check every 60 seconds (gives time for multi-indicator analysis)
|
|
460
|
+
const CHECK_INTERVAL_MS = 60 * 1000;
|
|
461
|
+
const ANALYSIS_SAMPLE_MS = 5000; // Sample CPU/context switches over 5 seconds
|
|
462
|
+
|
|
463
|
+
agent.livenessCheckInterval = setInterval(async () => {
|
|
464
|
+
// Skip if no task running or no PID tracked
|
|
465
|
+
if (!agent.currentTask || !agent.processPid) {
|
|
466
|
+
return;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
// Skip if output is recent (process is clearly active)
|
|
470
|
+
if (agent.lastOutputTime) {
|
|
471
|
+
const timeSinceLastOutput = Date.now() - agent.lastOutputTime;
|
|
472
|
+
if (timeSinceLastOutput < agent.staleDuration) {
|
|
473
|
+
return; // Output is recent, definitely not stuck
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// Output is stale - run multi-indicator analysis to confirm
|
|
478
|
+
agent._log(
|
|
479
|
+
`[${agent.id}] Output stale for ${Math.round((Date.now() - (agent.lastOutputTime || 0)) / 1000)}s, running multi-indicator analysis...`
|
|
480
|
+
);
|
|
481
|
+
|
|
482
|
+
try {
|
|
483
|
+
const analysis = await analyzeProcessHealth(agent.processPid, ANALYSIS_SAMPLE_MS);
|
|
484
|
+
|
|
485
|
+
// Process died during analysis
|
|
486
|
+
if (analysis.isLikelyStuck === null) {
|
|
487
|
+
agent._log(`[${agent.id}] Process analysis inconclusive: ${analysis.reason}`);
|
|
488
|
+
return;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// Log analysis details for debugging
|
|
492
|
+
agent._log(
|
|
493
|
+
`[${agent.id}] Analysis: score=${analysis.stuckScore}/${STUCK_THRESHOLD}, ` +
|
|
494
|
+
`state=${analysis.state}, wchan=${analysis.wchan}, ` +
|
|
495
|
+
`CPU=${analysis.cpuPercent}%, ctxSwitches=${analysis.ctxSwitchesDelta}`
|
|
496
|
+
);
|
|
497
|
+
|
|
498
|
+
if (analysis.isLikelyStuck) {
|
|
499
|
+
agent._log(`⚠️ Agent ${agent.id}: CONFIRMED STUCK (confidence: ${analysis.confidence})`);
|
|
500
|
+
agent._log(` ${analysis.analysis}`);
|
|
501
|
+
|
|
502
|
+
// CHANGED: Stale detection is informational only - never kills tasks
|
|
503
|
+
// Publish stale detection event with full analysis (for logging/monitoring)
|
|
504
|
+
agent._publishLifecycle('AGENT_STALE_WARNING', {
|
|
505
|
+
timeSinceLastOutput: Date.now() - (agent.lastOutputTime || 0),
|
|
506
|
+
staleDuration: agent.staleDuration,
|
|
507
|
+
lastOutputTime: agent.lastOutputTime,
|
|
508
|
+
// Multi-indicator analysis results
|
|
509
|
+
stuckScore: analysis.stuckScore,
|
|
510
|
+
confidence: analysis.confidence,
|
|
511
|
+
processState: analysis.state,
|
|
512
|
+
wchan: analysis.wchan,
|
|
513
|
+
cpuPercent: analysis.cpuPercent,
|
|
514
|
+
ctxSwitchesDelta: analysis.ctxSwitchesDelta,
|
|
515
|
+
indicators: analysis.indicators,
|
|
516
|
+
analysis: analysis.analysis,
|
|
517
|
+
});
|
|
518
|
+
|
|
519
|
+
// Keep monitoring - do NOT stop the agent
|
|
520
|
+
// User can manually intervene with 'zeroshot resume' if needed
|
|
521
|
+
// stopLivenessCheck(agent); // REMOVED - keep monitoring
|
|
522
|
+
} else {
|
|
523
|
+
agent._log(
|
|
524
|
+
`[${agent.id}] Process appears WORKING despite stale output (score: ${analysis.stuckScore})`
|
|
525
|
+
);
|
|
526
|
+
agent._log(` ${analysis.analysis}`);
|
|
527
|
+
// Don't flag as stuck - process is legitimately working
|
|
528
|
+
}
|
|
529
|
+
} catch (err) {
|
|
530
|
+
agent._log(`[${agent.id}] Error during stuck analysis: ${err.message}`);
|
|
531
|
+
// Don't flag as stuck on analysis error
|
|
532
|
+
}
|
|
533
|
+
}, CHECK_INTERVAL_MS);
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
/**
|
|
537
|
+
* Stop liveness monitoring
|
|
538
|
+
* @param {AgentWrapper} agent - Agent instance
|
|
539
|
+
*/
|
|
540
|
+
function stopLivenessCheck(agent) {
|
|
541
|
+
if (agent.livenessCheckInterval) {
|
|
542
|
+
clearInterval(agent.livenessCheckInterval);
|
|
543
|
+
agent.livenessCheckInterval = null;
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
module.exports = {
|
|
548
|
+
start,
|
|
549
|
+
stop,
|
|
550
|
+
handleMessage,
|
|
551
|
+
executeTriggerAction,
|
|
552
|
+
executeTask,
|
|
553
|
+
startLivenessCheck,
|
|
554
|
+
stopLivenessCheck,
|
|
555
|
+
};
|