@covibes/zeroshot 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +167 -0
- package/LICENSE +21 -0
- package/README.md +364 -0
- package/cli/index.js +3990 -0
- package/cluster-templates/base-templates/debug-workflow.json +181 -0
- package/cluster-templates/base-templates/full-workflow.json +455 -0
- package/cluster-templates/base-templates/single-worker.json +48 -0
- package/cluster-templates/base-templates/worker-validator.json +131 -0
- package/cluster-templates/conductor-bootstrap.json +122 -0
- package/cluster-templates/conductor-junior-bootstrap.json +69 -0
- package/docker/zeroshot-cluster/Dockerfile +132 -0
- package/lib/completion.js +174 -0
- package/lib/id-detector.js +53 -0
- package/lib/settings.js +97 -0
- package/lib/stream-json-parser.js +236 -0
- package/package.json +121 -0
- package/src/agent/agent-config.js +121 -0
- package/src/agent/agent-context-builder.js +241 -0
- package/src/agent/agent-hook-executor.js +329 -0
- package/src/agent/agent-lifecycle.js +555 -0
- package/src/agent/agent-stuck-detector.js +256 -0
- package/src/agent/agent-task-executor.js +1034 -0
- package/src/agent/agent-trigger-evaluator.js +67 -0
- package/src/agent-wrapper.js +459 -0
- package/src/agents/git-pusher-agent.json +20 -0
- package/src/attach/attach-client.js +438 -0
- package/src/attach/attach-server.js +543 -0
- package/src/attach/index.js +35 -0
- package/src/attach/protocol.js +220 -0
- package/src/attach/ring-buffer.js +121 -0
- package/src/attach/socket-discovery.js +242 -0
- package/src/claude-task-runner.js +468 -0
- package/src/config-router.js +80 -0
- package/src/config-validator.js +598 -0
- package/src/github.js +103 -0
- package/src/isolation-manager.js +1042 -0
- package/src/ledger.js +429 -0
- package/src/logic-engine.js +223 -0
- package/src/message-bus-bridge.js +139 -0
- package/src/message-bus.js +202 -0
- package/src/name-generator.js +232 -0
- package/src/orchestrator.js +1938 -0
- package/src/schemas/sub-cluster.js +156 -0
- package/src/sub-cluster-wrapper.js +545 -0
- package/src/task-runner.js +28 -0
- package/src/template-resolver.js +347 -0
- package/src/tui/CHANGES.txt +133 -0
- package/src/tui/LAYOUT.md +261 -0
- package/src/tui/README.txt +192 -0
- package/src/tui/TWO-LEVEL-NAVIGATION.md +186 -0
- package/src/tui/data-poller.js +325 -0
- package/src/tui/demo.js +208 -0
- package/src/tui/formatters.js +123 -0
- package/src/tui/index.js +193 -0
- package/src/tui/keybindings.js +383 -0
- package/src/tui/layout.js +317 -0
- package/src/tui/renderer.js +194 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AgentStuckDetector - Multi-indicator process health analysis
|
|
3
|
+
*
|
|
4
|
+
* Detects stuck Claude processes using multiple indicators:
|
|
5
|
+
* - Process state (S=sleeping vs R=running)
|
|
6
|
+
* - Wait channel (ep_poll = blocked on epoll_wait)
|
|
7
|
+
* - CPU usage over sample period
|
|
8
|
+
* - Context switches (activity indicator)
|
|
9
|
+
* - Network socket state (data in flight)
|
|
10
|
+
*
|
|
11
|
+
* CRITICAL: Single-indicator detection (just output freshness) has HIGH false positive risk.
|
|
12
|
+
* Multi-indicator approach ONLY flags processes that fail ALL indicators.
|
|
13
|
+
*
|
|
14
|
+
* Scoring system:
|
|
15
|
+
* - isSleeping: +1
|
|
16
|
+
* - isBlockedOnPoll: +1
|
|
17
|
+
* - lowCpuUsage: +1
|
|
18
|
+
* - lowCtxSwitches: +1
|
|
19
|
+
* - noDataInFlight: +0.5 (secondary signal)
|
|
20
|
+
* - hasSynSent: +1 (stuck trying to connect)
|
|
21
|
+
* - hasDataInFlight: -2 (active I/O = working)
|
|
22
|
+
*
|
|
23
|
+
* Threshold: stuckScore >= 3.5 = likely stuck
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
const { execSync } = require('child_process');
|
|
27
|
+
const fs = require('fs');
|
|
28
|
+
|
|
29
|
+
// Stuck detection thresholds
|
|
30
|
+
const STUCK_THRESHOLD = 3.5; // Score at which we consider process stuck
|
|
31
|
+
const HIGH_CONFIDENCE_THRESHOLD = 4.5;
|
|
32
|
+
const CPU_LOW_THRESHOLD = 1; // Percent - below this is considered "low"
|
|
33
|
+
const CTX_SWITCHES_LOW_THRESHOLD = 10; // Below this is considered "inactive"
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Get process state from /proc filesystem
|
|
37
|
+
* @param {number} pid - Process ID
|
|
38
|
+
* @returns {object} Process state info
|
|
39
|
+
*/
|
|
40
|
+
function getProcessState(pid) {
|
|
41
|
+
try {
|
|
42
|
+
const statPath = `/proc/${pid}/stat`;
|
|
43
|
+
if (!fs.existsSync(statPath)) {
|
|
44
|
+
return { exists: false };
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const stat = fs.readFileSync(statPath, 'utf8');
|
|
48
|
+
const parts = stat.split(' ');
|
|
49
|
+
|
|
50
|
+
// stat fields: pid, comm, state, ppid, pgrp, ...
|
|
51
|
+
// State is the 3rd field (index 2): R=running, S=sleeping, D=disk sleep, Z=zombie
|
|
52
|
+
const state = parts[2];
|
|
53
|
+
|
|
54
|
+
// Get wchan (what the process is waiting on)
|
|
55
|
+
let wchan = '';
|
|
56
|
+
try {
|
|
57
|
+
wchan = fs.readFileSync(`/proc/${pid}/wchan`, 'utf8').trim();
|
|
58
|
+
} catch {
|
|
59
|
+
// wchan may not be readable
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Get CPU usage from stat
|
|
63
|
+
// utime (field 14) + stime (field 15) = total CPU ticks
|
|
64
|
+
const utime = parseInt(parts[13], 10);
|
|
65
|
+
const stime = parseInt(parts[14], 10);
|
|
66
|
+
|
|
67
|
+
// Get status for more info
|
|
68
|
+
const status = fs.readFileSync(`/proc/${pid}/status`, 'utf8');
|
|
69
|
+
const threads = status.match(/Threads:\s+(\d+)/)?.[1] || '1';
|
|
70
|
+
const volCtxSwitches = status.match(/voluntary_ctxt_switches:\s+(\d+)/)?.[1] || '0';
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
exists: true,
|
|
74
|
+
state,
|
|
75
|
+
wchan,
|
|
76
|
+
cpuTicks: utime + stime,
|
|
77
|
+
threads: parseInt(threads, 10),
|
|
78
|
+
volCtxSwitches: parseInt(volCtxSwitches, 10),
|
|
79
|
+
};
|
|
80
|
+
} catch (err) {
|
|
81
|
+
return { exists: false, error: err.message };
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Get network socket activity for a process
|
|
87
|
+
* @param {number} pid - Process ID
|
|
88
|
+
* @returns {object} Network state info
|
|
89
|
+
*/
|
|
90
|
+
function getNetworkState(pid) {
|
|
91
|
+
try {
|
|
92
|
+
const fdPath = `/proc/${pid}/fd`;
|
|
93
|
+
if (!fs.existsSync(fdPath)) {
|
|
94
|
+
return { hasNetwork: false };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Use ss to get socket states for this process
|
|
98
|
+
let ssOutput = '';
|
|
99
|
+
try {
|
|
100
|
+
ssOutput = execSync(`ss -tunp 2>/dev/null | grep ",pid=${pid}," || true`, {
|
|
101
|
+
encoding: 'utf8',
|
|
102
|
+
timeout: 5000,
|
|
103
|
+
});
|
|
104
|
+
} catch {
|
|
105
|
+
return { hasNetwork: false };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (!ssOutput.trim()) {
|
|
109
|
+
return { hasNetwork: false, connections: [] };
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const connections = [];
|
|
113
|
+
const lines = ssOutput.trim().split('\n');
|
|
114
|
+
|
|
115
|
+
for (const line of lines) {
|
|
116
|
+
// Parse ss output: State Recv-Q Send-Q Local Address:Port Peer Address:Port Process
|
|
117
|
+
const match = line.match(/^(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)/);
|
|
118
|
+
if (match) {
|
|
119
|
+
connections.push({
|
|
120
|
+
state: match[1],
|
|
121
|
+
recvQ: parseInt(match[2], 10),
|
|
122
|
+
sendQ: parseInt(match[3], 10),
|
|
123
|
+
local: match[4],
|
|
124
|
+
peer: match[5],
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Analyze connection health
|
|
130
|
+
const establishedCount = connections.filter((c) => c.state === 'ESTAB').length;
|
|
131
|
+
const hasDataInFlight = connections.some((c) => c.recvQ > 0 || c.sendQ > 0);
|
|
132
|
+
const hasSynSent = connections.some((c) => c.state === 'SYN-SENT');
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
hasNetwork: connections.length > 0,
|
|
136
|
+
connections,
|
|
137
|
+
establishedCount,
|
|
138
|
+
hasDataInFlight,
|
|
139
|
+
hasSynSent,
|
|
140
|
+
};
|
|
141
|
+
} catch (err) {
|
|
142
|
+
return { hasNetwork: false, error: err.message };
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Analyze process health using multi-indicator approach
|
|
148
|
+
*
|
|
149
|
+
* @param {number} pid - Process ID
|
|
150
|
+
* @param {number} samplePeriodMs - How long to sample (default 5000ms)
|
|
151
|
+
* @returns {Promise<object>} Analysis result with isLikelyStuck, stuckScore, indicators
|
|
152
|
+
*/
|
|
153
|
+
async function analyzeProcessHealth(pid, samplePeriodMs = 5000) {
|
|
154
|
+
const t0 = getProcessState(pid);
|
|
155
|
+
if (!t0.exists) {
|
|
156
|
+
return { isLikelyStuck: null, reason: 'Process does not exist', pid };
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Wait and sample again
|
|
160
|
+
await new Promise((r) => setTimeout(r, samplePeriodMs));
|
|
161
|
+
|
|
162
|
+
const t1 = getProcessState(pid);
|
|
163
|
+
if (!t1.exists) {
|
|
164
|
+
return { isLikelyStuck: null, reason: 'Process died during analysis', pid };
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Calculate CPU usage during sample period
|
|
168
|
+
const cpuTicksDelta = t1.cpuTicks - t0.cpuTicks;
|
|
169
|
+
const ctxSwitchesDelta = t1.volCtxSwitches - t0.volCtxSwitches;
|
|
170
|
+
|
|
171
|
+
// Get clock ticks per second (typically 100 on Linux)
|
|
172
|
+
const clockTicks = 100;
|
|
173
|
+
|
|
174
|
+
// CPU seconds used during sample
|
|
175
|
+
const cpuSeconds = cpuTicksDelta / clockTicks;
|
|
176
|
+
const sampleSeconds = samplePeriodMs / 1000;
|
|
177
|
+
const cpuPercent = (cpuSeconds / sampleSeconds) * 100;
|
|
178
|
+
|
|
179
|
+
// Get network state
|
|
180
|
+
const network = getNetworkState(pid);
|
|
181
|
+
|
|
182
|
+
// Analyze stuck indicators
|
|
183
|
+
const indicators = {
|
|
184
|
+
isSleeping: t1.state === 'S',
|
|
185
|
+
isBlockedOnPoll: t1.wchan.includes('poll') || t1.wchan.includes('wait'),
|
|
186
|
+
lowCpuUsage: cpuPercent < CPU_LOW_THRESHOLD,
|
|
187
|
+
lowCtxSwitches: ctxSwitchesDelta < CTX_SWITCHES_LOW_THRESHOLD,
|
|
188
|
+
// Network indicators (only apply if process has network connections)
|
|
189
|
+
noDataInFlight: network.hasNetwork && !network.hasDataInFlight,
|
|
190
|
+
hasSynSent: network.hasSynSent, // Stuck trying to connect
|
|
191
|
+
};
|
|
192
|
+
|
|
193
|
+
// Calculate stuck score using weighted indicators
|
|
194
|
+
let stuckScore = 0;
|
|
195
|
+
if (indicators.isSleeping) stuckScore += 1;
|
|
196
|
+
if (indicators.isBlockedOnPoll) stuckScore += 1;
|
|
197
|
+
if (indicators.lowCpuUsage) stuckScore += 1;
|
|
198
|
+
if (indicators.lowCtxSwitches) stuckScore += 1;
|
|
199
|
+
if (indicators.noDataInFlight) stuckScore += 0.5; // Secondary signal
|
|
200
|
+
if (indicators.hasSynSent) stuckScore += 1; // Strong signal - stuck connecting
|
|
201
|
+
|
|
202
|
+
// CRITICAL: If data IS flowing, REDUCE stuck score (legitimate work)
|
|
203
|
+
if (network.hasDataInFlight) {
|
|
204
|
+
stuckScore = Math.max(0, stuckScore - 2); // Active I/O = likely working
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
const isLikelyStuck = stuckScore >= STUCK_THRESHOLD;
|
|
208
|
+
const confidence =
|
|
209
|
+
stuckScore >= HIGH_CONFIDENCE_THRESHOLD
|
|
210
|
+
? 'high'
|
|
211
|
+
: stuckScore >= STUCK_THRESHOLD
|
|
212
|
+
? 'medium'
|
|
213
|
+
: 'low';
|
|
214
|
+
|
|
215
|
+
return {
|
|
216
|
+
pid,
|
|
217
|
+
state: t1.state,
|
|
218
|
+
wchan: t1.wchan,
|
|
219
|
+
cpuPercent: parseFloat(cpuPercent.toFixed(2)),
|
|
220
|
+
ctxSwitchesDelta,
|
|
221
|
+
threads: t1.threads,
|
|
222
|
+
network: {
|
|
223
|
+
hasConnections: network.hasNetwork,
|
|
224
|
+
establishedCount: network.establishedCount || 0,
|
|
225
|
+
hasDataInFlight: network.hasDataInFlight || false,
|
|
226
|
+
hasSynSent: network.hasSynSent || false,
|
|
227
|
+
},
|
|
228
|
+
indicators,
|
|
229
|
+
stuckScore: parseFloat(stuckScore.toFixed(1)),
|
|
230
|
+
isLikelyStuck,
|
|
231
|
+
confidence,
|
|
232
|
+
analysis: isLikelyStuck
|
|
233
|
+
? `Process appears STUCK: sleeping on ${t1.wchan}, ${cpuPercent.toFixed(1)}% CPU, ${ctxSwitchesDelta} ctx switches`
|
|
234
|
+
: `Process appears WORKING: ${cpuPercent.toFixed(1)}% CPU, ${ctxSwitchesDelta} ctx switches, state=${t1.state}`,
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Check if we're on a platform that supports /proc filesystem
|
|
240
|
+
* @returns {boolean}
|
|
241
|
+
*/
|
|
242
|
+
function isPlatformSupported() {
|
|
243
|
+
return process.platform === 'linux' && fs.existsSync('/proc');
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
module.exports = {
|
|
247
|
+
analyzeProcessHealth,
|
|
248
|
+
getProcessState,
|
|
249
|
+
getNetworkState,
|
|
250
|
+
isPlatformSupported,
|
|
251
|
+
// Export thresholds for testing
|
|
252
|
+
STUCK_THRESHOLD,
|
|
253
|
+
HIGH_CONFIDENCE_THRESHOLD,
|
|
254
|
+
CPU_LOW_THRESHOLD,
|
|
255
|
+
CTX_SWITCHES_LOW_THRESHOLD,
|
|
256
|
+
};
|