@covibes/zeroshot 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +43 -0
- package/cli/index.js +130 -49
- package/cli/message-formatters-normal.js +77 -38
- package/cluster-templates/base-templates/debug-workflow.json +11 -2
- package/cluster-templates/base-templates/full-workflow.json +20 -7
- package/cluster-templates/base-templates/single-worker.json +8 -1
- package/cluster-templates/base-templates/worker-validator.json +10 -2
- package/docker/zeroshot-cluster/Dockerfile +7 -0
- package/package.json +3 -1
- package/src/agent/agent-config.js +19 -6
- package/src/agent/agent-context-builder.js +9 -0
- package/src/agent/agent-task-executor.js +149 -65
- package/src/config-validator.js +13 -0
- package/src/isolation-manager.js +11 -7
- package/src/orchestrator.js +78 -1
- package/src/status-footer.js +59 -6
- package/src/template-resolver.js +23 -1
|
@@ -6,6 +6,10 @@
|
|
|
6
6
|
|
|
7
7
|
FROM node:20-slim
|
|
8
8
|
|
|
9
|
+
# Upgrade npm to fix Arborist isDescendantOf bug (npm 10.x crash on complex peer deps)
|
|
10
|
+
# See: https://github.com/npm/cli/issues/7682
|
|
11
|
+
RUN npm install -g npm@latest
|
|
12
|
+
|
|
9
13
|
# Version pinning for infrastructure tools
|
|
10
14
|
ARG AWS_CLI_VERSION=2.15.10
|
|
11
15
|
ARG TERRAFORM_VERSION=1.6.6
|
|
@@ -19,6 +23,9 @@ ARG TFSEC_VERSION=1.28.4
|
|
|
19
23
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
20
24
|
git \
|
|
21
25
|
curl \
|
|
26
|
+
# Build tools for native modules (node-gyp needs make, gcc, g++)
|
|
27
|
+
build-essential \
|
|
28
|
+
python3-dev \
|
|
22
29
|
ca-certificates \
|
|
23
30
|
gnupg \
|
|
24
31
|
unzip \
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@covibes/zeroshot",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.5.0",
|
|
4
4
|
"description": "Multi-agent orchestration engine for Claude - cluster coordinator and CLI",
|
|
5
5
|
"main": "src/orchestrator.js",
|
|
6
6
|
"bin": {
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
"deadcode:files": "unimported",
|
|
25
25
|
"deadcode:deps": "depcheck",
|
|
26
26
|
"deadcode:all": "npm run deadcode && npm run deadcode:files && npm run deadcode:deps",
|
|
27
|
+
"dupcheck": "jscpd src/ --min-lines 5 --min-tokens 50 --threshold 5",
|
|
27
28
|
"check": "npm run typecheck && npm run lint",
|
|
28
29
|
"check:all": "npm run check && npm run deadcode:all",
|
|
29
30
|
"release": "semantic-release",
|
|
@@ -111,6 +112,7 @@
|
|
|
111
112
|
"eslint-config-prettier": "^10.1.8",
|
|
112
113
|
"eslint-plugin-unused-imports": "^4.3.0",
|
|
113
114
|
"husky": "^9.1.7",
|
|
115
|
+
"jscpd": "^3.5.10",
|
|
114
116
|
"mocha": "^11.7.5",
|
|
115
117
|
"semantic-release": "^25.0.2",
|
|
116
118
|
"sinon": "^21.0.0",
|
|
@@ -11,10 +11,9 @@
|
|
|
11
11
|
// Default max iterations (high limit - let the user decide when to give up)
|
|
12
12
|
const DEFAULT_MAX_ITERATIONS = 100;
|
|
13
13
|
|
|
14
|
-
//
|
|
15
|
-
//
|
|
16
|
-
|
|
17
|
-
const DEFAULT_TASK_TIMEOUT_MS = Infinity;
|
|
14
|
+
// Default timeout: 0 = no timeout (task runs until completion or explicit kill)
|
|
15
|
+
// Use positive number for timeout in milliseconds
|
|
16
|
+
const DEFAULT_TIMEOUT = 0;
|
|
18
17
|
|
|
19
18
|
// Stale detection - ENABLED by default using multi-indicator analysis (safe from false positives)
|
|
20
19
|
// Multi-indicator approach checks: process state, CPU usage, context switches, network I/O
|
|
@@ -85,13 +84,28 @@ function validateAgentConfig(config, options = {}) {
|
|
|
85
84
|
throw new Error(`Agent "${config.id}": invalid prompt format`);
|
|
86
85
|
}
|
|
87
86
|
|
|
87
|
+
// Default timeout to 0 (no timeout) if not specified
|
|
88
|
+
// Use positive number for timeout in milliseconds
|
|
89
|
+
// ROBUST: Handle undefined, null, AND string values from template resolution
|
|
90
|
+
if (config.timeout === undefined || config.timeout === null || config.timeout === '') {
|
|
91
|
+
config.timeout = DEFAULT_TIMEOUT;
|
|
92
|
+
} else {
|
|
93
|
+
// Coerce to number (handles string "0" from template resolution)
|
|
94
|
+
config.timeout = Number(config.timeout);
|
|
95
|
+
}
|
|
96
|
+
if (!Number.isFinite(config.timeout) || config.timeout < 0) {
|
|
97
|
+
throw new Error(
|
|
98
|
+
`Agent "${config.id}": timeout must be a non-negative number (got ${config.timeout}).`
|
|
99
|
+
);
|
|
100
|
+
}
|
|
101
|
+
|
|
88
102
|
// Build normalized config
|
|
89
103
|
const normalizedConfig = {
|
|
90
104
|
...config,
|
|
91
105
|
modelConfig,
|
|
92
106
|
promptConfig,
|
|
93
107
|
maxIterations: config.maxIterations || DEFAULT_MAX_ITERATIONS,
|
|
94
|
-
timeout: config.timeout
|
|
108
|
+
timeout: config.timeout, // Defaults to 0 (no timeout) if not specified
|
|
95
109
|
staleDuration: config.staleDuration || DEFAULT_STALE_DURATION_MS,
|
|
96
110
|
enableLivenessCheck: config.enableLivenessCheck ?? DEFAULT_LIVENESS_CHECK_ENABLED, // On by default, opt-out with false
|
|
97
111
|
};
|
|
@@ -115,7 +129,6 @@ function validateAgentConfig(config, options = {}) {
|
|
|
115
129
|
module.exports = {
|
|
116
130
|
validateAgentConfig,
|
|
117
131
|
DEFAULT_MAX_ITERATIONS,
|
|
118
|
-
DEFAULT_TASK_TIMEOUT_MS,
|
|
119
132
|
DEFAULT_STALE_DURATION_MS,
|
|
120
133
|
DEFAULT_LIVENESS_CHECK_ENABLED,
|
|
121
134
|
};
|
|
@@ -54,6 +54,15 @@ function buildContext({
|
|
|
54
54
|
context += `- If unsure between "fix the code" vs "relax the rules" → ALWAYS fix the code\n`;
|
|
55
55
|
context += `- If unsure between "do more" vs "do less" → ALWAYS do what's required, nothing more\n\n`;
|
|
56
56
|
|
|
57
|
+
// MINIMAL OUTPUT - No verbose prose for background agents
|
|
58
|
+
context += `## 📝 OUTPUT STYLE - MINIMAL\n\n`;
|
|
59
|
+
context += `You are a background agent. The human CANNOT interact with you.\n`;
|
|
60
|
+
context += `- NO explanatory prose ("Let me explain...", "I'll now...")\n`;
|
|
61
|
+
context += `- NO step-by-step narration\n`;
|
|
62
|
+
context += `- YES: Brief status updates ("Implementing auth", "Fixed 3 errors")\n`;
|
|
63
|
+
context += `- YES: Error reports with actionable info\n`;
|
|
64
|
+
context += `- YES: Final summary of changes made\n\n`;
|
|
65
|
+
|
|
57
66
|
// Add prompt from config (system prompt, instructions, output format)
|
|
58
67
|
// If selectedPrompt is provided (iteration-based), use it directly
|
|
59
68
|
// Otherwise fall back to legacy config.prompt handling
|
|
@@ -51,6 +51,72 @@ function sanitizeErrorMessage(error) {
|
|
|
51
51
|
return error;
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
+
/**
|
|
55
|
+
* Strip timestamp prefix from log lines.
|
|
56
|
+
* Log lines may have format: [epochMs]{json...} or [epochMs]text
|
|
57
|
+
*
|
|
58
|
+
* @param {string} line - Raw log line
|
|
59
|
+
* @returns {string} Line content without timestamp prefix, empty string for invalid input
|
|
60
|
+
*/
|
|
61
|
+
function stripTimestampPrefix(line) {
|
|
62
|
+
if (!line || typeof line !== 'string') return '';
|
|
63
|
+
const trimmed = line.trim().replace(/\r$/, '');
|
|
64
|
+
if (!trimmed) return '';
|
|
65
|
+
const match = trimmed.match(/^\[(\d{13})\](.*)$/);
|
|
66
|
+
return match ? match[2] : trimmed;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Extract error context from task output.
|
|
71
|
+
* Shared by both isolated and non-isolated modes.
|
|
72
|
+
*
|
|
73
|
+
* @param {Object} params - Extraction parameters
|
|
74
|
+
* @param {string} params.output - Full task output
|
|
75
|
+
* @param {string} [params.statusOutput] - Status command output (non-isolated only)
|
|
76
|
+
* @param {string} params.taskId - Task ID for error messages
|
|
77
|
+
* @param {boolean} [params.isNotFound=false] - True if task was not found
|
|
78
|
+
* @returns {string|null} Sanitized error context or null if extraction failed
|
|
79
|
+
*/
|
|
80
|
+
function extractErrorContext({ output, statusOutput, taskId, isNotFound = false }) {
|
|
81
|
+
// Task not found - explicit error
|
|
82
|
+
if (isNotFound) {
|
|
83
|
+
return sanitizeErrorMessage(`Task ${taskId} not found (may have crashed or been killed)`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Try status output first (only available in non-isolated mode)
|
|
87
|
+
if (statusOutput) {
|
|
88
|
+
const statusErrorMatch = statusOutput.match(/Error:\s*(.+)/);
|
|
89
|
+
if (statusErrorMatch) {
|
|
90
|
+
return sanitizeErrorMessage(statusErrorMatch[1].trim());
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Fall back to extracting from output (last 500 chars)
|
|
95
|
+
const lastOutput = (output || '').slice(-500).trim();
|
|
96
|
+
if (!lastOutput) {
|
|
97
|
+
return sanitizeErrorMessage('Task failed with no output (check if task was interrupted or timed out)');
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Common error patterns
|
|
101
|
+
const errorPatterns = [
|
|
102
|
+
/Error:\s*(.+)/i,
|
|
103
|
+
/error:\s*(.+)/i,
|
|
104
|
+
/failed:\s*(.+)/i,
|
|
105
|
+
/Exception:\s*(.+)/i,
|
|
106
|
+
/panic:\s*(.+)/i,
|
|
107
|
+
];
|
|
108
|
+
|
|
109
|
+
for (const pattern of errorPatterns) {
|
|
110
|
+
const match = lastOutput.match(pattern);
|
|
111
|
+
if (match) {
|
|
112
|
+
return sanitizeErrorMessage(match[1].slice(0, 200));
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// No pattern matched - include last portion of output
|
|
117
|
+
return sanitizeErrorMessage(`Task failed. Last output: ${lastOutput.slice(-200)}`);
|
|
118
|
+
}
|
|
119
|
+
|
|
54
120
|
// Track if we've already ensured the AskUserQuestion hook is installed
|
|
55
121
|
let askUserQuestionHookInstalled = false;
|
|
56
122
|
|
|
@@ -68,10 +134,11 @@ function extractTokenUsage(output) {
|
|
|
68
134
|
|
|
69
135
|
// Find the result line containing usage data
|
|
70
136
|
for (const line of lines) {
|
|
71
|
-
|
|
137
|
+
const content = stripTimestampPrefix(line);
|
|
138
|
+
if (!content) continue;
|
|
72
139
|
|
|
73
140
|
try {
|
|
74
|
-
const event = JSON.parse(
|
|
141
|
+
const event = JSON.parse(content);
|
|
75
142
|
if (event.type === 'result') {
|
|
76
143
|
const usage = event.usage || {};
|
|
77
144
|
return {
|
|
@@ -527,14 +594,45 @@ function followClaudeTaskLogs(agent, taskId) {
|
|
|
527
594
|
// Track exec failures - if status command keeps failing, something is wrong
|
|
528
595
|
if (error) {
|
|
529
596
|
consecutiveExecFailures++;
|
|
530
|
-
if (consecutiveExecFailures
|
|
597
|
+
if (consecutiveExecFailures >= MAX_CONSECUTIVE_FAILURES) {
|
|
531
598
|
console.error(
|
|
532
|
-
`[Agent ${agent.id}] ⚠️ Status polling failed ${MAX_CONSECUTIVE_FAILURES} times consecutively
|
|
599
|
+
`[Agent ${agent.id}] ⚠️ Status polling failed ${MAX_CONSECUTIVE_FAILURES} times consecutively! STOPPING.`
|
|
533
600
|
);
|
|
534
601
|
console.error(` Command: ${ctPath} status ${taskId}`);
|
|
535
602
|
console.error(` Error: ${error.message}`);
|
|
536
603
|
console.error(` Stderr: ${stderr || 'none'}`);
|
|
537
604
|
console.error(` This may indicate zeroshot is not in PATH or task storage is corrupted.`);
|
|
605
|
+
|
|
606
|
+
// Stop polling and resolve with failure
|
|
607
|
+
if (!resolved) {
|
|
608
|
+
resolved = true;
|
|
609
|
+
clearInterval(pollInterval);
|
|
610
|
+
clearInterval(statusCheckInterval);
|
|
611
|
+
agent.currentTask = null;
|
|
612
|
+
|
|
613
|
+
// Publish error for orchestrator/resume
|
|
614
|
+
agent._publish({
|
|
615
|
+
topic: 'AGENT_ERROR',
|
|
616
|
+
receiver: 'broadcast',
|
|
617
|
+
content: {
|
|
618
|
+
text: `Task ${taskId} polling failed after ${MAX_CONSECUTIVE_FAILURES} consecutive failures`,
|
|
619
|
+
data: {
|
|
620
|
+
taskId,
|
|
621
|
+
error: 'polling_timeout',
|
|
622
|
+
attempts: consecutiveExecFailures,
|
|
623
|
+
role: agent.role,
|
|
624
|
+
iteration: agent.iteration,
|
|
625
|
+
},
|
|
626
|
+
},
|
|
627
|
+
});
|
|
628
|
+
|
|
629
|
+
resolve({
|
|
630
|
+
success: false,
|
|
631
|
+
output,
|
|
632
|
+
error: `Status polling failed ${MAX_CONSECUTIVE_FAILURES} times - task may not exist`,
|
|
633
|
+
});
|
|
634
|
+
}
|
|
635
|
+
return;
|
|
538
636
|
}
|
|
539
637
|
return; // Keep polling - might be transient
|
|
540
638
|
}
|
|
@@ -566,47 +664,15 @@ function followClaudeTaskLogs(agent, taskId) {
|
|
|
566
664
|
clearInterval(statusCheckInterval);
|
|
567
665
|
agent.currentTask = null;
|
|
568
666
|
|
|
569
|
-
// Extract
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
const statusErrorMatch = stdout.match(/Error:\s*(.+)/);
|
|
574
|
-
if (statusErrorMatch) {
|
|
575
|
-
errorContext = statusErrorMatch[1].trim();
|
|
576
|
-
} else {
|
|
577
|
-
// Fall back to last 500 chars of output (likely contains the failure reason)
|
|
578
|
-
const lastOutput = output.slice(-500).trim();
|
|
579
|
-
if (lastOutput) {
|
|
580
|
-
// Look for common error patterns in output
|
|
581
|
-
const errorPatterns = [
|
|
582
|
-
/Error:\s*(.+)/i,
|
|
583
|
-
/error:\s*(.+)/i,
|
|
584
|
-
/failed:\s*(.+)/i,
|
|
585
|
-
/Exception:\s*(.+)/i,
|
|
586
|
-
/panic:\s*(.+)/i,
|
|
587
|
-
];
|
|
588
|
-
for (const pattern of errorPatterns) {
|
|
589
|
-
const match = lastOutput.match(pattern);
|
|
590
|
-
if (match) {
|
|
591
|
-
errorContext = match[1].slice(0, 200);
|
|
592
|
-
break;
|
|
593
|
-
}
|
|
594
|
-
}
|
|
595
|
-
// If no pattern matched, include last portion of output
|
|
596
|
-
if (!errorContext) {
|
|
597
|
-
errorContext = `Task failed. Last output: ${lastOutput.slice(-200)}`;
|
|
598
|
-
}
|
|
599
|
-
} else {
|
|
600
|
-
errorContext =
|
|
601
|
-
'Task failed with no output (check if task was interrupted or timed out)';
|
|
602
|
-
}
|
|
603
|
-
}
|
|
604
|
-
}
|
|
667
|
+
// Extract error context using shared helper
|
|
668
|
+
const errorContext = !success
|
|
669
|
+
? extractErrorContext({ output, statusOutput: stdout, taskId })
|
|
670
|
+
: null;
|
|
605
671
|
|
|
606
672
|
resolve({
|
|
607
673
|
success,
|
|
608
674
|
output,
|
|
609
|
-
error:
|
|
675
|
+
error: errorContext,
|
|
610
676
|
tokenUsage: extractTokenUsage(output),
|
|
611
677
|
});
|
|
612
678
|
}, 500);
|
|
@@ -912,12 +978,14 @@ function followClaudeTaskLogsIsolated(agent, taskId) {
|
|
|
912
978
|
`zeroshot status ${taskId} 2>/dev/null || echo "not_found"`,
|
|
913
979
|
]);
|
|
914
980
|
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
)
|
|
981
|
+
// Use same regex patterns as non-isolated mode (lines 649-650)
|
|
982
|
+
// CRITICAL: Don't use substring matching - it matches "error" in "is_error":false
|
|
983
|
+
const statusOutput = statusResult.stdout;
|
|
984
|
+
const isSuccess = /Status:\s+completed/i.test(statusOutput);
|
|
985
|
+
const isError = /Status:\s+failed/i.test(statusOutput);
|
|
986
|
+
const isNotFound = statusOutput.includes('not_found');
|
|
987
|
+
|
|
988
|
+
if (isSuccess || isError || isNotFound) {
|
|
921
989
|
// Task finished - read final output and resolve
|
|
922
990
|
const finalReadResult = await manager.execInContainer(clusterId, [
|
|
923
991
|
'sh',
|
|
@@ -940,13 +1008,23 @@ function followClaudeTaskLogsIsolated(agent, taskId) {
|
|
|
940
1008
|
cleanup();
|
|
941
1009
|
taskExited = true;
|
|
942
1010
|
|
|
943
|
-
//
|
|
1011
|
+
// Determine success status
|
|
1012
|
+
const success = isSuccess && !isError;
|
|
1013
|
+
|
|
1014
|
+
// Extract error context using shared helper
|
|
1015
|
+
const errorContext = !success
|
|
1016
|
+
? extractErrorContext({ output: fullOutput, taskId, isNotFound })
|
|
1017
|
+
: null;
|
|
1018
|
+
|
|
1019
|
+
// Parse result from output
|
|
944
1020
|
const parsedResult = agent._parseResultOutput(fullOutput);
|
|
945
1021
|
|
|
946
1022
|
resolve({
|
|
1023
|
+
success,
|
|
947
1024
|
output: fullOutput,
|
|
948
1025
|
taskId,
|
|
949
1026
|
result: parsedResult,
|
|
1027
|
+
error: errorContext,
|
|
950
1028
|
tokenUsage: extractTokenUsage(fullOutput),
|
|
951
1029
|
});
|
|
952
1030
|
}
|
|
@@ -956,18 +1034,19 @@ function followClaudeTaskLogsIsolated(agent, taskId) {
|
|
|
956
1034
|
}
|
|
957
1035
|
}, 500);
|
|
958
1036
|
|
|
959
|
-
// Safety timeout (
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
1037
|
+
// Safety timeout (0 = no timeout, task runs until completion)
|
|
1038
|
+
if (agent.timeout > 0) {
|
|
1039
|
+
setTimeout(() => {
|
|
1040
|
+
if (!taskExited) {
|
|
1041
|
+
cleanup();
|
|
1042
|
+
reject(
|
|
1043
|
+
new Error(
|
|
1044
|
+
`Task ${taskId} timeout after ${agent.timeout}ms (isolated mode)`
|
|
1045
|
+
)
|
|
1046
|
+
);
|
|
1047
|
+
}
|
|
1048
|
+
}, agent.timeout);
|
|
1049
|
+
}
|
|
971
1050
|
})
|
|
972
1051
|
.catch((err) => {
|
|
973
1052
|
cleanup();
|
|
@@ -994,11 +1073,14 @@ function parseResultOutput(agent, output) {
|
|
|
994
1073
|
let trimmedOutput = output.trim();
|
|
995
1074
|
|
|
996
1075
|
// IMPORTANT: Output is NDJSON (one JSON object per line) from streaming log
|
|
1076
|
+
// Lines may have timestamp prefix: [epochMs]{json...}
|
|
997
1077
|
// Find the line with "type":"result" which contains the actual result
|
|
998
1078
|
const lines = trimmedOutput.split('\n');
|
|
999
1079
|
const resultLine = lines.find((line) => {
|
|
1000
1080
|
try {
|
|
1001
|
-
const
|
|
1081
|
+
const content = stripTimestampPrefix(line);
|
|
1082
|
+
if (!content.startsWith('{')) return false;
|
|
1083
|
+
const obj = JSON.parse(content);
|
|
1002
1084
|
return obj.type === 'result';
|
|
1003
1085
|
} catch {
|
|
1004
1086
|
return false;
|
|
@@ -1006,13 +1088,15 @@ function parseResultOutput(agent, output) {
|
|
|
1006
1088
|
});
|
|
1007
1089
|
|
|
1008
1090
|
// Use the result line if found, otherwise use last non-empty line
|
|
1091
|
+
// CRITICAL: Strip timestamp prefix before assigning to trimmedOutput
|
|
1009
1092
|
if (resultLine) {
|
|
1010
|
-
trimmedOutput = resultLine
|
|
1093
|
+
trimmedOutput = stripTimestampPrefix(resultLine);
|
|
1011
1094
|
} else if (lines.length > 1) {
|
|
1012
|
-
// Fallback: use last non-empty line
|
|
1095
|
+
// Fallback: use last non-empty line (also strip timestamp)
|
|
1013
1096
|
for (let i = lines.length - 1; i >= 0; i--) {
|
|
1014
|
-
|
|
1015
|
-
|
|
1097
|
+
const content = stripTimestampPrefix(lines[i]);
|
|
1098
|
+
if (content) {
|
|
1099
|
+
trimmedOutput = content;
|
|
1016
1100
|
break;
|
|
1017
1101
|
}
|
|
1018
1102
|
}
|
package/src/config-validator.js
CHANGED
|
@@ -415,6 +415,19 @@ function validateAgents(config) {
|
|
|
415
415
|
}
|
|
416
416
|
}
|
|
417
417
|
|
|
418
|
+
// Check for git operations in validator prompts (unreliable in agents)
|
|
419
|
+
if (agent.role === 'validator') {
|
|
420
|
+
const prompt = typeof agent.prompt === 'string' ? agent.prompt : agent.prompt?.system;
|
|
421
|
+
const gitPatterns = ['git diff', 'git status', 'git log', 'git show'];
|
|
422
|
+
for (const pattern of gitPatterns) {
|
|
423
|
+
if (prompt?.includes(pattern)) {
|
|
424
|
+
errors.push(
|
|
425
|
+
`Validator '${agent.id}' uses '${pattern}' - git state is unreliable in agents`
|
|
426
|
+
);
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
418
431
|
// JSON output without schema
|
|
419
432
|
if (agent.outputFormat === 'json' && !agent.jsonSchema) {
|
|
420
433
|
warnings.push(
|
package/src/isolation-manager.js
CHANGED
|
@@ -191,7 +191,7 @@ class IsolationManager {
|
|
|
191
191
|
try {
|
|
192
192
|
installResult = await this.execInContainer(
|
|
193
193
|
clusterId,
|
|
194
|
-
['sh', '-c', 'npm install --no-audit --no-fund
|
|
194
|
+
['sh', '-c', 'npm_config_engine_strict=false npm install --no-audit --no-fund'],
|
|
195
195
|
{}
|
|
196
196
|
);
|
|
197
197
|
|
|
@@ -201,16 +201,18 @@ class IsolationManager {
|
|
|
201
201
|
}
|
|
202
202
|
|
|
203
203
|
// Failed - retry if not last attempt
|
|
204
|
+
// Use stderr if available, otherwise stdout (npm writes some errors to stdout)
|
|
205
|
+
const errorOutput = (installResult.stderr || installResult.stdout || '').slice(0, 500);
|
|
204
206
|
if (attempt < maxRetries) {
|
|
205
207
|
const delay = baseDelay * Math.pow(2, attempt - 1);
|
|
206
208
|
console.warn(
|
|
207
209
|
`[IsolationManager] ⚠️ npm install failed (attempt ${attempt}/${maxRetries}), retrying in ${delay}ms...`
|
|
208
210
|
);
|
|
209
|
-
console.warn(`[IsolationManager] Error: ${
|
|
211
|
+
console.warn(`[IsolationManager] Error: ${errorOutput}`);
|
|
210
212
|
await new Promise((_resolve) => setTimeout(_resolve, delay));
|
|
211
213
|
} else {
|
|
212
214
|
console.warn(
|
|
213
|
-
`[IsolationManager] ⚠️ npm install failed after ${maxRetries} attempts (non-fatal): ${
|
|
215
|
+
`[IsolationManager] ⚠️ npm install failed after ${maxRetries} attempts (non-fatal): ${errorOutput}`
|
|
214
216
|
);
|
|
215
217
|
}
|
|
216
218
|
} catch (execErr) {
|
|
@@ -342,8 +344,9 @@ class IsolationManager {
|
|
|
342
344
|
* @param {number} [timeout=10] - Timeout in seconds before SIGKILL
|
|
343
345
|
* @returns {Promise<void>}
|
|
344
346
|
*/
|
|
345
|
-
stopContainer(clusterId, timeout = 10) {
|
|
346
|
-
|
|
347
|
+
stopContainer(clusterId, timeout = 10, explicitContainerId = null) {
|
|
348
|
+
// Use explicit containerId (from restored state) or in-memory Map
|
|
349
|
+
const containerId = explicitContainerId || this.containers.get(clusterId);
|
|
347
350
|
if (!containerId) {
|
|
348
351
|
return; // Already stopped or never started
|
|
349
352
|
}
|
|
@@ -369,8 +372,9 @@ class IsolationManager {
|
|
|
369
372
|
* @param {boolean} [force=false] - Force remove running container
|
|
370
373
|
* @returns {Promise<void>}
|
|
371
374
|
*/
|
|
372
|
-
removeContainer(clusterId, force = false) {
|
|
373
|
-
|
|
375
|
+
removeContainer(clusterId, force = false, explicitContainerId = null) {
|
|
376
|
+
// Use explicit containerId (from restored state) or in-memory Map
|
|
377
|
+
const containerId = explicitContainerId || this.containers.get(clusterId);
|
|
374
378
|
if (!containerId) {
|
|
375
379
|
return;
|
|
376
380
|
}
|
package/src/orchestrator.js
CHANGED
|
@@ -126,9 +126,55 @@ class Orchestrator {
|
|
|
126
126
|
const clusterIds = Object.keys(data);
|
|
127
127
|
this._log(`[Orchestrator] Found ${clusterIds.length} clusters in file:`, clusterIds);
|
|
128
128
|
|
|
129
|
+
// Track clusters to remove (missing .db files or 0 messages)
|
|
130
|
+
const clustersToRemove = [];
|
|
131
|
+
// Track clusters with 0 messages (corrupted from SIGINT race condition)
|
|
132
|
+
const corruptedClusters = [];
|
|
133
|
+
|
|
129
134
|
for (const [clusterId, clusterData] of Object.entries(data)) {
|
|
135
|
+
// Skip clusters whose .db file doesn't exist (orphaned registry entries)
|
|
136
|
+
const dbPath = path.join(this.storageDir, `${clusterId}.db`);
|
|
137
|
+
if (!fs.existsSync(dbPath)) {
|
|
138
|
+
console.warn(`[Orchestrator] Cluster ${clusterId} has no database file, removing from registry`);
|
|
139
|
+
clustersToRemove.push(clusterId);
|
|
140
|
+
continue;
|
|
141
|
+
}
|
|
142
|
+
|
|
130
143
|
this._log(`[Orchestrator] Loading cluster: ${clusterId}`);
|
|
131
|
-
this._loadSingleCluster(clusterId, clusterData);
|
|
144
|
+
const cluster = this._loadSingleCluster(clusterId, clusterData);
|
|
145
|
+
|
|
146
|
+
// VALIDATION: Detect 0-message clusters (corrupted from SIGINT during initialization)
|
|
147
|
+
// These clusters were created before the initCompletePromise fix was applied
|
|
148
|
+
if (cluster && cluster.messageBus) {
|
|
149
|
+
const messageCount = cluster.messageBus.count({ cluster_id: clusterId });
|
|
150
|
+
if (messageCount === 0) {
|
|
151
|
+
console.warn(`[Orchestrator] ⚠️ Cluster ${clusterId} has 0 messages (corrupted)`);
|
|
152
|
+
console.warn(`[Orchestrator] This likely occurred from SIGINT during initialization.`);
|
|
153
|
+
console.warn(`[Orchestrator] Marking as 'corrupted' - use 'zeroshot kill ${clusterId}' to remove.`);
|
|
154
|
+
corruptedClusters.push(clusterId);
|
|
155
|
+
// Mark cluster as corrupted for visibility in status/list commands
|
|
156
|
+
cluster.state = 'corrupted';
|
|
157
|
+
cluster.corruptedReason = 'SIGINT during initialization (0 messages in ledger)';
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Clean up orphaned entries from clusters.json
|
|
163
|
+
if (clustersToRemove.length > 0) {
|
|
164
|
+
for (const clusterId of clustersToRemove) {
|
|
165
|
+
delete data[clusterId];
|
|
166
|
+
}
|
|
167
|
+
fs.writeFileSync(clustersFile, JSON.stringify(data, null, 2));
|
|
168
|
+
this._log(`[Orchestrator] Removed ${clustersToRemove.length} orphaned cluster(s) from registry`);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Log summary of corrupted clusters
|
|
172
|
+
if (corruptedClusters.length > 0) {
|
|
173
|
+
console.warn(`\n[Orchestrator] ⚠️ Found ${corruptedClusters.length} corrupted cluster(s):`);
|
|
174
|
+
for (const clusterId of corruptedClusters) {
|
|
175
|
+
console.warn(` - ${clusterId}`);
|
|
176
|
+
}
|
|
177
|
+
console.warn(`[Orchestrator] Run 'zeroshot clear' to remove all corrupted clusters.\n`);
|
|
132
178
|
}
|
|
133
179
|
|
|
134
180
|
this._log(`[Orchestrator] Total clusters loaded: ${this.clusters.size}`);
|
|
@@ -494,6 +540,13 @@ class Orchestrator {
|
|
|
494
540
|
}
|
|
495
541
|
|
|
496
542
|
// Build cluster object
|
|
543
|
+
// CRITICAL: initComplete promise ensures ISSUE_OPENED is published before stop() completes
|
|
544
|
+
// This prevents 0-message clusters from SIGINT during async initialization
|
|
545
|
+
let resolveInitComplete;
|
|
546
|
+
const initCompletePromise = new Promise((resolve) => {
|
|
547
|
+
resolveInitComplete = resolve;
|
|
548
|
+
});
|
|
549
|
+
|
|
497
550
|
const cluster = {
|
|
498
551
|
id: clusterId,
|
|
499
552
|
config,
|
|
@@ -504,6 +557,9 @@ class Orchestrator {
|
|
|
504
557
|
createdAt: Date.now(),
|
|
505
558
|
// Track PID for zombie detection (this process owns the cluster)
|
|
506
559
|
pid: process.pid,
|
|
560
|
+
// Initialization completion tracking (for safe SIGINT handling)
|
|
561
|
+
initCompletePromise,
|
|
562
|
+
_resolveInitComplete: resolveInitComplete,
|
|
507
563
|
// Isolation state (only if enabled)
|
|
508
564
|
// CRITICAL: Store workDir for resume capability - without this, resume() can't recreate container
|
|
509
565
|
isolation: options.isolation
|
|
@@ -652,6 +708,12 @@ class Orchestrator {
|
|
|
652
708
|
},
|
|
653
709
|
});
|
|
654
710
|
|
|
711
|
+
// CRITICAL: Mark initialization complete AFTER ISSUE_OPENED is published
|
|
712
|
+
// This ensures stop() waits for at least 1 message before stopping
|
|
713
|
+
if (cluster._resolveInitComplete) {
|
|
714
|
+
cluster._resolveInitComplete();
|
|
715
|
+
}
|
|
716
|
+
|
|
655
717
|
this._log(`Cluster ${clusterId} started with ${cluster.agents.length} agents`);
|
|
656
718
|
|
|
657
719
|
// Watch for CLUSTER_COMPLETE message to auto-stop
|
|
@@ -818,6 +880,10 @@ class Orchestrator {
|
|
|
818
880
|
};
|
|
819
881
|
} catch (error) {
|
|
820
882
|
cluster.state = 'failed';
|
|
883
|
+
// CRITICAL: Resolve the promise on failure too, so stop() doesn't hang
|
|
884
|
+
if (cluster._resolveInitComplete) {
|
|
885
|
+
cluster._resolveInitComplete();
|
|
886
|
+
}
|
|
821
887
|
console.error(`Cluster ${clusterId} failed to start:`, error);
|
|
822
888
|
throw error;
|
|
823
889
|
}
|
|
@@ -833,6 +899,17 @@ class Orchestrator {
|
|
|
833
899
|
throw new Error(`Cluster ${clusterId} not found`);
|
|
834
900
|
}
|
|
835
901
|
|
|
902
|
+
// CRITICAL: Wait for initialization to complete before stopping
|
|
903
|
+
// This ensures ISSUE_OPENED is published, preventing 0-message clusters
|
|
904
|
+
// Timeout after 30s to prevent infinite hang if init truly fails
|
|
905
|
+
if (cluster.initCompletePromise && cluster.state === 'initializing') {
|
|
906
|
+
this._log(`[Orchestrator] Waiting for initialization to complete before stopping...`);
|
|
907
|
+
await Promise.race([
|
|
908
|
+
cluster.initCompletePromise,
|
|
909
|
+
new Promise((resolve) => setTimeout(resolve, 30000)),
|
|
910
|
+
]);
|
|
911
|
+
}
|
|
912
|
+
|
|
836
913
|
cluster.state = 'stopping';
|
|
837
914
|
|
|
838
915
|
// Stop all agents (including subclusters which handle their own children)
|