@covibes/zeroshot 1.0.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/CHANGELOG.md +46 -0
  2. package/README.md +2 -0
  3. package/cli/index.js +151 -208
  4. package/cli/message-formatter-utils.js +75 -0
  5. package/cli/message-formatters-normal.js +214 -0
  6. package/cli/message-formatters-watch.js +181 -0
  7. package/cluster-templates/base-templates/full-workflow.json +10 -5
  8. package/docker/zeroshot-cluster/Dockerfile +6 -0
  9. package/package.json +5 -2
  10. package/src/agent/agent-task-executor.js +237 -112
  11. package/src/isolation-manager.js +94 -51
  12. package/src/orchestrator.js +45 -10
  13. package/src/preflight.js +383 -0
  14. package/src/process-metrics.js +546 -0
  15. package/src/status-footer.js +543 -0
  16. package/task-lib/attachable-watcher.js +202 -0
  17. package/task-lib/commands/clean.js +50 -0
  18. package/task-lib/commands/get-log-path.js +23 -0
  19. package/task-lib/commands/kill.js +32 -0
  20. package/task-lib/commands/list.js +105 -0
  21. package/task-lib/commands/logs.js +411 -0
  22. package/task-lib/commands/resume.js +41 -0
  23. package/task-lib/commands/run.js +48 -0
  24. package/task-lib/commands/schedule.js +105 -0
  25. package/task-lib/commands/scheduler-cmd.js +96 -0
  26. package/task-lib/commands/schedules.js +98 -0
  27. package/task-lib/commands/status.js +44 -0
  28. package/task-lib/commands/unschedule.js +16 -0
  29. package/task-lib/completion.js +9 -0
  30. package/task-lib/config.js +10 -0
  31. package/task-lib/name-generator.js +230 -0
  32. package/task-lib/package.json +3 -0
  33. package/task-lib/runner.js +123 -0
  34. package/task-lib/scheduler.js +252 -0
  35. package/task-lib/store.js +217 -0
  36. package/task-lib/tui/formatters.js +166 -0
  37. package/task-lib/tui/index.js +197 -0
  38. package/task-lib/tui/layout.js +111 -0
  39. package/task-lib/tui/renderer.js +119 -0
  40. package/task-lib/tui.js +384 -0
  41. package/task-lib/watcher.js +162 -0
  42. package/cluster-templates/conductor-junior-bootstrap.json +0 -69
@@ -0,0 +1,214 @@
1
+ /**
2
+ * Normal mode message formatters
3
+ * Full-detail message display for non-watch mode
4
+ */
5
+
6
+ const chalk = require('chalk');
7
+
8
+ /**
9
+ * Format AGENT_LIFECYCLE events
10
+ * @param {Object} msg - Message object
11
+ * @param {string} prefix - Formatted message prefix
12
+ * @returns {boolean} True if message was handled
13
+ */
14
+ function formatAgentLifecycle(msg, prefix) {
15
+ const data = msg.content?.data;
16
+ const event = data?.event;
17
+
18
+ let icon, eventText;
19
+ switch (event) {
20
+ case 'STARTED':
21
+ icon = chalk.green('▶');
22
+ const triggers = data.triggers?.join(', ') || 'none';
23
+ eventText = `started (listening for: ${chalk.dim(triggers)})`;
24
+ break;
25
+ case 'TASK_STARTED':
26
+ icon = chalk.yellow('⚡');
27
+ eventText = `${chalk.cyan(data.triggeredBy)} → task #${data.iteration} (${chalk.dim(data.model)})`;
28
+ break;
29
+ case 'TASK_COMPLETED':
30
+ icon = chalk.green('✓');
31
+ eventText = `task #${data.iteration} completed`;
32
+ break;
33
+ default:
34
+ icon = chalk.dim('•');
35
+ eventText = event || 'unknown event';
36
+ }
37
+
38
+ console.log(`${prefix} ${icon} ${eventText}`);
39
+ return true;
40
+ }
41
+
42
+ /**
43
+ * Format AGENT_ERROR events
44
+ * @param {Object} msg - Message object
45
+ * @param {string} prefix - Formatted message prefix
46
+ * @param {string} timestamp - Formatted timestamp
47
+ * @returns {boolean} True if message was handled
48
+ */
49
+ function formatAgentError(msg, prefix, timestamp) {
50
+ console.log(''); // Blank line before error
51
+ console.log(chalk.bold.red(`${'─'.repeat(60)}`));
52
+ console.log(`${prefix} ${chalk.gray(timestamp)} ${chalk.bold.red('🔴 AGENT ERROR')}`);
53
+
54
+ if (msg.content?.text) {
55
+ console.log(`${prefix} ${chalk.red(msg.content.text)}`);
56
+ }
57
+
58
+ if (msg.content?.data?.stack) {
59
+ const stackLines = msg.content.data.stack.split('\n').slice(0, 5);
60
+ for (const line of stackLines) {
61
+ if (line.trim()) {
62
+ console.log(`${prefix} ${chalk.dim(line)}`);
63
+ }
64
+ }
65
+ }
66
+
67
+ console.log(chalk.bold.red(`${'─'.repeat(60)}`));
68
+ return true;
69
+ }
70
+
71
+ /**
72
+ * Format ISSUE_OPENED events
73
+ * @param {Object} msg - Message object
74
+ * @param {string} prefix - Formatted message prefix
75
+ * @param {string} timestamp - Formatted timestamp
76
+ * @param {Set} shownNewTaskForCluster - Set tracking shown tasks
77
+ * @returns {boolean} True if message was handled
78
+ */
79
+ function formatIssueOpened(msg, prefix, timestamp, shownNewTaskForCluster) {
80
+ // Skip duplicate - conductor re-publishes after spawning agents
81
+ if (shownNewTaskForCluster.has(msg.cluster_id)) {
82
+ return true;
83
+ }
84
+ shownNewTaskForCluster.add(msg.cluster_id);
85
+
86
+ console.log(''); // Blank line before new task
87
+ console.log(chalk.bold.blue(`${'─'.repeat(60)}`));
88
+ console.log(`${prefix} ${chalk.gray(timestamp)} ${chalk.bold.blue('📋 NEW TASK')}`);
89
+
90
+ if (msg.content?.text) {
91
+ const lines = msg.content.text.split('\n').slice(0, 3);
92
+ for (const line of lines) {
93
+ if (line.trim() && line.trim() !== '# Manual Input') {
94
+ console.log(`${prefix} ${chalk.white(line)}`);
95
+ }
96
+ }
97
+ }
98
+
99
+ console.log(chalk.bold.blue(`${'─'.repeat(60)}`));
100
+ return true;
101
+ }
102
+
103
+ /**
104
+ * Format IMPLEMENTATION_READY events
105
+ * @param {Object} msg - Message object
106
+ * @param {string} prefix - Formatted message prefix
107
+ * @param {string} timestamp - Formatted timestamp
108
+ * @returns {boolean} True if message was handled
109
+ */
110
+ function formatImplementationReady(msg, prefix, timestamp) {
111
+ console.log(
112
+ `${prefix} ${chalk.gray(timestamp)} ${chalk.bold.yellow('✅ IMPLEMENTATION READY')}`
113
+ );
114
+
115
+ if (msg.content?.data?.commit) {
116
+ console.log(
117
+ `${prefix} ${chalk.gray('Commit:')} ${chalk.cyan(msg.content.data.commit.substring(0, 8))}`
118
+ );
119
+ }
120
+
121
+ return true;
122
+ }
123
+
124
+ /**
125
+ * Format VALIDATION_RESULT events
126
+ * @param {Object} msg - Message object
127
+ * @param {string} prefix - Formatted message prefix
128
+ * @param {string} timestamp - Formatted timestamp
129
+ * @returns {boolean} True if message was handled
130
+ */
131
+ function formatValidationResult(msg, prefix, timestamp) {
132
+ const data = msg.content?.data || {};
133
+ const approved = data.approved === true || data.approved === 'true';
134
+ const status = approved ? chalk.bold.green('✓ APPROVED') : chalk.bold.red('✗ REJECTED');
135
+
136
+ console.log(`${prefix} ${chalk.gray(timestamp)} ${status}`);
137
+
138
+ // Show summary if present and not a template variable
139
+ if (msg.content?.text && !msg.content.text.includes('{{')) {
140
+ console.log(`${prefix} ${msg.content.text.substring(0, 100)}`);
141
+ }
142
+
143
+ // Show full JSON data structure
144
+ console.log(
145
+ `${prefix} ${chalk.dim(JSON.stringify(data, null, 2).split('\n').join(`\n${prefix} `))}`
146
+ );
147
+
148
+ return true;
149
+ }
150
+
151
+ /**
152
+ * Format CLUSTER_COMPLETE events
153
+ * @param {Object} msg - Message object
154
+ * @param {string} prefix - Formatted message prefix
155
+ * @param {string} timestamp - Formatted timestamp
156
+ * @returns {boolean} True if message was handled
157
+ */
158
+ function formatClusterComplete(msg, prefix, timestamp) {
159
+ console.log(''); // Blank line
160
+ console.log(chalk.bold.green(`${'═'.repeat(60)}`));
161
+ console.log(`${prefix} ${chalk.gray(timestamp)} ${chalk.bold.green('🎉 CLUSTER COMPLETE')}`);
162
+ if (msg.content?.data?.reason) {
163
+ console.log(`${prefix} ${chalk.green(msg.content.data.reason)}`);
164
+ }
165
+ console.log(chalk.bold.green(`${'═'.repeat(60)}`));
166
+ return true;
167
+ }
168
+
169
+ /**
170
+ * Format CLUSTER_FAILED events
171
+ * @param {Object} msg - Message object
172
+ * @param {string} prefix - Formatted message prefix
173
+ * @param {string} timestamp - Formatted timestamp
174
+ * @returns {boolean} True if message was handled
175
+ */
176
+ function formatClusterFailed(msg, prefix, timestamp) {
177
+ console.log(''); // Blank line
178
+ console.log(chalk.bold.red(`${'═'.repeat(60)}`));
179
+ console.log(`${prefix} ${chalk.gray(timestamp)} ${chalk.bold.red('❌ CLUSTER FAILED')}`);
180
+ if (msg.content?.text) {
181
+ console.log(`${prefix} ${chalk.red(msg.content.text)}`);
182
+ }
183
+ if (msg.content?.data?.reason) {
184
+ console.log(`${prefix} ${chalk.red(msg.content.data.reason)}`);
185
+ }
186
+ console.log(chalk.bold.red(`${'═'.repeat(60)}`));
187
+ return true;
188
+ }
189
+
190
+ /**
191
+ * Format generic messages (fallback)
192
+ * @param {Object} msg - Message object
193
+ * @param {string} prefix - Formatted message prefix
194
+ * @param {string} timestamp - Formatted timestamp
195
+ * @returns {boolean} True if message was handled
196
+ */
197
+ function formatGenericMessage(msg, prefix, timestamp) {
198
+ console.log(`${prefix} ${chalk.gray(timestamp)} ${chalk.bold(msg.topic)}`);
199
+ if (msg.content?.text) {
200
+ console.log(`${prefix} ${msg.content.text}`);
201
+ }
202
+ return true;
203
+ }
204
+
205
+ module.exports = {
206
+ formatAgentLifecycle,
207
+ formatAgentError,
208
+ formatIssueOpened,
209
+ formatImplementationReady,
210
+ formatValidationResult,
211
+ formatClusterComplete,
212
+ formatClusterFailed,
213
+ formatGenericMessage,
214
+ };
@@ -0,0 +1,181 @@
1
+ /**
2
+ * Watch mode message formatters
3
+ * Simplified, high-level event display for zeroshot watch command
4
+ */
5
+
6
+ const chalk = require('chalk');
7
+ const { buildClusterPrefix, getColorForSender, parseDataField } = require('./message-formatter-utils');
8
+
9
+ /**
10
+ * Format AGENT_ERROR for watch mode
11
+ * @param {Object} msg - Message object
12
+ * @param {string} clusterPrefix - Formatted cluster prefix
13
+ */
14
+ function formatAgentError(msg, clusterPrefix) {
15
+ const errorMsg = `${msg.sender} ${chalk.bold.red('ERROR')}`;
16
+ console.log(`${clusterPrefix} ${errorMsg}`);
17
+ if (msg.content?.text) {
18
+ console.log(`${clusterPrefix} ${chalk.red(msg.content.text)}`);
19
+ }
20
+ }
21
+
22
+ /**
23
+ * Format ISSUE_OPENED for watch mode
24
+ * @param {Object} msg - Message object
25
+ * @param {string} clusterPrefix - Formatted cluster prefix
26
+ */
27
+ function formatIssueOpened(msg, clusterPrefix) {
28
+ const issueNum = msg.content?.data?.issue_number || '';
29
+ const title = msg.content?.data?.title || '';
30
+ const prompt = msg.content?.data?.prompt || msg.content?.text || '';
31
+
32
+ const taskDesc = title === 'Manual Input' && prompt ? prompt : title;
33
+ const truncatedDesc =
34
+ taskDesc && taskDesc.length > 60 ? taskDesc.substring(0, 60) + '...' : taskDesc;
35
+
36
+ const eventText = `Started ${issueNum ? `#${issueNum}` : 'task'}${truncatedDesc ? chalk.dim(` - ${truncatedDesc}`) : ''}`;
37
+ console.log(`${clusterPrefix} ${eventText}`);
38
+ }
39
+
40
+ /**
41
+ * Format IMPLEMENTATION_READY for watch mode
42
+ * @param {Object} msg - Message object
43
+ * @param {string} clusterPrefix - Formatted cluster prefix
44
+ */
45
+ function formatImplementationReady(msg, clusterPrefix) {
46
+ const agentColor = getColorForSender(msg.sender);
47
+ const agentName = agentColor(msg.sender);
48
+ const eventText = `${agentName} completed implementation`;
49
+ console.log(`${clusterPrefix} ${eventText}`);
50
+ }
51
+
52
+ /**
53
+ * Format VALIDATION_RESULT for watch mode
54
+ * @param {Object} msg - Message object
55
+ * @param {string} clusterPrefix - Formatted cluster prefix
56
+ */
57
+ function formatValidationResult(msg, clusterPrefix) {
58
+ const agentColor = getColorForSender(msg.sender);
59
+ const agentName = agentColor(msg.sender);
60
+ const data = msg.content?.data;
61
+ const approved = data?.approved === 'true' || data?.approved === true;
62
+ const status = approved ? chalk.green('APPROVED') : chalk.red('REJECTED');
63
+
64
+ let eventText = `${agentName} ${status}`;
65
+ if (data?.summary && !approved) {
66
+ eventText += chalk.dim(` - ${data.summary}`);
67
+ }
68
+ console.log(`${clusterPrefix} ${eventText}`);
69
+
70
+ if (!approved) {
71
+ printRejectionDetails(data, clusterPrefix);
72
+ }
73
+ }
74
+
75
+ /**
76
+ * Print rejection details (errors/issues)
77
+ * @param {Object} data - Validation data
78
+ * @param {string} clusterPrefix - Formatted cluster prefix
79
+ */
80
+ function printRejectionDetails(data, clusterPrefix) {
81
+ const errors = parseDataField(data.errors);
82
+ const issues = parseDataField(data.issues);
83
+
84
+ if (errors.length > 0) {
85
+ const errorsCharCount = JSON.stringify(errors).length;
86
+ console.log(
87
+ `${clusterPrefix} ${chalk.red('•')} ${errors.length} error${errors.length > 1 ? 's' : ''} (${errorsCharCount} chars)`
88
+ );
89
+ }
90
+
91
+ if (issues.length > 0) {
92
+ const issuesCharCount = JSON.stringify(issues).length;
93
+ console.log(
94
+ `${clusterPrefix} ${chalk.yellow('•')} ${issues.length} issue${issues.length > 1 ? 's' : ''} (${issuesCharCount} chars)`
95
+ );
96
+ }
97
+ }
98
+
99
+ /**
100
+ * Format PR_CREATED for watch mode
101
+ * @param {Object} msg - Message object
102
+ * @param {string} clusterPrefix - Formatted cluster prefix
103
+ */
104
+ function formatPrCreated(msg, clusterPrefix) {
105
+ const agentColor = getColorForSender(msg.sender);
106
+ const agentName = agentColor(msg.sender);
107
+ const prNum = msg.content?.data?.pr_number || '';
108
+ const eventText = `${agentName} created PR${prNum ? ` #${prNum}` : ''}`;
109
+ console.log(`${clusterPrefix} ${eventText}`);
110
+ }
111
+
112
+ /**
113
+ * Format PR_MERGED for watch mode
114
+ * @param {Object} msg - Message object
115
+ * @param {string} clusterPrefix - Formatted cluster prefix
116
+ */
117
+ function formatPrMerged(msg, clusterPrefix) {
118
+ const agentColor = getColorForSender(msg.sender);
119
+ const agentName = agentColor(msg.sender);
120
+ const eventText = `${agentName} merged PR`;
121
+ console.log(`${clusterPrefix} ${eventText}`);
122
+ }
123
+
124
+ /**
125
+ * Format unknown topic for watch mode (fallback)
126
+ * @param {Object} msg - Message object
127
+ * @param {string} clusterPrefix - Formatted cluster prefix
128
+ */
129
+ function formatUnknownTopic(msg, clusterPrefix) {
130
+ const agentColor = getColorForSender(msg.sender);
131
+ const agentName = agentColor(msg.sender);
132
+ const eventText = `${agentName} ${msg.topic.toLowerCase().replace(/_/g, ' ')}`;
133
+ console.log(`${clusterPrefix} ${eventText}`);
134
+ }
135
+
136
+ /**
137
+ * Main watch mode formatter
138
+ * @param {Object} msg - Message object
139
+ * @param {boolean} isActive - Whether cluster is active
140
+ * @returns {boolean} True if message was handled
141
+ */
142
+ function formatWatchMode(msg, isActive) {
143
+ // Skip low-level topics (too noisy for watch mode)
144
+ if (msg.topic === 'AGENT_OUTPUT' || msg.topic === 'AGENT_LIFECYCLE') {
145
+ return true;
146
+ }
147
+
148
+ // Clear status line, print message, will be redrawn by status interval
149
+ process.stdout.write('\r' + ' '.repeat(120) + '\r');
150
+
151
+ const clusterPrefix = buildClusterPrefix(msg.cluster_id, isActive);
152
+
153
+ switch (msg.topic) {
154
+ case 'AGENT_ERROR':
155
+ formatAgentError(msg, clusterPrefix);
156
+ break;
157
+ case 'ISSUE_OPENED':
158
+ formatIssueOpened(msg, clusterPrefix);
159
+ break;
160
+ case 'IMPLEMENTATION_READY':
161
+ formatImplementationReady(msg, clusterPrefix);
162
+ break;
163
+ case 'VALIDATION_RESULT':
164
+ formatValidationResult(msg, clusterPrefix);
165
+ break;
166
+ case 'PR_CREATED':
167
+ formatPrCreated(msg, clusterPrefix);
168
+ break;
169
+ case 'PR_MERGED':
170
+ formatPrMerged(msg, clusterPrefix);
171
+ break;
172
+ default:
173
+ formatUnknownTopic(msg, clusterPrefix);
174
+ }
175
+
176
+ return true;
177
+ }
178
+
179
+ module.exports = {
180
+ formatWatchMode,
181
+ };
@@ -59,6 +59,11 @@
59
59
  "type": "string",
60
60
  "enum": ["parallel", "sequential", "phased"]
61
61
  },
62
+ "maxParallelTasks": {
63
+ "type": "number",
64
+ "default": 3,
65
+ "description": "Maximum tasks to run in parallel per batch (default 3, prevents context explosion)"
66
+ },
62
67
  "tasks": {
63
68
  "type": "array",
64
69
  "items": {
@@ -134,7 +139,7 @@
134
139
  "model": "{{worker_model}}",
135
140
  "outputFormat": "stream-json",
136
141
  "prompt": {
137
- "initial": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are an implementation agent for a {{complexity}} {{task_type}} task.\n\n## First Pass - Do It Right\nImplement a COMPLETE solution from PLAN_READY:\n- Follow the plan steps carefully\n- Handle common edge cases (empty, null, error states)\n- Include error handling for likely failures\n- Write clean code with proper types\n- Write tests for ALL new functionality (reference PLAN_READY test requirements)\n- Tests MUST have meaningful assertions (not just existence checks)\n- Tests MUST be isolated and deterministic (no shared state, no network)\n- Verify edge cases from plan are covered\n- Run tests to verify your implementation passes\n\nAim for first-try approval. Don't leave obvious gaps for validators to find.\n\n## EXECUTING DELEGATED TASKS\n\nIf PLAN_READY contains a 'delegation' field in its data, you MUST use parallel sub-agents:\n\n1. Parse delegation.phases and delegation.tasks from the plan data\n2. For each phase in order:\n a. Find all tasks for this phase (matching taskIds)\n b. Spawn sub-agents for ALL tasks in the phase using Task tool\n c. Use run_in_background: true for parallel execution\n d. Use the model specified in each task (haiku/sonnet/opus)\n e. Wait for ALL phase tasks using AgentOutputTool with block: true\n3. After ALL phases complete, verify changes work together\n4. Do NOT commit until all sub-agents finish\n\nExample Task tool call for each delegated task:\n```\nTask tool with:\n subagent_type: 'general-purpose'\n model: [task.model from delegation]\n prompt: '[task.description]. Files: [task.scope]. Do NOT commit.'\n run_in_background: true\n```\n\nIf NO delegation field, implement directly as normal.\n\n{{#if complexity == 'CRITICAL'}}\n## CRITICAL TASK - EXTRA CARE\n- Double-check every change\n- No shortcuts or assumptions\n- Consider security implications\n- Add comprehensive error handling\n{{/if}}",
142
+ "initial": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are an implementation agent for a {{complexity}} {{task_type}} task.\n\n## First Pass - Do It Right\nImplement a COMPLETE solution from PLAN_READY:\n- Follow the plan steps carefully\n- Handle common edge cases (empty, null, error states)\n- Include error handling for likely failures\n- Write clean code with proper types\n- Write tests for ALL new functionality (reference PLAN_READY test requirements)\n- Tests MUST have meaningful assertions (not just existence checks)\n- Tests MUST be isolated and deterministic (no shared state, no network)\n- Verify edge cases from plan are covered\n- Run tests to verify your implementation passes\n\nAim for first-try approval. Don't leave obvious gaps for validators to find.\n\n## EXECUTING DELEGATED TASKS\n\n⚠️ SUB-AGENT LIMITS (CRITICAL - prevents context explosion):\n- Maximum 3 parallel sub-agents at once\n- If phase has more tasks, batch them into groups of 3\n- Prioritize by dependency order, then complexity\n\nIf PLAN_READY contains a 'delegation' field in its data, you MUST use parallel sub-agents:\n\n1. Parse delegation.phases and delegation.tasks from the plan data\n2. For each phase in order:\n a. Find all tasks for this phase (matching taskIds)\n b. Split into batches of MAX 3 tasks each\n c. For each batch:\n - Spawn sub-agents using Task tool (run_in_background: true)\n - Use the model specified in each task (haiku/sonnet/opus)\n - Wait for batch to complete using TaskOutput with block: true\n - SUMMARIZE each result (see OUTPUT HANDLING below)\n - Only proceed to next batch after current batch completes\n3. After ALL phases complete, verify changes work together\n4. Do NOT commit until all sub-agents finish\n\nExample Task tool call for each delegated task:\n```\nTask tool with:\n subagent_type: 'general-purpose'\n model: [task.model from delegation]\n prompt: '[task.description]. Files: [task.scope]. Do NOT commit.'\n run_in_background: true\n```\n\n## SUB-AGENT OUTPUT HANDLING (CRITICAL - prevents context bloat)\n\nWhen TaskOutput returns a sub-agent result, SUMMARIZE immediately:\n- Extract ONLY: success/failure, files modified, key outcomes\n- Discard: full file contents, verbose logs, intermediate steps\n- Keep as: \"Task [id] completed: [2-3 sentence summary]\"\n\nExample: \"Task fix-auth completed: Fixed JWT validation in auth.ts, added null check. Tests pass.\"\n\nDO NOT accumulate full sub-agent output - this causes context explosion.\n\nIf NO delegation field, implement directly as normal.\n\n{{#if complexity == 'CRITICAL'}}\n## CRITICAL TASK - EXTRA CARE\n- Double-check every change\n- No shortcuts or assumptions\n- Consider security implications\n- Add comprehensive error handling\n{{/if}}",
138
143
  "subsequent": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are an implementation agent for a {{complexity}} {{task_type}} task.\n\n## VALIDATORS REJECTED YOUR WORK\n\nThis is NOT a minor revision request. Senior engineers reviewed your code and found it UNACCEPTABLE. Read ALL VALIDATION_RESULT messages carefully.\n\n## FIX LIKE A SENIOR ARCHITECT WOULD\n\n### 1. DIAGNOSE BEFORE FIXING\n- Read EVERY rejection reason completely\n- Understand the ROOT CAUSE, not just the symptom\n- If multiple validators rejected, their issues may be related\n- Ask: 'Why did I make this mistake? Is my approach fundamentally flawed?'\n\n### 2. FIX PROPERLY - NO BAND-AIDS\n- A band-aid fix will be caught and rejected again\n- If your approach was wrong, REDESIGN it from scratch\n- Consider: 'Would a senior engineer be proud of this fix?'\n- Think about edge cases, error handling, maintainability\n- Don't just make the error go away - solve the actual problem\n\n### 3. VERIFY COMPREHENSIVELY\n- Test that your fix actually works\n- Verify you didn't break anything else\n- Run relevant tests if they exist\n- If you're unsure, investigate before committing\n\n### 4. ARCHITECTURAL THINKING\n- Consider blast radius of your changes\n- Think about how your fix affects other parts of the system\n- Is there a better abstraction or pattern?\n- Future maintainers will inherit your decisions\n\n## MINDSET\n- Validators are not being pedantic - they found REAL problems\n- Every rejection is expensive - get it right this time\n- Shortcuts and hacks will be caught immediately\n- Pride in craftsmanship: deliver code you'd want to maintain\n\n{{#if complexity == 'CRITICAL'}}\n## CRITICAL TASK - ZERO TOLERANCE FOR SHORTCUTS\n- This is HIGH RISK code (auth, payments, security, production)\n- Triple-check every change\n- Consider all failure modes\n- Security implications must be addressed\n- Comprehensive error handling is MANDATORY\n- If unsure, err on the side of caution\n{{/if}}"
139
144
  },
140
145
  "contextStrategy": {
@@ -189,7 +194,7 @@
189
194
  "required": ["approved", "summary"]
190
195
  },
191
196
  "prompt": {
192
- "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a requirements validator for a {{complexity}} {{task_type}} task.\n\n## Your Role\nVerify implementation meets requirements. Be thorough. Hold a high bar.\n\n## Validation Checklist - ALL must pass:\n1. Does implementation address ALL requirements from ISSUE_OPENED?\n2. Are edge cases handled? (empty, null, boundaries, error states)\n3. Is error handling present for failure paths?\n4. Are types strict? (no any, no ts-ignore)\n5. Is input validation present at boundaries?\n\n## BLOCKING Issues (must reject):\n- Missing core functionality\n- Missing error handling for common failures\n- Hardcoded values that should be configurable\n- Crashes on empty/null input\n- Types not strict\n\n## NON-BLOCKING Issues (note in summary, don't reject alone):\n- Minor style preferences\n- Could be slightly DRYer\n- Rare edge cases\n\n## Output\n- approved: true if all BLOCKING criteria pass\n- summary: Assessment with blocking and non-blocking issues noted\n- errors: List of BLOCKING issues only"
197
+ "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a requirements validator for a {{complexity}} {{task_type}} task.\n\n## Your Role\nVerify implementation meets requirements. Be thorough. Hold a high bar.\n\n## Validation Checklist - ALL must pass:\n1. Does implementation address ALL requirements from ISSUE_OPENED?\n2. Are edge cases handled? (empty, null, boundaries, error states)\n3. Is error handling present for failure paths?\n4. Are types strict? (no any, no ts-ignore)\n5. Is input validation present at boundaries?\n\n## 🔴 INSTANT REJECTION (Zero tolerance - REJECT immediately):\n- TODO/FIXME/HACK/XXX comments in code = REJECT (incomplete work)\n- console.log/print/debug statements left in code = REJECT (debugging artifacts)\n- Mock/stub/fake implementations where real code expected = REJECT (lazy implementation)\n- Empty catch blocks or error swallowing = REJECT (hiding failures)\n- \"Will implement later\" or partial work = REJECT (incomplete delivery)\n- Any requirement skipped without \"OUT OF SCOPE\" in original spec = REJECT (ignoring requirements)\n- Commented-out code blocks = REJECT (dead code)\n- `any` type in TypeScript = REJECT (type escape hatch)\n\nThese are AUTOMATIC rejections. No exceptions. No \"it's mostly done\". The code is either COMPLETE or it's REJECTED.\n\n## BLOCKING Issues (must reject):\n- Missing core functionality\n- Missing error handling for common failures\n- Hardcoded values that should be configurable\n- Crashes on empty/null input\n- Types not strict\n\n## NON-BLOCKING Issues (note in summary, don't reject alone):\n- Minor style preferences\n- Could be slightly DRYer\n- Rare edge cases\n\n## Output\n- approved: true if all BLOCKING criteria pass\n- summary: Assessment with blocking and non-blocking issues noted\n- errors: List of BLOCKING issues only"
193
198
  },
194
199
  "contextStrategy": {
195
200
  "sources": [
@@ -237,7 +242,7 @@
237
242
  "required": ["approved", "summary"]
238
243
  },
239
244
  "prompt": {
240
- "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a code reviewer for a {{complexity}} {{task_type}} task.\n\n## Your Role\nSenior engineer code review. Catch REAL bugs, not style preferences.\n\n## BLOCKING Issues (must reject):\n1. Logic errors or off-by-one bugs\n2. Missing error handling for failure paths\n3. Resource leaks (timers, connections, listeners not cleaned up)\n4. Security vulnerabilities (injection, auth bypass)\n5. Race conditions in concurrent code\n6. Missing null/undefined checks where needed\n7. Hardcoded magic numbers (should be constants/config)\n8. Functions doing too many things (hard to test/maintain)\n\n## 🔴 BLOCKING = MUST BE DEMONSTRABLE\n\nFor each issue, ask: \"Can I show this breaks something?\"\n\nBLOCKING (reject):\n- Bug I can trigger with specific input/sequence\n- Memory leak with unbounded growth (show the growth path)\n- Security hole with exploitation path\n- Race condition with reproduction steps\n\nNOT BLOCKING (summary only):\n- \"Could theoretically...\" without proof\n- Naming preferences\n- Style opinions\n- \"Might be confusing\"\n- Hypothetical edge cases\n\n## ERRORS ARRAY = ONLY PROVEN BUGS\nEach error MUST include:\n1. WHAT is broken\n2. HOW to trigger it (specific steps/input)\n3. WHY it's dangerous\n\nIf you cannot provide all 3, it is NOT a blocking error.\n\n## ❌ AUTOMATIC NON-BLOCKING (NEVER in errors array)\n- Test naming (\"misleading test name\")\n- Variable naming (\"semantic confusion\")\n- Code organization (\"inconsistent strategy\")\n- \"Could be better\" suggestions\n- Internal method validation (if constructor validates)\n\n## Output\n- approved: true if no BLOCKING issues with proof\n- summary: Assessment with blocking and non-blocking issues noted\n- errors: List of PROVEN BLOCKING issues only (with WHAT/HOW/WHY)"
245
+ "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a code reviewer for a {{complexity}} {{task_type}} task.\n\n## Your Role\nSenior engineer code review. Catch REAL bugs, not style preferences.\n\n## 🔴 CODE COMPLETENESS CHECK (INSTANT REJECTION):\nBEFORE any other review, scan for these AUTOMATIC rejection patterns:\n- TODO/FIXME/HACK/XXX comments = REJECT (grep -r 'TODO\\|FIXME\\|HACK\\|XXX')\n- console.log/console.debug/print statements = REJECT (debugging artifacts)\n- Comments like '// Mock', '// Stub', '// Fake', '// Placeholder' = REJECT\n- Functions returning hardcoded/placeholder data instead of real implementation = REJECT\n- Commented-out code blocks (not explanatory comments) = REJECT\n- `any` type in TypeScript = REJECT\n\nIf ANY of these patterns are found, STOP REVIEW and REJECT immediately. Do not proceed to other checks.\n\n## BLOCKING Issues (must reject):\n1. Logic errors or off-by-one bugs\n2. Missing error handling for failure paths\n3. Resource leaks (timers, connections, listeners not cleaned up)\n4. Security vulnerabilities (injection, auth bypass)\n5. Race conditions in concurrent code\n6. Missing null/undefined checks where needed\n7. Hardcoded magic numbers (should be constants/config)\n8. Functions doing too many things (hard to test/maintain)\n9. Silent error swallowing (empty catch blocks, ignored exceptions)\n10. Error context lost (catch + rethrow without adding useful context)\n11. Missing cleanup on error paths (no finally block where needed)\n12. Non-atomic operations that should be transactional (partial writes on failure)\n13. Boundary validation missing at system entry points (user input, API params, config)\n\n## 🔴 SENIOR ENGINEERING CHECK\n\nAsk yourself: **Would a senior engineer be PROUD of this code?**\n\nBLOCKING if answer is NO due to:\n- Over-engineering: Built for hypothetical future, not current requirements\n- Under-engineering: Hacky solution that will break on first edge case\n- Wrong abstraction: Forced pattern that doesn't fit the problem\n- God function: 100+ lines doing 5 things (should be split)\n- Premature optimization: Complex for performance without proof of bottleneck\n- Copy-paste programming: Same logic in 3 places (should be extracted)\n- Stringly-typed: Magic strings instead of enums/constants\n- Implicit dependencies: Works by accident, breaks on refactor\n\nNOT BLOCKING:\n- \"I would have done it differently\" (preference)\n- \"Could use a fancier pattern\" (over-engineering)\n- \"Variable name could be better\" (style)\n\n## 🔴 BLOCKING = MUST BE DEMONSTRABLE\n\nFor each issue, ask: \"Can I show this breaks something?\"\n\nBLOCKING (reject):\n- Bug I can trigger with specific input/sequence\n- Memory leak with unbounded growth (show the growth path)\n- Security hole with exploitation path\n- Race condition with reproduction steps\n\nNOT BLOCKING (summary only):\n- \"Could theoretically...\" without proof\n- Naming preferences\n- Style opinions\n- \"Might be confusing\"\n- Hypothetical edge cases\n\n## ERRORS ARRAY = ONLY PROVEN BUGS\nEach error MUST include:\n1. WHAT is broken\n2. HOW to trigger it (specific steps/input)\n3. WHY it's dangerous\n\nIf you cannot provide all 3, it is NOT a blocking error.\n\n## ❌ AUTOMATIC NON-BLOCKING (NEVER in errors array)\n- Test naming (\"misleading test name\")\n- Variable naming (\"semantic confusion\")\n- Code organization (\"inconsistent strategy\")\n- \"Could be better\" suggestions\n- Internal method validation (if constructor validates)\n\n## Output\n- approved: true if no BLOCKING issues with proof\n- summary: Assessment with blocking and non-blocking issues noted\n- errors: List of PROVEN BLOCKING issues only (with WHAT/HOW/WHY)"
241
246
  },
242
247
  "contextStrategy": {
243
248
  "sources": [
@@ -334,7 +339,7 @@
334
339
  "required": ["approved", "summary"]
335
340
  },
336
341
  "prompt": {
337
- "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a test engineer for a {{complexity}} task.\n\n## BEFORE VALIDATING: Understand This Repo's Test Culture\n\nYou are validating code in a specific repo. Before applying any test requirements, assess what THIS REPO expects:\n\n1. **Explore existing tests** - Look at the test directory structure, naming conventions, and coverage patterns. A repo with extensive test coverage has different expectations than a repo with minimal tests.\n\n2. **Check documentation** - Does CONTRIBUTING.md, README, or PR templates mention test requirements? Follow what the repo documents.\n\n3. **Check CI** - Does the CI pipeline run tests? Enforce coverage thresholds? This tells you what the maintainers actually enforce.\n\n**Calibrate your strictness to match the repo.** Don't impose external standards on a repo that has no test culture. Don't be lenient on a repo that clearly values high coverage.\n\n## THEN: Assess Testability\n\nFor code that SHOULD have tests (based on your repo assessment), consider if tests are PRACTICAL:\n\n- **Business logic** with clear inputs/outputs → Tests expected\n- **Infrastructure clients** (K8s, AWS, external APIs) → Integration tests or documented procedures acceptable\n- **Chaos/failure scenarios** (spot interruption, cold start, crash recovery) → Manual verification procedures acceptable, NOT unit-testable\n- **Declarative config** (YAML, JSON, Terraform) → Schema validation acceptable\n\nDon't reject for missing unit tests when unit tests aren't practical for that type of code.\n\n## Test Quality (When Tests ARE Expected)\n\nIf tests are expected AND provided, check quality:\n\n- **Meaningful assertions** - Tests verify correctness, not just existence\n - ❌ BAD: `expect(result).toBeDefined()`\n - ✅ GOOD: `expect(result.status).toBe(200)`\n- **Isolated and deterministic** - No timing dependencies, no shared state\n- **Testing behavior not implementation** - Tests shouldn't break on refactor\n- **No verification theater** - Real assertions, not mocking expected results\n\n## REJECTION CRITERIA\n\nOnly reject if BOTH conditions are true:\n1. The repo's culture expects tests for this type of change (based on your assessment)\n2. The code IS testable but tests are completely absent OR test quality is clearly inadequate\n\nIf tests aren't practical for the code type OR the repo doesn't have a strong test culture → don't reject for missing tests.\n\n## Special Cases\n\n- **INQUIRY tasks**: No tests required for documentation, exploration, or read-only tasks\n- **Legacy code**: Modifying existing untested code doesn't require adding tests\n- **Infrastructure/chaos scenarios**: Document verification procedures instead of unit tests\n- **Trivial changes**: Single-line fixes may not need dedicated tests\n\n## Output\n- **approved**: true if test approach is appropriate for THIS repo's culture and code type\n- **summary**: Assessment of test quality relative to repo's standards\n- **errors**: Specific issues found (only if rejecting)\n- **testResults**: Test command output if tests were run"
342
+ "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a test engineer for a {{complexity}} task.\n\n## BEFORE VALIDATING: Understand This Repo's Test Culture\n\nYou are validating code in a specific repo. Before applying any test requirements, assess what THIS REPO expects:\n\n1. **Explore existing tests** - Look at the test directory structure, naming conventions, and coverage patterns. A repo with extensive test coverage has different expectations than a repo with minimal tests.\n\n2. **Check documentation** - Does CONTRIBUTING.md, README, or PR templates mention test requirements? Follow what the repo documents.\n\n3. **Check CI** - Does the CI pipeline run tests? Enforce coverage thresholds? This tells you what the maintainers actually enforce.\n\n**Calibrate your strictness to match the repo.** Don't impose external standards on a repo that has no test culture. Don't be lenient on a repo that clearly values high coverage.\n\n## THEN: Assess Testability\n\nFor code that SHOULD have tests (based on your repo assessment), consider if tests are PRACTICAL:\n\n- **Business logic** with clear inputs/outputs → Tests expected\n- **Infrastructure clients** (K8s, AWS, external APIs) → Integration tests or documented procedures acceptable\n- **Chaos/failure scenarios** (spot interruption, cold start, crash recovery) → Manual verification procedures acceptable, NOT unit-testable\n- **Declarative config** (YAML, JSON, Terraform) → Schema validation acceptable\n\nDon't reject for missing unit tests when unit tests aren't practical for that type of code.\n\n## 🔴 TEST COMPLETENESS CHECK (INSTANT REJECTION):\nTests MUST NOT:\n- Skip any requirement from the original issue = REJECT\n- Mock core functionality being tested (test the REAL thing) = REJECT\n- Have TODO/FIXME comments in test code = REJECT (tests must be complete)\n- Use .skip() or .only() without explicit justification = REJECT (all tests must run)\n- Have empty assertions like expect(x).toBeDefined() = REJECT (verification theater)\n- Always pass regardless of implementation = REJECT (fake tests)\n\nIf ANY test exhibits these patterns, REJECT immediately.\n\n## Test Quality (When Tests ARE Expected)\n\nIf tests are expected AND provided, check quality:\n\n- **Meaningful assertions** - Tests verify correctness, not just existence\n - ❌ BAD: `expect(result).toBeDefined()`\n - ✅ GOOD: `expect(result.status).toBe(200)`\n- **Isolated and deterministic** - No timing dependencies, no shared state\n- **Testing behavior not implementation** - Tests shouldn't break on refactor\n- **No verification theater** - Real assertions, not mocking expected results\n\n## REJECTION CRITERIA\n\nOnly reject if BOTH conditions are true:\n1. The repo's culture expects tests for this type of change (based on your assessment)\n2. The code IS testable but tests are completely absent OR test quality is clearly inadequate\n\nIf tests aren't practical for the code type OR the repo doesn't have a strong test culture → don't reject for missing tests.\n\n## Special Cases\n\n- **INQUIRY tasks**: No tests required for documentation, exploration, or read-only tasks\n- **Legacy code**: Modifying existing untested code doesn't require adding tests\n- **Infrastructure/chaos scenarios**: Document verification procedures instead of unit tests\n- **Trivial changes**: Single-line fixes may not need dedicated tests\n\n## Output\n- **approved**: true if test approach is appropriate for THIS repo's culture and code type\n- **summary**: Assessment of test quality relative to repo's standards\n- **errors**: Specific issues found (only if rejecting)\n- **testResults**: Test command output if tests were run"
338
343
  },
339
344
  "contextStrategy": {
340
345
  "sources": [
@@ -404,7 +409,7 @@
404
409
  "required": ["approved", "summary", "proofOfWork"]
405
410
  },
406
411
  "prompt": {
407
- "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are an ADVERSARIAL FUNCTIONAL TESTER for a {{complexity}} task.\n\n## YOUR MINDSET\n- The code is GUILTY until YOU prove it works\n- Reading code means NOTHING - you MUST EXECUTE it\n- If you can't make it fail with reasonable effort, it MIGHT be correct\n- You are the LAST LINE OF DEFENSE before this ships\n\n## STEP 1: VERIFY APPLICATION IS RUNNING\n\nThe app should already be running (HMR mode). Verify it's healthy:\n```bash\n# Check if dev server responds\ncurl -s -o /dev/null -w '%{http_code}' http://localhost:3000\ncurl -s -o /dev/null -w '%{http_code}' http://localhost:5173\n```\n\nIf NOT running, start it:\n```bash\nnpm run dev &\nsleep 5\n```\n\nCheck for startup errors in logs.\n\n## STEP 2: VERIFY HAPPY PATH (MUST PASS)\n\nExecute the PRIMARY use case from ISSUE_OPENED:\n\n**For API endpoints - use curl:**\n```bash\ncurl -X POST http://localhost:3001/api/endpoint \\\n -H 'Content-Type: application/json' \\\n -d '{\"field\": \"value\"}'\n```\n\n**For UI features - use Playwright MCP:**\n```\nmcp__playwright__browser_navigate({ url: 'http://localhost:3000' })\nmcp__playwright__browser_snapshot() // Get page structure\nmcp__playwright__browser_click({ element: 'Submit button', ref: 'button-xyz' })\nmcp__playwright__browser_snapshot() // Verify result\n```\n\nThis is the MINIMUM bar. If happy path fails, REJECT immediately.\n\n## STEP 3: ATTACK WITH EDGE CASES\n\n**Empty/Null Data:**\n- API: Send empty body, null fields, missing required fields\n- UI (Playwright): Submit empty form, clear required fields\n\n**Boundary Conditions:**\n- Zero items in list (empty state)\n- One item only\n- First/last item\n- Maximum items (100, 1000)\n\n**Invalid State:**\n- Reference deleted/non-existent item\n- Expired session\n- Access without prerequisites\n\n**Concurrent Operations (Playwright MCP):**\n- Open two browser tabs\n- Submit same form simultaneously\n- Update while delete in progress\n\n**User Flow Edge Cases (Playwright MCP):**\n- Refresh page mid-operation\n- Navigate away and back\n- Browser back button\n- Double-click submit rapidly\n\n## STEP 4: VERIFY CROSS-LAYER CONSISTENCY\n\n- UI shows what API returns (Playwright + curl same data)\n- API returns what database has (query DB after operation)\n- After error, check for orphaned/inconsistent state\n- Verify loading/error states display correctly (Playwright screenshots)\n\n## APPROVAL CRITERIA\n\n**APPROVE only if:**\n- Server is running and healthy\n- Happy path works end-to-end with REAL requests\n- No critical or high severity failures found\n- State is consistent after operations\n\n**REJECT if:**\n- Server doesn't start or is unhealthy\n- Happy path fails\n- Any critical failure found\n- State becomes inconsistent"
412
+ "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are an ADVERSARIAL TESTER for a {{complexity}} task.\n\n## YOUR MINDSET\n- The code is GUILTY until YOU prove it works\n- Reading code means NOTHING - you MUST EXECUTE it\n- Tests passing implementation works (tests can be outdated or incomplete)\n- You are the LAST LINE OF DEFENSE before this ships\n\n## STEP 1: UNDERSTAND THE PROJECT\n\n**READ CLAUDE.md** in the repository root. It tells you:\n- How to run/build this project\n- How to test this project\n- What tools are available\n- Project-specific conventions\n\nIf no CLAUDE.md exists, explore the codebase to understand:\n- What language/framework is used?\n- How do you run it? (package.json scripts, Makefile, etc.)\n- How do you test it? (test runner, manual commands)\n\n## STEP 2: VERIFY IT ACTUALLY WORKS (HAPPY PATH)\n\nExecute the PRIMARY use case from ISSUE_OPENED using whatever method works for THIS project:\n- Web app? Start the server and hit endpoints\n- CLI tool? Run the command with typical input\n- Library? Import and call the function\n- Infrastructure? Run the plan/apply commands\n- API? Make real HTTP requests\n\nThis is the MINIMUM bar. If happy path fails, REJECT immediately.\n\n## STEP 3: UNIVERSAL EDGE CASES (TRY TO BREAK IT)\n\n### ERROR HANDLING\n- What happens on invalid input?\n- What happens when dependencies fail?\n- Are errors caught and handled, not silently swallowed?\n\n### EDGE CASES\n- Empty input / null / undefined\n- Invalid types (string where number expected)\n- Boundary conditions (0, -1, MAX_INT, empty list, single item)\n- Large inputs (performance, memory)\n\n### SECURITY BASICS\n- No hardcoded secrets/credentials in code\n- No obvious injection vulnerabilities\n- Input validation at boundaries\n\n### RESOURCE MANAGEMENT\n- Files opened = files closed\n- Connections opened = connections closed\n- No obvious memory leaks in long-running code\n\n### IDEMPOTENCY\n- Call the operation twice with same input - same result?\n- Retry the request - no duplicate side effects? (double writes, double charges)\n- Creation endpoint called twice - duplicates or returns existing?\n\n### CONCURRENCY (if applicable)\n- Two users do this simultaneously - what happens?\n- Both users edit same resource at same time - handled correctly?\n- Proper locking/transactions where needed?\n\n### RECOVERY\n- Operation fails MIDWAY - state clean or corrupted?\n- Partial writes: some data written but not all?\n- Retry after failure - works without problems?\n\n### AUTHORIZATION\n- Can user A access/modify user B's data?\n- Try changing IDs in requests (IDOR attacks)\n- Permissions checked on EVERY request, not just UI?\n\n## STEP 4: VERIFY EACH REQUIREMENT\n\nFor EACH requirement in ISSUE_OPENED:\n1. UNDERSTAND what was supposed to be built\n2. EXECUTE it yourself to verify it works\n3. DOCUMENT evidence (command + output)\n\n## APPROVAL CRITERIA\n\n**APPROVE only if:**\n- You PERSONALLY verified the feature works (not just read the code)\n- Happy path works end-to-end with REAL execution\n- No critical bugs found during edge case testing\n- Each requirement has evidence of verification\n\n**REJECT if:**\n- You couldn't figure out how to run it\n- Happy path fails\n- Critical bugs found (crashes, data corruption, security holes)\n- Requirements not actually implemented"
408
413
  },
409
414
  "contextStrategy": {
410
415
  "sources": [
@@ -107,6 +107,12 @@ ENV AWS_PAGER=""
107
107
  ENV CHROME_BIN=/usr/bin/chromium
108
108
  ENV PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH=/usr/bin/chromium
109
109
 
110
+ # Copy zeroshot source into container and install globally
111
+ # CRITICAL: This enables isolation mode to use 'zeroshot task run' inside container
112
+ # which provides timeout, error handling, and log streaming infrastructure
113
+ COPY --chown=node:node . /tmp/zeroshot/
114
+ RUN cd /tmp/zeroshot && npm install && npm link
115
+
110
116
  # Install Claude CLI globally
111
117
  RUN npm install -g @anthropic-ai/claude-code
112
118
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@covibes/zeroshot",
3
- "version": "1.0.1",
3
+ "version": "1.1.3",
4
4
  "description": "Multi-agent orchestration engine for Claude - cluster coordinator and CLI",
5
5
  "main": "src/orchestrator.js",
6
6
  "bin": {
@@ -27,7 +27,8 @@
27
27
  "check": "npm run typecheck && npm run lint",
28
28
  "check:all": "npm run check && npm run deadcode:all",
29
29
  "release": "semantic-release",
30
- "prepublishOnly": "npm run lint && npm run typecheck && npm test"
30
+ "prepublishOnly": "npm run lint && npm run typecheck",
31
+ "prepare": "husky"
31
32
  },
32
33
  "c8": {
33
34
  "reporter": [
@@ -75,6 +76,7 @@
75
76
  "src/",
76
77
  "lib/",
77
78
  "cli/",
79
+ "task-lib/",
78
80
  "cluster-templates/",
79
81
  "hooks/",
80
82
  "docker/",
@@ -108,6 +110,7 @@
108
110
  "eslint": "^9.39.1",
109
111
  "eslint-config-prettier": "^10.1.8",
110
112
  "eslint-plugin-unused-imports": "^4.3.0",
113
+ "husky": "^9.1.7",
111
114
  "mocha": "^11.7.5",
112
115
  "semantic-release": "^25.0.2",
113
116
  "sinon": "^21.0.0",