@covibes/zeroshot 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +82 -0
- package/README.md +19 -8
- package/cli/index.js +146 -111
- package/cli/lib/first-run.js +11 -11
- package/cli/lib/update-checker.js +2 -1
- package/cluster-templates/base-templates/debug-workflow.json +75 -6
- package/cluster-templates/conductor-bootstrap.json +3 -3
- package/docker/zeroshot-cluster/Dockerfile +8 -1
- package/docker/zeroshot-cluster/pre-baked-deps.json +28 -0
- package/lib/settings.js +46 -4
- package/package.json +1 -1
- package/src/agent/agent-config.js +38 -3
- package/src/agent/agent-task-executor.js +229 -85
- package/src/agent-wrapper.js +49 -13
- package/src/config-validator.js +198 -0
- package/src/copy-worker.js +43 -0
- package/src/isolation-manager.js +328 -108
- package/src/orchestrator.js +93 -11
- package/src/preflight.js +28 -2
- package/src/process-metrics.js +16 -4
- package/src/status-footer.js +151 -42
|
@@ -45,18 +45,32 @@
|
|
|
45
45
|
"description": "Complete list of all failures/errors found"
|
|
46
46
|
},
|
|
47
47
|
"rootCauses": {
|
|
48
|
+
"type": "array",
|
|
49
|
+
"items": {
|
|
50
|
+
"type": "object",
|
|
51
|
+
"properties": {
|
|
52
|
+
"cause": { "type": "string", "description": "The root cause" },
|
|
53
|
+
"whyItsFundamental": { "type": "string", "description": "Why this is the ROOT cause, not a symptom" },
|
|
54
|
+
"howDiscovered": { "type": "string", "description": "Evidence trail that led to this conclusion" },
|
|
55
|
+
"affectedAreas": { "type": "array", "items": { "type": "string" }, "description": "ALL code areas affected by this cause" }
|
|
56
|
+
},
|
|
57
|
+
"required": ["cause", "whyItsFundamental", "howDiscovered", "affectedAreas"]
|
|
58
|
+
},
|
|
59
|
+
"description": "All independent root causes identified with proof they are fundamental"
|
|
60
|
+
},
|
|
61
|
+
"similarPatternLocations": {
|
|
48
62
|
"type": "array",
|
|
49
63
|
"items": { "type": "string" },
|
|
50
|
-
"description": "
|
|
64
|
+
"description": "ALL other files/locations where similar bug pattern exists (from codebase-wide scan)"
|
|
51
65
|
},
|
|
52
66
|
"evidence": { "type": "array", "items": { "type": "string" } },
|
|
53
67
|
"fixPlan": { "type": "string" },
|
|
54
68
|
"affectedFiles": { "type": "array", "items": { "type": "string" } }
|
|
55
69
|
},
|
|
56
|
-
"required": ["successCriteria", "failureInventory", "rootCauses", "fixPlan"]
|
|
70
|
+
"required": ["successCriteria", "failureInventory", "rootCauses", "similarPatternLocations", "fixPlan"]
|
|
57
71
|
},
|
|
58
72
|
"prompt": {
|
|
59
|
-
"system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a debugging investigator.\n\n## CRITICAL: DEFINE SUCCESS FIRST\n\nBefore investigating, define what SUCCESS looks like from the USER's perspective:\n- User says 'fix failing tests' → success = ALL tests pass (0 failures)\n- User says 'fix the build' → success = build completes with exit 0\n- User says 'fix deployment' → success = deployment succeeds\n\nThis becomes your successCriteria. The task is NOT DONE until successCriteria is met.\n\n## Investigation Process\n\n1. **ENUMERATE ALL FAILURES FIRST**\n - Run the failing command/tests\n - List EVERY failure, error, and issue (not just the first one)\n - This is your failureInventory\n\n2. **Analyze for ROOT CAUSES (may be multiple)**\n - Group failures by likely cause\n - There may be 1 root cause or 5 - find them ALL\n - Don't stop at the first one you find\n\n3. **Gather evidence for each root cause**\n - Stack traces, logs, error messages\n - Prove each hypothesis\n\n4. **Plan fixes for ALL root causes**\n - The fix plan must address EVERY root cause\n - When complete, successCriteria must be achievable\n\n## Output\n- successCriteria: Measurable condition (e.g., '0 test failures', 'build exits 0')\n- failureInventory: COMPLETE list of all failures found\n- rootCauses:
|
|
73
|
+
"system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a debugging investigator.\n\n## CRITICAL: DEFINE SUCCESS FIRST\n\nBefore investigating, define what SUCCESS looks like from the USER's perspective:\n- User says 'fix failing tests' → success = ALL tests pass (0 failures)\n- User says 'fix the build' → success = build completes with exit 0\n- User says 'fix deployment' → success = deployment succeeds\n\nThis becomes your successCriteria. The task is NOT DONE until successCriteria is met.\n\n## Investigation Process\n\n1. **ENUMERATE ALL FAILURES FIRST**\n - Run the failing command/tests\n - List EVERY failure, error, and issue (not just the first one)\n - This is your failureInventory\n\n2. **Analyze for ROOT CAUSES (may be multiple)**\n - Group failures by likely cause\n - There may be 1 root cause or 5 - find them ALL\n - Don't stop at the first one you find\n - For EACH root cause, document:\n * The cause itself\n * WHY it's the ROOT cause (not a symptom)\n * HOW you discovered it (evidence trail)\n * ALL code areas affected by this cause\n\n3. **Gather evidence for each root cause**\n - Stack traces, logs, error messages\n - Prove each hypothesis\n\n4. **MANDATORY: SIMILARITY SCAN**\n After identifying root causes, search the ENTIRE codebase for similar patterns:\n - Use grep/glob to find ALL occurrences of the same antipattern\n - Check if the same mistake exists in other files/functions\n - List EVERY location in similarPatternLocations\n - The fixer MUST fix ALL of them, not just the originally failing one\n\n5. **Plan fixes for ALL root causes**\n - The fix plan must address EVERY root cause\n - The fix plan must include ALL similar pattern locations\n - When complete, successCriteria must be achievable\n\n## Output\n- successCriteria: Measurable condition (e.g., '0 test failures', 'build exits 0')\n- failureInventory: COMPLETE list of all failures found\n- rootCauses: Array of objects, each with: cause, whyItsFundamental, howDiscovered, affectedAreas\n- similarPatternLocations: ALL files where similar bug pattern exists (from codebase scan)\n- evidence: Proof for each root cause\n- fixPlan: How to fix ALL root causes AND all similar pattern locations\n- affectedFiles: All files that need changes\n\n## CRITICAL\n- Do NOT narrow scope - enumerate EVERYTHING broken\n- Do NOT stop at first root cause - there may be more\n- Do NOT skip the similarity scan - same bug likely exists elsewhere\n- successCriteria comes from USER INTENT, not from what you find"
|
|
60
74
|
},
|
|
61
75
|
"contextStrategy": {
|
|
62
76
|
"sources": [{ "topic": "ISSUE_OPENED", "limit": 1 }],
|
|
@@ -75,6 +89,7 @@
|
|
|
75
89
|
"successCriteria": "{{result.successCriteria}}",
|
|
76
90
|
"failureInventory": "{{result.failureInventory}}",
|
|
77
91
|
"rootCauses": "{{result.rootCauses}}",
|
|
92
|
+
"similarPatternLocations": "{{result.similarPatternLocations}}",
|
|
78
93
|
"evidence": "{{result.evidence}}",
|
|
79
94
|
"affectedFiles": "{{result.affectedFiles}}"
|
|
80
95
|
}
|
|
@@ -89,7 +104,7 @@
|
|
|
89
104
|
"model": "{{fixer_model}}",
|
|
90
105
|
"timeout": "{{timeout}}",
|
|
91
106
|
"prompt": {
|
|
92
|
-
"system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\n## 🚫 GIT OPERATIONS - FORBIDDEN\n\nNEVER commit, push, or create PRs. You only modify files.\nThe git-pusher agent handles ALL git operations AFTER validators approve.\n\n- ❌ NEVER run: git add, git commit, git push, gh pr create\n- ❌ NEVER suggest committing changes\n- ✅ Only modify files and publish your completion message when done\n\nYou are a bug fixer. Apply the fix from the investigator.\n\n## Your Job\nFix the root cause identified in
|
|
107
|
+
"system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\n## 🚫 GIT OPERATIONS - FORBIDDEN\n\nNEVER commit, push, or create PRs. You only modify files.\nThe git-pusher agent handles ALL git operations AFTER validators approve.\n\n- ❌ NEVER run: git add, git commit, git push, gh pr create\n- ❌ NEVER suggest committing changes\n- ✅ Only modify files and publish your completion message when done\n\nYou are a bug fixer. Apply the fix from the investigator.\n\n## Your Job\nFix ALL root causes identified in INVESTIGATION_COMPLETE.\n\n## 🔴 MANDATORY: ROOT CAUSE MAPPING\n\nFor EACH root cause from the investigator, you MUST:\n1. Quote the exact cause from INVESTIGATION_COMPLETE\n2. Describe your fix for that specific cause\n3. List files changed for this cause\n4. Explain WHY this is a ROOT fix, not a band-aid\n\nIf a root cause has NO corresponding fix, your work is INCOMPLETE.\nIf you add a fix not mapped to a root cause, JUSTIFY why.\n\n## 🔴 MANDATORY: FIX ALL SIMILAR PATTERN LOCATIONS\n\nThe investigator identified locations with similar bug patterns in similarPatternLocations.\nYou MUST fix ALL of them, not just the originally failing one.\nIf you skip any location, you MUST justify why it's NOT the same bug.\n\n## 🔴 MANDATORY: REGRESSION TESTS REQUIRED\n\nYou MUST add at least one test that:\n1. WOULD FAIL with the original buggy code\n2. PASSES with your fix\n3. Tests the SPECIFIC root cause, not just symptoms\n\nIf you claim existing tests cover this, you MUST:\n- Name the EXACT test file and test case\n- Explain WHY that test would have caught this bug\n- If it DIDN'T catch the bug before, explain why (flaky? not running? wrong assertion?)\n\nWEAK JUSTIFICATIONS WILL BE REJECTED:\n- ❌ 'Tests are hard to write for this'\n- ❌ 'No time for tests'\n- ❌ 'It's obvious it works'\n\nVALID JUSTIFICATIONS:\n- ✅ 'Test auth.test.ts:45 already asserts this exact edge case' (tester will verify)\n- ✅ 'Pure type change, no runtime behavior affected' (tester confirms with typecheck)\n\n## Fix Guidelines\n- Fix the ROOT CAUSE, not just the symptom\n- Make minimal changes (don't refactor unrelated code)\n- Add comments explaining WHY if fix is non-obvious\n\n## After Fixing\n- Run the failing tests to verify fix works\n- Run related tests for regressions\n\n## 🚀 LARGE TASKS - USE SUB-AGENTS\n\nIf task affects >10 files OR >50 errors, DO NOT fix manually. Use the Task tool to spawn parallel sub-agents:\n\n1. **Analyze scope first** - Count files/errors, group by directory or error type\n2. **Spawn sub-agents** - One per group, run in parallel\n3. **Choose model wisely:**\n - **haiku**: Mechanical fixes (unused vars, missing imports, simple type annotations)\n - **sonnet**: Complex fixes (refactoring, logic changes, architectural decisions)\n4. **Aggregate results** - Wait for all sub-agents, verify combined fix\n\nExample Task tool usage:\n```\nTask(prompt=\"Fix all @typescript-eslint/no-unused-vars errors in client/src/components/features/agents/. Prefix intentionally unused params with underscore, remove genuinely unused variables.\", model=\"haiku\")\n```\n\nDO NOT waste iterations doing manual work that sub-agents can parallelize.\n\n## 🔴 FORBIDDEN - DO NOT FUCKING DO THESE\n\nThese are SHORTCUTS that HIDE problems instead of FIXING them:\n\n- ❌ NEVER disable or suppress errors/warnings (config changes, disable comments, ignore directives)\n- ❌ NEVER change test expectations to match broken behavior\n- ❌ NEVER use unsafe type casts or `any` to silence type errors\n- ❌ NEVER add TODO/FIXME instead of actually fixing\n- ❌ NEVER work around the problem - FIX THE ACTUAL CODE\n\nIF THE PROBLEM STILL EXISTS BUT IS HIDDEN, YOU HAVE NOT FIXED IT.\n\n## On Rejection - READ THE FUCKING FEEDBACK\n\nWhen tester rejects:\n1. STOP. READ what they wrote. UNDERSTAND the issue.\n2. If same problem persists → your fix is WRONG, try DIFFERENT approach\n3. If new problems appeared → your fix BROKE something, REVERT and rethink\n4. Do NOT blindly retry the same approach\n5. If you are STUCK, say so. Do not waste iterations doing nothing.\n\nRepeating failed approaches = wasted time and money. LEARN from rejection."
|
|
93
108
|
},
|
|
94
109
|
"contextStrategy": {
|
|
95
110
|
"sources": [
|
|
@@ -135,13 +150,67 @@
|
|
|
135
150
|
"properties": {
|
|
136
151
|
"approved": { "type": "boolean" },
|
|
137
152
|
"summary": { "type": "string" },
|
|
153
|
+
"commandResult": {
|
|
154
|
+
"type": "object",
|
|
155
|
+
"properties": {
|
|
156
|
+
"command": { "type": "string", "description": "Exact command run to verify successCriteria" },
|
|
157
|
+
"exitCode": { "type": "integer", "description": "Exit code (0=pass, non-0=fail)" },
|
|
158
|
+
"output": { "type": "string", "description": "Command output (truncated if needed)" }
|
|
159
|
+
},
|
|
160
|
+
"required": ["command", "exitCode"]
|
|
161
|
+
},
|
|
162
|
+
"rootCauseVerification": {
|
|
163
|
+
"type": "array",
|
|
164
|
+
"items": {
|
|
165
|
+
"type": "object",
|
|
166
|
+
"properties": {
|
|
167
|
+
"cause": { "type": "string", "description": "Root cause from INVESTIGATION_COMPLETE" },
|
|
168
|
+
"addressed": { "type": "boolean", "description": "Whether fixer explicitly addressed this cause" },
|
|
169
|
+
"fixType": { "type": "string", "enum": ["root_fix", "band_aid", "not_addressed"], "description": "Quality of the fix" },
|
|
170
|
+
"notes": { "type": "string", "description": "Why you classified it this way" }
|
|
171
|
+
},
|
|
172
|
+
"required": ["cause", "addressed", "fixType"]
|
|
173
|
+
},
|
|
174
|
+
"description": "Verification of EACH root cause from investigator"
|
|
175
|
+
},
|
|
176
|
+
"similarLocationVerification": {
|
|
177
|
+
"type": "array",
|
|
178
|
+
"items": {
|
|
179
|
+
"type": "object",
|
|
180
|
+
"properties": {
|
|
181
|
+
"location": { "type": "string", "description": "Location from similarPatternLocations" },
|
|
182
|
+
"fixed": { "type": "boolean", "description": "Whether fixer addressed this location" },
|
|
183
|
+
"notes": { "type": "string", "description": "Why it was skipped (if not fixed)" }
|
|
184
|
+
},
|
|
185
|
+
"required": ["location", "fixed"]
|
|
186
|
+
},
|
|
187
|
+
"description": "Verification of ALL similar pattern locations"
|
|
188
|
+
},
|
|
189
|
+
"testVerification": {
|
|
190
|
+
"type": "object",
|
|
191
|
+
"properties": {
|
|
192
|
+
"newTestsAdded": { "type": "boolean", "description": "Whether fixer added new regression tests" },
|
|
193
|
+
"testQuality": { "type": "string", "enum": ["adequate", "trivial", "none"], "description": "Quality of added tests" },
|
|
194
|
+
"wouldFailWithOriginalBug": { "type": "boolean", "description": "Whether new tests would fail with original buggy code" },
|
|
195
|
+
"justificationValid": { "type": "boolean", "description": "If no tests added, whether fixer's justification was valid" },
|
|
196
|
+
"notes": { "type": "string", "description": "Details about test quality or justification review" }
|
|
197
|
+
},
|
|
198
|
+
"required": ["newTestsAdded", "testQuality"]
|
|
199
|
+
},
|
|
200
|
+
"regressionCheck": {
|
|
201
|
+
"type": "object",
|
|
202
|
+
"properties": {
|
|
203
|
+
"broaderTestsRun": { "type": "boolean", "description": "Whether broader test suite was run beyond successCriteria" },
|
|
204
|
+
"newFailures": { "type": "array", "items": { "type": "string" }, "description": "Any NEW failures introduced by the fix" }
|
|
205
|
+
}
|
|
206
|
+
},
|
|
138
207
|
"errors": { "type": "array", "items": { "type": "string" } },
|
|
139
208
|
"testResults": { "type": "string" }
|
|
140
209
|
},
|
|
141
|
-
"required": ["approved", "summary"]
|
|
210
|
+
"required": ["approved", "summary", "commandResult", "rootCauseVerification", "testVerification"]
|
|
142
211
|
},
|
|
143
212
|
"prompt": {
|
|
144
|
-
"system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a bug fix tester. Verify the fix FULLY satisfies the user's request.\n\n##
|
|
213
|
+
"system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a bug fix tester. Verify the fix FULLY satisfies the user's request.\n\n## 🔴 VERIFICATION CHECKLIST (ALL MUST PASS)\n\n### A. Command Verification\n1. Run the EXACT command from successCriteria\n2. Record exit code in commandResult.exitCode\n3. If exit code != 0 → REJECT immediately (skip other checks)\n\n### B. Root Cause Coverage (CRITICAL)\nFor EACH root cause in INVESTIGATION_COMPLETE.rootCauses:\n1. Verify fixer explicitly addressed this cause\n2. Classify as: root_fix (proper fix), band_aid (symptom fix), or not_addressed\n3. If ANY cause is not_addressed → REJECT\n4. If ANY cause is band_aid → REJECT (explain why it's a band-aid)\n\n### C. Similar Location Coverage\nFor EACH location in INVESTIGATION_COMPLETE.similarPatternLocations:\n1. Verify fixer addressed this location\n2. If any location skipped without justification → REJECT\n3. Valid skip: Fixer explained why it's NOT the same bug\n\n### D. Test Quality Verification\nCheck if fixer added new regression tests:\n\n**If new tests were added:**\n- Tests must have REAL assertions (not just `expect(x).toBeDefined()`)\n- Tests would FAIL with original buggy code (verify by reading test logic)\n- No test antipatterns: mocking expected results, timing dependencies, shared state\n- If tests are trivial → REJECT\n\n**If NO new tests added:**\n- Fixer MUST have provided justification\n- VERIFY the justification is valid:\n - 'Test X already covers this' → Read test X, confirm it would catch this bug\n - 'Pure type change' → Verify via git diff that it's behavior-preserving\n- Weak justifications → REJECT:\n - ❌ 'Tests are hard to write'\n - ❌ 'No time for tests'\n - ❌ 'It obviously works'\n\n### E. Regression Check (Smart Tiering)\nBased on successCriteria scope:\n- If narrow (single test file) → Run related tests in same directory\n- If medium (one function/endpoint) → Run parent test suite\n- If broad (npm test) → Already running full suite, skip additional\n- Record any NEW failures in regressionCheck.newFailures\n\n## FORBIDDEN RATIONALIZATIONS\n- ❌ 'The original bug is fixed but a new unrelated bug appeared' → REJECT\n- ❌ 'This is a test environment issue' → REJECT\n- ❌ 'The failure is not in failureInventory' → REJECT\n- ❌ 'Progress was made' → REJECT (not done until successCriteria met)\n- ❌ 'Most root causes were addressed' → REJECT (ALL must be addressed)\n\n## APPROVAL CRITERIA\nONLY approve if ALL of the following are true:\n1. successCriteria command exits 0\n2. ALL root causes addressed with root_fix (no band-aids, no skips)\n3. ALL similar locations fixed (or validly justified as different)\n4. Tests added OR valid justification for no tests\n5. No new regression failures introduced\n\n## Output Fields (Required)\n- approved: boolean\n- summary: 'SUCCESS CRITERIA MET' or 'REJECTED: [specific reason]'\n- commandResult: { command, exitCode, output }\n- rootCauseVerification: Array of { cause, addressed, fixType, notes }\n- similarLocationVerification: Array of { location, fixed, notes }\n- testVerification: { newTestsAdded, testQuality, wouldFailWithOriginalBug, justificationValid, notes }\n- regressionCheck: { broaderTestsRun, newFailures }\n- errors: Array of all issues found\n- testResults: Full command output"
|
|
145
214
|
},
|
|
146
215
|
"contextStrategy": {
|
|
147
216
|
"sources": [
|
|
@@ -40,7 +40,7 @@
|
|
|
40
40
|
"topic": "ISSUE_OPENED",
|
|
41
41
|
"logic": {
|
|
42
42
|
"engine": "javascript",
|
|
43
|
-
"script": "return message.sender === 'system';"
|
|
43
|
+
"script": "return message.sender === 'system' && !message.metadata?._republished;"
|
|
44
44
|
},
|
|
45
45
|
"action": "execute_task"
|
|
46
46
|
}
|
|
@@ -50,7 +50,7 @@
|
|
|
50
50
|
"action": "publish_message",
|
|
51
51
|
"transform": {
|
|
52
52
|
"engine": "javascript",
|
|
53
|
-
"script": "const { complexity, taskType, reasoning } = result;\nconst taskText = triggeringMessage.content?.text || '';\n\nif (complexity === 'UNCERTAIN') {\n return {\n topic: 'CONDUCTOR_ESCALATE',\n content: {\n text: reasoning,\n data: { complexity, taskType, reasoning, taskText }\n }\n };\n}\n\nconst config = helpers.getConfig(complexity, taskType);\n\nreturn {\n topic: 'CLUSTER_OPERATIONS',\n content: {\n text: `[${complexity}:${taskType}] ${reasoning}`,\n data: {\n complexity,\n taskType,\n operations: [\n { action: 'load_config', config },\n { action: 'publish', topic: 'ISSUE_OPENED', content: { text: taskText } }\n ]\n }\n }\n};"
|
|
53
|
+
"script": "const { complexity, taskType, reasoning } = result;\nconst taskText = triggeringMessage.content?.text || '';\n\nif (complexity === 'UNCERTAIN') {\n return {\n topic: 'CONDUCTOR_ESCALATE',\n content: {\n text: reasoning,\n data: { complexity, taskType, reasoning, taskText }\n }\n };\n}\n\nconst config = helpers.getConfig(complexity, taskType);\n\nreturn {\n topic: 'CLUSTER_OPERATIONS',\n content: {\n text: `[${complexity}:${taskType}] ${reasoning}`,\n data: {\n complexity,\n taskType,\n operations: [\n { action: 'load_config', config },\n { action: 'publish', topic: 'ISSUE_OPENED', content: { text: taskText }, metadata: { _republished: true } }\n ]\n }\n }\n};"
|
|
54
54
|
}
|
|
55
55
|
}
|
|
56
56
|
}
|
|
@@ -113,7 +113,7 @@
|
|
|
113
113
|
"action": "publish_message",
|
|
114
114
|
"transform": {
|
|
115
115
|
"engine": "javascript",
|
|
116
|
-
"script": "const { complexity, taskType, reasoning } = result;\n\nlet taskText = triggeringMessage.content?.data?.taskText || '';\nif (!taskText) {\n taskText = triggeringMessage.content?.text || '';\n}\n\nconst config = helpers.getConfig(complexity, taskType);\n\nreturn {\n topic: 'CLUSTER_OPERATIONS',\n content: {\n text: `Senior: [${complexity}:${taskType}] ${reasoning}`,\n data: {\n complexity,\n taskType,\n operations: [\n { action: 'load_config', config },\n { action: 'publish', topic: 'ISSUE_OPENED', content: { text: taskText } }\n ]\n }\n }\n};"
|
|
116
|
+
"script": "const { complexity, taskType, reasoning } = result;\n\nlet taskText = triggeringMessage.content?.data?.taskText || '';\nif (!taskText) {\n taskText = triggeringMessage.content?.text || '';\n}\n\nconst config = helpers.getConfig(complexity, taskType);\n\nreturn {\n topic: 'CLUSTER_OPERATIONS',\n content: {\n text: `Senior: [${complexity}:${taskType}] ${reasoning}`,\n data: {\n complexity,\n taskType,\n operations: [\n { action: 'load_config', config },\n { action: 'publish', topic: 'ISSUE_OPENED', content: { text: taskText }, metadata: { _republished: true } }\n ]\n }\n }\n};"
|
|
117
117
|
}
|
|
118
118
|
}
|
|
119
119
|
}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Provides: Node.js, Python, Git, Chromium, Claude CLI, Playwright deps, Infrastructure tools
|
|
3
3
|
#
|
|
4
4
|
# Build: docker build -t vibe-cluster-base vibe/cluster/docker/vibe-cluster/
|
|
5
|
-
# Usage:
|
|
5
|
+
# Usage: zeroshot run <task> --docker
|
|
6
6
|
|
|
7
7
|
FROM node:20-slim
|
|
8
8
|
|
|
@@ -120,6 +120,13 @@ ENV PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH=/usr/bin/chromium
|
|
|
120
120
|
COPY --chown=node:node . /tmp/zeroshot/
|
|
121
121
|
RUN cd /tmp/zeroshot && npm install && npm link
|
|
122
122
|
|
|
123
|
+
# Pre-bake common npm dependencies (Issue #20: 30-40% startup reduction)
|
|
124
|
+
# These are installed once at image build time, then copied to workspaces at runtime
|
|
125
|
+
# if the project's dependencies are satisfied by the pre-baked set.
|
|
126
|
+
COPY docker/zeroshot-cluster/pre-baked-deps.json /pre-baked-deps/package.json
|
|
127
|
+
RUN cd /pre-baked-deps && npm install --ignore-scripts \
|
|
128
|
+
&& chown -R node:node /pre-baked-deps
|
|
129
|
+
|
|
123
130
|
# Install Claude CLI globally
|
|
124
131
|
RUN npm install -g @anthropic-ai/claude-code
|
|
125
132
|
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pre-baked-deps",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Pre-baked npm dependencies for zeroshot isolation containers",
|
|
5
|
+
"private": true,
|
|
6
|
+
"dependencies": {
|
|
7
|
+
"typescript": "^5.3.0",
|
|
8
|
+
"eslint": "^8.56.0",
|
|
9
|
+
"@typescript-eslint/parser": "^6.19.0",
|
|
10
|
+
"@typescript-eslint/eslint-plugin": "^6.19.0",
|
|
11
|
+
"prettier": "^3.2.0",
|
|
12
|
+
"jest": "^29.7.0",
|
|
13
|
+
"@types/jest": "^29.5.0",
|
|
14
|
+
"ts-jest": "^29.1.0",
|
|
15
|
+
"@types/node": "^20.11.0",
|
|
16
|
+
"tsx": "^4.7.0",
|
|
17
|
+
"esbuild": "^0.19.0",
|
|
18
|
+
"dotenv": "^16.4.0",
|
|
19
|
+
"glob": "^10.3.0",
|
|
20
|
+
"chalk": "^5.3.0",
|
|
21
|
+
"commander": "^12.0.0",
|
|
22
|
+
"axios": "^1.6.0",
|
|
23
|
+
"node-fetch": "^3.3.0",
|
|
24
|
+
"zod": "^3.22.0",
|
|
25
|
+
"lodash": "^4.17.0",
|
|
26
|
+
"@types/lodash": "^4.14.0"
|
|
27
|
+
}
|
|
28
|
+
}
|
package/lib/settings.js
CHANGED
|
@@ -16,11 +16,49 @@ function getSettingsFile() {
|
|
|
16
16
|
return process.env.ZEROSHOT_SETTINGS_FILE || path.join(os.homedir(), '.zeroshot', 'settings.json');
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
+
/**
|
|
20
|
+
* Model hierarchy for cost ceiling validation
|
|
21
|
+
* Higher number = more expensive/capable model
|
|
22
|
+
*/
|
|
23
|
+
const MODEL_HIERARCHY = {
|
|
24
|
+
opus: 3,
|
|
25
|
+
sonnet: 2,
|
|
26
|
+
haiku: 1,
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
const VALID_MODELS = Object.keys(MODEL_HIERARCHY);
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Validate a requested model against the maxModel ceiling
|
|
33
|
+
* @param {string} requestedModel - Model the agent wants to use
|
|
34
|
+
* @param {string} maxModel - Maximum allowed model (cost ceiling)
|
|
35
|
+
* @returns {string} The validated model
|
|
36
|
+
* @throws {Error} If requested model exceeds ceiling
|
|
37
|
+
*/
|
|
38
|
+
function validateModelAgainstMax(requestedModel, maxModel) {
|
|
39
|
+
if (!requestedModel) return maxModel; // Default to ceiling if unspecified
|
|
40
|
+
|
|
41
|
+
if (!VALID_MODELS.includes(requestedModel)) {
|
|
42
|
+
throw new Error(`Invalid model "${requestedModel}". Valid: ${VALID_MODELS.join(', ')}`);
|
|
43
|
+
}
|
|
44
|
+
if (!VALID_MODELS.includes(maxModel)) {
|
|
45
|
+
throw new Error(`Invalid maxModel "${maxModel}". Valid: ${VALID_MODELS.join(', ')}`);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (MODEL_HIERARCHY[requestedModel] > MODEL_HIERARCHY[maxModel]) {
|
|
49
|
+
throw new Error(
|
|
50
|
+
`Agent requests "${requestedModel}" but maxModel is "${maxModel}". ` +
|
|
51
|
+
`Either lower agent's model or raise maxModel.`
|
|
52
|
+
);
|
|
53
|
+
}
|
|
54
|
+
return requestedModel;
|
|
55
|
+
}
|
|
56
|
+
|
|
19
57
|
// Default settings
|
|
20
58
|
const DEFAULT_SETTINGS = {
|
|
21
|
-
|
|
59
|
+
maxModel: 'sonnet', // Cost ceiling - agents cannot use models above this
|
|
22
60
|
defaultConfig: 'conductor-bootstrap',
|
|
23
|
-
|
|
61
|
+
defaultDocker: false,
|
|
24
62
|
strictSchema: true, // true = reliable json output (default), false = live streaming (may crash - see bold-meadow-11)
|
|
25
63
|
logLevel: 'normal',
|
|
26
64
|
// Auto-update settings
|
|
@@ -69,8 +107,8 @@ function validateSetting(key, value) {
|
|
|
69
107
|
return `Unknown setting: ${key}`;
|
|
70
108
|
}
|
|
71
109
|
|
|
72
|
-
if (key === '
|
|
73
|
-
return `Invalid model: ${value}. Valid models:
|
|
110
|
+
if (key === 'maxModel' && !VALID_MODELS.includes(value)) {
|
|
111
|
+
return `Invalid model: ${value}. Valid models: ${VALID_MODELS.join(', ')}`;
|
|
74
112
|
}
|
|
75
113
|
|
|
76
114
|
if (key === 'logLevel' && !['quiet', 'normal', 'verbose'].includes(value)) {
|
|
@@ -108,6 +146,10 @@ module.exports = {
|
|
|
108
146
|
coerceValue,
|
|
109
147
|
DEFAULT_SETTINGS,
|
|
110
148
|
getSettingsFile,
|
|
149
|
+
// Model validation exports
|
|
150
|
+
MODEL_HIERARCHY,
|
|
151
|
+
VALID_MODELS,
|
|
152
|
+
validateModelAgainstMax,
|
|
111
153
|
// Backward compatibility: SETTINGS_FILE as getter (reads env var dynamically)
|
|
112
154
|
get SETTINGS_FILE() {
|
|
113
155
|
return getSettingsFile();
|
package/package.json
CHANGED
|
@@ -6,8 +6,11 @@
|
|
|
6
6
|
* - Default values for optional fields
|
|
7
7
|
* - Model configuration setup
|
|
8
8
|
* - Safety checks for test mode
|
|
9
|
+
* - maxModel ceiling enforcement at config time
|
|
9
10
|
*/
|
|
10
11
|
|
|
12
|
+
const { loadSettings, validateModelAgainstMax } = require('../../lib/settings');
|
|
13
|
+
|
|
11
14
|
// Default max iterations (high limit - let the user decide when to give up)
|
|
12
15
|
const DEFAULT_MAX_ITERATIONS = 100;
|
|
13
16
|
|
|
@@ -55,11 +58,40 @@ function validateAgentConfig(config, options = {}) {
|
|
|
55
58
|
}
|
|
56
59
|
|
|
57
60
|
// Model configuration: support both static model and dynamic rules
|
|
61
|
+
// If no model specified, model is null - _selectModel() will use maxModel as default
|
|
58
62
|
let modelConfig;
|
|
59
63
|
if (config.modelRules) {
|
|
60
64
|
modelConfig = { type: 'rules', rules: config.modelRules };
|
|
61
65
|
} else {
|
|
62
|
-
modelConfig = { type: 'static', model: config.model ||
|
|
66
|
+
modelConfig = { type: 'static', model: config.model || null };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// COST CEILING ENFORCEMENT: Validate model(s) against maxModel at config time
|
|
70
|
+
// Catches violations EARLY (config load) instead of at runtime (iteration N)
|
|
71
|
+
const settings = loadSettings();
|
|
72
|
+
const maxModel = settings.maxModel || 'sonnet';
|
|
73
|
+
|
|
74
|
+
if (modelConfig.type === 'static' && modelConfig.model) {
|
|
75
|
+
// Static model: validate once
|
|
76
|
+
try {
|
|
77
|
+
validateModelAgainstMax(modelConfig.model, maxModel);
|
|
78
|
+
} catch (error) {
|
|
79
|
+
throw new Error(`Agent "${config.id}": ${error.message}`);
|
|
80
|
+
}
|
|
81
|
+
} else if (modelConfig.type === 'rules') {
|
|
82
|
+
// Dynamic rules: validate ALL rules upfront (don't wait until iteration N)
|
|
83
|
+
for (const rule of modelConfig.rules) {
|
|
84
|
+
if (rule.model) {
|
|
85
|
+
try {
|
|
86
|
+
validateModelAgainstMax(rule.model, maxModel);
|
|
87
|
+
} catch {
|
|
88
|
+
throw new Error(
|
|
89
|
+
`Agent "${config.id}": modelRule "${rule.iterations}" requests "${rule.model}" ` +
|
|
90
|
+
`but maxModel is "${maxModel}". Either lower the rule's model or raise maxModel.`
|
|
91
|
+
);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
63
95
|
}
|
|
64
96
|
|
|
65
97
|
// Prompt configuration: support static prompt OR iteration-based rules
|
|
@@ -116,9 +148,12 @@ function validateAgentConfig(config, options = {}) {
|
|
|
116
148
|
(trigger) => !trigger.action || trigger.action === 'execute_task'
|
|
117
149
|
);
|
|
118
150
|
|
|
119
|
-
|
|
151
|
+
// Accept either mockSpawnFn OR taskRunner as valid mock sources
|
|
152
|
+
const hasMock = options.mockSpawnFn || options.taskRunner;
|
|
153
|
+
|
|
154
|
+
if (options.testMode && !hasMock && executesTask) {
|
|
120
155
|
throw new Error(
|
|
121
|
-
`AgentWrapper: testMode=true but no mockSpawnFn provided for agent '${config.id}'. ` +
|
|
156
|
+
`AgentWrapper: testMode=true but no mockSpawnFn/taskRunner provided for agent '${config.id}'. ` +
|
|
122
157
|
`This would cause real Claude API calls. ABORTING.`
|
|
123
158
|
);
|
|
124
159
|
}
|