npm - @covibes/zeroshot - Versions diffs - 2.0.0 → 3.0.0 - Mend

@covibes/zeroshot 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/CHANGELOG.md +82 -0
package/README.md +19 -8
package/cli/index.js +146 -111
package/cli/lib/first-run.js +11 -11
package/cli/lib/update-checker.js +2 -1
package/cluster-templates/base-templates/debug-workflow.json +75 -6
package/cluster-templates/conductor-bootstrap.json +3 -3
package/docker/zeroshot-cluster/Dockerfile +8 -1
package/docker/zeroshot-cluster/pre-baked-deps.json +28 -0
package/lib/settings.js +46 -4
package/package.json +1 -1
package/src/agent/agent-config.js +38 -3
package/src/agent/agent-task-executor.js +229 -85
package/src/agent-wrapper.js +49 -13
package/src/config-validator.js +198 -0
package/src/copy-worker.js +43 -0
package/src/isolation-manager.js +328 -108
package/src/orchestrator.js +93 -11
package/src/preflight.js +28 -2
package/src/process-metrics.js +16 -4
package/src/status-footer.js +151 -42

package/cluster-templates/base-templates/debug-workflow.json CHANGED Viewed

@@ -45,18 +45,32 @@
             "description": "Complete list of all failures/errors found"
           },
           "rootCauses": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "cause": { "type": "string", "description": "The root cause" },
+                "whyItsFundamental": { "type": "string", "description": "Why this is the ROOT cause, not a symptom" },
+                "howDiscovered": { "type": "string", "description": "Evidence trail that led to this conclusion" },
+                "affectedAreas": { "type": "array", "items": { "type": "string" }, "description": "ALL code areas affected by this cause" }
+              },
+              "required": ["cause", "whyItsFundamental", "howDiscovered", "affectedAreas"]
+            },
+            "description": "All independent root causes identified with proof they are fundamental"
+          },
+          "similarPatternLocations": {
             "type": "array",
             "items": { "type": "string" },
-            "description": "All independent root causes identified"
+            "description": "ALL other files/locations where similar bug pattern exists (from codebase-wide scan)"
           },
           "evidence": { "type": "array", "items": { "type": "string" } },
           "fixPlan": { "type": "string" },
           "affectedFiles": { "type": "array", "items": { "type": "string" } }
         },
-        "required": ["successCriteria", "failureInventory", "rootCauses", "fixPlan"]
+        "required": ["successCriteria", "failureInventory", "rootCauses", "similarPatternLocations", "fixPlan"]
       },
       "prompt": {
-        "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a debugging investigator.\n\n## CRITICAL: DEFINE SUCCESS FIRST\n\nBefore investigating, define what SUCCESS looks like from the USER's perspective:\n- User says 'fix failing tests' → success = ALL tests pass (0 failures)\n- User says 'fix the build' → success = build completes with exit 0\n- User says 'fix deployment' → success = deployment succeeds\n\nThis becomes your successCriteria. The task is NOT DONE until successCriteria is met.\n\n## Investigation Process\n\n1. **ENUMERATE ALL FAILURES FIRST**\n   - Run the failing command/tests\n   - List EVERY failure, error, and issue (not just the first one)\n   - This is your failureInventory\n\n2. **Analyze for ROOT CAUSES (may be multiple)**\n   - Group failures by likely cause\n   - There may be 1 root cause or 5 - find them ALL\n   - Don't stop at the first one you find\n\n3. **Gather evidence for each root cause**\n   - Stack traces, logs, error messages\n   - Prove each hypothesis\n\n4. **Plan fixes for ALL root causes**\n   - The fix plan must address EVERY root cause\n   - When complete, successCriteria must be achievable\n\n## Output\n- successCriteria: Measurable condition (e.g., '0 test failures', 'build exits 0')\n- failureInventory: COMPLETE list of all failures found\n- rootCauses: ALL independent root causes (array, may be 1 or many)\n- evidence: Proof for each root cause\n- fixPlan: How to fix ALL root causes\n- affectedFiles: All files that need changes\n\n## CRITICAL\n- Do NOT narrow scope - enumerate EVERYTHING broken\n- Do NOT stop at first root cause - there may be more\n- successCriteria comes from USER INTENT, not from what you find"
+        "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a debugging investigator.\n\n## CRITICAL: DEFINE SUCCESS FIRST\n\nBefore investigating, define what SUCCESS looks like from the USER's perspective:\n- User says 'fix failing tests' → success = ALL tests pass (0 failures)\n- User says 'fix the build' → success = build completes with exit 0\n- User says 'fix deployment' → success = deployment succeeds\n\nThis becomes your successCriteria. The task is NOT DONE until successCriteria is met.\n\n## Investigation Process\n\n1. **ENUMERATE ALL FAILURES FIRST**\n   - Run the failing command/tests\n   - List EVERY failure, error, and issue (not just the first one)\n   - This is your failureInventory\n\n2. **Analyze for ROOT CAUSES (may be multiple)**\n   - Group failures by likely cause\n   - There may be 1 root cause or 5 - find them ALL\n   - Don't stop at the first one you find\n   - For EACH root cause, document:\n     * The cause itself\n     * WHY it's the ROOT cause (not a symptom)\n     * HOW you discovered it (evidence trail)\n     * ALL code areas affected by this cause\n\n3. **Gather evidence for each root cause**\n   - Stack traces, logs, error messages\n   - Prove each hypothesis\n\n4. **MANDATORY: SIMILARITY SCAN**\n   After identifying root causes, search the ENTIRE codebase for similar patterns:\n   - Use grep/glob to find ALL occurrences of the same antipattern\n   - Check if the same mistake exists in other files/functions\n   - List EVERY location in similarPatternLocations\n   - The fixer MUST fix ALL of them, not just the originally failing one\n\n5. **Plan fixes for ALL root causes**\n   - The fix plan must address EVERY root cause\n   - The fix plan must include ALL similar pattern locations\n   - When complete, successCriteria must be achievable\n\n## Output\n- successCriteria: Measurable condition (e.g., '0 test failures', 'build exits 0')\n- failureInventory: COMPLETE list of all failures found\n- rootCauses: Array of objects, each with: cause, whyItsFundamental, howDiscovered, affectedAreas\n- similarPatternLocations: ALL files where similar bug pattern exists (from codebase scan)\n- evidence: Proof for each root cause\n- fixPlan: How to fix ALL root causes AND all similar pattern locations\n- affectedFiles: All files that need changes\n\n## CRITICAL\n- Do NOT narrow scope - enumerate EVERYTHING broken\n- Do NOT stop at first root cause - there may be more\n- Do NOT skip the similarity scan - same bug likely exists elsewhere\n- successCriteria comes from USER INTENT, not from what you find"
       },
       "contextStrategy": {
         "sources": [{ "topic": "ISSUE_OPENED", "limit": 1 }],
@@ -75,6 +89,7 @@
                 "successCriteria": "{{result.successCriteria}}",
                 "failureInventory": "{{result.failureInventory}}",
                 "rootCauses": "{{result.rootCauses}}",
+                "similarPatternLocations": "{{result.similarPatternLocations}}",
                 "evidence": "{{result.evidence}}",
                 "affectedFiles": "{{result.affectedFiles}}"
               }
@@ -89,7 +104,7 @@
       "model": "{{fixer_model}}",
       "timeout": "{{timeout}}",
       "prompt": {
-        "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\n## 🚫 GIT OPERATIONS - FORBIDDEN\n\nNEVER commit, push, or create PRs. You only modify files.\nThe git-pusher agent handles ALL git operations AFTER validators approve.\n\n- ❌ NEVER run: git add, git commit, git push, gh pr create\n- ❌ NEVER suggest committing changes\n- ✅ Only modify files and publish your completion message when done\n\nYou are a bug fixer. Apply the fix from the investigator.\n\n## Your Job\nFix the root cause identified in INVESTIGATION_COMPLETE.\n\n## Fix Guidelines\n- Fix the ROOT CAUSE, not just the symptom\n- Make minimal changes (don't refactor unrelated code)\n- Add comments explaining WHY if fix is non-obvious\n- Consider if same bug exists elsewhere\n\n## After Fixing\n- Run the failing tests to verify fix works\n- Run related tests for regressions\n- Add test case that would catch this bug if it recurs\n\n## 🚀 LARGE TASKS - USE SUB-AGENTS\n\nIf task affects >10 files OR >50 errors, DO NOT fix manually. Use the Task tool to spawn parallel sub-agents:\n\n1. **Analyze scope first** - Count files/errors, group by directory or error type\n2. **Spawn sub-agents** - One per group, run in parallel\n3. **Choose model wisely:**\n   - **haiku**: Mechanical fixes (unused vars, missing imports, simple type annotations)\n   - **sonnet**: Complex fixes (refactoring, logic changes, architectural decisions)\n4. **Aggregate results** - Wait for all sub-agents, verify combined fix\n\nExample Task tool usage:\n```\nTask(prompt=\"Fix all @typescript-eslint/no-unused-vars errors in client/src/components/features/agents/. Prefix intentionally unused params with underscore, remove genuinely unused variables.\", model=\"haiku\")\n```\n\nDO NOT waste iterations doing manual work that sub-agents can parallelize.\n\n## 🔴 FORBIDDEN - DO NOT FUCKING DO THESE\n\nThese are SHORTCUTS that HIDE problems instead of FIXING them:\n\n- ❌ NEVER disable or suppress errors/warnings (config changes, disable comments, ignore directives)\n- ❌ NEVER change test expectations to match broken behavior\n- ❌ NEVER use unsafe type casts or `any` to silence type errors\n- ❌ NEVER add TODO/FIXME instead of actually fixing\n- ❌ NEVER work around the problem - FIX THE ACTUAL CODE\n\nIF THE PROBLEM STILL EXISTS BUT IS HIDDEN, YOU HAVE NOT FIXED IT.\n\n## On Rejection - READ THE FUCKING FEEDBACK\n\nWhen tester rejects:\n1. STOP. READ what they wrote. UNDERSTAND the issue.\n2. If same problem persists → your fix is WRONG, try DIFFERENT approach\n3. If new problems appeared → your fix BROKE something, REVERT and rethink\n4. Do NOT blindly retry the same approach\n5. If you are STUCK, say so. Do not waste iterations doing nothing.\n\nRepeating failed approaches = wasted time and money. LEARN from rejection."
+        "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\n## 🚫 GIT OPERATIONS - FORBIDDEN\n\nNEVER commit, push, or create PRs. You only modify files.\nThe git-pusher agent handles ALL git operations AFTER validators approve.\n\n- ❌ NEVER run: git add, git commit, git push, gh pr create\n- ❌ NEVER suggest committing changes\n- ✅ Only modify files and publish your completion message when done\n\nYou are a bug fixer. Apply the fix from the investigator.\n\n## Your Job\nFix ALL root causes identified in INVESTIGATION_COMPLETE.\n\n## 🔴 MANDATORY: ROOT CAUSE MAPPING\n\nFor EACH root cause from the investigator, you MUST:\n1. Quote the exact cause from INVESTIGATION_COMPLETE\n2. Describe your fix for that specific cause\n3. List files changed for this cause\n4. Explain WHY this is a ROOT fix, not a band-aid\n\nIf a root cause has NO corresponding fix, your work is INCOMPLETE.\nIf you add a fix not mapped to a root cause, JUSTIFY why.\n\n## 🔴 MANDATORY: FIX ALL SIMILAR PATTERN LOCATIONS\n\nThe investigator identified locations with similar bug patterns in similarPatternLocations.\nYou MUST fix ALL of them, not just the originally failing one.\nIf you skip any location, you MUST justify why it's NOT the same bug.\n\n## 🔴 MANDATORY: REGRESSION TESTS REQUIRED\n\nYou MUST add at least one test that:\n1. WOULD FAIL with the original buggy code\n2. PASSES with your fix\n3. Tests the SPECIFIC root cause, not just symptoms\n\nIf you claim existing tests cover this, you MUST:\n- Name the EXACT test file and test case\n- Explain WHY that test would have caught this bug\n- If it DIDN'T catch the bug before, explain why (flaky? not running? wrong assertion?)\n\nWEAK JUSTIFICATIONS WILL BE REJECTED:\n- ❌ 'Tests are hard to write for this'\n- ❌ 'No time for tests'\n- ❌ 'It's obvious it works'\n\nVALID JUSTIFICATIONS:\n- ✅ 'Test auth.test.ts:45 already asserts this exact edge case' (tester will verify)\n- ✅ 'Pure type change, no runtime behavior affected' (tester confirms with typecheck)\n\n## Fix Guidelines\n- Fix the ROOT CAUSE, not just the symptom\n- Make minimal changes (don't refactor unrelated code)\n- Add comments explaining WHY if fix is non-obvious\n\n## After Fixing\n- Run the failing tests to verify fix works\n- Run related tests for regressions\n\n## 🚀 LARGE TASKS - USE SUB-AGENTS\n\nIf task affects >10 files OR >50 errors, DO NOT fix manually. Use the Task tool to spawn parallel sub-agents:\n\n1. **Analyze scope first** - Count files/errors, group by directory or error type\n2. **Spawn sub-agents** - One per group, run in parallel\n3. **Choose model wisely:**\n   - **haiku**: Mechanical fixes (unused vars, missing imports, simple type annotations)\n   - **sonnet**: Complex fixes (refactoring, logic changes, architectural decisions)\n4. **Aggregate results** - Wait for all sub-agents, verify combined fix\n\nExample Task tool usage:\n```\nTask(prompt=\"Fix all @typescript-eslint/no-unused-vars errors in client/src/components/features/agents/. Prefix intentionally unused params with underscore, remove genuinely unused variables.\", model=\"haiku\")\n```\n\nDO NOT waste iterations doing manual work that sub-agents can parallelize.\n\n## 🔴 FORBIDDEN - DO NOT FUCKING DO THESE\n\nThese are SHORTCUTS that HIDE problems instead of FIXING them:\n\n- ❌ NEVER disable or suppress errors/warnings (config changes, disable comments, ignore directives)\n- ❌ NEVER change test expectations to match broken behavior\n- ❌ NEVER use unsafe type casts or `any` to silence type errors\n- ❌ NEVER add TODO/FIXME instead of actually fixing\n- ❌ NEVER work around the problem - FIX THE ACTUAL CODE\n\nIF THE PROBLEM STILL EXISTS BUT IS HIDDEN, YOU HAVE NOT FIXED IT.\n\n## On Rejection - READ THE FUCKING FEEDBACK\n\nWhen tester rejects:\n1. STOP. READ what they wrote. UNDERSTAND the issue.\n2. If same problem persists → your fix is WRONG, try DIFFERENT approach\n3. If new problems appeared → your fix BROKE something, REVERT and rethink\n4. Do NOT blindly retry the same approach\n5. If you are STUCK, say so. Do not waste iterations doing nothing.\n\nRepeating failed approaches = wasted time and money. LEARN from rejection."
       },
       "contextStrategy": {
         "sources": [
@@ -135,13 +150,67 @@
         "properties": {
           "approved": { "type": "boolean" },
           "summary": { "type": "string" },
+          "commandResult": {
+            "type": "object",
+            "properties": {
+              "command": { "type": "string", "description": "Exact command run to verify successCriteria" },
+              "exitCode": { "type": "integer", "description": "Exit code (0=pass, non-0=fail)" },
+              "output": { "type": "string", "description": "Command output (truncated if needed)" }
+            },
+            "required": ["command", "exitCode"]
+          },
+          "rootCauseVerification": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "cause": { "type": "string", "description": "Root cause from INVESTIGATION_COMPLETE" },
+                "addressed": { "type": "boolean", "description": "Whether fixer explicitly addressed this cause" },
+                "fixType": { "type": "string", "enum": ["root_fix", "band_aid", "not_addressed"], "description": "Quality of the fix" },
+                "notes": { "type": "string", "description": "Why you classified it this way" }
+              },
+              "required": ["cause", "addressed", "fixType"]
+            },
+            "description": "Verification of EACH root cause from investigator"
+          },
+          "similarLocationVerification": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "location": { "type": "string", "description": "Location from similarPatternLocations" },
+                "fixed": { "type": "boolean", "description": "Whether fixer addressed this location" },
+                "notes": { "type": "string", "description": "Why it was skipped (if not fixed)" }
+              },
+              "required": ["location", "fixed"]
+            },
+            "description": "Verification of ALL similar pattern locations"
+          },
+          "testVerification": {
+            "type": "object",
+            "properties": {
+              "newTestsAdded": { "type": "boolean", "description": "Whether fixer added new regression tests" },
+              "testQuality": { "type": "string", "enum": ["adequate", "trivial", "none"], "description": "Quality of added tests" },
+              "wouldFailWithOriginalBug": { "type": "boolean", "description": "Whether new tests would fail with original buggy code" },
+              "justificationValid": { "type": "boolean", "description": "If no tests added, whether fixer's justification was valid" },
+              "notes": { "type": "string", "description": "Details about test quality or justification review" }
+            },
+            "required": ["newTestsAdded", "testQuality"]
+          },
+          "regressionCheck": {
+            "type": "object",
+            "properties": {
+              "broaderTestsRun": { "type": "boolean", "description": "Whether broader test suite was run beyond successCriteria" },
+              "newFailures": { "type": "array", "items": { "type": "string" }, "description": "Any NEW failures introduced by the fix" }
+            }
+          },
           "errors": { "type": "array", "items": { "type": "string" } },
           "testResults": { "type": "string" }
         },
-        "required": ["approved", "summary"]
+        "required": ["approved", "summary", "commandResult", "rootCauseVerification", "testVerification"]
       },
       "prompt": {
-        "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a bug fix tester. Verify the fix FULLY satisfies the user's request.\n\n## CRITICAL: SUCCESS CRITERIA IS THE ONLY GATE\n\nThe investigator defined successCriteria (e.g., '0 test failures', 'build exits 0').\nThis is the ONLY condition that matters for approval.\n\n**THE RULE IS SIMPLE:**\n- Run the command that verifies successCriteria\n- If it PASSES (exit 0) → APPROVE (proceed to test quality check)\n- If it FAILS (exit non-0) → REJECT (don't check test quality)\n\nDo NOT rationalize. Do NOT make exceptions. Do NOT distinguish between 'related' and 'unrelated' failures.\nIf successCriteria says 'all tests pass' and ANY test fails → REJECT. Period.\n\n## FORBIDDEN RATIONALIZATIONS\n- ❌ 'The original bug is fixed but a new unrelated bug appeared' → REJECT (tests still fail)\n- ❌ 'This is a test environment issue' → REJECT (tests still fail)\n- ❌ 'The failure is not in failureInventory' → REJECT (successCriteria not met)\n- ❌ 'Progress was made' → REJECT (not done until successCriteria met)\n\n## Verification Process\n1. Read successCriteria from INVESTIGATION_COMPLETE\n2. Run the EXACT command (e.g., 'npm run test:e2e:safe')\n3. Check exit code: 0 = APPROVE, non-0 = REJECT\n4. If APPROVED: Check if new tests were added as part of fix\n5. If new tests added: Verify test quality (see Test Quality Check below)\n\n## Test Quality Check (Only if new tests added)\n\nIf the fix includes new or modified tests, verify quality:\n\n**REJECT if ANY of these test antipatterns found:**\n1. **Verification theater** - Tests with no real assertions (just `expect(result).toBeDefined()`)\n2. **Mocking expected results** - Mock returns exact value being asserted\n3. **Timing dependencies** - Tests use arbitrary timeouts (setTimeout without promises)\n4. **Missing isolation** - Tests share state, make real network/DB calls\n\n**How to verify:**\n- Read new/modified test files (use Read tool)\n- Check assertions are meaningful (verify values, not just existence)\n- Search for timing dependencies (`setTimeout`, `sleep`)\n- Search for shared state (module-level variables modified in tests)\n- Search for real external calls (`fetch`, `axios`, `prisma` without mocks)\n\nIf test quality issues found → REJECT with specific criterion violated.\n\n## Output\n- approved: true ONLY if successCriteria command exits 0 AND (no new tests OR new tests pass quality check)\n- summary: 'SUCCESS CRITERIA MET' or 'SUCCESS CRITERIA NOT MET: [reason]' (include test quality issues if found)\n- errors: ALL failures from the command output PLUS test quality issues if found\n- testResults: Full command output with exit code"
+        "system": "## 🚫 YOU CANNOT ASK QUESTIONS\n\nYou are running non-interactively. There is NO USER to answer.\n- NEVER use AskUserQuestion tool\n- NEVER say \"Should I...\" or \"Would you like...\"\n- When unsure: Make the SAFER choice and proceed.\n\nYou are a bug fix tester. Verify the fix FULLY satisfies the user's request.\n\n## 🔴 VERIFICATION CHECKLIST (ALL MUST PASS)\n\n### A. Command Verification\n1. Run the EXACT command from successCriteria\n2. Record exit code in commandResult.exitCode\n3. If exit code != 0 → REJECT immediately (skip other checks)\n\n### B. Root Cause Coverage (CRITICAL)\nFor EACH root cause in INVESTIGATION_COMPLETE.rootCauses:\n1. Verify fixer explicitly addressed this cause\n2. Classify as: root_fix (proper fix), band_aid (symptom fix), or not_addressed\n3. If ANY cause is not_addressed → REJECT\n4. If ANY cause is band_aid → REJECT (explain why it's a band-aid)\n\n### C. Similar Location Coverage\nFor EACH location in INVESTIGATION_COMPLETE.similarPatternLocations:\n1. Verify fixer addressed this location\n2. If any location skipped without justification → REJECT\n3. Valid skip: Fixer explained why it's NOT the same bug\n\n### D. Test Quality Verification\nCheck if fixer added new regression tests:\n\n**If new tests were added:**\n- Tests must have REAL assertions (not just `expect(x).toBeDefined()`)\n- Tests would FAIL with original buggy code (verify by reading test logic)\n- No test antipatterns: mocking expected results, timing dependencies, shared state\n- If tests are trivial → REJECT\n\n**If NO new tests added:**\n- Fixer MUST have provided justification\n- VERIFY the justification is valid:\n  - 'Test X already covers this' → Read test X, confirm it would catch this bug\n  - 'Pure type change' → Verify via git diff that it's behavior-preserving\n- Weak justifications → REJECT:\n  - ❌ 'Tests are hard to write'\n  - ❌ 'No time for tests'\n  - ❌ 'It obviously works'\n\n### E. Regression Check (Smart Tiering)\nBased on successCriteria scope:\n- If narrow (single test file) → Run related tests in same directory\n- If medium (one function/endpoint) → Run parent test suite\n- If broad (npm test) → Already running full suite, skip additional\n- Record any NEW failures in regressionCheck.newFailures\n\n## FORBIDDEN RATIONALIZATIONS\n- ❌ 'The original bug is fixed but a new unrelated bug appeared' → REJECT\n- ❌ 'This is a test environment issue' → REJECT\n- ❌ 'The failure is not in failureInventory' → REJECT\n- ❌ 'Progress was made' → REJECT (not done until successCriteria met)\n- ❌ 'Most root causes were addressed' → REJECT (ALL must be addressed)\n\n## APPROVAL CRITERIA\nONLY approve if ALL of the following are true:\n1. successCriteria command exits 0\n2. ALL root causes addressed with root_fix (no band-aids, no skips)\n3. ALL similar locations fixed (or validly justified as different)\n4. Tests added OR valid justification for no tests\n5. No new regression failures introduced\n\n## Output Fields (Required)\n- approved: boolean\n- summary: 'SUCCESS CRITERIA MET' or 'REJECTED: [specific reason]'\n- commandResult: { command, exitCode, output }\n- rootCauseVerification: Array of { cause, addressed, fixType, notes }\n- similarLocationVerification: Array of { location, fixed, notes }\n- testVerification: { newTestsAdded, testQuality, wouldFailWithOriginalBug, justificationValid, notes }\n- regressionCheck: { broaderTestsRun, newFailures }\n- errors: Array of all issues found\n- testResults: Full command output"
       },
       "contextStrategy": {
         "sources": [

package/cluster-templates/conductor-bootstrap.json CHANGED Viewed

@@ -40,7 +40,7 @@
           "topic": "ISSUE_OPENED",
           "logic": {
             "engine": "javascript",
-            "script": "return message.sender === 'system';"
+            "script": "return message.sender === 'system' && !message.metadata?._republished;"
           },
           "action": "execute_task"
         }
@@ -50,7 +50,7 @@
           "action": "publish_message",
           "transform": {
             "engine": "javascript",
-            "script": "const { complexity, taskType, reasoning } = result;\nconst taskText = triggeringMessage.content?.text || '';\n\nif (complexity === 'UNCERTAIN') {\n  return {\n    topic: 'CONDUCTOR_ESCALATE',\n    content: {\n      text: reasoning,\n      data: { complexity, taskType, reasoning, taskText }\n    }\n  };\n}\n\nconst config = helpers.getConfig(complexity, taskType);\n\nreturn {\n  topic: 'CLUSTER_OPERATIONS',\n  content: {\n    text: `[${complexity}:${taskType}] ${reasoning}`,\n    data: {\n      complexity,\n      taskType,\n      operations: [\n        { action: 'load_config', config },\n        { action: 'publish', topic: 'ISSUE_OPENED', content: { text: taskText } }\n      ]\n    }\n  }\n};"
+            "script": "const { complexity, taskType, reasoning } = result;\nconst taskText = triggeringMessage.content?.text || '';\n\nif (complexity === 'UNCERTAIN') {\n  return {\n    topic: 'CONDUCTOR_ESCALATE',\n    content: {\n      text: reasoning,\n      data: { complexity, taskType, reasoning, taskText }\n    }\n  };\n}\n\nconst config = helpers.getConfig(complexity, taskType);\n\nreturn {\n  topic: 'CLUSTER_OPERATIONS',\n  content: {\n    text: `[${complexity}:${taskType}] ${reasoning}`,\n    data: {\n      complexity,\n      taskType,\n      operations: [\n        { action: 'load_config', config },\n        { action: 'publish', topic: 'ISSUE_OPENED', content: { text: taskText }, metadata: { _republished: true } }\n      ]\n    }\n  }\n};"
           }
         }
       }
@@ -113,7 +113,7 @@
           "action": "publish_message",
           "transform": {
             "engine": "javascript",
-            "script": "const { complexity, taskType, reasoning } = result;\n\nlet taskText = triggeringMessage.content?.data?.taskText || '';\nif (!taskText) {\n  taskText = triggeringMessage.content?.text || '';\n}\n\nconst config = helpers.getConfig(complexity, taskType);\n\nreturn {\n  topic: 'CLUSTER_OPERATIONS',\n  content: {\n    text: `Senior: [${complexity}:${taskType}] ${reasoning}`,\n    data: {\n      complexity,\n      taskType,\n      operations: [\n        { action: 'load_config', config },\n        { action: 'publish', topic: 'ISSUE_OPENED', content: { text: taskText } }\n      ]\n    }\n  }\n};"
+            "script": "const { complexity, taskType, reasoning } = result;\n\nlet taskText = triggeringMessage.content?.data?.taskText || '';\nif (!taskText) {\n  taskText = triggeringMessage.content?.text || '';\n}\n\nconst config = helpers.getConfig(complexity, taskType);\n\nreturn {\n  topic: 'CLUSTER_OPERATIONS',\n  content: {\n    text: `Senior: [${complexity}:${taskType}] ${reasoning}`,\n    data: {\n      complexity,\n      taskType,\n      operations: [\n        { action: 'load_config', config },\n        { action: 'publish', topic: 'ISSUE_OPENED', content: { text: taskText }, metadata: { _republished: true } }\n      ]\n    }\n  }\n};"
           }
         }
       }

package/docker/zeroshot-cluster/Dockerfile CHANGED Viewed

@@ -2,7 +2,7 @@
 # Provides: Node.js, Python, Git, Chromium, Claude CLI, Playwright deps, Infrastructure tools
 #
 # Build: docker build -t vibe-cluster-base vibe/cluster/docker/vibe-cluster/
-# Usage: vibe run <task> --isolation
+# Usage: zeroshot run <task> --docker
 FROM node:20-slim
@@ -120,6 +120,13 @@ ENV PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH=/usr/bin/chromium
 COPY --chown=node:node . /tmp/zeroshot/
 RUN cd /tmp/zeroshot && npm install && npm link
+# Pre-bake common npm dependencies (Issue #20: 30-40% startup reduction)
+# These are installed once at image build time, then copied to workspaces at runtime
+# if the project's dependencies are satisfied by the pre-baked set.
+COPY docker/zeroshot-cluster/pre-baked-deps.json /pre-baked-deps/package.json
+RUN cd /pre-baked-deps && npm install --ignore-scripts \
+    && chown -R node:node /pre-baked-deps
 # Install Claude CLI globally
 RUN npm install -g @anthropic-ai/claude-code

package/docker/zeroshot-cluster/pre-baked-deps.json ADDED Viewed

@@ -0,0 +1,28 @@
+{
+  "name": "pre-baked-deps",
+  "version": "1.0.0",
+  "description": "Pre-baked npm dependencies for zeroshot isolation containers",
+  "private": true,
+  "dependencies": {
+    "typescript": "^5.3.0",
+    "eslint": "^8.56.0",
+    "@typescript-eslint/parser": "^6.19.0",
+    "@typescript-eslint/eslint-plugin": "^6.19.0",
+    "prettier": "^3.2.0",
+    "jest": "^29.7.0",
+    "@types/jest": "^29.5.0",
+    "ts-jest": "^29.1.0",
+    "@types/node": "^20.11.0",
+    "tsx": "^4.7.0",
+    "esbuild": "^0.19.0",
+    "dotenv": "^16.4.0",
+    "glob": "^10.3.0",
+    "chalk": "^5.3.0",
+    "commander": "^12.0.0",
+    "axios": "^1.6.0",
+    "node-fetch": "^3.3.0",
+    "zod": "^3.22.0",
+    "lodash": "^4.17.0",
+    "@types/lodash": "^4.14.0"
+  }
+}

package/lib/settings.js CHANGED Viewed

@@ -16,11 +16,49 @@ function getSettingsFile() {
   return process.env.ZEROSHOT_SETTINGS_FILE || path.join(os.homedir(), '.zeroshot', 'settings.json');
 }
+/**
+ * Model hierarchy for cost ceiling validation
+ * Higher number = more expensive/capable model
+ */
+const MODEL_HIERARCHY = {
+  opus: 3,
+  sonnet: 2,
+  haiku: 1,
+};
+const VALID_MODELS = Object.keys(MODEL_HIERARCHY);
+/**
+ * Validate a requested model against the maxModel ceiling
+ * @param {string} requestedModel - Model the agent wants to use
+ * @param {string} maxModel - Maximum allowed model (cost ceiling)
+ * @returns {string} The validated model
+ * @throws {Error} If requested model exceeds ceiling
+ */
+function validateModelAgainstMax(requestedModel, maxModel) {
+  if (!requestedModel) return maxModel; // Default to ceiling if unspecified
+  if (!VALID_MODELS.includes(requestedModel)) {
+    throw new Error(`Invalid model "${requestedModel}". Valid: ${VALID_MODELS.join(', ')}`);
+  }
+  if (!VALID_MODELS.includes(maxModel)) {
+    throw new Error(`Invalid maxModel "${maxModel}". Valid: ${VALID_MODELS.join(', ')}`);
+  }
+  if (MODEL_HIERARCHY[requestedModel] > MODEL_HIERARCHY[maxModel]) {
+    throw new Error(
+      `Agent requests "${requestedModel}" but maxModel is "${maxModel}". ` +
+        `Either lower agent's model or raise maxModel.`
+    );
+  }
+  return requestedModel;
+}
 // Default settings
 const DEFAULT_SETTINGS = {
-  defaultModel: 'sonnet',
+  maxModel: 'sonnet', // Cost ceiling - agents cannot use models above this
   defaultConfig: 'conductor-bootstrap',
-  defaultIsolation: false,
+  defaultDocker: false,
   strictSchema: true, // true = reliable json output (default), false = live streaming (may crash - see bold-meadow-11)
   logLevel: 'normal',
   // Auto-update settings
@@ -69,8 +107,8 @@ function validateSetting(key, value) {
     return `Unknown setting: ${key}`;
   }
-  if (key === 'defaultModel' && !['opus', 'sonnet', 'haiku'].includes(value)) {
-    return `Invalid model: ${value}. Valid models: opus, sonnet, haiku`;
+  if (key === 'maxModel' && !VALID_MODELS.includes(value)) {
+    return `Invalid model: ${value}. Valid models: ${VALID_MODELS.join(', ')}`;
   }
   if (key === 'logLevel' && !['quiet', 'normal', 'verbose'].includes(value)) {
@@ -108,6 +146,10 @@ module.exports = {
   coerceValue,
   DEFAULT_SETTINGS,
   getSettingsFile,
+  // Model validation exports
+  MODEL_HIERARCHY,
+  VALID_MODELS,
+  validateModelAgainstMax,
   // Backward compatibility: SETTINGS_FILE as getter (reads env var dynamically)
   get SETTINGS_FILE() {
     return getSettingsFile();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@covibes/zeroshot",
-  "version": "2.0.0",
+  "version": "3.0.0",
   "description": "Multi-agent orchestration engine for Claude - cluster coordinator and CLI",
   "main": "src/orchestrator.js",
   "bin": {

package/src/agent/agent-config.js CHANGED Viewed

@@ -6,8 +6,11 @@
  * - Default values for optional fields
  * - Model configuration setup
  * - Safety checks for test mode
+ * - maxModel ceiling enforcement at config time
  */
+const { loadSettings, validateModelAgainstMax } = require('../../lib/settings');
 // Default max iterations (high limit - let the user decide when to give up)
 const DEFAULT_MAX_ITERATIONS = 100;
@@ -55,11 +58,40 @@ function validateAgentConfig(config, options = {}) {
   }
   // Model configuration: support both static model and dynamic rules
+  // If no model specified, model is null - _selectModel() will use maxModel as default
   let modelConfig;
   if (config.modelRules) {
     modelConfig = { type: 'rules', rules: config.modelRules };
   } else {
-    modelConfig = { type: 'static', model: config.model || 'sonnet' };
+    modelConfig = { type: 'static', model: config.model || null };
+  }
+  // COST CEILING ENFORCEMENT: Validate model(s) against maxModel at config time
+  // Catches violations EARLY (config load) instead of at runtime (iteration N)
+  const settings = loadSettings();
+  const maxModel = settings.maxModel || 'sonnet';
+  if (modelConfig.type === 'static' && modelConfig.model) {
+    // Static model: validate once
+    try {
+      validateModelAgainstMax(modelConfig.model, maxModel);
+    } catch (error) {
+      throw new Error(`Agent "${config.id}": ${error.message}`);
+    }
+  } else if (modelConfig.type === 'rules') {
+    // Dynamic rules: validate ALL rules upfront (don't wait until iteration N)
+    for (const rule of modelConfig.rules) {
+      if (rule.model) {
+        try {
+          validateModelAgainstMax(rule.model, maxModel);
+        } catch {
+          throw new Error(
+            `Agent "${config.id}": modelRule "${rule.iterations}" requests "${rule.model}" ` +
+              `but maxModel is "${maxModel}". Either lower the rule's model or raise maxModel.`
+          );
+        }
+      }
+    }
   }
   // Prompt configuration: support static prompt OR iteration-based rules
@@ -116,9 +148,12 @@ function validateAgentConfig(config, options = {}) {
     (trigger) => !trigger.action || trigger.action === 'execute_task'
   );
-  if (options.testMode && !options.mockSpawnFn && executesTask) {
+  // Accept either mockSpawnFn OR taskRunner as valid mock sources
+  const hasMock = options.mockSpawnFn || options.taskRunner;
+  if (options.testMode && !hasMock && executesTask) {
     throw new Error(
-      `AgentWrapper: testMode=true but no mockSpawnFn provided for agent '${config.id}'. ` +
+      `AgentWrapper: testMode=true but no mockSpawnFn/taskRunner provided for agent '${config.id}'. ` +
         `This would cause real Claude API calls. ABORTING.`
     );
   }