npm - @hamp10/agentforge - Versions diffs - 0.2.21 → 0.2.23 - Mend

@hamp10/agentforge 0.2.21 → 0.2.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/bin/agentforge.js +909 -115
package/package.json +2 -1
package/scripts/check-task-semantics.js +911 -0
package/scripts/postinstall.js +20 -5
package/src/OllamaAgent.js +1178 -246
package/src/OpenClawCLI.js +5897 -748
package/src/browser.js +392 -0
package/src/default-task-guides.js +95 -0
package/src/resolveOpenclaw.js +38 -7
package/src/selfUpdate.js +31 -3
package/src/supervisor.js +88 -20
package/src/taskSemantics.js +141 -0
package/src/worker.js +4257 -230
package/templates/agent/AGENTFORGE.md +151 -53
package/templates/hooks/agentforge-platform/handler.js +322 -0
package/src/HampAgentCLI.js +0 -125
package/src/hampagent/browser.js +0 -321
package/src/hampagent/runner.js +0 -277
package/src/hampagent/sessions.js +0 -62
package/src/hampagent/tools.js +0 -298

package/src/OllamaAgent.js CHANGED Viewed

@@ -1,10 +1,11 @@
-import { exec } from 'child_process';
+import { exec, spawn } from 'child_process';
 import { mkdirSync, writeFileSync, readFileSync, existsSync, readdirSync, statSync, appendFileSync } from 'fs';
 import { EventEmitter } from 'events';
 import path from 'path';
+import { homedir } from 'os';
 import { promisify } from 'util';
 import { fileURLToPath } from 'url';
-import { browserAction } from './hampagent/browser.js';
+import { browserAction, releaseAgentTab } from './browser.js';
 const execAsync = promisify(exec);
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -136,8 +137,13 @@ function _parseWriteFileFences(content) {
   const re = /(?:WRITE_FILE|write_file)[:\s]+([^\n]+)\n```[^\n]*\n([\s\S]*?)```/gi;
   let m;
   while ((m = re.exec(content)) !== null) {
-    const filePath = m[1].trim();
+    const filePath = m[1].trim().replace(/\]$/, ''); // strip trailing ] if model used [write_file: /path] bracket notation
     const fileContent = m[2]; // raw content, no unescaping needed
+    // Reject compaction placeholders — model echoed the summary as content
+    if (/^\[wrote:/.test(fileContent.trim()) || /^\(\d+ chars, \d+ lines —/.test(fileContent.trim())) {
+      console.log(`   ⚠️  WRITE_FILE skipped: content is a compaction placeholder, not real file content (${filePath})`);
+      continue;
+    }
     if (filePath && fileContent !== undefined) {
       calls.push({ name: 'write_file', arguments: { path: filePath, content: fileContent } });
     }
@@ -261,7 +267,10 @@ function _parseTextToolCalls(content) {
         if (depth === 0 && jsonStr.trim()) break;
       }
       try {
-        const obj = JSON.parse(jsonStr.trim());
+        // Strip Gemma4 model artifacts that can appear after a complete JSON object:
+        // <tool_call|>, <|end_of_turn|>, <|end|>, etc.
+        const cleanJson = jsonStr.trim().replace(/<[^>]*>$/g, '').trimEnd();
+        const obj = JSON.parse(cleanJson);
         if (Array.isArray(obj)) {
           for (const item of obj) {
             const call = normalise(item);
@@ -334,15 +343,27 @@ export class OllamaAgent extends EventEmitter {
     return { agentId, workDir };
   }
-  async runAgentTask(agentId, task, workDir, sessionId = null, image = null, browserProfile = null, actualWorkDir = null, agentModel = null, customSystemPrompt = null, conversationHistory = null) {
+  async runAgentTask(agentId, task, workDir, sessionId = null, image = null, browserProfile = null, actualWorkDir = null, agentModel = null, customSystemPrompt = null, conversationHistory = null, allImages = null, visionModel = null, providerKeys = null) {
     const startTime = Date.now();
     const controller = new AbortController();
     // Use per-agent model override if provided (and not the placeholder 'Default').
     // Strip 'ollama/' prefix — catalog returns IDs like 'ollama/modelname:tag' but
     // Ollama's API expects bare names like 'modelname:tag'.
+    // Cloud model IDs (google/..., anthropic/..., openai/...) are not valid Ollama names —
+    // fall back to the configured local model so a mismatch doesn't crash the task.
     const rawModel = (agentModel && agentModel !== 'Default') ? agentModel : this.model;
-    const effectiveModel = rawModel.startsWith('ollama/') ? rawModel.slice(7) : rawModel;
+    const isCloudModel = /^(google|anthropic|openai|mistral|cohere|azure)\//i.test(rawModel);
+    const effectiveModel = isCloudModel ? this.model : (rawModel.startsWith('ollama/') ? rawModel.slice(7) : rawModel);
+    if (isCloudModel) console.log(`   [${agentId}] ⚠️  Cloud model ID "${rawModel}" ignored by local runner — using ${effectiveModel}`);
+    // Store per-task vision settings — used by _screenshotAndDescribe during this task
+    // Cleared at the end of the task so stale keys don't leak between tasks
+    this._taskVisionModel = visionModel || null;
+    this._taskProviderKeys = providerKeys || null;
+    const googleKey = providerKeys?.google || null;
+    if (visionModel) console.log(`   [${agentId}] 👁️ Vision model: ${visionModel} (google key: ${googleKey ? 'present' : 'MISSING'})`);
+    else console.log(`   [${agentId}] 👁️ Vision: Ollama default (no vision_model in flow config)`);
     // Fake proc-like object so worker.js pid checks don't crash
     const fakeProc = { pid: null };
@@ -356,7 +377,7 @@ export class OllamaAgent extends EventEmitter {
       // Load conversation history — prefer Railway DB history (sent via task payload, works across
       // any machine/user/model). Fall back to local file for offline or pre-fix sessions.
       const history = (conversationHistory && conversationHistory.length > 0)
-        ? conversationHistory.slice(-20)
+        ? conversationHistory.slice(-60)
         : this._loadHistory(agentId, workDir, sessionId);
       // Text-based tool format is used rather than XML schemas — more reliable across models.
@@ -364,6 +385,27 @@ export class OllamaAgent extends EventEmitter {
       // ALL models get the same rule set and tool format — no model-specific branching.
       const homeDir = process.env.HOME || '/tmp';
       const projectsDir = `${homeDir}/Desktop/Projects`;
+      // ── Per-agent port assignment ──────────────────────────────────────────
+      // Each agent gets a deterministic port in range 3100-59099 derived from its ID
+      // (56000-port space — handles tens of thousands of projects before any collision).
+      // Port 3000 is reserved for agent_dashboard. At task start, any stale process
+      // on the assigned port is killed. If the port is still occupied by a live
+      // unrelated process, we walk up until we find a free one.
+      const agentPortOffset = parseInt(agentId.replace(/\D/g, '').slice(-5) || '0') % 56000;
+      let assignedPort = 3100 + agentPortOffset;
+      // Kill any stale server from a previous run of THIS agent
+      try { await execAsync(`lsof -t -i:${assignedPort} | xargs kill -9 2>/dev/null || true`); } catch {}
+      // If something else is still on that port, scan upward for a free one
+      for (let attempts = 0; attempts < 100; attempts++) {
+        try {
+          const { stdout } = await execAsync(`lsof -t -i:${assignedPort} 2>/dev/null || true`);
+          if (!stdout.trim()) break; // port is free
+          assignedPort++;
+          if (assignedPort > 59099) assignedPort = 3100;
+        } catch { break; }
+      }
+      console.log(`   [${agentId}] 🔌 Assigned port: ${assignedPort}`);
       const universalRules = `
 == WHAT YOU CAN DO ==
 You have these tools:
@@ -373,7 +415,7 @@ read_file: Read a local file.
 WRITE_FILE: Write a local file (code-fence format only).
 list_directory: List a local directory.
 web_fetch: Fetch any public URL — websites, APIs, docs, raw data. Fast, text-only.
-screenshot_and_describe: Navigate a real browser to any URL and screenshot it. Use this when pages require JavaScript, you need visual output, or web_fetch returns nothing useful.
+screenshot_and_describe: Take a screenshot and analyze it with vision so YOU can see and reason about what's on screen. Use this when: pages are JS-heavy, snapshot gives partial/empty data, you need to read numbers/text that aren't in the DOM, or you want to verify what's actually visible. Pass url to navigate first, or omit url to screenshot the current browser tab. Returns a text description YOU can reason about — this is NOT just for the user, it is how YOU SEE THE PAGE.
 browser: Control the AgentForge Browser directly (Chrome, always running, logged into user's services). Use for ALL browser interaction — navigating, clicking, typing, reading page content, screenshots.
 BROWSER TOOL — use this instead of writing CDP scripts:
@@ -385,7 +427,7 @@ BROWSER TOOL — use this instead of writing CDP scripts:
 {"name":"browser","arguments":{"action":"click","text":"Show Filter"}}      ← click element by visible text
 {"name":"browser","arguments":{"action":"click","selector":"#filter-btn"}}  ← click by CSS selector
 {"name":"browser","arguments":{"action":"type","selector":"input","text":"hello"}} ← type text
-{"name":"browser","arguments":{"action":"screenshot"}}                      ← take screenshot
+{"name":"browser","arguments":{"action":"screenshot"}}                      ← sends screenshot to user (YOU cannot see it — use screenshot_and_describe to see the page yourself)
 {"name":"browser","arguments":{"action":"evaluate","script":"document.title"}} ← run JS
 {"name":"browser","arguments":{"action":"scroll","y":400}}                  ← scroll down
@@ -398,46 +440,181 @@ WORKFLOW when user says "the tab is already open":
 The browser has the user's sessions and cookies. You CAN click any button, filter, or link visible on the page.
 == GENERAL RULES (all tasks) ==
-G1. IDENTIFY THE TASK TYPE. Build? Research? Question? Match approach to task.
-G2. START IMMEDIATELY. No intro text, no plans, no asking permission. First output = first tool call or direct answer.
+G1. IDENTIFY THE TASK TYPE FIRST:
+  - CONVERSATIONAL/QUESTION (asking for names, opinions, definitions, advice, comparisons, brainstorming): Answer in text. NO tools. Do NOT use browser, bash, screenshot, or any tool. Match the depth of your response to the complexity of the question — a simple factual question gets a concise answer, an open-ended or creative question gets a full, substantive response with reasoning.
+  - RESEARCH (look something up online): Use web_fetch or browser to find info, then answer in text.
+  - BUILD (create an app, game, script, file): Use bash, WRITE_FILE, browser as needed.
+  - BROWSER TASK (interact with a website): Use browser tools.
+G2. START IMMEDIATELY. No intro text, no plans, no asking permission. First output = first tool call or direct answer. DO NOT repeat the user's question or task back to them — just respond.
 G3. ANY WEBSITE/URL IS ACCESSIBLE. User mentions a site or open tab? Use browser snapshot to see what's currently open, then browser navigate/click/type to interact. Never ask "what's the URL?" — find it yourself.
 G4. NEVER ASK PERMISSION. Never say "should I use X or Y?" — pick the right tool and use it.
-G5. IF A TOOL FAILS: Try a different approach. web_fetch empty → screenshot_and_describe. Never repeat a failing call identically.
+G4a. STOP WHEN DONE. After completing the task, STOP. Do NOT add meta-commentary about your capabilities, limitations, or what information you don't have. Do NOT explain what you cannot do. Answer and stop.
+G4b. FORMATTING: Use **bold** for section labels and emphasis. Do NOT use markdown headers (# ## ### ####) — use **bold** instead. For bullet lists, ALWAYS write "- item" (dash + space + text). NEVER write "*item" (asterisk directly before text with no space) — that is not valid markdown and shows as a raw asterisk.
+G5. IF A TOOL FAILS: Try a different approach. Browser snapshot empty? → try web_fetch on the same URL. web_fetch empty? → try screenshot_and_describe. NEVER repeat a failing call more than twice with different selectors — take a snapshot to see what's actually on the page. IF WEB BROWSING FAILS REPEATEDLY: fall back to web_fetch on the site's URL, or try a different URL entirely. NEVER write files, build code, or start a server as a fallback for web research — stay in browser/web_fetch tools until you have the data.
+G5a. BROWSER FORM SUBMISSION: After typing into a search/input field, ALWAYS submit with {"action":"press","key":"Enter","selector":"<same-selector-you-typed-into>"} — pass the selector of the field you just typed in so Enter fires in the right element. NEVER try to click submit/compute/search buttons by ref, text, or selector. Buttons shift, break, or trigger ads. Enter always works.
+G5b. BROWSER INTERACTION RULE: After navigating to a page, ALWAYS take a snapshot FIRST to see real element text, IDs, and indices before attempting to click or type. Do NOT guess selectors from memory — selectors change. Snapshot → read elements → interact.
+G5c. READING PAGE CONTENT: For reading text on a page (titles, scores, prices, numbers), use browser → snapshot — it returns all DOM text fast. Use screenshot_and_describe only when you need to visually verify something rendered (canvas, image, CSS layout) OR when snapshot body text is under 200 chars (JS-heavy page, results not yet in DOM). When using screenshot_and_describe to find specific data, ALWAYS pass check_for with exactly what you need.
 G6. RESEARCH TASKS: web_fetch → read → reason → respond in text. No server, no localhost.
-G7. NEVER INVENT TASKS. Do exactly what was asked. Do not build a web app when asked to analyze data.
+G7. NEVER INVENT TASKS. Do exactly what was asked. Do not build a web app when asked to analyze data. Do not write files when asked to look up information. Do not start coding when the task is browsing.
 G8. WHEN GENUINELY STUCK: State what you tried, what failed, ask ONE specific question.
 G9. KEEP GOING until the task is fully complete.
 == BUILD RULES (only when building apps/games/tools) ==
-B1. PROJECT LOCATION: Always put projects in ${projectsDir}/PROJECT_NAME/ (no spaces — use underscores).
+B0. STATIC-FILE TASKS (saves to a local path, no deployment/hosting mentioned): If the task says "save to ~/some/path.html" or "create a file at ~/some/path" and does NOT mention serving, hosting, or deploying — just WRITE_FILE to that exact path, then open it with {"name":"browser","arguments":{"action":"navigate","url":"file:///abs/path/index.html"}} and screenshot to visually verify. Do NOT spin up a server, do NOT run npm init, do NOT install packages. Pure HTML/CSS/JS files run directly in browsers via file:// URLs — no server needed.
+B1. PROJECT LOCATION: Always put projects in ${projectsDir}/PROJECT_NAME/ (no spaces — use underscores). NEVER create directories or write project files under /tmp/agentforge/ — that path is platform-managed. Your Working directory (${workDir}) is only for tool execution context, NOT for storing project files.
 B2. WRITE EVERY FILE COMPLETELY — no stubs, no placeholders, no TODOs. Full working code only.
+B2a. NEVER use echo or cat to append code line-by-line (e.g. echo 'code' >> file.js). Always use WRITE_FILE with the COMPLETE file content in one call. Appending one line per bash call wastes 100 turns to write what one WRITE_FILE does instantly.
 B3. BUILD FILE BY FILE — write each file completely before writing the next.
 B4. ALWAYS use absolute paths.
-B5. SERVING FILES: Node.js server: nohup /usr/local/bin/node /abs/path/server.js > /tmp/server.log 2>&1 & — NEVER blocking. Pure HTML/JS (no backend): nohup python3 -m http.server PORT --directory /abs/path/ > /tmp/server.log 2>&1 &
-B6. npm install: cd ${projectsDir}/PROJECT_NAME && /usr/local/bin/npm init -y && /usr/local/bin/npm install express
-B7. After starting server, verify: sleep 3 && curl -s -o /dev/null -w '%{http_code}' http://localhost:PORT — if 000, check /tmp/server.log and fix the error.
-B8. PORT MANAGEMENT: Check port before starting: lsof -i :PORT | head -3. If in use: kill old process, restart. If crashed: restart. If busy with something else: pick different port.
+B5. SERVING FILES: Node.js server MUST cd into the project dir first — ALWAYS use this exact pattern: cd /abs/project/path && nohup /usr/local/bin/node /abs/project/path/server.js > /tmp/server.log 2>&1 & — NEVER use a bare filename like "nohup node server.js" without cd, or Node will look for server.js in the wrong directory and crash. NEVER blocking. Pure HTML/JS (no backend): nohup python3 -m http.server ${assignedPort} --directory /abs/path/ > /tmp/server.log 2>&1 &
+B5b. STOPPING A SERVER: NEVER use "pkill -f node" — it kills the platform itself. To stop a running server: kill $(lsof -ti:PORT) 2>/dev/null || true
+B6. npm install: ALL npm commands MUST be in ONE bash call with cd: {"name":"bash","arguments":{"command":"cd /abs/project/path && /usr/local/bin/npm init -y && /usr/local/bin/npm install express"}} — NEVER run npm init or npm install as a separate bash call without cd, or packages install in the wrong directory and the server will crash with "Cannot find module".
+B7. After starting server, verify using the ACTUAL PORT the server is listening on (not the assigned port): sleep 3 && curl -s -o /dev/null -w '%{http_code}' http://localhost:ACTUAL_PORT
+- If 000: server crashed. Read /tmp/server.log, fix the error, restart server (kill $(lsof -ti:PORT) 2>/dev/null || true && cd /abs/project/path && nohup /usr/local/bin/node /abs/path/server.js > /tmp/server.log 2>&1 &), then curl again.
+- If 404: server is running but missing a file. Read /tmp/server.log — if you see "ENOENT" for public/index.html, that HTML file was NOT written. Write it immediately, then curl again. Do NOT rewrite server.js for a 404.
+- If 200: server is up. Proceed to B10 screenshot QA.
+After fixing any error, ALWAYS restart the server AND re-verify with curl before proceeding.
+B8. PORT RULE: If the user's task explicitly specifies a port number, use that exact port everywhere — in server.js, in the verification curl, everywhere. If no port is specified, use your ASSIGNED PORT ${assignedPort}. In server.js: const PORT = process.env.PORT || YOUR_CHOSEN_PORT; Never use port 3000 (reserved by system).
 B9. EXPRESS WILDCARD ROUTE: NEVER write app.get('*', ...) — crashes in newer versions. Use app.use((req, res) => { ... }) instead.
-B10. MANDATORY SCREENSHOT QA: After curl returns 200, call screenshot_and_describe with send_to_user:true. You are NOT done until the screenshot shows the real working app.
-B11. ALWAYS open the finished app: bash open http://localhost:PORT
+B10a. STATIC FILE PATHS: ALWAYS use path.join(__dirname, 'public') for express.static — NEVER './public' or 'public'. For res.sendFile on the root route: ALWAYS path.join(__dirname, 'public', 'index.html') — NEVER path.join(__dirname, 'index.html'). Relative paths break under nohup.
+B10b. server.js IS FOR LOGIC ONLY — NEVER EMBED HTML: All HTML belongs in public/index.html. Route handlers must NOT contain template literals with HTML (backtick strings with <div>, <h1>, etc.) — these cause SyntaxErrors. server.js should only have: require/import, middleware, JSON API routes, express.static, and app.listen. Anything visual goes in public/.
+B10. MANDATORY SCREENSHOT QA — KEEP ITERATING UNTIL THE DESIGN PASSES:
+After curl returns 200, call screenshot_and_describe(url:"http://localhost:PORT", send_to_user:true).
+Evaluate against these pass/fail criteria. If ANY fail, fix immediately and screenshot again:
+✗ FAIL: Plain/unstyled HTML — no colors, raw browser defaults, looks like a text document
+✗ FAIL: Text barely visible or poor contrast against the background
+✗ FAIL: Layout broken, elements overlapping, or content spilling outside containers
+✗ FAIL: Buttons are plain gray browser defaults — unstyled
+✗ FAIL: Inputs are plain white browser defaults — unstyled
+✗ FAIL: No consistent color theme applied throughout
+✓ PASS: All of the above are satisfied — consistent theme, readable text, styled controls, proper layout
+Stop only when ALL criteria pass. There is no fixed iteration count — stop when it genuinely looks good, whether that takes 1 screenshot or 10. Do NOT stop just because the server is running.
+B11. CSS DESIGN STANDARDS — apply from the start, before any screenshot:
+Use a dark background (#1a1a2e or #0d1117 or similar), white/light text, colored accents (#00b4d8, #4ade80, #f472b6, etc.). Style ALL inputs and buttons — no raw browser defaults. Use border-radius, padding, box-shadow, and flex/grid layout. Minimum: background gradient or solid dark color, styled form inputs (border: 1px solid #444, bg: #1e1e2e, color: #fff), primary buttons with colored background. The first version should already look good — not a plain HTML skeleton.
 B12. CANVAS GAMES: canvas 800×600, dark background #1a1a2e, all elements clearly visible. Dark theme, styled UI.
 B13. OBSERVE BEFORE FIXING: Screenshot first, then make targeted edits. Never rewrite an entire file from scratch when the server is running.
 B14. TARGETED EDITS: read_file to see current code, write_file only the changed section. Never throw away working code.
 B15. QUALITY LOOP: After each fix, screenshot again to verify. Iterate until it looks correct.
-B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different states. Not just the header.`;
+B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different states. Not just the header.
+B17. AFTER DEPLOYING: Once a deployment command succeeds, immediately run the platform's URL command (e.g. railway domain, vercel --prod, netlify open:deploy, fly status) to get the live public URL. Your final message MUST include the full URL so the user can open it.`;
       // Text-based tool format works reliably across all local models.
       // WRITE_FILE uses code-fence to avoid JSON-escaping issues; all other tools use JSON.
-      const jsonToolFormat = `You are an AI agent. Working directory: ${workDir}\n\nDO NOT describe what you will do. DO NOT write plans. START EXECUTING IMMEDIATELY.\n\nTO WRITE A FILE (only when actually writing code/content to disk):\nWriting server.js...\nWRITE_FILE /abs/path/to/server.js\n\`\`\`\n...complete file content here...\n\`\`\`\n\nFOR ALL OTHER TOOLS — output JSON on its own line:\nRunning command...\n{"name":"bash","arguments":{"command":"shell command here"}}\n\nTools:\n- WRITE_FILE /path — write a local file. ONLY use this when actually creating/editing a file on disk.\n- {"name":"bash","arguments":{"command":"..."}} — run any shell command\n- {"name":"read_file","arguments":{"path":"/abs/path"}} — read a local file\n- {"name":"list_directory","arguments":{"path":"/abs/path"}} — list local directory\n- {"name":"web_fetch","arguments":{"url":"https://any-public-url.com"}} — fetch ANY website or URL and read its content. Use for research, data, docs, scraping public sites.\n- {"name":"screenshot_and_describe","arguments":{"url":"https://any-url.com","check_for":"what to look for","send_to_user":true}} — open ANY URL in a real browser and screenshot it. Use when pages are dynamic/JS-heavy or you need to show the user visuals.\n\n${universalRules}`;
+      const jsonToolFormat = `You are an AI agent. Working directory: ${workDir}\n\nCONVERSATIONAL QUESTIONS — answer directly with text, NO tools: brainstorming, opinions, explanations, greetings, "what is X", "give me ideas", "how does X work", anything you can answer from knowledge. Only use tools when you need to actually DO something: read/write files, run commands, browse real-time data, build or deploy something.\n\nACTION TASKS — DO NOT describe what you will do. DO NOT write plans. START EXECUTING IMMEDIATELY.\n\nCRITICAL — THINK SILENTLY: Any reasoning, planning, self-doubt, or "I cannot" thoughts MUST go inside <think>...</think> tags and NEVER appear as visible text. Your visible output must be ONLY tool calls and final answers. NEVER output limitations or explanations before calling a tool — think it, don't say it.\n\nTO WRITE A FILE — output WRITE_FILE with the FULL ABSOLUTE PATH on the same line, then a code fence:\nWRITE_FILE /abs/path/to/server.js\n\`\`\`\n...complete file content here...\n\`\`\`\nCRITICAL: the path is MANDATORY — WRITE_FILE alone (no path) is invalid and will be ignored.\n\nTO RUN A COMMAND — output JSON on its own line:\n{"name":"bash","arguments":{"command":"shell command here"}}\n\nTools:\n- WRITE_FILE /path — write a local file. ONLY use this when actually creating/editing a file on disk.\n- {"name":"bash","arguments":{"command":"..."}} — run any shell command\n- {"name":"read_file","arguments":{"path":"/abs/path"}} — read a local file\n- {"name":"list_directory","arguments":{"path":"/abs/path"}} — list local directory\n- {"name":"web_fetch","arguments":{"url":"https://any-public-url.com"}} — fetch ANY website or URL and read its content. Use for research, data, docs, scraping public sites.\n- {"name":"screenshot_and_describe","arguments":{"url":"https://any-url.com","check_for":"what to look for"}} — screenshot a page and analyze it with vision so YOU can SEE what's on screen. CRITICAL: this is how you visually read a page — use it whenever snapshot returns partial/empty data or you need to read numbers/text from a JS-heavy page. Omit url to screenshot the current browser tab. Returns text description YOU can reason about.\n- {"name":"browser","arguments":{"action":"tabs"}} — control the REAL Chrome browser (pre-logged in with user's sessions). Use for bookmarks, logged-in sites, JavaScript-heavy pages. Actions: tabs, snapshot, navigate, click, type, press, screenshot (sends to user only — YOU cannot see it), evaluate, scroll, focus. SUBMITTING FORMS: after typing into a search box, use {"action":"press","key":"Enter"} to submit — do NOT click ref numbers which can hit ads. CLICKING BUTTONS: prefer {"action":"click","text":"button label"} over {"action":"click","ref":N} — ref numbers shift and can click the wrong element. To visually READ a page yourself, use screenshot_and_describe instead of browser screenshot.\n\n${universalRules}`;
       const systemPrompt = customSystemPrompt || jsonToolFormat;
+      // Build message array. When there is prior history, scan the last few assistant turns
+      // for signs the model got stuck (declared inability, looped, gave up). If stuck, trim
+      // the history so the user's new instruction lands with full weight rather than being
+      // buried under a wall of failed reasoning the model is anchored to.
+      let activeHistory = history;
+      if (activeHistory.length > 0) {
+        const recentAssistant = activeHistory
+          .filter(m => m.role === 'assistant')
+          .slice(-4)
+          .map(m => (m.content || '').toLowerCase());
+        const stuckSignals = [
+          'i cannot', 'i am unable', 'unfortunately', 'environment does not',
+          'not possible', 'i lack', 'i do not have the ability', 'i have exhausted',
+          'cannot be done', 'is not supported', 'failed to', 'i have tried',
+          'every attempt', 'cannot complete',
+          'no specific task', 'no task has been given', 'no task was given',
+          'cannot proceed with a meaningful', 'i must wait for a task',
+          'waiting for a task', 'please provide a task', 'specify a task',
+        ];
+        const isStuck = recentAssistant.some(text =>
+          stuckSignals.some(sig => text.includes(sig))
+        );
+        if (isStuck) {
+          // Keep only the last 6 turns (3 exchanges) so the new instruction dominates.
+          // The user is course-correcting — don't let stale failure reasoning override them.
+          activeHistory = activeHistory.slice(-6);
+          console.log(`   [${agentId}] 🔄 Stuck signals detected in history — trimmed to last 6 turns so new instruction takes priority`);
+        }
+      }
       const messages = [
         { role: 'system', content: systemPrompt },
-        ...history,
+        ...activeHistory,
       ];
+      // Inject context the agent needs to work on existing projects.
+      // Registry is always injected (small, always relevant).
+      // Workspace files list only injected on fresh sessions (no history).
+      let taskContent = task;
+      {
+        const contextParts = [];
+        // 1. Known running projects from the global registry (always inject)
+        try {
+          const REGISTRY = '/tmp/agentforge/projects.json';
+          if (existsSync(REGISTRY)) {
+            const registry = JSON.parse(readFileSync(REGISTRY, 'utf8'));
+            const entries = Object.values(registry);
+            if (entries.length > 0) {
+              const lines = entries.map(e => {
+                let info = `- "${e.name}" → ${e.path} (running on port ${e.port}`;
+                if (e.railwayProject) info += `, Railway project: "${e.railwayProject}"`;
+                if (e.liveUrl) info += `, live URL: ${e.liveUrl}`;
+                return info + ')';
+              });
+              contextParts.push(`Known projects on this machine:\n${lines.join('\n')}`);
+            }
+          }
+        } catch {}
+        // 1b. Available deployment/publishing CLIs — probe what's actually installed and authed.
+        // Inject so the agent knows it CAN deploy rather than claiming it lacks credentials.
+        try {
+          const deployTools = [];
+          const candidates = [
+            { cmd: 'railway', check: 'railway whoami 2>/dev/null', label: 'railway' },
+            { cmd: 'vercel',  check: 'vercel whoami 2>/dev/null',  label: 'vercel'  },
+            { cmd: 'netlify', check: 'netlify status 2>/dev/null', label: 'netlify' },
+            { cmd: 'fly',     check: 'fly auth whoami 2>/dev/null',label: 'fly'     },
+            { cmd: 'surge',   check: 'surge whoami 2>/dev/null',   label: 'surge'   },
+            { cmd: 'gh',      check: 'gh auth status 2>/dev/null', label: 'gh'      },
+          ];
+          await Promise.all(candidates.map(async ({ cmd, check, label }) => {
+            try {
+              const { stdout } = await execAsync(`which ${cmd} 2>/dev/null && ${check}`, { timeout: 4000 });
+              if (stdout.trim()) deployTools.push(`${label} (authenticated: ${stdout.trim().split('\n')[0].slice(0, 60)})`);
+            } catch {}
+          }));
+          if (deployTools.length > 0) {
+            contextParts.push(`Deployment CLIs available and authenticated on this machine:\n${deployTools.map(t => `- ${t}`).join('\n')}\n\nYou can use these tools directly via bash to deploy projects publicly.`);
+          }
+        } catch {}
+        // 2. Existing files in this agent's workspace (fresh sessions only)
+        if (activeHistory.length === 0) {
+          try {
+            const SKIP_NAMES = new Set(['MEMORY.md', 'AGENTS.md', 'AGENTFORGE.md', 'node_modules', '.git', 'memory', '.npm', 'package-lock.json']);
+            const collectFiles = (dir, base = '', depth = 0) => {
+              if (depth > 3) return [];
+              let files = [];
+              for (const e of readdirSync(dir, { withFileTypes: true })) {
+                if (SKIP_NAMES.has(e.name)) continue;
+                const rel = base ? `${base}/${e.name}` : e.name;
+                if (e.isDirectory()) files.push(...collectFiles(path.join(dir, e.name), rel, depth + 1));
+                else files.push(rel);
+              }
+              return files;
+            };
+            const existingFiles = collectFiles(workDir);
+            if (existingFiles.length > 0) {
+              contextParts.push(`Your workspace already contains these files:\n${existingFiles.map(f => `- ${workDir}/${f}`).join('\n')}\n\nRead the relevant files before making any changes. Make targeted edits — do NOT rewrite working files from scratch.`);
+            }
+          } catch {}
+        }
+        if (contextParts.length > 0) {
+          taskContent = `${contextParts.join('\n\n')}\n\n${task}`;
+        }
+      }
       // Attach initial image if provided — always include it; models that don't support
       // images will ignore the field, and if they error we catch it below.
-      const userMessage = { role: 'user', content: task };
+      const userMessage = { role: 'user', content: taskContent };
       if (image) {
         const base64 = image.replace(/^data:image\/\w+;base64,/, '');
         userMessage.images = [base64];
@@ -459,10 +636,52 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
       const toolsUsed = []; // track tool names called (for fallback summary)
       // No hard turn limit — agent runs until done, loop-detected, or wall-clock timeout.
       const recentCalls = []; // last N tool calls for loop detection
+      const recentBashCalls = []; // bash-only window — write_file doesn't contaminate bash loop detection
       let emptyRetries = 0; // consecutive empty-response retries
+      const recentOutputs = []; // last N no-tool-call outputs for repeated-output detection
+      let incompleteKicks = 0; // consecutive times _isTaskComplete returned false
+      let noToolKicks = 0; // consecutive turns with content but no tool calls — escalate message
+      let taskDoneEarly = false; // set by completion-language detector inside tool loop
+      let localBrowserTurns = 0; // consecutive browser tool calls on localhost — capped to prevent infinite QA loops
+      let successfulScreenshots = 0; // how many times we've seen a working (non-placeholder) localhost app
+      let midRefusalKicks = 0; // how many times we've overridden a mid-task refusal
+      let echoAppendCalls = 0; // consecutive bash calls using echo >> to append to a file
+      let consecutiveTruncations = 0; // how many times in a row the same truncated JSON was re-output
+      const fileReadCounts = new Map(); // path -> # of reads since last write_file (cross-turn read-loop detector)
       for (let turn = 0; ; turn++) {
         if (controller.signal.aborted) break;
+        let toolsUsedThisTurn = 0; // Fix 10: per-turn tool count — reset each turn so _isTaskComplete
+                                   // only fires when the current turn actually ran tools, not just
+                                   // because prior turns did. Prevents kicking mid-plan text outputs.
+        // Hard turn cap: prevent runaway agents. 60 turns handles complex multi-file projects.
+        if (turn >= 60) {
+          console.log(`   [${agentId}] ⚠️ Turn cap (60) reached — forcing completion`);
+          messages.push({ role: 'user', content: 'You have used 60 turns. Provide your final answer now — describe what you built and any important notes. Be concise.' });
+          break;
+        }
+        // ── Per-turn context trim ────────────────────────────────────────────
+        // After large file writes the messages array can accumulate 15K+ tokens
+        // making each subsequent Ollama call slower and causing empty responses.
+        // When total content exceeds 30K chars (~7.5K tokens), drop middle messages
+        // (keep system prompt + first user task + last 8 messages).
+        // Threshold lowered from 60K: a single large WRITE_FILE can add 20K chars,
+        // causing every subsequent turn to have slow prefill.
+        const totalMsgChars = messages.reduce((s, m) => s + (typeof m.content === 'string' ? m.content.length : 0), 0);
+        if (totalMsgChars > 30000 && messages.length > 10) {
+          const systemMsg = messages[0];
+          const firstUserMsg = messages.find(m => m.role === 'user');
+          const recentMsgs = messages.slice(-8);
+          const trimmed = [systemMsg, firstUserMsg, ...recentMsgs].filter(Boolean);
+          // Only trim if it actually reduces messages (avoids trimming to same set)
+          if (trimmed.length < messages.length) {
+            console.log(`   [${agentId}] ✂️ Turn ${turn}: context trim ${messages.length}→${trimmed.length} msgs (${Math.round(totalMsgChars/1000)}KB chars)`);
+            messages.length = 0;
+            messages.push(...trimmed);
+          }
+        }
         this.emit('tool_activity', { agentId, event: 'tool_start', tool: 'model', description: `Thinking…` });
@@ -472,6 +691,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
         const isOllamaBackend = this.baseUrl.includes('11434') || this.baseUrl.includes('localhost') || this.baseUrl.includes('127.0.0.1');
         const useNativeEndpoint = isOllamaBackend; // all local models use native endpoint
+        const inferenceStart = Date.now();
         let response;
         try {
@@ -498,14 +718,37 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
             };
           }
+          // Per-turn inference timeout: 8 minutes. Without this, a huge context (e.g. 37K-char
+          // file in messages) can make Ollama spin for 10+ minutes with no output. The context
+          // trim (60K char threshold) prevents most cases, but this is a safety valve.
+          const turnAbort = new AbortController();
+          const turnTimeoutId = setTimeout(() => {
+            console.log(`   [${agentId}] ⏰ Turn ${turn}: inference timeout (8 min) — aborting and retrying with trimmed context`);
+            turnAbort.abort();
+          }, 8 * 60 * 1000);
+          const combinedSignal = AbortSignal.any
+            ? AbortSignal.any([controller.signal, turnAbort.signal])
+            : turnAbort.signal; // fallback: use turn signal only if any() unavailable
           response = await fetch(endpoint, {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
-            signal: controller.signal,
+            signal: combinedSignal,
             body: JSON.stringify(requestBody)
           });
+          clearTimeout(turnTimeoutId);
         } catch (fetchErr) {
-          if (fetchErr.name === 'AbortError') break;
+          if (fetchErr.name === 'AbortError') {
+            // If the task-level controller was aborted, exit cleanly
+            if (controller.signal.aborted) break;
+            // Otherwise this was a turn-level timeout — treat like empty response and retry
+            console.log(`   [${agentId}] ⏰ Turn ${turn}: inference timed out — forcing context trim and retry`);
+            // Trim aggressively: keep system + first user + last 4 messages
+            const _sys = messages[0]; const _usr = messages.find(m => m.role === 'user');
+            const _recent = messages.slice(-4);
+            messages.length = 0; messages.push(_sys, _usr, ..._recent.filter(Boolean));
+            continue; // retry this turn with trimmed context
+          }
           throw new Error(`Cannot reach local model server at ${this.baseUrl}. Is it running? (${fetchErr.message})`);
         }
@@ -529,18 +772,43 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
         let inFenceBlock = false;      // inside WRITE_FILE code fence — suppress content from streaming
         let fenceDepth = 0;            // ``` count since last WRITE_FILE (even=closed, odd=open)
         let rawTokenCount = 0;
+        let tokenCapTruncatedFile = false; // true when token cap fired mid-WRITE_FILE fence
         let lastVisibleAt = Date.now(); // track when we last got visible output (for think timeout)
         const reader = response.body.getReader();
         const decoder = new TextDecoder();
         let buf = '';
-        // No timeouts — local model can take as long as it needs on any turn.
-        // Only the user abort (controller.signal) or stream end stops a turn.
+        // No hard timeout on inference — local model can take as long as it needs.
+        // But we DO time out individual reader.read() calls (30s) so a silently-dropped
+        // connection never hangs the worker forever. And when Ollama signals done:true we
+        // immediately cancel the reader instead of waiting for the HTTP body to close on
+        // its own (which can stall indefinitely on keep-alive connections).
         let turnRetry = false;
-        while (true) {
+        streamLoop: while (true) {
           if (controller.signal.aborted) break;
-          const { done, value } = await reader.read();
+          // Time-box each individual read() call. If no bytes arrive for 30s the stream
+          // has stalled (Ollama crashed / connection dropped silently) — abort it.
+          let _readTimer;
+          let readResult;
+          try {
+            readResult = await Promise.race([
+              reader.read(),
+              new Promise((_, reject) => {
+                _readTimer = setTimeout(() => reject(new Error('stream_read_stall')), 30000);
+              })
+            ]);
+          } catch (e) {
+            if (e.message === 'stream_read_stall') {
+              console.log(`   [${agentId}] ⏱️ Stream stalled (no data for 30s) — aborting`);
+              reader.cancel().catch(() => {});
+              break;
+            }
+            throw e;
+          } finally {
+            clearTimeout(_readTimer);
+          }
+          const { done, value } = readResult;
           if (done) break;
           buf += decoder.decode(value, { stream: true });
@@ -556,7 +824,12 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
               // Ollama native NDJSON format
               let nativeEvt;
               try { nativeEvt = JSON.parse(line); } catch { continue; }
-              if (nativeEvt.done) continue;
+              if (nativeEvt.done) {
+                // Ollama says generation is complete — cancel the reader and exit now.
+                // Do NOT fall back to reader.read() which can hang on keep-alive connections.
+                reader.cancel().catch(() => {});
+                break streamLoop;
+              }
               tokenText = nativeEvt.message?.content ?? null;
             } else {
               // OpenAI SSE format
@@ -586,21 +859,37 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
             rawTokenCount++;
             streamContent += tokenText;
+            // Per-turn token cap — if a single turn generates >6000 tokens, the model is
+            // probably writing multiple large files in one shot or looping. Truncate the stream
+            // and let the agent loop handle the (partial) output. Keeps single-turn inference
+            // bounded to ~3-5 minutes on local hardware.
+            if (rawTokenCount >= 6000) {
+              console.log(`   [${agentId}] ⚠️ Turn ${turn}: token cap (${rawTokenCount}) — truncating stream`);
+              // Close any open code fence so the WRITE_FILE parser can extract partial content.
+              // Track whether we truncated mid-write so we can inject a hint after the tool loop.
+              if (inFenceBlock && fenceDepth % 2 === 1) {
+                streamContent += '\n```\n';
+                inFenceBlock = false;
+                tokenCapTruncatedFile = true; // set below
+              }
+              reader.cancel().catch(() => {});
+              break streamLoop;
+            }
             // Process token through think + tool_call filters, emit visible text live
             // We scan only the new delta token against the current buffer state
             const chunk = tokenText;
             let visible = '';
+            const wasInThinkBlock = inThinkBlock;
             // Simple per-token state machine — handles split tags across tokens by tracking state flags
             if (!inThinkBlock && !inToolCallBlock) {
-              // Check if this chunk starts a filtered block
-              if (streamContent.includes('<think>') && !streamContent.includes('</think>')) {
+              // Check if this chunk starts a filtered block.
+              // Use `<think` (no closing >) to catch split tokens where `>` arrives separately.
+              // `<think` won't false-positive on `</think>` since that starts with `</`.
+              if (streamContent.includes('<think') && !streamContent.includes('</think>')) {
                 inThinkBlock = true;
-                // emit text before the <think> tag
-                const before = streamContent.lastIndexOf('<think>');
-                // already streamed everything before this point; just suppress from here
               } else if (streamContent.includes('<tool_call>') && !streamContent.slice(streamContent.lastIndexOf('<tool_call>')).includes('</tool_call>')) {
                 inToolCallBlock = true;
-                // Text before <tool_call> on this same token — already emitted or trivial
               } else if (!inThinkBlock && !inToolCallBlock) {
                 visible = chunk;
               }
@@ -614,6 +903,26 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
               inToolCallBlock = false;
             }
+            // Stream think block content live — shown in a collapsible "Thinking…" panel in the dashboard
+            {
+              let thinkChunk = '';
+              if (!wasInThinkBlock && inThinkBlock) {
+                // Just entered think block — emit content after the opening <think> tag
+                const tagEnd = chunk.indexOf('<think>');
+                thinkChunk = tagEnd >= 0 ? chunk.slice(tagEnd + 7) : chunk;
+              } else if (wasInThinkBlock && inThinkBlock) {
+                // Mid-think block — emit raw chunk (strip stray tag fragments)
+                thinkChunk = chunk.replace(/<\/?think>/g, '');
+              } else if (wasInThinkBlock && !inThinkBlock) {
+                // Just exited think block — emit content before the closing </think> tag
+                const tagStart = chunk.indexOf('</think>');
+                thinkChunk = tagStart >= 0 ? chunk.slice(0, tagStart) : chunk;
+              }
+              if (thinkChunk) {
+                this.emit('agent_output', { agentId, output: thinkChunk, isThinking: true, isChunk: true });
+              }
+            }
             // Scan ALL lines completed in this token for state transitions.
             // Multi-char tokens can contain multiple lines (WRITE_FILE + ``` in same token).
             if (tokenText.includes('\n')) {
@@ -622,7 +931,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
               while (nlIdx !== -1) {
                 const lineStart = Math.max(0, streamContent.lastIndexOf('\n', nlIdx - 1)) + 1;
                 const line = streamContent.slice(lineStart, nlIdx).trim();
-                if (/^(WRITE_FILE|write_file)[:\s]+\S/i.test(line)) {
+                if (/^(WRITE_FILE|write_file)/i.test(line)) {
                   inFenceBlock = true; fenceDepth = 0;
                 } else if (inFenceBlock && /^```/.test(line)) {
                   fenceDepth++;
@@ -639,16 +948,28 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
               const cleanSC = streamContent.replace(/<think>[\s\S]*?<\/think>/g, '');
               const lastNL = cleanSC.lastIndexOf('\n');
               const curLine = cleanSC.slice(lastNL + 1).trimStart();
-              if (!inFenceBlock && /^(WRITE_FILE|write_file)[:\s]+\S/i.test(curLine)) {
+              // Suppress as soon as "WRITE_FILE" appears at start of partial line —
+              // don't wait for the path to arrive or the word streams char-by-char to the user.
+              if (!inFenceBlock && /^(WRITE_FILE|write_file)/i.test(curLine)) {
                 inFenceBlock = true; fenceDepth = 0;
               }
-              if (!inJsonBlob && !inFenceBlock && (curLine.startsWith('{') || curLine.startsWith('['))) {
+              // Only treat as JSON blob if it looks like actual JSON — `[{` or `["` or `[` followed by quote/brace.
+              // Avoid false-positive on `[bash result]:`, `[tool result]:`, etc.
+              if (!inJsonBlob && !inFenceBlock && (curLine.startsWith('{') || /^\[[\[{"']/.test(curLine))) {
                 inJsonBlob = true;
               }
             }
             // Emit visible content — safety filter removes any ``` or WRITE_FILE lines
-            // that slipped through (e.g. partial token at detection boundary)
+            // that slipped through (e.g. partial token at detection boundary).
+            // If a complete <think>...</think> block arrived in one token (state machine missed it),
+            // route its content as a thinking chunk so users can see the agent's reasoning.
+            if (visible) {
+              visible = visible.replace(/<think>([\s\S]*?)<\/think>/g, (_, content) => {
+                if (content.trim()) this.emit('agent_output', { agentId, output: content, isThinking: true, isChunk: true });
+                return '';
+              }).replace(/<think>[\s\S]*/g, '');
+            }
             if (visible && !inThinkBlock && !inToolCallBlock && !inJsonBlob && !inFenceBlock) {
               const safe = visible.split('\n').filter(ln => {
                 const t = ln.trimStart();
@@ -666,13 +987,40 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
             if (inThinkBlock && (Date.now() - lastVisibleAt) > 90000 && rawTokenCount > 100) {
               console.log(`   [${agentId}] ⏱️ Think timeout (>90s, ${rawTokenCount} tokens) — aborting stream`);
               reader.cancel().catch(() => {});
-              break;
+              break streamLoop;
+            }
+            // Repetition loop detection — catches runaway token loops (e.g. hundreds of </li> repeating).
+            // Small local models can get stuck when fed malformed HTML or very large context.
+            // Check every 50 tokens after warmup: if any short pattern fills most of the recent output → abort.
+            if (rawTokenCount % 50 === 0 && rawTokenCount > 150) {
+              const tail = streamContent.slice(-800);
+              let loopDetected = false;
+              for (let pLen = 4; pLen <= 15; pLen++) {
+                const pat = tail.slice(-pLen);
+                if (!pat.trim()) continue;
+                let count = 0, pos = 0;
+                while ((pos = tail.indexOf(pat, pos)) !== -1) { count++; pos += pLen; }
+                if (count >= 30) { loopDetected = true; break; } // Fix 9: raised from 20 — HTML/CSS files have naturally repetitive short patterns (px;, </div>, etc.)
+              }
+              if (loopDetected) {
+                console.log(`   [${agentId}] 🔄 Repetition loop detected at ${rawTokenCount} tokens — aborting stream`);
+                reader.cancel().catch(() => {});
+                break streamLoop;
+              }
             }
           }
         }
-        console.log(`   [${agentId}] 📊 Stream done: ${rawTokenCount} tokens, ${streamContent.length} chars, ${visibleContent.length} visible, apiToolCalls=${Object.keys(streamToolCalls).length}`);
-        if (streamContent) console.log(`   [${agentId}] 📝 First 200 chars: ${streamContent.slice(0, 200)}`);
+        const inferenceMs = Date.now() - inferenceStart;
+        console.log(`   [${agentId}] 📊 Turn ${turn}: ${rawTokenCount} tokens, ${streamContent.length} chars raw, ${visibleContent.length} visible, apiToolCalls=${Object.keys(streamToolCalls).length}, inference=${(inferenceMs/1000).toFixed(1)}s`);
+        if (rawTokenCount === 0 && inferenceMs > 10000) {
+          console.log(`   [${agentId}] ⚠️ Turn ${turn}: Ollama spent ${(inferenceMs/1000).toFixed(1)}s returning 0 tokens — possible OOM, KV cache eviction, or model degenerate state`);
+        }
+        // Log visible content (what the user sees) — helps diagnose planning vs acting
+        if (visibleContent.trim()) console.log(`   [${agentId}] 👁️  Visible: ${visibleContent.trim().replace(/\n/g, ' ').slice(0, 300)}`);
+        // Log raw content if no visible (pure tool call turn) — helps diagnose tool format
+        else if (streamContent.trim()) console.log(`   [${agentId}] 📝 Raw: ${streamContent.trim().replace(/\n/g, ' ').slice(0, 200)}`);
         // ── Extract tool calls from content ───────────────────────────────────
         // Try <tool_call> XML tags first (some models emit this format), then fall through
@@ -712,6 +1060,19 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
           }
         }
+        // Detect model mimicking compaction format: [wrote: /path — N chars, M lines]
+        // This happens after context trim — model sees these summaries and generates them as fake outputs.
+        // The model THINKS it wrote the file but it hasn't. Correct it immediately.
+        if (!parsedTagCalls && Object.keys(streamToolCalls).length === 0 && streamContent) {
+          const fakeWroteMatch = streamContent.match(/\[wrote:\s*([^\s\]]+)[^\]]*\]/i);
+          if (fakeWroteMatch) {
+            const fakePath = fakeWroteMatch[1];
+            console.log(`   [${agentId}] ⚠️ Model generated fake [wrote: ...] summary — correcting`);
+            messages.push({ role: 'user', content: `You output "[wrote: ${fakePath}...]" but that is a SUMMARY FORMAT from your context history — you did NOT actually write any file. To actually write a file, you MUST use WRITE_FILE format:\n\nWRITE_FILE ${fakePath}\n\`\`\`\n...complete file content...\n\`\`\`\n\nOutput the full file content now using WRITE_FILE.` });
+            continue;
+          }
+        }
         // Fallback 4: if we found ONLY bash tool calls but content has writing blocks too,
         // merge them so files get written AND bash runs
         if (parsedTagCalls && streamContent) {
@@ -757,10 +1118,18 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
         // ── Push assistant message ────────────────────────────────────────────
         // All local models now use JSON-in-text format on the native endpoint.
         // Strip <think>...</think> blocks to avoid burning context on reasoning traces.
+        // Also compact WRITE_FILE fences: replace the file body with a summary line
+        // to prevent large file contents from flooding the context on every future turn.
         const toolCallsArray = Object.values(streamToolCalls);
         const hasToolCalls = toolCallsArray.length > 0;
         const cleanedContent = (streamContent || '')
           .replace(/<think>[\s\S]*?<\/think>/g, '')
+          // Compact WRITE_FILE fence bodies: replace with a non-fence note so the model
+          // cannot mistake the summary for real file content and echo it back.
+          .replace(/(?:WRITE_FILE|write_file)[:\s]+([^\n]+)\n```[^\n]*\n([\s\S]*?)```/gi, (match, filePath, fileContent) => {
+            const lines = fileContent.split('\n').length;
+            return `[wrote: ${filePath.trim()} — ${fileContent.length} chars, ${lines} lines]`;
+          })
           .trim();
         messages.push({ role: 'assistant', content: cleanedContent || '' });
@@ -769,6 +1138,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
         // ── Execute tool calls ────────────────────────────────────────────────
         if (toolCallsArray.length > 0) {
+          let completionCheckedThisTurn = false; // deduplicate _isTaskComplete across tool calls in same turn
           for (const toolCall of toolCallsArray) {
             if (controller.signal.aborted) break;
@@ -808,7 +1178,37 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
             });
             console.log(`   [${agentId}] 🔧 ${name}: ${JSON.stringify(parsedArgs).slice(0, 120)}`);
             toolsUsed.push(name);
+            toolsUsedThisTurn++; // Fix 10: track per-turn for _isTaskComplete gating
             emptyRetries = 0; // reset on successful tool call
+            recentOutputs.length = 0; // reset repeated-output tracker on any tool execution
+            // Track consecutive browser/screenshot calls on a locally-built app.
+            // After 6 such calls the agent has browsed enough — check if done and stop.
+            if (name === 'browser' || name === 'screenshot_and_describe') localBrowserTurns++;
+            else if (name === 'write_file' || name === 'bash') localBrowserTurns = 0; // reset on real work
+            if (localBrowserTurns >= 6 && toolsUsed.filter(t => t === 'write_file').length > 0) {
+              const originalTask3 = messages.find(m => m.role === 'user')?.content || task;
+              const isDoneBrowse = await this._isTaskComplete(originalTask3, visibleContent || allOutput, controller.signal);
+              if (isDoneBrowse) {
+                console.log(`   [${agentId}] ✅ Done after ${localBrowserTurns} browser interactions — stopping`);
+                if (visibleContent) finalContent = visibleContent;
+                taskDoneEarly = true;
+                break;
+              }
+              // Not done yet after 6 browser calls — push a targeted hint rather than silently resetting.
+              // This fires every 6 browser calls to redirect the agent toward evaluation or completion.
+              console.log(`   [${agentId}] ⚠️ ${localBrowserTurns} browser interactions without completion — injecting guidance`);
+              messages.push({ role: 'user', content: `You have taken ${localBrowserTurns} screenshots/browser interactions. You need to complete the task or make progress.
+If you are trying to verify that a DYNAMIC feature works (timer counting down, animation playing, real-time updates), STOP using screenshots — they capture a single frozen moment and CANNOT prove motion or state change.
+Use browser evaluate instead to directly check JavaScript state:
+{"name":"browser","arguments":{"action":"evaluate","script":"document.querySelector('#display').textContent"}}
+Or to read a value, wait 2 seconds, and compare:
+{"name":"browser","arguments":{"action":"evaluate","script":"(function(){ return new Promise(r => { var t1 = document.body.innerText; setTimeout(() => r('before: ' + t1.slice(0,50) + ' | after: ' + document.body.innerText.slice(0,50)), 2000); }); })()"}}
+If the CODE is correct and the app LOOKS right, declare the task DONE — you do not need to prove every dynamic behavior via screenshot. State what you verified and what you built, then stop.` });
+              localBrowserTurns = 0; // reset so hint fires again after 6 more if still stuck
+            }
             // Loop detection: catch repeated single calls AND alternating A/B/A/B patterns.
             // Normalize curl commands: strip sleep prefix so "sleep 3 && curl ...URL" and
@@ -821,15 +1221,44 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
             recentCalls.push(callKey);
             if (recentCalls.length > 6) recentCalls.shift();
+            // Bash-only window: write_file calls don't contaminate bash loop detection.
+            // A write_file between two bash loops was causing the detector to miss patterns
+            // like curl→cat→nohup→write_file(rewrite)→curl→cat→nohup (server-start loop).
+            if (name === 'bash') {
+              recentBashCalls.push(callKey);
+              if (recentBashCalls.length > 6) recentBashCalls.shift();
+              // Detect echo-append pattern: echo '...' >> file (building file line by line)
+              if (/echo\s+['"]/.test(parsedArgs.command || '') && />>\s*\S/.test(parsedArgs.command || '')) {
+                echoAppendCalls++;
+                if (echoAppendCalls >= 4) {
+                  const appendTarget = (parsedArgs.command || '').match(/>>[ ]*(\S+)/)?.[1] || 'the file';
+                  console.log(`   [${agentId}] ⚠️ echo-append loop (${echoAppendCalls}x) — injecting WRITE_FILE hint`);
+                  messages.push({ role: 'user', content: `STOP using echo >> to append code line by line — this wastes turns. You have already called echo >> ${echoAppendCalls} times. Use WRITE_FILE with the COMPLETE content of ${appendTarget} in ONE call instead:\n\nWRITE_FILE /abs/path/to/${appendTarget.split('/').pop()}\n\`\`\`\n...complete file content...\n\`\`\`` });
+                  echoAppendCalls = 0; // reset so hint only fires once per burst
+                }
+              } else {
+                echoAppendCalls = 0; // non-echo bash call resets the counter
+              }
+            } else if (name === 'write_file') {
+              // A successful write_file is progress; don't reset entirely but clear bash window
+              // so the loop detector starts fresh for the post-rewrite phase.
+              recentBashCalls.length = 0;
+              echoAppendCalls = 0;
+            }
             // Detect: same call 3x in a row (2x for screenshot — never valid to screenshot without a change)
             const screenshotLoop = name === 'screenshot_and_describe' && recentCalls.length >= 2 && recentCalls.slice(-2).every(c => c === callKey);
             const last3Same = screenshotLoop || (recentCalls.length >= 3 && recentCalls.slice(-3).every(c => c === callKey));
-            // Detect: alternating A,B,A,B pattern (last 4 calls)
+            // Detect: alternating A,B,A,B pattern (last 4 calls) — check both windows
             const last4 = recentCalls.slice(-4);
-            const abab = last4.length === 4 && last4[0] === last4[2] && last4[1] === last4[3] && last4[0] !== last4[1];
-            // Detect: A,B,C,A,B,C pattern (last 6)
+            const last4bash = recentBashCalls.slice(-4);
+            const abab = (last4.length === 4 && last4[0] === last4[2] && last4[1] === last4[3] && last4[0] !== last4[1])
+                      || (last4bash.length === 4 && last4bash[0] === last4bash[2] && last4bash[1] === last4bash[3] && last4bash[0] !== last4bash[1]);
+            // Detect: A,B,C,A,B,C pattern (last 6) — check both windows
             const last6 = recentCalls.slice(-6);
-            const abcabc = last6.length === 6 && last6[0] === last6[3] && last6[1] === last6[4] && last6[2] === last6[5];
+            const last6bash = recentBashCalls.slice(-6);
+            const abcabc = (last6.length === 6 && last6[0] === last6[3] && last6[1] === last6[4] && last6[2] === last6[5])
+                        || (last6bash.length === 6 && last6bash[0] === last6bash[3] && last6bash[1] === last6bash[4] && last6bash[2] === last6bash[5]);
             if (last3Same || abab || abcabc) {
               const pattern = last3Same ? 'same call 3x' : abab ? 'A/B/A/B alternating' : 'A/B/C repeating';
@@ -847,14 +1276,36 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
                 const openPort = openPortMatch ? openPortMatch[1] : '????';
                 loopFixMsg += `You are calling 'open http://localhost:${openPort}' repeatedly but the server is not running — opening the browser to a dead port does nothing. You must RESTART THE SERVER first:\n{"name":"bash","arguments":{"command":"pkill -f 'node.*${openPort}' 2>/dev/null; sleep 1; cd YOUR_PROJECT_DIR && nohup /usr/local/bin/node server.js > /tmp/server.log 2>&1 & sleep 3 && curl -s -o /dev/null -w '%{http_code}' http://localhost:${openPort}"}}\nIf curl returns 000, check the crash: bash cat /tmp/server.log. Fix the crash FIRST. Only call 'open' after curl returns 200.`;
               } else if (name === 'bash' && (loopCmd.includes('curl') || loopCmd.includes('http_code'))) {
-                loopFixMsg += `The server check is looping. Check /tmp/server.log for errors:\n{"name":"bash","arguments":{"command":"cat /tmp/server.log | tail -20"}}\nThen fix the actual error in the code. NEVER change the port.`;
+                // Auto-read crash log now so the hint can include the actual error
+                let crashLogNow = '';
+                try {
+                  crashLogNow = String(await this._executeTool('bash', { command: 'cat /tmp/server.log 2>/dev/null | tail -30 || echo "No server.log"' }, workDir, agentId)).trim();
+                } catch {}
+                let serverLoopHint = `The server is stuck in a crash-restart loop — curl keeps returning 000.\n\nLatest crash log:\n${crashLogNow}\n\n`;
+                // If crash log has a SyntaxError, auto-read the code snippet
+                const synMatch = crashLogNow.match(/^(\/[^\n:]+\.(?:js|ts|mjs|cjs)):(\d+)\n/m);
+                if (synMatch && /SyntaxError/.test(crashLogNow)) {
+                  const synFile = synMatch[1];
+                  const synLine = parseInt(synMatch[2], 10);
+                  let snippet = '';
+                  try {
+                    snippet = String(await this._executeTool('bash', {
+                      command: `awk 'NR>=${Math.max(1, synLine - 5)} && NR<=${synLine + 5} {printf "%4d: %s\\n", NR, $0}' "${synFile}" 2>/dev/null`
+                    }, workDir, agentId)).trim();
+                  } catch {}
+                  serverLoopHint += `⚠️  SyntaxError in ${synFile} at line ${synLine}${snippet ? `:\n\`\`\`\n${snippet}\n\`\`\`` : ''}.\n\n`;
+                  serverLoopHint += `Fix the syntax error:\n1. write_file to patch only the broken line (do NOT rewrite the whole file unless it is tiny)\n2. Then restart with nohup\nNEVER restart before fixing the syntax error — it will always crash again.`;
+                } else {
+                  serverLoopHint += `The error is shown above. Fix the code, then restart. Do NOT call curl or cat again before making a fix.`;
+                }
+                loopFixMsg += serverLoopHint;
               } else if (loopCmd.includes('npm install')) {
                 loopFixMsg += `npm install is looping — packages likely already installed. Skip it and start the server directly with nohup.`;
               } else if (name === 'bash' && (loopCmd.includes('/tmp/') && (loopCmd.includes('.js') || loopCmd.includes('node')) && loopCmd.includes('9223'))) {
                 loopFixMsg += `Your Node.js/CDP script is only READING the page — that is why nothing changes. You need to WRITE A NEW SCRIPT THAT CLICKS.\n\nReplace your /tmp script with one that clicks the target element:\n\nWRITE_FILE /tmp/cdp_click.js\n\`\`\`javascript\nconst ws = new WebSocket('ws://localhost:9223/devtools/page/TAB_ID_HERE');\nws.onopen = () => {\n  // Click element containing the text you need (change "Filter" to what you see on the page)\n  ws.send(JSON.stringify({id:1, method:'Runtime.evaluate', params:{expression: 'Array.from(document.querySelectorAll("a,button,input,span,div,th")).find(el=>el.textContent.trim().includes("Filter"))?.click() || "not found"', returnByValue:true}}));\n};\nws.onmessage = e => { console.log(JSON.parse(e.data)); ws.close(); };\nsetTimeout(() => ws.close(), 5000);\n\`\`\`\n\nThen run: bash → /usr/local/bin/node --experimental-websocket /tmp/cdp_click.js\n\nYou CAN click. You CAN interact. Stop saying you cannot — write the clicking script.`;
               } else if (name === 'screenshot_and_describe') {
                 const loopPort = (parsedArgs.url || '').match(/:(\d+)/)?.[1] || '????';
-                loopFixMsg += `You are calling screenshot_and_describe repeatedly — STOP. Taking the same screenshot over and over changes nothing. You have two choices:\n\nA) If the user asked a question or gave feedback — answer them with TEXT. You do NOT need a screenshot to reply to a conversation. Just write your response.\n\nB) If the app needs to be improved — make a CODE CHANGE first, then take ONE screenshot to verify:\n1. read_file the file that needs changing\n2. write_file with the improvement\n3. restart the server: bash pkill+nohup\n4. screenshot ONCE to verify\n\nDo NOT take another screenshot without first doing one of the above.`;
+                loopFixMsg += `You are calling screenshot_and_describe repeatedly — STOP. Taking the same screenshot over and over changes nothing.\n\nIf you are trying to verify DYNAMIC behavior (timer running, animation, countdown, live updates): screenshots CANNOT prove this — they capture a frozen moment. Use browser evaluate instead:\n{"name":"browser","arguments":{"action":"evaluate","script":"document.querySelector('#timer-display, .display, #display, [id*=time], [class*=time]')?.textContent || document.body.innerText.slice(0,200)"}}\nOr wait 2s and compare: {"name":"browser","arguments":{"action":"evaluate","script":"(function(){ return new Promise(r => { var t1 = document.body.innerText.slice(0,100); setTimeout(() => r({before:t1, after:document.body.innerText.slice(0,100)}), 2000); }); })()" }}\n\nOtherwise, you have two choices:\nA) If the code is correct and the app looks right — declare the task DONE. You do not need to screenshot every feature.\nB) If something specific is visually wrong — make a code change FIRST, then ONE screenshot to verify.`;
               } else {
                 loopFixMsg += `Observe the tool results above, identify what is specifically broken, then make a targeted fix. Do not repeat commands that already ran.`;
               }
@@ -862,9 +1313,31 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
               messages.push({ role: 'user', content: loopFixMsg });
               // Don't fully reset — keep 1 entry so next identical call fires after 2 more (not 3)
               recentCalls.splice(0, recentCalls.length - 1);
+              recentBashCalls.splice(0, recentBashCalls.length - 1);
               break; // break inner tool loop, let model respond to hint
             }
+            // ── Read-loop detector (cross-turn) ──────────────────────────────
+            // Tracks how many times each file path has been read since the last write_file.
+            // If the agent reads the same file 3+ times without writing anything, it is stuck
+            // in a "read to plan" loop that never produces output — force it to write now.
+            if (name === 'read_file' && parsedArgs.path) {
+              const rp = parsedArgs.path;
+              const readCount = (fileReadCounts.get(rp) || 0) + 1;
+              fileReadCounts.set(rp, readCount);
+              if (readCount >= 3) {
+                const fname = path.basename(rp);
+                console.log(`   [${agentId}] 🔁 Read-loop: "${rp}" read ${readCount}x without a write — forcing write`);
+                fileReadCounts.set(rp, 0); // reset so hint can fire again if agent persists
+                messages.push({ role: 'user', content: `STOP. You have read ${fname} ${readCount} times in a row without writing anything. You already have the full file content in your context. Reading it again changes nothing.\n\nSTOP READING. Write your next WRITE_FILE now — put the updated ${fname} content in a fence:\n\nWRITE_FILE ${rp}\n\`\`\`\n...updated content...\n\`\`\`\n\nDo NOT read any more files. Write.` });
+                break; // break inner tool loop
+              }
+            }
+            // Any write_file clears the read counts — fresh slate after actual progress
+            if (name === 'write_file') {
+              fileReadCounts.clear();
+            }
             const result = await this._executeTool(name, parsedArgs, workDir, agentId);
             this.emit('tool_activity', { agentId, event: 'tool_end', tool: name, description: `✓ ${name}` });
@@ -874,15 +1347,71 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
               this.emit('agent_image', { agentId, image: result });
             }
-            // ── Bash: curl returned 000 = server not running — force log read ──
+            // ── Bash: curl result handling ────────────────────────────────────
             if (name === 'bash') {
               const resultStr = String(result).trim();
               const isCurlZero = resultStr === '000' || resultStr.endsWith('\n000') || /\b000$/.test(resultStr);
-              if (isCurlZero) {
-                const logRead = await this._executeTool('bash', { command: 'cat /tmp/server.log 2>/dev/null | tail -30 || echo "No server.log found"' }, workDir, agentId);
-                messages.push({ role: 'user', content: `[bash result]: 000\n\nThe server is NOT running — curl got 000 (connection refused). Here are the crash logs:\n\n${logRead}\n\nThe server crashed. Read the error above, fix the bug in the code, then restart. Do NOT assume it is running. Do NOT change the port. Fix the actual error.` });
+              const isCurl404 = resultStr === '404' || resultStr.endsWith('\n404') || /\b404$/.test(resultStr);
+              const isCurl200 = resultStr === '200' || resultStr.endsWith('\n200') || /\b200$/.test(resultStr);
+              if (isCurlZero || isCurl404) {
+                const logRead = await this._executeTool('bash', { command: 'cat /tmp/server.log 2>/dev/null | tail -40 || echo "No server.log found"' }, workDir, agentId);
+                if (isCurlZero) {
+                  // If crash log has a SyntaxError with file:line, auto-read the snippet to save the agent
+                  // a read_file round-trip and make the fix obvious
+                  let syntaxSnippet = '';
+                  const synErrMatch = String(logRead).match(/^(\/[^\n:]+\.(?:js|ts|mjs|cjs)):(\d+)\n/m);
+                  if (synErrMatch && /SyntaxError/.test(String(logRead))) {
+                    const synFile = synErrMatch[1];
+                    const synLine = parseInt(synErrMatch[2], 10);
+                    try {
+                      const snippet = await this._executeTool('bash', {
+                        command: `awk 'NR>=${Math.max(1, synLine - 8)} && NR<=${synLine + 8} {printf "%4d: %s\\n", NR, $0}' "${synFile}" 2>/dev/null`
+                      }, workDir, agentId);
+                      if (snippet && String(snippet).trim()) {
+                        syntaxSnippet = `\n\n⚠️  SYNTAX ERROR in ${synFile} near line ${synLine}. The relevant code:\n\`\`\`\n${snippet}\n\`\`\`\nFix the syntax error in that file BEFORE trying to restart.`;
+                      }
+                    } catch {}
+                  }
+                  messages.push({ role: 'user', content: `[bash result]: 000 (connection refused — server is NOT running)\n\nCrash log:\n${logRead}${syntaxSnippet}\n\nThe server crashed or never started. Fix the actual error shown above. Do NOT assume it is running. Do NOT change the port. Make a targeted fix to the code then restart.` });
+                } else {
+                  messages.push({ role: 'user', content: `[bash result]: 404 (server is running but root route not found)\n\nServer log:\n${logRead}\n\nCommon cause: static files path is wrong. In server.js: (1) express.static must use path.join(__dirname, 'public'); (2) any res.sendFile for the root route must use path.join(__dirname, 'public', 'index.html') — NEVER path.join(__dirname, 'index.html') or relative paths. Fix and restart. Do NOT rewrite the whole file.` });
+                }
                 continue;
               }
+              // ── curl 200: server confirmed running — open in AgentForge browser ──
+              // Platform responsibility: always show the user their app the moment it's live.
+              // Agent does not need to call 'open' — the platform handles it here.
+              if (isCurl200) {
+                const curlCmd = parsedArgs.command || '';
+                const portMatch = curlCmd.match(/localhost:(\d+)/);
+                if (portMatch) {
+                  const appUrl = `http://localhost:${portMatch[1]}`;
+                  const { opened } = await this._openInBrowser(appUrl, agentId);
+                  // ── Register project in global registry so other agents can find it ──
+                  try {
+                    const REGISTRY = '/tmp/agentforge/projects.json';
+                    const registry = existsSync(REGISTRY) ? JSON.parse(readFileSync(REGISTRY, 'utf8')) : {};
+                    // Derive a readable project name: prefer Desktop/Projects subdir name, else workDir basename
+                    let projectName = path.basename(workDir);
+                    const homeDir2 = process.env.HOME || '/tmp';
+                    const desktopProjects = `${homeDir2}/Desktop/Projects`;
+                    try {
+                      // Walk Desktop/Projects for the most recently modified dir — likely the active project
+                      const dirs = readdirSync(desktopProjects, { withFileTypes: true })
+                        .filter(e => e.isDirectory())
+                        .map(e => ({ name: e.name, mtime: statSync(path.join(desktopProjects, e.name)).mtimeMs }))
+                        .sort((a, b) => b.mtime - a.mtime);
+                      if (dirs.length > 0) projectName = dirs[0].name;
+                    } catch {}
+                    registry[portMatch[1]] = { port: parseInt(portMatch[1]), path: workDir, agentId, name: projectName, updated: new Date().toISOString() };
+                    writeFileSync(REGISTRY, JSON.stringify(registry, null, 2));
+                  } catch {}
+                  messages.push({ role: 'user', content: `[bash result]: 200 — server is running at ${appUrl}${opened ? '. App opened in browser and screenshot sent to user.' : '.'}\n\nNow call screenshot_and_describe with url:"${appUrl}" and send_to_user:true to verify it looks correct, then iterate to improve it.` });
+                  continue;
+                }
+              }
             }
             // ALL models get tool results fed back — no model should run blind.
@@ -890,12 +1419,90 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
             // must be in context so the model can see what happened and react correctly.
             {
               const noThink = '';
+              // Deployment URL detection: if bash output contains a public HTTPS URL
+              // from a known hosting platform, extract it, persist it to the project registry,
+              // and tell the agent to report it.
+              if (name === 'bash') {
+                const resultStr = String(result);
+                const deployUrlMatch = resultStr.match(/https:\/\/[a-zA-Z0-9._-]+\.(railway\.app|vercel\.app|netlify\.app|fly\.dev|surge\.sh|pages\.dev|web\.app|github\.io|onrender\.com|up\.railway\.app)[^\s]*/);
+                if (deployUrlMatch) {
+                  const deployUrl = deployUrlMatch[0];
+                  console.log(`   [${agentId}] 🌐 Deployment URL detected: ${deployUrl}`);
+                  // Persist the live URL (and Railway project name if available) into the registry
+                  // so future agents know the deployed URL without re-running CLI commands.
+                  try {
+                    const REGISTRY = '/tmp/agentforge/projects.json';
+                    const registry = existsSync(REGISTRY) ? JSON.parse(readFileSync(REGISTRY, 'utf8')) : {};
+                    const entry = Object.values(registry).find(e => e.path === workDir || e.agentId === agentId);
+                    if (entry) {
+                      entry.liveUrl = deployUrl;
+                      // Capture Railway project name if railway status is available
+                      try {
+                        const { stdout: statusOut } = await execAsync('railway status 2>/dev/null', { cwd: workDir, timeout: 5000 });
+                        const projectMatch = statusOut.match(/Project:\s*(.+)/);
+                        if (projectMatch) entry.railwayProject = projectMatch[1].trim();
+                      } catch {}
+                      const key = Object.keys(registry).find(k => registry[k] === entry);
+                      if (key) {
+                        registry[key] = entry;
+                        writeFileSync(REGISTRY, JSON.stringify(registry, null, 2));
+                        console.log(`   [${agentId}] 💾 Saved live URL to registry: ${deployUrl}`);
+                      }
+                    }
+                  } catch {}
+                  messages.push({ role: 'user', content: `[bash result]:\n${resultStr.slice(0, 3000)}\n\nDeployment succeeded. The live URL is: ${deployUrl}\n\nReport this URL to the user as your final response.` });
+                  continue;
+                }
+              }
+              // After writing files, check if the task is complete — don't just blindly kick "Continue".
+              // Run _isTaskComplete after any write_file call (≥2 tools used so agent has done real work).
+              // Only check once per turn to avoid redundant LLM calls when multiple files are written.
+              // Note: visibleContent may be just "WRITE_FILE" (10 chars) for pure file-write turns — don't
+              // require long visible content here; the write_file result itself is sufficient evidence.
+              if (name === 'write_file' && toolsUsed.length >= 2 && !completionCheckedThisTurn) {
+                completionCheckedThisTurn = true;
+                const originalTask2 = messages.find(m => m.role === 'user')?.content || task;
+                // Use write_file result as context (includes path that was written) + any visible text
+                const completionContext = (visibleContent.length > 10 ? visibleContent + '\n' : '') + 'Just wrote: ' + String(result).slice(0, 500);
+                const isDoneEarly = await this._isTaskComplete(originalTask2, completionContext, controller.signal);
+                if (isDoneEarly) {
+                  console.log(`   [${agentId}] ✅ Task complete after write_file — stopping`);
+                  if (visibleContent) finalContent = visibleContent;
+                  taskDoneEarly = true;
+                  break; // break inner tool loop; outer loop checks taskDoneEarly
+                }
+              }
               if (isImageResult) {
                 const base64 = result.replace(/^data:image\/\w+;base64,/, '');
                 messages.push({ role: 'user', content: `[${name} result]: Screenshot captured. Continue with the next step.${noThink}`, images: [base64] });
               } else {
                 const resultText = isImageResult ? '[Screenshot captured]' : String(result).slice(0, 6000);
-                messages.push({ role: 'user', content: `[${name} result]:\n${resultText}\n\nContinue with the next step.${noThink}` });
+                // Fix 12/19: after writing an HTML file for a static task, automatically navigate the browser
+                // to the file:// URL so the agent's tab IS on the correct page before the next turn.
+                // Previously we only injected a guidance message, which models often ignored — jumping straight
+                // to screenshot_and_describe with no URL and getting a blank screenshot of the wrong tab.
+                let continueMsg = `Continue with the next step.${noThink}`;
+                if (name === 'write_file' && successfulScreenshots === 0) {
+                  const writtenPath = parsedArgs?.path || '';
+                  const isHtmlFile = /\.html?$/i.test(writtenPath);
+                  const taskLower2 = (messages.find(m => m.role === 'user')?.content || task).toLowerCase();
+                  const isStaticTask = isHtmlFile && !/\b(railway|vercel|render|netlify|fly\.io|heroku|deploy|server\.js|express|http\.createserver)\b/.test(taskLower2);
+                  if (isStaticTask && writtenPath) {
+                    const absolutePath = writtenPath.startsWith('~') ? writtenPath.replace(/^~/, process.env.HOME || '/Users/' + (workDir.split('/')[2] || 'user')) : writtenPath;
+                    // Fix 19: auto-navigate the browser tab to the file so it's already loaded.
+                    try {
+                      await browserAction({ action: 'navigate', url: `file://${absolutePath}` }, agentId);
+                      console.log(`   [${agentId}] 🌐 Auto-navigated to file://${absolutePath}`);
+                      continueMsg = `File written and opened in browser at file://${absolutePath}. Now take a screenshot to verify it looks correct:\n{"name":"browser","arguments":{"action":"screenshot_and_describe","check_for":"the complete app with all required features"}}\n${noThink}`;
+                    } catch (navErr) {
+                      // Navigation failed — fall back to instruction-only
+                      continueMsg = `File written. YOUR NEXT ACTION MUST BE THIS — navigate to the file first, then screenshot:\n1. {"name":"browser","arguments":{"action":"navigate","url":"file://${absolutePath}"}}\n2. {"name":"browser","arguments":{"action":"screenshot_and_describe","check_for":"the complete app"}}\nDO NOT call screenshot_and_describe without url first — you will get a blank screenshot.${noThink}`;
+                    }
+                  }
+                }
+                messages.push({ role: 'user', content: `[${name} result]:\n${resultText}\n\n${continueMsg}` });
                 if (name === 'screenshot_and_describe') {
                   const screenshotResult = String(result);
@@ -917,26 +1524,45 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
                   }
                   // Successful screenshot of a build task — push to make a code change
                   else if (isLocalhost) {
-                    messages.push({ role: 'user', content: `You have seen the current state. Now make your next improvement: read_file the code, write_file the fix, restart server, then screenshot once to verify.` });
+                    // Catch placeholder/hello world pages on localhost — force the model to keep building
+                    const screenshotText = String(result).toLowerCase();
+                    const isPlaceholder = (
+                      screenshotText.includes('hello world') ||
+                      screenshotText.includes('cannot get /') ||
+                      (screenshotText.includes('express') && screenshotText.includes('error')) ||
+                      // Only match "placeholder" as an unbuilt-page indicator, not Gemini describing
+                      // a UI element's placeholder attribute (e.g. "Placeholder Text: Start typing...")
+                      /\bplaceholder\s*(page|app|content|site)\b/.test(screenshotText) ||
+                      screenshotText.includes('coming soon') ||
+                      (screenshotText.includes('blank') && !screenshotText.includes('not blank'))
+                    );
+                    if (isPlaceholder) {
+                      messages.push({ role: 'user', content: `The screenshot shows a placeholder or empty page — the app is not done yet. Continue writing complete working code. Identify which files still need real implementation and write them now.${noThink}` });
+                    } else {
+                      successfulScreenshots++;
+                      if (successfulScreenshots >= 2) {
+                        // Agent has confirmed the app works at least twice. Time to wrap up rather
+                        // than looping indefinitely on minor improvements.
+                        messages.push({ role: 'user', content: `The app is working correctly (confirmed twice). Your task is complete. Write your final reply now: describe what you built, what it does, and how to use it. Do NOT make any more code changes — just reply in text.` });
+                      } else {
+                        messages.push({ role: 'user', content: `The app is running. If there is one specific thing that is clearly missing or broken, fix it now (read_file → write_file → restart → screenshot). If the app already fulfills all the requirements, skip improvements and write your final reply instead.` });
+                      }
+                    }
                   }
                   // Successful screenshot of a public URL — agent is doing research, let it reason
                 }
-                // Catch placeholder/hello world pages — force the model to keep building
-                const screenshotText = String(result).toLowerCase();
-                const isPlaceholder = (
-                  screenshotText.includes('hello world') ||
-                  screenshotText.includes('cannot get /') ||
-                  (screenshotText.includes('express') && screenshotText.includes('error')) ||
-                  screenshotText.includes('placeholder') ||
-                  screenshotText.includes('coming soon') ||
-                  (screenshotText.includes('blank') && !screenshotText.includes('not blank'))
-                );
-                if (isPlaceholder) {
-                  messages.push({ role: 'user', content: `The screenshot shows a placeholder or empty page — the app is not done yet. Continue writing complete working code. Identify which files still need real implementation and write them now.${noThink}` });
-                }
               }
             }
           }
+          // Token cap fired mid-WRITE_FILE — the last file written is truncated.
+          // Alert the agent so it knows to complete the file instead of immediately starting the server.
+          if (tokenCapTruncatedFile) {
+            tokenCapTruncatedFile = false;
+            console.log(`   [${agentId}] ⚠️ Token cap truncated a file — injecting continuation hint`);
+            messages.push({ role: 'user', content: `⚠️ Your last response was cut off — the file was only partially written. The server will crash with a SyntaxError.\n\nDo NOT run the server yet. First complete the truncated file: read_file it to see where it was cut, then write_file to add the missing code (closing braces, remaining routes, etc.). Make sure the file is syntactically complete before starting the server.` });
+            tokenCapTruncatedFile = false;
+          }
+          if (taskDoneEarly) break; // completion language detected inside tool loop — stop the turn loop
           continue; // loop back for next model turn
         }
@@ -946,13 +1572,78 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
           const hasContent = combined.trim().length > 30;
           const isEmpty = combined.trim().length === 0;
+          // Structural: agent writing "Running command..." or "WRITE_FILE" headers but no actual tool JSON.
+          // Happens when the model plans multiple steps using the header format but forgets the JSON body.
+          const hasFakeHeaders = (streamContent.match(/^Running command\.\.\./gm) || []).length >= 2 ||
+            /^WRITE_FILE\s*$/m.test(streamContent); // WRITE_FILE with no path on same line
+          if (hasFakeHeaders) {
+            console.log(`   [${agentId}] ⚡ Turn ${turn}: agent writing planning headers without tool calls — showing correct format`);
+            messages.push({ role: 'user', content: `You are writing "Running command..." or "WRITE_FILE" as planning text but not outputting actual tool calls.\n\nSTOP PLANNING. Execute now. First step: create the project directory:\n{"name":"bash","arguments":{"command":"mkdir -p ${projectsDir}/PROJECT_NAME && cd ${projectsDir}/PROJECT_NAME && /usr/local/bin/npm init -y && /usr/local/bin/npm install express"}}\n\nThen write files with ABSOLUTE paths:\nWRITE_FILE ${projectsDir}/PROJECT_NAME/server.js\n\`\`\`\nconst express = require('express');\nconst PORT = process.env.PORT || ${assignedPort};\n// complete file here\n\`\`\`\n\nOutput ONLY the bash JSON tool call right now. Nothing else.` });
+            continue;
+          }
           // Structural: truncated JSON — model started a tool call but stream ended early
           const hasTruncatedJson = /\{"name"\s*:\s*"(bash|web_fetch|screenshot_and_describe|read_file|write_file|list_directory)"/i.test(streamContent) && Object.keys(streamToolCalls).length === 0;
           if (hasTruncatedJson) {
-            console.log(`   [${agentId}] ⚡ Turn ${turn}: truncated JSON tool call — kicking to re-output`);
-            messages.push({ role: 'user', content: 'Your tool call was cut off. Output the complete JSON on one line now.' });
+            consecutiveTruncations++;
+            console.log(`   [${agentId}] ⚡ Turn ${turn}: truncated JSON tool call (${consecutiveTruncations}x) — kicking to re-output`);
+            // WRITE_FILE called as JSON — model is trying {"name":"WRITE_FILE","path":"...","content":"..."} which
+            // always truncates because file content doesn't fit in a JSON string. Redirect immediately, every time.
+            const isJsonWriteFileCall = /\{"name"\s*:\s*"WRITE_FILE"\s*,\s*"(path|arguments)"/i.test(streamContent);
+            if (isJsonWriteFileCall) {
+              const pathMatch = streamContent.match(/"path"\s*:\s*"([^"]+)"/);
+              const filePath = pathMatch ? pathMatch[1] : '/Users/hamp/Desktop/Projects/PROJECTNAME/filename.js';
+              console.log(`   [${agentId}] ⚡ Turn ${turn}: WRITE_FILE used as JSON tool call — correcting to fence format`);
+              consecutiveTruncations = 0;
+              // Permanently inject reminder into system message so it survives all context trims
+              if (messages[0] && messages[0].role === 'system' && !messages[0].content.includes('NEVER use {"name":"WRITE_FILE"')) {
+                messages[0] = { ...messages[0], content: messages[0].content + `\n\n⚠️ WRITE_FILE REMINDER (injected after format error): NEVER use {"name":"WRITE_FILE",...} JSON. ALWAYS use the code-fence format:\nWRITE_FILE /absolute/path/to/file\n\`\`\`\nfull file content\n\`\`\`` };
+              }
+              messages.push({ role: 'user', content: `WRITE_FILE is NOT a JSON tool. It uses a code-fence format — the ONLY correct way:\n\nWRITE_FILE ${filePath}\n\`\`\`\n...complete file content here...\n\`\`\`\n\nOutput ONLY the WRITE_FILE fence now. No JSON, no explanation.` });
+              continue;
+            }
+            // Fix 8: bash command that contains WRITE_FILE — model is confusing WRITE_FILE fence with a shell command.
+            const isBashWriteFile = /\{"name"\s*:\s*"bash"[\s\S]{0,300}WRITE_FILE\s+(\/\S+)/i.test(streamContent);
+            // Fix 18: bash command embedding file content via node -e writeFileSync, echo, cat, etc.
+            // These always truncate because the file content doesn't fit in max_tokens.
+            // After 3+ consecutive truncations, escalate with a firm WRITE_FILE redirect.
+            const isBashEmbedFile = /\{"name"\s*:\s*"bash"[\s\S]{0,200}(writeFileSync|echo|cat\s+<<|printf)[\s\S]{0,200}\.(css|html|js|ts|py|json|txt|md)/i.test(streamContent);
+            const fileNameMatch = streamContent.match(/writeFileSync\s*\(\s*['"`]?([^'"`\s,)]+\.[a-z]{1,5})/i)
+                               || streamContent.match(/>\s*['"]?([A-Za-z0-9_.-]+\.(css|html|js|ts|py|json|txt|md))['"]?/i);
+            const fname = fileNameMatch ? fileNameMatch[1] : 'server.js';
+            if (isBashWriteFile) {
+              const pathMatch = streamContent.match(/WRITE_FILE\s+(\/[^\s\\'"]+)/);
+              const filePath = pathMatch ? pathMatch[1] : '/path/to/file';
+              console.log(`   [${agentId}] ⚡ Turn ${turn}: bash+WRITE_FILE pattern — correcting format`);
+              messages.push({ role: 'user', content: `WRITE_FILE is NOT a bash command. Use the WRITE_FILE fence format directly at the top level (outside any bash call):\n\nWRITE_FILE ${filePath}\n\`\`\`\n...complete file content here...\n\`\`\`\n\nOutput ONLY the WRITE_FILE fence now — do NOT wrap it in a bash tool call.` });
+            } else if (isBashEmbedFile || consecutiveTruncations >= 3) {
+              // Large file embedded in bash always truncates. Redirect to WRITE_FILE.
+              const dirMatch = streamContent.match(/cd\s+([\w/~.-]+)/);
+              const dir = dirMatch ? dirMatch[1] : '/absolute/path/to/dir';
+              const truncCount = consecutiveTruncations;
+              console.log(`   [${agentId}] ⚡ Turn ${turn}: bash-embed-file pattern (${truncCount}x) — redirecting to WRITE_FILE`);
+              consecutiveTruncations = 0; // reset after escalation
+              // Check if the task already had a successful write_file — if so, remind agent the file exists
+              const hadPriorWrite = toolsUsed.filter(t => t === 'write_file').length > 0;
+              const priorWriteHint = hadPriorWrite
+                ? `\n\nNOTE: You already wrote a file earlier in this task. Check if you still need to write more files, or if you should instead verify the existing file works.`
+                : '';
+              messages.push({ role: 'user', content: `STOP. Your bash command embeds file content as a string and will ALWAYS be truncated — it cannot work. You have tried this ${truncCount} times in a row.\n\nTo write a file, ALWAYS use WRITE_FILE which handles files of any size:\n\nWRITE_FILE ${dir}/${fname}\n\`\`\`\n...complete file content here...\n\`\`\`${priorWriteHint}` });
+            } else {
+              const isEchoFileWrite = /\{"name"\s*:\s*"bash"[\s\S]{0,300}(echo|cat\s+<<|printf)[\s\S]{0,200}>\s*\S+\.(css|html|js|ts|py|json|txt|md)/i.test(streamContent);
+              if (isEchoFileWrite) {
+                const fnMatch = streamContent.match(/>\s*['"]?(\S+\.(css|html|js|ts|py|json|txt|md))['"]?/i);
+                const fn = fnMatch ? fnMatch[1] : 'the file';
+                messages.push({ role: 'user', content: `Your bash echo/cat command is too large and will always be truncated. You MUST use WRITE_FILE instead — it handles any file size:\n\nWriting ${fn}...\nWRITE_FILE /absolute/path/to/${fn}\n\`\`\`\n...complete file content here...\n\`\`\`` });
+              } else {
+                messages.push({ role: 'user', content: 'Your tool call was cut off. Output the complete JSON on one line now.' });
+              }
+            }
             continue;
           }
+          consecutiveTruncations = 0; // reset on any successful parse
           // Structural: empty response — model produced nothing
           if (isEmpty) {
@@ -965,23 +1656,150 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
             console.log(`   [${agentId}] ⚠️ Turn ${turn}: empty after 3 retries`);
           }
+          // Structural: model echoed tool result / non-JSON bracket text as plain output.
+          // visibleContent=0 despite having raw content means inJsonBlob fired on a false-positive
+          // (e.g. "[bash result]: ..." starts with "[") or model output was all inside <think>.
+          // Either way: no tool calls, nothing visible, task not done — kick it to continue.
+          if (!isEmpty && visibleContent.length === 0 && toolsUsedThisTurn === 0 && toolsUsed.length > 0) {
+            console.log(`   [${agentId}] ⚡ Turn ${turn}: raw output with 0 visible content and no tool calls — model echoed tool result or thought-only response, kicking to continue`);
+            messages.push({ role: 'user', content: 'You echoed a result instead of making your next tool call. Keep going — call the next tool now.' });
+            continue;
+          }
+          // Repeated output detection — context overflow causes model to output same text repeatedly.
+          // Normalize whitespace, strip "I will not"/"task is complete" boilerplate, then compare.
+          // If we see the same output 2+ times in a row with no tool calls, hard-stop.
+          if (hasContent) {
+            const normalizedOutput = combined.trim().replace(/\s+/g, ' ').slice(0, 300);
+            recentOutputs.push(normalizedOutput);
+            if (recentOutputs.length > 4) recentOutputs.shift();
+            // Check: last 3 outputs identical (context maxed — repeating same text)
+            const last3Same = recentOutputs.length >= 3 &&
+              recentOutputs[recentOutputs.length - 1] === recentOutputs[recentOutputs.length - 2] &&
+              recentOutputs[recentOutputs.length - 2] === recentOutputs[recentOutputs.length - 3];
+            if (last3Same) {
+              console.log(`   [${agentId}] 🛑 Repeated identical output detected — context likely maxed. Hard-stopping.`);
+              finalContent = combined.trim();
+              break;
+            }
+          }
+          // Structural: agent outputting code as chat text instead of writing files.
+          // Detected by markdown code fences (```html/css/js) in visible output.
+          // This happens when tool calls fail repeatedly and the agent falls back to showing code.
+          // Redirect to WRITE_FILE — never accept code dumps as a substitute for file writes.
+          if (hasContent && /```(html|css|js|javascript|typescript|python|json)/i.test(visibleContent)) {
+            console.log(`   [${agentId}] ⚡ Turn ${turn}: agent dumping code as chat text — redirecting to WRITE_FILE`);
+            messages.push({ role: 'user', content: 'Do NOT show code in chat. You MUST write files to disk using WRITE_FILE:\n\nWriting filename.ext...\nWRITE_FILE /absolute/path/to/filename.ext\n```\n...complete file content here...\n```\n\nWrite every file now.' });
+            continue;
+          }
           // Structural: agent hasn't used any tools yet — it must act before it can answer
           if (toolsUsed.length === 0 && hasContent) {
-            console.log(`   [${agentId}] ⚡ Turn ${turn}: no tools used yet — kicking to act`);
-            messages.push({ role: 'user', content: 'Make your first tool call now.' });
+            // Ask the LLM whether the text response actually satisfies the task.
+            // If complete, stop. If not, kick into tool use.
+            if (visibleContent.length > 50) {
+              const originalTask = messages.find(m => m.role === 'user')?.content || task;
+              const isDone = await this._isTaskComplete(originalTask, visibleContent, controller.signal);
+              if (isDone) {
+                console.log(`   [${agentId}] ✅ Turn ${turn}: text-only response satisfies task — stopping`);
+                if (visibleContent) finalContent = visibleContent;
+                break;
+              }
+            }
+            noToolKicks++;
+            const refusalPhrases = ['i cannot', 'i am unable', 'i don\'t have', 'i do not have', 'not possible', 'impossible', 'cannot be done', 'outside my', 'beyond my', 'you must use', 'you should use', 'you will need to', 'recommend using'];
+            const isRefusing = refusalPhrases.some(p => visibleContent.toLowerCase().includes(p));
+            // Also catch "I can't do X but I'll build a prototype/concept instead" pivot patterns
+            const isDowngrading = /\b(prototype|concept|simplified|scaled.down|limited version|mock.up|demo|without.*backend|without.*server|front.?end only|client.?side only)\b/i.test(visibleContent) && /\b(i (can'?t|cannot|am unable|won'?t|will not)|not possible|too complex|beyond|outside)\b/i.test(visibleContent);
+            if (isRefusing || isDowngrading) {
+              console.log(`   [${agentId}] ⚡ Turn ${turn}: agent ${isDowngrading ? 'downgrading task' : 'refusing task'} — overriding`);
+              messages.push({ role: 'user', content: 'Do NOT explain what you cannot do. Do NOT build a prototype or simplified version unless explicitly asked. Build the real thing. You have bash, Node.js, npm, and a full browser. Start executing now — first tool call only, no text.' });
+            } else if (noToolKicks >= 3) {
+              // Agent has been kicked 3+ times and still not calling tools — give an explicit example
+              console.log(`   [${agentId}] ⚡ Turn ${turn}: no tools after ${noToolKicks} kicks — showing exact format`);
+              messages.push({ role: 'user', content: `STOP writing plans. You have been asked ${noToolKicks} times and have not called a single tool.\n\nHere is exactly what a tool call looks like — output ONLY this, right now:\n{"name":"bash","arguments":{"command":"ls ${workDir}"}}\n\nNothing before it. Nothing after it. No "Running command...", no explanation, no plan. Just that one line of JSON. DO IT NOW.` });
+            } else {
+              console.log(`   [${agentId}] ⚡ Turn ${turn}: no tools used yet — kicking to act (${noToolKicks})`);
+              messages.push({ role: 'user', content: 'Stop planning. Make your first tool call now. Output only the JSON, nothing else.' });
+            }
             continue;
           }
+          noToolKicks = 0; // reset when tools are actually used
-          // Semantic: ask the LLM whether the task is actually complete.
-          // This replaces all regex-based intent detection — the model judges its own output.
+          // Mid-task refusal detection — agent used some tools but then refused to continue.
+          // e.g. "I cannot access X" after reading files. Override with capability reminder.
+          // NEVER accept defeat — always push harder with alternative approaches.
           if (hasContent && toolsUsed.length > 0) {
+            const midRefusalPhrases = [
+              'i cannot', 'i am unable', 'i do not have', "i don't have", 'not possible', 'impossible',
+              'cannot be done', 'outside my capabilities', 'beyond my capabilities', 'you must use',
+              'you will need to', 'you should use',
+              // Deployment give-up patterns
+              'all.*methods failed', 'methods have failed', 'deployment.*failed', 'failed.*deployment',
+              'cannot provide a', 'i must stop', 'have concluded', 'since all', 'every attempt',
+              'all automated', 'i have tried', 'methods failed sequentially',
+            ];
+            const isMidRefusing = midRefusalPhrases.some(p => {
+              if (p.includes('.*')) return new RegExp(p, 'i').test(visibleContent);
+              return visibleContent.toLowerCase().includes(p);
+            });
+            // Also catch mid-task downgrade: agent did work but is wrapping up with "I cannot fully
+            // replicate X" or pivoting to a framework/stub instead of real implementation.
+            const isMidDowngrading = /\b(prototype|concept|simplified|scaled.down|limited version|mock.up|demo|without.*backend|without.*server|front.?end only|client.?side only|framework only|basic framework|foundational framework)\b/i.test(visibleContent)
+              || /\bi cannot fully (replicate|implement|build|create)/i.test(visibleContent)
+              || /\bthe application is ready for you to interact\b/i.test(visibleContent);
+            if (isMidRefusing || isMidDowngrading) {
+              midRefusalKicks++;
+              const _taskText = messages.find(m => m.role === 'user')?.content || task;
+              const isDeployTask = /\b(deploy|railway|vercel|render|netlify|fly\.io|heroku|public url|live url|host|hosting)\b/i.test(_taskText);
+              if (midRefusalKicks >= 3 && isDeployTask) {
+                // Deployment specifically — try alternative deploy strategy, never give up
+                console.log(`   [${agentId}] ⚡ Turn ${turn}: deployment refusal ${midRefusalKicks}x — forcing alternative strategy`);
+                messages.push({ role: 'user', content: `STOP GIVING UP. Deployment is NOT optional. You have bash and Railway CLI at ~/.npm-global/bin/railway.\n\nExact commands to deploy:\n1. cd to project dir\n2. ~/.npm-global/bin/railway up --detach\n3. If "No service" error: ~/.npm-global/bin/railway up --service PROJECT_NAME --detach\n4. Get URL: ~/.npm-global/bin/railway domain\n\nIf Railway fails entirely, fall back: cd project && nohup node server.js > /tmp/server.log 2>&1 & then use ngrok or use python3 -m http.server in public/ dir and report the local URL.\n\nExecute a bash tool call RIGHT NOW. Do not explain. Do not summarize. Run the command.` });
+              } else if (midRefusalKicks >= 3) {
+                // Genuinely stuck on non-deploy task — try one more different approach before giving up
+                console.log(`   [${agentId}] ⚡ Turn ${turn}: mid-task refusal ${midRefusalKicks}x — forcing different approach`);
+                messages.push({ role: 'user', content: `You keep saying you cannot do this, but giving up is NOT acceptable. Try a completely different approach. You have bash, Node.js, npm, WRITE_FILE, and a full browser. What is the simplest possible working solution? Do it now — execute a tool call immediately.` });
+              } else if (isMidDowngrading) {
+                console.log(`   [${agentId}] ⚡ Turn ${turn}: mid-task downgrade detected — overriding (${midRefusalKicks})`);
+                messages.push({ role: 'user', content: `Do NOT deliver a framework, prototype, or stub. Build the real thing. You have bash, Node.js, npm, canvas, and a full browser. Keep going — implement it fully now.` });
+              } else {
+                console.log(`   [${agentId}] ⚡ Turn ${turn}: mid-task refusal detected — overriding (${midRefusalKicks})`);
+                messages.push({ role: 'user', content: `You have bash access and can run any shell command. Stop saying you cannot. Try a different approach. Execute a tool call now — no explanations.` });
+              }
+              continue;
+            }
+          }
+          midRefusalKicks = 0; // reset when agent proceeds normally
+          // Semantic: ask the LLM whether the task is actually complete.
+          // Fix 10: only fire when the CURRENT turn actually used tools (toolsUsedThisTurn > 0),
+          // OR when many turns have passed (turn >= 5). Using the cumulative toolsUsed.length caused
+          // premature kicks on mid-plan text outputs (agent says "I will click the buttons" → gets
+          // kicked → abandons button-click plan and starts a Node server instead).
+          if (hasContent && (toolsUsedThisTurn > 0 || turn >= 5) && visibleContent.length > 100) {
+            if (successfulScreenshots >= 2) {
+              console.log(`   [${agentId}] ✅ Turn ${turn}: app confirmed twice by screenshots — accepting final output`);
+              if (visibleContent) finalContent = visibleContent;
+              break;
+            }
             const originalTask = messages.find(m => m.role === 'user')?.content || task;
             const isDone = await this._isTaskComplete(originalTask, combined, controller.signal);
             if (!isDone) {
-              console.log(`   [${agentId}] ⚡ Turn ${turn}: LLM says task incomplete — kicking`);
-              messages.push({ role: 'user', content: 'You have not completed the task yet. Try a different approach and keep going.' });
+              incompleteKicks++;
+              console.log(`   [${agentId}] ⚡ Turn ${turn}: LLM says task incomplete — kicking (${incompleteKicks}/3)`);
+              // After 3 consecutive incomplete verdicts, the agent likely has the answer
+              // but is adding self-doubt text. Force-complete to stop the spiral.
+              if (incompleteKicks >= 3) {
+                console.log(`   [${agentId}] 🛑 3 incomplete verdicts — forcing completion with current output`);
+                if (visibleContent) finalContent = visibleContent;
+                break;
+              }
+              messages.push({ role: 'user', content: 'The task is not complete yet. Continue making progress.' });
               continue;
             }
+            incompleteKicks = 0; // reset on success
             console.log(`   [${agentId}] ✅ Turn ${turn}: LLM confirmed task complete`);
           }
         }
@@ -993,6 +1811,19 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
       }
       if (!finalContent && allOutput) finalContent = allOutput;
+      // Final safety strip — remove any <think> blocks that leaked through the per-token filter
+      if (finalContent) finalContent = finalContent.replace(/<think>[\s\S]*?<\/think>/g, '').replace(/<\/?think>/g, '').trim();
+      // Quality gate: reject finalContent that is just a tool header with no real text.
+      // e.g. "WRITE_FILE" or "{\"name\":" — these are tool invocations, not agent replies.
+      if (finalContent) {
+        const fc = finalContent.trim();
+        const isToolHeader = /^WRITE_FILE\b|^READ_FILE\b|^\{"name":|^{"name":/.test(fc) || fc.length < 15;
+        if (isToolHeader) {
+          console.log(`   [${agentId}] ⚠️ finalContent looks like a tool header ("${fc.slice(0, 40)}") — requesting summary`);
+          finalContent = '';
+        }
+      }
       // If still no output (model did only tool calls, never wrote text), ask for a summary.
       // Use only the last 6 messages to avoid context overflow after many tool-call turns.
@@ -1067,6 +1898,8 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
       const duration = Date.now() - startTime;
       this.activeAgents.delete(agentId);
+      this._taskVisionModel = null;
+      this._taskProviderKeys = null;
       this.emit('agent_completed', {
         agentId,
@@ -1076,10 +1909,13 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
       });
       console.log(`\n✅ [Ollama] Agent ${agentId} completed in ${(duration / 1000).toFixed(2)}s\n`);
+      releaseAgentTab(agentId);
       return { success: true, agentId, duration, result: { output: finalContent } };
     } catch (err) {
       this.activeAgents.delete(agentId);
+      this._taskVisionModel = null;
+      this._taskProviderKeys = null;
       if (err.name === 'AbortError' || controller.signal.aborted) {
         this.emit('agent_cancelled', { agentId });
@@ -1114,6 +1950,26 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
     return Array.from(this.activeAgents.values());
   }
+  // ─── Open URL in AgentForge browser ──────────────────────────────────────
+  // Single abstraction for navigating the user-facing browser.
+  // Uses browserAction (puppeteer-core) — never raw CDP WebSocket directly.
+  // Called by: bash 'open' intercept, curl 200 auto-launch.
+  async _openInBrowser(url, agentId = 'agent') {
+    try {
+      await browserAction({ action: 'navigate', url }, agentId);
+      await new Promise(r => setTimeout(r, 1500)); // let page render
+      const shot = await browserAction({ action: 'screenshot' }, agentId);
+      if (shot && shot.__screenshot) {
+        this.emit('agent_image', { agentId, image: `data:image/png;base64,${shot.base64}` });
+      }
+      console.log(`   [${agentId}] 🌐 Opened ${url} in AgentForge browser`);
+      return { opened: true };
+    } catch (err) {
+      console.log(`   [${agentId}] ⚠️ _openInBrowser(${url}): ${err.message}`);
+      return { opened: false };
+    }
+  }
   // ─── Tool execution ───────────────────────────────────────────────────────
   async _executeTool(name, args, workDir, agentId = 'agent') {
@@ -1144,40 +2000,32 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
             }
           }
-          // Intercept "open http://..." — navigate the AgentForge CDP browser directly,
-          // then auto-screenshot so the agent immediately sees what it built.
+          // Intercept bash calls where command is exactly a tool name — model confused tool names
+          // with CLI commands. e.g. {"name":"bash","arguments":{"command":"screenshot_and_describe"}}
+          // Most common pattern: screenshot_and_describe / web_fetch called inside bash.
+          const cmdTrimmed = args.command.trim().replace(/\s+.*$/, ''); // first word only
+          if (cmdTrimmed === 'screenshot_and_describe') {
+            console.log(`   [${agentId}] 🔀 bash("screenshot_and_describe") → redirecting to screenshot_and_describe tool`);
+            const urlMatch = args.command.match(/https?:\/\/\S+/);
+            const result = await this._screenshotAndDescribe(urlMatch ? urlMatch[0] : null, null, agentId);
+            if (this._lastScreenshotData) { this.emit('agent_image', { agentId, image: this._lastScreenshotData }); this._lastScreenshotData = null; }
+            return result;
+          }
+          // Intercept "open http://..." — navigate the AgentForge browser via _openInBrowser,
+          // then get an AI description so the agent can reason about what it built.
           const openUrlMatch = args.command.trim().match(/^open\s+(https?:\/\/\S+)/);
           if (openUrlMatch) {
             const targetUrl = openUrlMatch[1];
-            let openedViaCDP = false;
-            try {
-              const newTabRes = await fetch('http://127.0.0.1:9223/json/new', { method: 'PUT', signal: AbortSignal.timeout(3000) });
-              const newTabData = await newTabRes.json();
-              const tabWs = new WebSocket(`ws://127.0.0.1:9223/devtools/page/${newTabData.id}`);
-              await new Promise(r => tabWs.on('open', r));
-              await new Promise(r => {
-                let navigated = false;
-                tabWs.send(JSON.stringify({ id: 1, method: 'Page.navigate', params: { url: targetUrl } }));
-                tabWs.on('message', () => { if (!navigated) { navigated = true; tabWs.close(); r(); } });
-                setTimeout(() => { tabWs.close(); r(); }, 3000);
-              });
-              openedViaCDP = true;
-            } catch {
-              // CDP unavailable — fall through to OS open
-              try { await execAsync(`open "${targetUrl}"`); } catch {}
-            }
-            // Auto-screenshot after opening so the agent sees what it built.
-            // Wait for page to load, then call screenshot_and_describe.
-            await new Promise(r => setTimeout(r, 2500));
+            const { opened } = await this._openInBrowser(targetUrl, agentId);
+            // Get AI description for agent context (screenshot already emitted by _openInBrowser)
+            await new Promise(r => setTimeout(r, 800));
             try {
-              const screenshotResult = await this._executeTool('screenshot_and_describe', {
-                url: targetUrl,
-                check_for: 'the running application',
-                send_to_user: true
-              }, workDir, agentId);
-              return `Opened ${targetUrl} in browser${openedViaCDP ? ' (AgentForge browser)' : ''}.\n\nVisual snapshot of what is currently visible:\n${screenshotResult}`;
+              const desc = await this._screenshotAndDescribe(targetUrl, 'the running application', agentId);
+              this._lastScreenshotData = null; // suppress duplicate emit — raw already sent above
+              return `Opened ${targetUrl}${opened ? ' in AgentForge browser' : ''}.\n\nWhat is currently visible:\n${desc}`;
             } catch {
-              return `Opened ${targetUrl} in browser. (Screenshot failed — verify with screenshot_and_describe)`;
+              return `Opened ${targetUrl}${opened ? ' in AgentForge browser' : ''}.`;
             }
           }
@@ -1186,18 +2034,39 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
           let bashCwd = workDir;
           const _home = process.env.HOME || '/tmp';
           try { if (!existsSync(bashCwd)) bashCwd = _home; } catch { bashCwd = _home; }
-          // Background commands (ending with &) return no stdout — the model interprets
-          // silence as failure and loops. Run them, then read back any log file to confirm.
+          // Inject a PATH that includes the directories needed to find node/npm/python3
+          // regardless of how the worker was started (nohup/launchd strip the user PATH).
+          // process.execPath is the node binary running this worker — its directory always
+          // contains npm too, and is correct on any machine/version/install method.
+          const bashEnv = {
+            ...process.env,
+            PATH: [
+              path.dirname(process.execPath), // node + npm, always matches running version
+              '/usr/local/bin',               // homebrew, system tools
+              '/usr/local/sbin',
+              process.env.HOME ? `${process.env.HOME}/.npm-global/bin` : '',
+              process.env.PATH || '',
+            ].filter(Boolean).join(':'),
+          };
+          // Background commands (ending with &): use spawn with detached+stdio:ignore so the
+          // child process is fully detached from our pipe FDs and returns immediately.
+          // Using execAsync here hangs for the full 120s timeout because the background process
+          // inherits the exec pipe and keeps it open as long as the server runs.
           const isBackground = /&\s*$/.test(args.command.trim());
-          const { stdout, stderr } = await execAsync(args.command, {
-            cwd: bashCwd,
-            timeout: 120000,
-            maxBuffer: 1024 * 1024 * 2 // 2MB
-          });
-          const out = (stdout + stderr).trim();
-          if (isBackground && !out) {
-            // Give the process a moment to start, then check /tmp/server.log if it exists
-            await new Promise(r => setTimeout(r, 1500));
+          if (isBackground) {
+            // Strip trailing & — spawn will run detached
+            const cmd = args.command.replace(/&\s*$/, '').trim();
+            await new Promise((resolve) => {
+              const child = spawn('/bin/sh', ['-c', cmd], {
+                cwd: bashCwd,
+                env: bashEnv,
+                detached: true,
+                stdio: 'ignore',
+              });
+              child.unref();
+              // Give the process a moment to start up, then read back any log file
+              setTimeout(resolve, 1500);
+            });
             let confirmation = 'Background process started.';
             try {
               const logContent = readFileSync('/tmp/server.log', 'utf-8').trim().split('\n').slice(-3).join('\n');
@@ -1205,12 +2074,33 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
             } catch { /* no log yet */ }
             return confirmation;
           }
-          return out || '(no output)';
+          const { stdout, stderr } = await execAsync(args.command, {
+            cwd: bashCwd,
+            timeout: 120000,
+            maxBuffer: 1024 * 1024 * 2, // 2MB
+            env: bashEnv,
+          });
+          const rawOut = (stdout + stderr).trim() || '(no output)';
+          // Truncate large outputs to prevent context flooding (e.g. npm install, large file cats)
+          const MAX_BASH_OUTPUT = 3000;
+          if (rawOut.length > MAX_BASH_OUTPUT) {
+            const head = rawOut.slice(0, 500);
+            const tail = rawOut.slice(-2000);
+            return `${head}\n...(${rawOut.length - 2500} chars omitted)...\n${tail}`;
+          }
+          return rawOut;
         }
         case 'read_file': {
           const fp = this._resolvePath(args.path, workDir);
-          return readFileSync(fp, 'utf-8');
+          const fileContent = readFileSync(fp, 'utf-8');
+          const MAX_READ_OUTPUT = 8000;
+          if (fileContent.length > MAX_READ_OUTPUT) {
+            const head = fileContent.slice(0, 3000);
+            const tail = fileContent.slice(-3000);
+            return `${head}\n...(${fileContent.length - 6000} chars omitted — file is ${fileContent.length} chars total)...\n${tail}`;
+          }
+          return fileContent;
         }
         case 'write_file': {
@@ -1242,7 +2132,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
           if (target === 'browser') {
             // Navigate + screenshot via CDP on agent browser (port 9223)
-            return await this._cdpScreenshot(args.url, tmpFile);
+            return await this._cdpScreenshot(args.url, tmpFile, agentId);
           } else {
             // Full screen capture
             await execAsync(`screencapture -x "${tmpFile}"`);
@@ -1253,7 +2143,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
         }
         case 'screenshot_and_describe': {
-          const result = await this._screenshotAndDescribe(args.url, args.check_for);
+          const result = await this._screenshotAndDescribe(args.url, args.check_for, agentId);
           // Always send screenshot to user — agent called this tool, user should always see it
           if (this._lastScreenshotData) {
             this.emit('agent_image', { agentId, image: this._lastScreenshotData });
@@ -1263,7 +2153,22 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
         }
         case 'browser': {
-          const result = await browserAction(args);
+          // Intercept browser→screenshot_and_describe misuse — agent confused the browser action
+          // namespace with the standalone tool name. Redirect to the real vision handler so the
+          // agent gets back a text description it can reason about, not just "Image sent to chat."
+          if (args.action === 'screenshot_and_describe' || args.action === 'describe') {
+            const result = await this._screenshotAndDescribe(args.url || null, args.check_for || null, agentId);
+            if (this._lastScreenshotData) {
+              this.emit('agent_image', { agentId, image: this._lastScreenshotData });
+              this._lastScreenshotData = null;
+            }
+            return result;
+          }
+          const t0 = Date.now();
+          const result = await browserAction(args, agentId);
+          const elapsed = Date.now() - t0;
+          const resultPreview = typeof result === 'string' ? result.slice(0, 200) : (result?.__screenshot ? `[screenshot ${Math.round((result.base64?.length||0)*0.75/1024)}KB]` : JSON.stringify(result).slice(0,200));
+          console.log(`   [${agentId}] 🌐 browser(${args.action}) → ${elapsed}ms → ${resultPreview.replace(/\n/g,' ')}`);
           if (result && result.__screenshot) {
             const imgData = `data:image/png;base64,${result.base64}`;
             this.emit('agent_image', { agentId, image: imgData });
@@ -1281,81 +2186,51 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
   }
   // ─── CDP browser screenshot ───────────────────────────────────────────────
+  // Uses the persistent browserAction connection (puppeteer-core) — never raw CDP WebSocket.
+  // This reuses the existing connection to port 9223 with ad blocking already active.
-  async _cdpScreenshot(navigateUrl, tmpFile) {
-    const CDP_PORT = 9223;
-    // Always create a NEW tab — never hijack the dashboard or other existing tabs
-    const newTabRes = await fetch(`http://127.0.0.1:${CDP_PORT}/json/new`, { method: 'PUT' });
-    const newTabData = await newTabRes.json();
-    const tabId = newTabData.id;
-    return new Promise((resolve, reject) => {
-      const ws = new WebSocket(`ws://127.0.0.1:${CDP_PORT}/devtools/page/${tabId}`);
-      let msgId = 1;
-      const pending = new Map();
-      const send = (method, params = {}) => new Promise((res, rej) => {
-        const id = msgId++;
-        pending.set(id, { resolve: res, reject: rej });
-        ws.send(JSON.stringify({ id, method, params }));
-      });
-      ws.addEventListener('message', (evt) => {
-        const msg = JSON.parse(evt.data);
-        if (msg.id && pending.has(msg.id)) {
-          const { resolve: res, reject: rej } = pending.get(msg.id);
-          pending.delete(msg.id);
-          if (msg.error) rej(new Error(msg.error.message));
-          else res(msg.result);
-        }
-      });
-      ws.addEventListener('open', async () => {
-        try {
-          if (navigateUrl) {
-            await send('Page.navigate', { url: navigateUrl });
-            // Wait for page to fully render
-            await new Promise(r => setTimeout(r, 3000));
-          }
-          const { data } = await send('Page.captureScreenshot', { format: 'png' });
-          // Close the temporary tab
-          await send('Target.closeTarget', { targetId: tabId }).catch(() => {});
-          ws.close();
-          resolve(`data:image/png;base64,${data}`);
-        } catch (err) {
-          ws.close();
-          reject(err);
-        }
-      });
-      ws.addEventListener('error', (err) => reject(new Error(`CDP WebSocket error: ${err.message}`)));
-      setTimeout(() => { ws.close(); reject(new Error('CDP screenshot timeout')); }, 25000);
-    });
+  async _cdpScreenshot(navigateUrl, _tmpFile, agentId = 'agent') {
+    if (navigateUrl) {
+      await browserAction({ action: 'navigate', url: navigateUrl }, agentId);
+    } else {
+      // No navigation — page may be mid-render (e.g., after press:Enter form submit or JS SPA update)
+      // Wait for JS to finish rendering before snapping
+      await new Promise(r => setTimeout(r, 1500));
+    }
+    const result = await browserAction({ action: 'screenshot' }, agentId);
+    if (result && result.__screenshot) {
+      return `data:image/png;base64,${result.base64}`;
+    }
+    throw new Error('Screenshot returned no image data');
   }
   // ─── Screenshot + vision analysis ─────────────────────────────────────────
   // Takes a screenshot of a URL, then asks the active vision model to describe it.
   // Returns a plain-text description the main agent can reason about.
-  async _screenshotAndDescribe(url, checkFor) {
+  async _screenshotAndDescribe(url, checkFor, agentId = 'agent') {
     const question = checkFor
-      ? `Does this web page look like it's working? Specifically check: ${checkFor}. Describe precisely what you see — the background color, any canvas element, colored shapes (even tiny dots), text, buttons, game elements, or error messages. Is the background dark or white? Are there any colored pixels at all?`
-      : `Describe what you see on this web page. What is the background color? Are there any colored shapes, text, buttons, or UI elements? Is there a canvas? Even tiny colored dots count — be precise about what you see.`;
+      ? `Look at this web page and specifically find: ${checkFor}. List exactly what you see — exact text, numbers, titles, labels, counts. CRITICAL: Preserve ALL spaces between words exactly as they appear — never merge adjacent words or labels together without a space between them. Also note the background color, any canvas element, or visual errors.`
+      : `Describe this web page in full. List ALL visible text content: headlines, titles, labels, numbers, post titles, scores, counts — copy them exactly as shown. CRITICAL: Preserve ALL spaces between words — never concatenate adjacent text elements without a space. If two pieces of text appear next to each other (e.g. a label like "Posted" next to a value like "22 hr. ago"), always write them with a space between them. Then describe the visual layout: background color, UI elements, canvas, any errors.`;
-    // === Server reachability check — fast fail if server is down ===
-    try {
-      await fetch(url, { signal: AbortSignal.timeout(4000) });
-    } catch (reachErr) {
-      const portMatch = url.match(/:(\d+)/);
-      const port = portMatch ? portMatch[1] : '?';
-      return `SERVER IS NOT REACHABLE at ${url} (${reachErr.message}). The server on port ${port} is not running or crashed. You must restart it using bash before taking a screenshot:\n{"name":"bash","arguments":{"command":"pkill -f 'node.*${port}' 2>/dev/null; sleep 1; cd YOUR_PROJECT_DIR && nohup node server.js > /tmp/server.log 2>&1 & sleep 2 && echo started"}}\nCheck /tmp/server.log for errors if it still fails.`;
+    // === Server reachability check — only for local dev servers ===
+    // Skipped when url is null (current browser tab) or a public site.
+    const isLocalUrl = url && (url.includes('localhost') || url.includes('127.0.0.1') || url.match(/:\d{4,5}/));
+    if (isLocalUrl) {
+      try {
+        await fetch(url, { signal: AbortSignal.timeout(4000) });
+      } catch (reachErr) {
+        const portMatch = url.match(/:(\d+)/);
+        const port = portMatch ? portMatch[1] : '?';
+        return `SERVER IS NOT REACHABLE at ${url} (${reachErr.message}). The server on port ${port} is not running or crashed. You must restart it using bash before taking a screenshot:\n{"name":"bash","arguments":{"command":"pkill -f 'node.*${port}' 2>/dev/null; sleep 1; cd YOUR_PROJECT_DIR && nohup node server.js > /tmp/server.log 2>&1 & sleep 2 && echo started"}}\nCheck /tmp/server.log for errors if it still fails.`;
+      }
     }
-    // === HTML dependency audit (always runs — fast, reliable) ===
+    // === HTML dependency audit — only for local dev servers ===
     // Fetches the page HTML and checks for common missing client-side dependencies.
-    // This catches issues that screenshots can't detect (JS errors, missing script tags).
+    // Skipped for external sites (useless) and null url (current tab).
     let auditNotes = '';
+    if (isLocalUrl) {
     try {
       const htmlRes = await fetch(url, { signal: AbortSignal.timeout(8000) });
       const html = await htmlRes.text();
@@ -1375,46 +2250,31 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
         auditNotes = `\n\nHTML DEPENDENCY AUDIT FOUND ISSUES:\n${missing.map(m => '- ' + m).join('\n')}`;
       }
     } catch {}
+    } // end isLocalUrl audit block
+    // === DOM snapshot (when no URL — current page, or file:// URL) ===
+    // Captures all page text regardless of scroll position. Appended alongside the vision
+    // result so the agent always gets DOM content even when results are below the fold.
+    // Also runs for file:// URLs: vision models sometimes misidentify form inputs or static
+    // elements — the DOM snapshot provides ground-truth element types and values alongside
+    // the visual description so the agent can cross-reference and avoid false rewrites.
+    let domSnapshot = '';
+    if (!url || url.startsWith('file://')) {
+      try {
+        const snap = await browserAction({ action: 'snapshot' }, agentId);
+        if (typeof snap === 'string' && snap.length > 200) {
+          domSnapshot = `\n\n--- DOM snapshot (actual element types and values — use this to verify what is really on the page, not just what it looks like) ---\n${snap}`;
+        }
+      } catch {}
+    }
     let imageData;
-    const tmpFile = `/tmp/af_verify_${Date.now()}.png`;
-    // Try AgentForge browser via CDP first
+    // Use the AgentForge browser via browserAction (persistent puppeteer connection, ad blocking active)
     try {
-      imageData = await this._cdpScreenshot(url, null);
-    } catch (cdpErr) {
-      // CDP not available — try puppeteer headless screenshot
-      try {
-        const puppeteerModule = process.env.HOME + '/.npm-global/lib/node_modules/puppeteer';
-        const scriptFile = `/tmp/af_pup_${Date.now()}.js`;
-        const nodeScript = `
-const puppeteer = require(${JSON.stringify(puppeteerModule)});
-(async () => {
-  const browser = await puppeteer.launch({headless: true, protocolTimeout: 30000, args: ['--no-sandbox','--disable-setuid-sandbox','--disable-gpu','--disable-dev-shm-usage']});
-  const page = await browser.newPage();
-  await page.setDefaultNavigationTimeout(12000);
-  await page.setViewport({width: 1280, height: 900});
-  try {
-    await page.goto(${JSON.stringify(url)}, {waitUntil: 'domcontentloaded', timeout: 12000}).catch(()=>{});
-    await new Promise(r => setTimeout(r, 2500));
-    await page.screenshot({path: ${JSON.stringify(tmpFile)}, fullPage: true});
-    console.log('puppeteer screenshot ok');
-  } finally {
-    await browser.close();
-  }
-})().then(() => process.exit(0)).catch(e => { console.error(e.message); process.exit(1); });
-`;
-        writeFileSync(scriptFile, nodeScript);
-        await execAsync(`/usr/local/bin/node "${scriptFile}"`, { timeout: 45000 });
-        await execAsync(`rm -f "${scriptFile}"`).catch(() => {});
-        const raw = readFileSync(tmpFile).toString('base64');
-        await execAsync(`rm -f "${tmpFile}"`).catch(() => {});
-        imageData = `data:image/png;base64,${raw}`;
-      } catch (pupErr) {
-        console.warn(`   [screenshot_and_describe] puppeteer failed: ${pupErr.message}`);
-        // No screenshot possible — return audit notes only
-        return `Cannot take screenshot (CDP: ${cdpErr.message}, puppeteer: ${pupErr.message}). ${auditNotes || 'No dependency issues found in HTML. Check server logs for errors.'}`;
-      }
+      imageData = await this._cdpScreenshot(url, null, agentId);
+    } catch (err) {
+      return `Cannot take screenshot: ${err.message}. Is the AgentForge Browser running?${auditNotes}${domSnapshot}`;
     }
     // Store imageData so caller can emit to user if send_to_user=true
@@ -1422,39 +2282,95 @@ const puppeteer = require(${JSON.stringify(puppeteerModule)});
     const base64 = imageData.replace(/^data:image\/\w+;base64,/, '');
-    // Use the active model for vision analysis.
-    try {
-      // /api/chat with images array — supported by all Ollama vision-capable models
-      const res = await fetch(`${this.baseUrl}/api/chat`, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          model: this.model,
-          messages: [{ role: 'user', content: question, images: [base64] }],
-          stream: false,
-          options: { num_ctx: 4096 }
-        }),
-        signal: AbortSignal.timeout(120000)
-      });
+    // Resolve vision backend: use task-level vision model if configured (from modelflow),
+    // otherwise fall back to the agent's primary Ollama model.
+    const taskVisionModel = this._taskVisionModel;
+    const taskGeminiKey = this._taskProviderKeys?.google || null;
+    const isGemini = taskVisionModel && (taskVisionModel.startsWith('google/') || taskVisionModel.startsWith('gemini-'));
+    if (isGemini && taskGeminiKey) {
+      // ── Gemini vision via Google AI REST API ──────────────────────────────
+      // Model ID from flow is like "google/gemini-2.5-flash" → strip "google/" prefix
+      const geminiModel = taskVisionModel.startsWith('google/') ? taskVisionModel.slice(7) : taskVisionModel;
+      console.log(`   [screenshot_and_describe] Using Gemini vision: ${geminiModel}`);
+      try {
+        const geminiUrl = `https://generativelanguage.googleapis.com/v1beta/models/${geminiModel}:generateContent?key=${taskGeminiKey}`;
+        const res = await fetch(geminiUrl, {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({
+            contents: [{
+              parts: [
+                { text: question },
+                { inline_data: { mime_type: 'image/png', data: base64 } }
+              ]
+            }],
+            generationConfig: { maxOutputTokens: 1024 }
+          }),
+          signal: AbortSignal.timeout(30000)
+        });
+        if (res.ok) {
+          const json = await res.json();
+          const description = json.candidates?.[0]?.content?.parts?.[0]?.text || '';
+          const clean = description.trim();
+          if (clean) {
+            console.log(`   [screenshot_and_describe] Gemini: ${clean.slice(0, 200)}`);
+            return `Screenshot analysis of ${url || 'current page'}:\n${clean}${auditNotes}${domSnapshot}`;
+          }
+        } else {
+          const errText = await res.text().catch(() => '');
+          console.warn(`   [screenshot_and_describe] Gemini error ${res.status}: ${errText.slice(0, 200)}`);
+        }
+      } catch (err) {
+        console.warn(`   [screenshot_and_describe] Gemini vision call failed: ${err.message}`);
+      }
+    } else {
+      // ── Ollama vision (default) ───────────────────────────────────────────
+      try {
+        const res = await fetch(`${this.baseUrl}/api/chat`, {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({
+            model: this.model,
+            messages: [{ role: 'user', content: question, images: [base64] }],
+            stream: false,
+            options: { num_ctx: 4096 }
+          }),
+          signal: AbortSignal.timeout(120000)
+        });
-      if (res.ok) {
-        const json = await res.json();
-        const description = json.message?.content || json.response || '';
-        const clean = description.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
-        if (clean) {
-          console.log(`   [screenshot_and_describe] ${clean.slice(0, 200)}`);
-          return `Screenshot analysis of ${url}:\n${clean}${auditNotes}`;
+        if (res.ok) {
+          const json = await res.json();
+          const description = json.message?.content || json.response || '';
+          const clean = description.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
+          if (clean) {
+            console.log(`   [screenshot_and_describe] ${clean.slice(0, 200)}`);
+            return `Screenshot analysis of ${url || 'current page'}:\n${clean}${auditNotes}${domSnapshot}`;
+          }
         }
+      } catch (err) {
+        console.warn(`   [screenshot_and_describe] vision call failed: ${err.message}`);
       }
-    } catch (err) {
-      console.warn(`   [screenshot_and_describe] vision call failed: ${err.message}`);
     }
-    return `Screenshot captured but description unavailable. The app is visible at ${url} — use read_file to check the code and make targeted improvements.${auditNotes}`;
+    return `Screenshot captured but description unavailable. The app is visible at ${url} — use read_file to check the code and make targeted improvements.${auditNotes}${domSnapshot}`;
   }
   _resolvePath(p, workDir) {
-    return path.isAbsolute(p) ? p : path.join(workDir, p);
+    // Expand ~ to home directory before any other resolution.
+    // path.isAbsolute('~/foo') === false, so without this the path would be
+    // joined with workDir and land in /tmp/agentforge/agents/{id}/~/foo (wrong).
+    if (p.startsWith('~/') || p === '~') {
+      p = p.replace(/^~/, homedir());
+    }
+    if (!path.isAbsolute(p)) return path.join(workDir, p);
+    // Reject paths directly under / (e.g. /index.html, /style.css) — those are filesystem root
+    // and always read-only. Redirect to workDir so the file lands somewhere writable.
+    if (path.dirname(p) === '/') {
+      console.log(`   [worker] ⚠️ Path "${p}" is at filesystem root — redirecting to ${workDir}`);
+      return path.join(workDir, path.basename(p));
+    }
+    return p;
   }
   _toolDesc(name, args) {
@@ -1472,6 +2388,22 @@ const puppeteer = require(${JSON.stringify(puppeteerModule)});
       }
       case 'take_screenshot':
         return `Screenshot: ${args.url || args.target}`;
+      case 'browser': {
+        const action = args.action || 'browser';
+        if (action === 'navigate' || action === 'open') {
+          try { return `browser → ${new URL(args.url).hostname}`; } catch { return `browser → navigate`; }
+        }
+        if (action === 'snapshot') return 'browser → snapshot page';
+        if (action === 'screenshot') return 'browser → screenshot';
+        if (action === 'click') return `browser → click "${(args.text || args.selector || '').toString().slice(0, 40)}"`;
+        if (action === 'type') return `browser → type into ${(args.selector || 'input').toString().slice(0, 40)}`;
+        if (action === 'tabs') return 'browser → list tabs';
+        if (action === 'evaluate') return 'browser → run JS';
+        if (action === 'scroll') return 'browser → scroll';
+        if (action === 'find_elements') return 'browser → find elements';
+        if (action === 'get_bookmarks') return 'browser → get bookmarks';
+        return `browser → ${action}`;
+      }
       default:
         return name;
     }
@@ -1522,7 +2454,7 @@ const puppeteer = require(${JSON.stringify(puppeteerModule)});
           model: this.model,
           messages: [
             { role: 'system', content: 'You determine if a task is complete. Reply with only "yes" or "no".' },
-            { role: 'user', content: `Task: ${task.slice(0, 300)}\n\nAgent output: ${output.slice(0, 600)}\n\nDid the agent fully complete the task with real results (not excuses, not plans, not partial attempts)?` }
+            { role: 'user', content: `Task: ${task.slice(0, 400)}\n\nAgent output (last part):\n${output.slice(-800)}\n\nDid the agent complete ALL requirements of the task? Judge based on evidence of completed actions (files written, commands run, results returned) — NOT based on the agent's own statements about what it can or cannot do. Agent self-assessments and disclaimers are unreliable.\n- For build/server tasks: code must be written AND server must be running locally. Do NOT require cloud deployment (Railway/Vercel/Render/etc.) unless the task explicitly says to deploy or host publicly.\n- For tasks that explicitly mention deploying to Railway/Vercel/Render/Netlify/fly.io/Heroku: there MUST be a live public URL in the output.\n- For research/Q&A tasks: specific facts must be present.\nAnswer "yes" only if ALL stated requirements are done. Answer "no" if ANY required step is missing.` }
           ],
           stream: false,
           think: false,