@hamp10/agentforge 0.2.21 → 0.2.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/agentforge.js +909 -115
- package/package.json +2 -1
- package/scripts/check-task-semantics.js +911 -0
- package/scripts/postinstall.js +20 -5
- package/src/OllamaAgent.js +1178 -246
- package/src/OpenClawCLI.js +5897 -748
- package/src/browser.js +392 -0
- package/src/default-task-guides.js +95 -0
- package/src/resolveOpenclaw.js +38 -7
- package/src/selfUpdate.js +31 -3
- package/src/supervisor.js +88 -20
- package/src/taskSemantics.js +141 -0
- package/src/worker.js +4257 -230
- package/templates/agent/AGENTFORGE.md +151 -53
- package/templates/hooks/agentforge-platform/handler.js +322 -0
- package/src/HampAgentCLI.js +0 -125
- package/src/hampagent/browser.js +0 -321
- package/src/hampagent/runner.js +0 -277
- package/src/hampagent/sessions.js +0 -62
- package/src/hampagent/tools.js +0 -298
package/src/OllamaAgent.js
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
import { exec } from 'child_process';
|
|
1
|
+
import { exec, spawn } from 'child_process';
|
|
2
2
|
import { mkdirSync, writeFileSync, readFileSync, existsSync, readdirSync, statSync, appendFileSync } from 'fs';
|
|
3
3
|
import { EventEmitter } from 'events';
|
|
4
4
|
import path from 'path';
|
|
5
|
+
import { homedir } from 'os';
|
|
5
6
|
import { promisify } from 'util';
|
|
6
7
|
import { fileURLToPath } from 'url';
|
|
7
|
-
import { browserAction } from './
|
|
8
|
+
import { browserAction, releaseAgentTab } from './browser.js';
|
|
8
9
|
|
|
9
10
|
const execAsync = promisify(exec);
|
|
10
11
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
@@ -136,8 +137,13 @@ function _parseWriteFileFences(content) {
|
|
|
136
137
|
const re = /(?:WRITE_FILE|write_file)[:\s]+([^\n]+)\n```[^\n]*\n([\s\S]*?)```/gi;
|
|
137
138
|
let m;
|
|
138
139
|
while ((m = re.exec(content)) !== null) {
|
|
139
|
-
const filePath = m[1].trim();
|
|
140
|
+
const filePath = m[1].trim().replace(/\]$/, ''); // strip trailing ] if model used [write_file: /path] bracket notation
|
|
140
141
|
const fileContent = m[2]; // raw content, no unescaping needed
|
|
142
|
+
// Reject compaction placeholders — model echoed the summary as content
|
|
143
|
+
if (/^\[wrote:/.test(fileContent.trim()) || /^\(\d+ chars, \d+ lines —/.test(fileContent.trim())) {
|
|
144
|
+
console.log(` ⚠️ WRITE_FILE skipped: content is a compaction placeholder, not real file content (${filePath})`);
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
141
147
|
if (filePath && fileContent !== undefined) {
|
|
142
148
|
calls.push({ name: 'write_file', arguments: { path: filePath, content: fileContent } });
|
|
143
149
|
}
|
|
@@ -261,7 +267,10 @@ function _parseTextToolCalls(content) {
|
|
|
261
267
|
if (depth === 0 && jsonStr.trim()) break;
|
|
262
268
|
}
|
|
263
269
|
try {
|
|
264
|
-
|
|
270
|
+
// Strip Gemma4 model artifacts that can appear after a complete JSON object:
|
|
271
|
+
// <tool_call|>, <|end_of_turn|>, <|end|>, etc.
|
|
272
|
+
const cleanJson = jsonStr.trim().replace(/<[^>]*>$/g, '').trimEnd();
|
|
273
|
+
const obj = JSON.parse(cleanJson);
|
|
265
274
|
if (Array.isArray(obj)) {
|
|
266
275
|
for (const item of obj) {
|
|
267
276
|
const call = normalise(item);
|
|
@@ -334,15 +343,27 @@ export class OllamaAgent extends EventEmitter {
|
|
|
334
343
|
return { agentId, workDir };
|
|
335
344
|
}
|
|
336
345
|
|
|
337
|
-
async runAgentTask(agentId, task, workDir, sessionId = null, image = null, browserProfile = null, actualWorkDir = null, agentModel = null, customSystemPrompt = null, conversationHistory = null) {
|
|
346
|
+
async runAgentTask(agentId, task, workDir, sessionId = null, image = null, browserProfile = null, actualWorkDir = null, agentModel = null, customSystemPrompt = null, conversationHistory = null, allImages = null, visionModel = null, providerKeys = null) {
|
|
338
347
|
const startTime = Date.now();
|
|
339
348
|
const controller = new AbortController();
|
|
340
349
|
|
|
341
350
|
// Use per-agent model override if provided (and not the placeholder 'Default').
|
|
342
351
|
// Strip 'ollama/' prefix — catalog returns IDs like 'ollama/modelname:tag' but
|
|
343
352
|
// Ollama's API expects bare names like 'modelname:tag'.
|
|
353
|
+
// Cloud model IDs (google/..., anthropic/..., openai/...) are not valid Ollama names —
|
|
354
|
+
// fall back to the configured local model so a mismatch doesn't crash the task.
|
|
344
355
|
const rawModel = (agentModel && agentModel !== 'Default') ? agentModel : this.model;
|
|
345
|
-
const
|
|
356
|
+
const isCloudModel = /^(google|anthropic|openai|mistral|cohere|azure)\//i.test(rawModel);
|
|
357
|
+
const effectiveModel = isCloudModel ? this.model : (rawModel.startsWith('ollama/') ? rawModel.slice(7) : rawModel);
|
|
358
|
+
if (isCloudModel) console.log(` [${agentId}] ⚠️ Cloud model ID "${rawModel}" ignored by local runner — using ${effectiveModel}`);
|
|
359
|
+
|
|
360
|
+
// Store per-task vision settings — used by _screenshotAndDescribe during this task
|
|
361
|
+
// Cleared at the end of the task so stale keys don't leak between tasks
|
|
362
|
+
this._taskVisionModel = visionModel || null;
|
|
363
|
+
this._taskProviderKeys = providerKeys || null;
|
|
364
|
+
const googleKey = providerKeys?.google || null;
|
|
365
|
+
if (visionModel) console.log(` [${agentId}] 👁️ Vision model: ${visionModel} (google key: ${googleKey ? 'present' : 'MISSING'})`);
|
|
366
|
+
else console.log(` [${agentId}] 👁️ Vision: Ollama default (no vision_model in flow config)`);
|
|
346
367
|
|
|
347
368
|
// Fake proc-like object so worker.js pid checks don't crash
|
|
348
369
|
const fakeProc = { pid: null };
|
|
@@ -356,7 +377,7 @@ export class OllamaAgent extends EventEmitter {
|
|
|
356
377
|
// Load conversation history — prefer Railway DB history (sent via task payload, works across
|
|
357
378
|
// any machine/user/model). Fall back to local file for offline or pre-fix sessions.
|
|
358
379
|
const history = (conversationHistory && conversationHistory.length > 0)
|
|
359
|
-
? conversationHistory.slice(-
|
|
380
|
+
? conversationHistory.slice(-60)
|
|
360
381
|
: this._loadHistory(agentId, workDir, sessionId);
|
|
361
382
|
|
|
362
383
|
// Text-based tool format is used rather than XML schemas — more reliable across models.
|
|
@@ -364,6 +385,27 @@ export class OllamaAgent extends EventEmitter {
|
|
|
364
385
|
// ALL models get the same rule set and tool format — no model-specific branching.
|
|
365
386
|
const homeDir = process.env.HOME || '/tmp';
|
|
366
387
|
const projectsDir = `${homeDir}/Desktop/Projects`;
|
|
388
|
+
|
|
389
|
+
// ── Per-agent port assignment ──────────────────────────────────────────
|
|
390
|
+
// Each agent gets a deterministic port in range 3100-59099 derived from its ID
|
|
391
|
+
// (56000-port space — handles tens of thousands of projects before any collision).
|
|
392
|
+
// Port 3000 is reserved for agent_dashboard. At task start, any stale process
|
|
393
|
+
// on the assigned port is killed. If the port is still occupied by a live
|
|
394
|
+
// unrelated process, we walk up until we find a free one.
|
|
395
|
+
const agentPortOffset = parseInt(agentId.replace(/\D/g, '').slice(-5) || '0') % 56000;
|
|
396
|
+
let assignedPort = 3100 + agentPortOffset;
|
|
397
|
+
// Kill any stale server from a previous run of THIS agent
|
|
398
|
+
try { await execAsync(`lsof -t -i:${assignedPort} | xargs kill -9 2>/dev/null || true`); } catch {}
|
|
399
|
+
// If something else is still on that port, scan upward for a free one
|
|
400
|
+
for (let attempts = 0; attempts < 100; attempts++) {
|
|
401
|
+
try {
|
|
402
|
+
const { stdout } = await execAsync(`lsof -t -i:${assignedPort} 2>/dev/null || true`);
|
|
403
|
+
if (!stdout.trim()) break; // port is free
|
|
404
|
+
assignedPort++;
|
|
405
|
+
if (assignedPort > 59099) assignedPort = 3100;
|
|
406
|
+
} catch { break; }
|
|
407
|
+
}
|
|
408
|
+
console.log(` [${agentId}] 🔌 Assigned port: ${assignedPort}`);
|
|
367
409
|
const universalRules = `
|
|
368
410
|
== WHAT YOU CAN DO ==
|
|
369
411
|
You have these tools:
|
|
@@ -373,7 +415,7 @@ read_file: Read a local file.
|
|
|
373
415
|
WRITE_FILE: Write a local file (code-fence format only).
|
|
374
416
|
list_directory: List a local directory.
|
|
375
417
|
web_fetch: Fetch any public URL — websites, APIs, docs, raw data. Fast, text-only.
|
|
376
|
-
screenshot_and_describe:
|
|
418
|
+
screenshot_and_describe: Take a screenshot and analyze it with vision so YOU can see and reason about what's on screen. Use this when: pages are JS-heavy, snapshot gives partial/empty data, you need to read numbers/text that aren't in the DOM, or you want to verify what's actually visible. Pass url to navigate first, or omit url to screenshot the current browser tab. Returns a text description YOU can reason about — this is NOT just for the user, it is how YOU SEE THE PAGE.
|
|
377
419
|
browser: Control the AgentForge Browser directly (Chrome, always running, logged into user's services). Use for ALL browser interaction — navigating, clicking, typing, reading page content, screenshots.
|
|
378
420
|
|
|
379
421
|
BROWSER TOOL — use this instead of writing CDP scripts:
|
|
@@ -385,7 +427,7 @@ BROWSER TOOL — use this instead of writing CDP scripts:
|
|
|
385
427
|
{"name":"browser","arguments":{"action":"click","text":"Show Filter"}} ← click element by visible text
|
|
386
428
|
{"name":"browser","arguments":{"action":"click","selector":"#filter-btn"}} ← click by CSS selector
|
|
387
429
|
{"name":"browser","arguments":{"action":"type","selector":"input","text":"hello"}} ← type text
|
|
388
|
-
{"name":"browser","arguments":{"action":"screenshot"}} ←
|
|
430
|
+
{"name":"browser","arguments":{"action":"screenshot"}} ← sends screenshot to user (YOU cannot see it — use screenshot_and_describe to see the page yourself)
|
|
389
431
|
{"name":"browser","arguments":{"action":"evaluate","script":"document.title"}} ← run JS
|
|
390
432
|
{"name":"browser","arguments":{"action":"scroll","y":400}} ← scroll down
|
|
391
433
|
|
|
@@ -398,46 +440,181 @@ WORKFLOW when user says "the tab is already open":
|
|
|
398
440
|
The browser has the user's sessions and cookies. You CAN click any button, filter, or link visible on the page.
|
|
399
441
|
|
|
400
442
|
== GENERAL RULES (all tasks) ==
|
|
401
|
-
G1. IDENTIFY THE TASK TYPE
|
|
402
|
-
|
|
443
|
+
G1. IDENTIFY THE TASK TYPE FIRST:
|
|
444
|
+
- CONVERSATIONAL/QUESTION (asking for names, opinions, definitions, advice, comparisons, brainstorming): Answer in text. NO tools. Do NOT use browser, bash, screenshot, or any tool. Match the depth of your response to the complexity of the question — a simple factual question gets a concise answer, an open-ended or creative question gets a full, substantive response with reasoning.
|
|
445
|
+
- RESEARCH (look something up online): Use web_fetch or browser to find info, then answer in text.
|
|
446
|
+
- BUILD (create an app, game, script, file): Use bash, WRITE_FILE, browser as needed.
|
|
447
|
+
- BROWSER TASK (interact with a website): Use browser tools.
|
|
448
|
+
G2. START IMMEDIATELY. No intro text, no plans, no asking permission. First output = first tool call or direct answer. DO NOT repeat the user's question or task back to them — just respond.
|
|
403
449
|
G3. ANY WEBSITE/URL IS ACCESSIBLE. User mentions a site or open tab? Use browser snapshot to see what's currently open, then browser navigate/click/type to interact. Never ask "what's the URL?" — find it yourself.
|
|
404
450
|
G4. NEVER ASK PERMISSION. Never say "should I use X or Y?" — pick the right tool and use it.
|
|
405
|
-
|
|
451
|
+
G4a. STOP WHEN DONE. After completing the task, STOP. Do NOT add meta-commentary about your capabilities, limitations, or what information you don't have. Do NOT explain what you cannot do. Answer and stop.
|
|
452
|
+
G4b. FORMATTING: Use **bold** for section labels and emphasis. Do NOT use markdown headers (# ## ### ####) — use **bold** instead. For bullet lists, ALWAYS write "- item" (dash + space + text). NEVER write "*item" (asterisk directly before text with no space) — that is not valid markdown and shows as a raw asterisk.
|
|
453
|
+
G5. IF A TOOL FAILS: Try a different approach. Browser snapshot empty? → try web_fetch on the same URL. web_fetch empty? → try screenshot_and_describe. NEVER repeat a failing call more than twice with different selectors — take a snapshot to see what's actually on the page. IF WEB BROWSING FAILS REPEATEDLY: fall back to web_fetch on the site's URL, or try a different URL entirely. NEVER write files, build code, or start a server as a fallback for web research — stay in browser/web_fetch tools until you have the data.
|
|
454
|
+
G5a. BROWSER FORM SUBMISSION: After typing into a search/input field, ALWAYS submit with {"action":"press","key":"Enter","selector":"<same-selector-you-typed-into>"} — pass the selector of the field you just typed in so Enter fires in the right element. NEVER try to click submit/compute/search buttons by ref, text, or selector. Buttons shift, break, or trigger ads. Enter always works.
|
|
455
|
+
G5b. BROWSER INTERACTION RULE: After navigating to a page, ALWAYS take a snapshot FIRST to see real element text, IDs, and indices before attempting to click or type. Do NOT guess selectors from memory — selectors change. Snapshot → read elements → interact.
|
|
456
|
+
G5c. READING PAGE CONTENT: For reading text on a page (titles, scores, prices, numbers), use browser → snapshot — it returns all DOM text fast. Use screenshot_and_describe only when you need to visually verify something rendered (canvas, image, CSS layout) OR when snapshot body text is under 200 chars (JS-heavy page, results not yet in DOM). When using screenshot_and_describe to find specific data, ALWAYS pass check_for with exactly what you need.
|
|
406
457
|
G6. RESEARCH TASKS: web_fetch → read → reason → respond in text. No server, no localhost.
|
|
407
|
-
G7. NEVER INVENT TASKS. Do exactly what was asked. Do not build a web app when asked to analyze data.
|
|
458
|
+
G7. NEVER INVENT TASKS. Do exactly what was asked. Do not build a web app when asked to analyze data. Do not write files when asked to look up information. Do not start coding when the task is browsing.
|
|
408
459
|
G8. WHEN GENUINELY STUCK: State what you tried, what failed, ask ONE specific question.
|
|
409
460
|
G9. KEEP GOING until the task is fully complete.
|
|
410
461
|
|
|
411
462
|
== BUILD RULES (only when building apps/games/tools) ==
|
|
412
|
-
|
|
463
|
+
B0. STATIC-FILE TASKS (saves to a local path, no deployment/hosting mentioned): If the task says "save to ~/some/path.html" or "create a file at ~/some/path" and does NOT mention serving, hosting, or deploying — just WRITE_FILE to that exact path, then open it with {"name":"browser","arguments":{"action":"navigate","url":"file:///abs/path/index.html"}} and screenshot to visually verify. Do NOT spin up a server, do NOT run npm init, do NOT install packages. Pure HTML/CSS/JS files run directly in browsers via file:// URLs — no server needed.
|
|
464
|
+
B1. PROJECT LOCATION: Always put projects in ${projectsDir}/PROJECT_NAME/ (no spaces — use underscores). NEVER create directories or write project files under /tmp/agentforge/ — that path is platform-managed. Your Working directory (${workDir}) is only for tool execution context, NOT for storing project files.
|
|
413
465
|
B2. WRITE EVERY FILE COMPLETELY — no stubs, no placeholders, no TODOs. Full working code only.
|
|
466
|
+
B2a. NEVER use echo or cat to append code line-by-line (e.g. echo 'code' >> file.js). Always use WRITE_FILE with the COMPLETE file content in one call. Appending one line per bash call wastes 100 turns to write what one WRITE_FILE does instantly.
|
|
414
467
|
B3. BUILD FILE BY FILE — write each file completely before writing the next.
|
|
415
468
|
B4. ALWAYS use absolute paths.
|
|
416
|
-
B5. SERVING FILES: Node.js server: nohup /usr/local/bin/node /abs/path/server.js > /tmp/server.log 2>&1 & — NEVER blocking. Pure HTML/JS (no backend): nohup python3 -m http.server
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
469
|
+
B5. SERVING FILES: Node.js server MUST cd into the project dir first — ALWAYS use this exact pattern: cd /abs/project/path && nohup /usr/local/bin/node /abs/project/path/server.js > /tmp/server.log 2>&1 & — NEVER use a bare filename like "nohup node server.js" without cd, or Node will look for server.js in the wrong directory and crash. NEVER blocking. Pure HTML/JS (no backend): nohup python3 -m http.server ${assignedPort} --directory /abs/path/ > /tmp/server.log 2>&1 &
|
|
470
|
+
B5b. STOPPING A SERVER: NEVER use "pkill -f node" — it kills the platform itself. To stop a running server: kill $(lsof -ti:PORT) 2>/dev/null || true
|
|
471
|
+
B6. npm install: ALL npm commands MUST be in ONE bash call with cd: {"name":"bash","arguments":{"command":"cd /abs/project/path && /usr/local/bin/npm init -y && /usr/local/bin/npm install express"}} — NEVER run npm init or npm install as a separate bash call without cd, or packages install in the wrong directory and the server will crash with "Cannot find module".
|
|
472
|
+
B7. After starting server, verify using the ACTUAL PORT the server is listening on (not the assigned port): sleep 3 && curl -s -o /dev/null -w '%{http_code}' http://localhost:ACTUAL_PORT
|
|
473
|
+
- If 000: server crashed. Read /tmp/server.log, fix the error, restart server (kill $(lsof -ti:PORT) 2>/dev/null || true && cd /abs/project/path && nohup /usr/local/bin/node /abs/path/server.js > /tmp/server.log 2>&1 &), then curl again.
|
|
474
|
+
- If 404: server is running but missing a file. Read /tmp/server.log — if you see "ENOENT" for public/index.html, that HTML file was NOT written. Write it immediately, then curl again. Do NOT rewrite server.js for a 404.
|
|
475
|
+
- If 200: server is up. Proceed to B10 screenshot QA.
|
|
476
|
+
After fixing any error, ALWAYS restart the server AND re-verify with curl before proceeding.
|
|
477
|
+
B8. PORT RULE: If the user's task explicitly specifies a port number, use that exact port everywhere — in server.js, in the verification curl, everywhere. If no port is specified, use your ASSIGNED PORT ${assignedPort}. In server.js: const PORT = process.env.PORT || YOUR_CHOSEN_PORT; Never use port 3000 (reserved by system).
|
|
420
478
|
B9. EXPRESS WILDCARD ROUTE: NEVER write app.get('*', ...) — crashes in newer versions. Use app.use((req, res) => { ... }) instead.
|
|
421
|
-
|
|
422
|
-
|
|
479
|
+
B10a. STATIC FILE PATHS: ALWAYS use path.join(__dirname, 'public') for express.static — NEVER './public' or 'public'. For res.sendFile on the root route: ALWAYS path.join(__dirname, 'public', 'index.html') — NEVER path.join(__dirname, 'index.html'). Relative paths break under nohup.
|
|
480
|
+
B10b. server.js IS FOR LOGIC ONLY — NEVER EMBED HTML: All HTML belongs in public/index.html. Route handlers must NOT contain template literals with HTML (backtick strings with <div>, <h1>, etc.) — these cause SyntaxErrors. server.js should only have: require/import, middleware, JSON API routes, express.static, and app.listen. Anything visual goes in public/.
|
|
481
|
+
B10. MANDATORY SCREENSHOT QA — KEEP ITERATING UNTIL THE DESIGN PASSES:
|
|
482
|
+
After curl returns 200, call screenshot_and_describe(url:"http://localhost:PORT", send_to_user:true).
|
|
483
|
+
Evaluate against these pass/fail criteria. If ANY fail, fix immediately and screenshot again:
|
|
484
|
+
✗ FAIL: Plain/unstyled HTML — no colors, raw browser defaults, looks like a text document
|
|
485
|
+
✗ FAIL: Text barely visible or poor contrast against the background
|
|
486
|
+
✗ FAIL: Layout broken, elements overlapping, or content spilling outside containers
|
|
487
|
+
✗ FAIL: Buttons are plain gray browser defaults — unstyled
|
|
488
|
+
✗ FAIL: Inputs are plain white browser defaults — unstyled
|
|
489
|
+
✗ FAIL: No consistent color theme applied throughout
|
|
490
|
+
✓ PASS: All of the above are satisfied — consistent theme, readable text, styled controls, proper layout
|
|
491
|
+
Stop only when ALL criteria pass. There is no fixed iteration count — stop when it genuinely looks good, whether that takes 1 screenshot or 10. Do NOT stop just because the server is running.
|
|
492
|
+
B11. CSS DESIGN STANDARDS — apply from the start, before any screenshot:
|
|
493
|
+
Use a dark background (#1a1a2e or #0d1117 or similar), white/light text, colored accents (#00b4d8, #4ade80, #f472b6, etc.). Style ALL inputs and buttons — no raw browser defaults. Use border-radius, padding, box-shadow, and flex/grid layout. Minimum: background gradient or solid dark color, styled form inputs (border: 1px solid #444, bg: #1e1e2e, color: #fff), primary buttons with colored background. The first version should already look good — not a plain HTML skeleton.
|
|
423
494
|
B12. CANVAS GAMES: canvas 800×600, dark background #1a1a2e, all elements clearly visible. Dark theme, styled UI.
|
|
424
495
|
B13. OBSERVE BEFORE FIXING: Screenshot first, then make targeted edits. Never rewrite an entire file from scratch when the server is running.
|
|
425
496
|
B14. TARGETED EDITS: read_file to see current code, write_file only the changed section. Never throw away working code.
|
|
426
497
|
B15. QUALITY LOOP: After each fix, screenshot again to verify. Iterate until it looks correct.
|
|
427
|
-
B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different states. Not just the header
|
|
498
|
+
B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different states. Not just the header.
|
|
499
|
+
B17. AFTER DEPLOYING: Once a deployment command succeeds, immediately run the platform's URL command (e.g. railway domain, vercel --prod, netlify open:deploy, fly status) to get the live public URL. Your final message MUST include the full URL so the user can open it.`;
|
|
428
500
|
// Text-based tool format works reliably across all local models.
|
|
429
501
|
// WRITE_FILE uses code-fence to avoid JSON-escaping issues; all other tools use JSON.
|
|
430
|
-
const jsonToolFormat = `You are an AI agent. Working directory: ${workDir}\n\
|
|
502
|
+
const jsonToolFormat = `You are an AI agent. Working directory: ${workDir}\n\nCONVERSATIONAL QUESTIONS — answer directly with text, NO tools: brainstorming, opinions, explanations, greetings, "what is X", "give me ideas", "how does X work", anything you can answer from knowledge. Only use tools when you need to actually DO something: read/write files, run commands, browse real-time data, build or deploy something.\n\nACTION TASKS — DO NOT describe what you will do. DO NOT write plans. START EXECUTING IMMEDIATELY.\n\nCRITICAL — THINK SILENTLY: Any reasoning, planning, self-doubt, or "I cannot" thoughts MUST go inside <think>...</think> tags and NEVER appear as visible text. Your visible output must be ONLY tool calls and final answers. NEVER output limitations or explanations before calling a tool — think it, don't say it.\n\nTO WRITE A FILE — output WRITE_FILE with the FULL ABSOLUTE PATH on the same line, then a code fence:\nWRITE_FILE /abs/path/to/server.js\n\`\`\`\n...complete file content here...\n\`\`\`\nCRITICAL: the path is MANDATORY — WRITE_FILE alone (no path) is invalid and will be ignored.\n\nTO RUN A COMMAND — output JSON on its own line:\n{"name":"bash","arguments":{"command":"shell command here"}}\n\nTools:\n- WRITE_FILE /path — write a local file. ONLY use this when actually creating/editing a file on disk.\n- {"name":"bash","arguments":{"command":"..."}} — run any shell command\n- {"name":"read_file","arguments":{"path":"/abs/path"}} — read a local file\n- {"name":"list_directory","arguments":{"path":"/abs/path"}} — list local directory\n- {"name":"web_fetch","arguments":{"url":"https://any-public-url.com"}} — fetch ANY website or URL and read its content. Use for research, data, docs, scraping public sites.\n- {"name":"screenshot_and_describe","arguments":{"url":"https://any-url.com","check_for":"what to look for"}} — screenshot a page and analyze it with vision so YOU can SEE what's on screen. CRITICAL: this is how you visually read a page — use it whenever snapshot returns partial/empty data or you need to read numbers/text from a JS-heavy page. Omit url to screenshot the current browser tab. Returns text description YOU can reason about.\n- {"name":"browser","arguments":{"action":"tabs"}} — control the REAL Chrome browser (pre-logged in with user's sessions). Use for bookmarks, logged-in sites, JavaScript-heavy pages. Actions: tabs, snapshot, navigate, click, type, press, screenshot (sends to user only — YOU cannot see it), evaluate, scroll, focus. SUBMITTING FORMS: after typing into a search box, use {"action":"press","key":"Enter"} to submit — do NOT click ref numbers which can hit ads. CLICKING BUTTONS: prefer {"action":"click","text":"button label"} over {"action":"click","ref":N} — ref numbers shift and can click the wrong element. To visually READ a page yourself, use screenshot_and_describe instead of browser screenshot.\n\n${universalRules}`;
|
|
431
503
|
const systemPrompt = customSystemPrompt || jsonToolFormat;
|
|
432
504
|
|
|
505
|
+
// Build message array. When there is prior history, scan the last few assistant turns
|
|
506
|
+
// for signs the model got stuck (declared inability, looped, gave up). If stuck, trim
|
|
507
|
+
// the history so the user's new instruction lands with full weight rather than being
|
|
508
|
+
// buried under a wall of failed reasoning the model is anchored to.
|
|
509
|
+
let activeHistory = history;
|
|
510
|
+
if (activeHistory.length > 0) {
|
|
511
|
+
const recentAssistant = activeHistory
|
|
512
|
+
.filter(m => m.role === 'assistant')
|
|
513
|
+
.slice(-4)
|
|
514
|
+
.map(m => (m.content || '').toLowerCase());
|
|
515
|
+
const stuckSignals = [
|
|
516
|
+
'i cannot', 'i am unable', 'unfortunately', 'environment does not',
|
|
517
|
+
'not possible', 'i lack', 'i do not have the ability', 'i have exhausted',
|
|
518
|
+
'cannot be done', 'is not supported', 'failed to', 'i have tried',
|
|
519
|
+
'every attempt', 'cannot complete',
|
|
520
|
+
'no specific task', 'no task has been given', 'no task was given',
|
|
521
|
+
'cannot proceed with a meaningful', 'i must wait for a task',
|
|
522
|
+
'waiting for a task', 'please provide a task', 'specify a task',
|
|
523
|
+
];
|
|
524
|
+
const isStuck = recentAssistant.some(text =>
|
|
525
|
+
stuckSignals.some(sig => text.includes(sig))
|
|
526
|
+
);
|
|
527
|
+
if (isStuck) {
|
|
528
|
+
// Keep only the last 6 turns (3 exchanges) so the new instruction dominates.
|
|
529
|
+
// The user is course-correcting — don't let stale failure reasoning override them.
|
|
530
|
+
activeHistory = activeHistory.slice(-6);
|
|
531
|
+
console.log(` [${agentId}] 🔄 Stuck signals detected in history — trimmed to last 6 turns so new instruction takes priority`);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
433
535
|
const messages = [
|
|
434
536
|
{ role: 'system', content: systemPrompt },
|
|
435
|
-
...
|
|
537
|
+
...activeHistory,
|
|
436
538
|
];
|
|
437
539
|
|
|
540
|
+
// Inject context the agent needs to work on existing projects.
|
|
541
|
+
// Registry is always injected (small, always relevant).
|
|
542
|
+
// Workspace files list only injected on fresh sessions (no history).
|
|
543
|
+
let taskContent = task;
|
|
544
|
+
{
|
|
545
|
+
const contextParts = [];
|
|
546
|
+
|
|
547
|
+
// 1. Known running projects from the global registry (always inject)
|
|
548
|
+
try {
|
|
549
|
+
const REGISTRY = '/tmp/agentforge/projects.json';
|
|
550
|
+
if (existsSync(REGISTRY)) {
|
|
551
|
+
const registry = JSON.parse(readFileSync(REGISTRY, 'utf8'));
|
|
552
|
+
const entries = Object.values(registry);
|
|
553
|
+
if (entries.length > 0) {
|
|
554
|
+
const lines = entries.map(e => {
|
|
555
|
+
let info = `- "${e.name}" → ${e.path} (running on port ${e.port}`;
|
|
556
|
+
if (e.railwayProject) info += `, Railway project: "${e.railwayProject}"`;
|
|
557
|
+
if (e.liveUrl) info += `, live URL: ${e.liveUrl}`;
|
|
558
|
+
return info + ')';
|
|
559
|
+
});
|
|
560
|
+
contextParts.push(`Known projects on this machine:\n${lines.join('\n')}`);
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
} catch {}
|
|
564
|
+
|
|
565
|
+
// 1b. Available deployment/publishing CLIs — probe what's actually installed and authed.
|
|
566
|
+
// Inject so the agent knows it CAN deploy rather than claiming it lacks credentials.
|
|
567
|
+
try {
|
|
568
|
+
const deployTools = [];
|
|
569
|
+
const candidates = [
|
|
570
|
+
{ cmd: 'railway', check: 'railway whoami 2>/dev/null', label: 'railway' },
|
|
571
|
+
{ cmd: 'vercel', check: 'vercel whoami 2>/dev/null', label: 'vercel' },
|
|
572
|
+
{ cmd: 'netlify', check: 'netlify status 2>/dev/null', label: 'netlify' },
|
|
573
|
+
{ cmd: 'fly', check: 'fly auth whoami 2>/dev/null',label: 'fly' },
|
|
574
|
+
{ cmd: 'surge', check: 'surge whoami 2>/dev/null', label: 'surge' },
|
|
575
|
+
{ cmd: 'gh', check: 'gh auth status 2>/dev/null', label: 'gh' },
|
|
576
|
+
];
|
|
577
|
+
await Promise.all(candidates.map(async ({ cmd, check, label }) => {
|
|
578
|
+
try {
|
|
579
|
+
const { stdout } = await execAsync(`which ${cmd} 2>/dev/null && ${check}`, { timeout: 4000 });
|
|
580
|
+
if (stdout.trim()) deployTools.push(`${label} (authenticated: ${stdout.trim().split('\n')[0].slice(0, 60)})`);
|
|
581
|
+
} catch {}
|
|
582
|
+
}));
|
|
583
|
+
if (deployTools.length > 0) {
|
|
584
|
+
contextParts.push(`Deployment CLIs available and authenticated on this machine:\n${deployTools.map(t => `- ${t}`).join('\n')}\n\nYou can use these tools directly via bash to deploy projects publicly.`);
|
|
585
|
+
}
|
|
586
|
+
} catch {}
|
|
587
|
+
|
|
588
|
+
// 2. Existing files in this agent's workspace (fresh sessions only)
|
|
589
|
+
if (activeHistory.length === 0) {
|
|
590
|
+
try {
|
|
591
|
+
const SKIP_NAMES = new Set(['MEMORY.md', 'AGENTS.md', 'AGENTFORGE.md', 'node_modules', '.git', 'memory', '.npm', 'package-lock.json']);
|
|
592
|
+
const collectFiles = (dir, base = '', depth = 0) => {
|
|
593
|
+
if (depth > 3) return [];
|
|
594
|
+
let files = [];
|
|
595
|
+
for (const e of readdirSync(dir, { withFileTypes: true })) {
|
|
596
|
+
if (SKIP_NAMES.has(e.name)) continue;
|
|
597
|
+
const rel = base ? `${base}/${e.name}` : e.name;
|
|
598
|
+
if (e.isDirectory()) files.push(...collectFiles(path.join(dir, e.name), rel, depth + 1));
|
|
599
|
+
else files.push(rel);
|
|
600
|
+
}
|
|
601
|
+
return files;
|
|
602
|
+
};
|
|
603
|
+
const existingFiles = collectFiles(workDir);
|
|
604
|
+
if (existingFiles.length > 0) {
|
|
605
|
+
contextParts.push(`Your workspace already contains these files:\n${existingFiles.map(f => `- ${workDir}/${f}`).join('\n')}\n\nRead the relevant files before making any changes. Make targeted edits — do NOT rewrite working files from scratch.`);
|
|
606
|
+
}
|
|
607
|
+
} catch {}
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
if (contextParts.length > 0) {
|
|
611
|
+
taskContent = `${contextParts.join('\n\n')}\n\n${task}`;
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
|
|
438
615
|
// Attach initial image if provided — always include it; models that don't support
|
|
439
616
|
// images will ignore the field, and if they error we catch it below.
|
|
440
|
-
const userMessage = { role: 'user', content:
|
|
617
|
+
const userMessage = { role: 'user', content: taskContent };
|
|
441
618
|
if (image) {
|
|
442
619
|
const base64 = image.replace(/^data:image\/\w+;base64,/, '');
|
|
443
620
|
userMessage.images = [base64];
|
|
@@ -459,10 +636,52 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
459
636
|
const toolsUsed = []; // track tool names called (for fallback summary)
|
|
460
637
|
// No hard turn limit — agent runs until done, loop-detected, or wall-clock timeout.
|
|
461
638
|
const recentCalls = []; // last N tool calls for loop detection
|
|
639
|
+
const recentBashCalls = []; // bash-only window — write_file doesn't contaminate bash loop detection
|
|
462
640
|
let emptyRetries = 0; // consecutive empty-response retries
|
|
641
|
+
const recentOutputs = []; // last N no-tool-call outputs for repeated-output detection
|
|
642
|
+
let incompleteKicks = 0; // consecutive times _isTaskComplete returned false
|
|
643
|
+
let noToolKicks = 0; // consecutive turns with content but no tool calls — escalate message
|
|
644
|
+
let taskDoneEarly = false; // set by completion-language detector inside tool loop
|
|
645
|
+
let localBrowserTurns = 0; // consecutive browser tool calls on localhost — capped to prevent infinite QA loops
|
|
646
|
+
let successfulScreenshots = 0; // how many times we've seen a working (non-placeholder) localhost app
|
|
647
|
+
let midRefusalKicks = 0; // how many times we've overridden a mid-task refusal
|
|
648
|
+
let echoAppendCalls = 0; // consecutive bash calls using echo >> to append to a file
|
|
649
|
+
let consecutiveTruncations = 0; // how many times in a row the same truncated JSON was re-output
|
|
650
|
+
const fileReadCounts = new Map(); // path -> # of reads since last write_file (cross-turn read-loop detector)
|
|
463
651
|
|
|
464
652
|
for (let turn = 0; ; turn++) {
|
|
465
653
|
if (controller.signal.aborted) break;
|
|
654
|
+
let toolsUsedThisTurn = 0; // Fix 10: per-turn tool count — reset each turn so _isTaskComplete
|
|
655
|
+
// only fires when the current turn actually ran tools, not just
|
|
656
|
+
// because prior turns did. Prevents kicking mid-plan text outputs.
|
|
657
|
+
|
|
658
|
+
// Hard turn cap: prevent runaway agents. 60 turns handles complex multi-file projects.
|
|
659
|
+
if (turn >= 60) {
|
|
660
|
+
console.log(` [${agentId}] ⚠️ Turn cap (60) reached — forcing completion`);
|
|
661
|
+
messages.push({ role: 'user', content: 'You have used 60 turns. Provide your final answer now — describe what you built and any important notes. Be concise.' });
|
|
662
|
+
break;
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
// ── Per-turn context trim ────────────────────────────────────────────
|
|
666
|
+
// After large file writes the messages array can accumulate 15K+ tokens
|
|
667
|
+
// making each subsequent Ollama call slower and causing empty responses.
|
|
668
|
+
// When total content exceeds 30K chars (~7.5K tokens), drop middle messages
|
|
669
|
+
// (keep system prompt + first user task + last 8 messages).
|
|
670
|
+
// Threshold lowered from 60K: a single large WRITE_FILE can add 20K chars,
|
|
671
|
+
// causing every subsequent turn to have slow prefill.
|
|
672
|
+
const totalMsgChars = messages.reduce((s, m) => s + (typeof m.content === 'string' ? m.content.length : 0), 0);
|
|
673
|
+
if (totalMsgChars > 30000 && messages.length > 10) {
|
|
674
|
+
const systemMsg = messages[0];
|
|
675
|
+
const firstUserMsg = messages.find(m => m.role === 'user');
|
|
676
|
+
const recentMsgs = messages.slice(-8);
|
|
677
|
+
const trimmed = [systemMsg, firstUserMsg, ...recentMsgs].filter(Boolean);
|
|
678
|
+
// Only trim if it actually reduces messages (avoids trimming to same set)
|
|
679
|
+
if (trimmed.length < messages.length) {
|
|
680
|
+
console.log(` [${agentId}] ✂️ Turn ${turn}: context trim ${messages.length}→${trimmed.length} msgs (${Math.round(totalMsgChars/1000)}KB chars)`);
|
|
681
|
+
messages.length = 0;
|
|
682
|
+
messages.push(...trimmed);
|
|
683
|
+
}
|
|
684
|
+
}
|
|
466
685
|
|
|
467
686
|
this.emit('tool_activity', { agentId, event: 'tool_start', tool: 'model', description: `Thinking…` });
|
|
468
687
|
|
|
@@ -472,6 +691,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
472
691
|
const isOllamaBackend = this.baseUrl.includes('11434') || this.baseUrl.includes('localhost') || this.baseUrl.includes('127.0.0.1');
|
|
473
692
|
const useNativeEndpoint = isOllamaBackend; // all local models use native endpoint
|
|
474
693
|
|
|
694
|
+
const inferenceStart = Date.now();
|
|
475
695
|
let response;
|
|
476
696
|
try {
|
|
477
697
|
|
|
@@ -498,14 +718,37 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
498
718
|
};
|
|
499
719
|
}
|
|
500
720
|
|
|
721
|
+
// Per-turn inference timeout: 8 minutes. Without this, a huge context (e.g. 37K-char
|
|
722
|
+
// file in messages) can make Ollama spin for 10+ minutes with no output. The context
|
|
723
|
+
// trim (60K char threshold) prevents most cases, but this is a safety valve.
|
|
724
|
+
const turnAbort = new AbortController();
|
|
725
|
+
const turnTimeoutId = setTimeout(() => {
|
|
726
|
+
console.log(` [${agentId}] ⏰ Turn ${turn}: inference timeout (8 min) — aborting and retrying with trimmed context`);
|
|
727
|
+
turnAbort.abort();
|
|
728
|
+
}, 8 * 60 * 1000);
|
|
729
|
+
const combinedSignal = AbortSignal.any
|
|
730
|
+
? AbortSignal.any([controller.signal, turnAbort.signal])
|
|
731
|
+
: turnAbort.signal; // fallback: use turn signal only if any() unavailable
|
|
732
|
+
|
|
501
733
|
response = await fetch(endpoint, {
|
|
502
734
|
method: 'POST',
|
|
503
735
|
headers: { 'Content-Type': 'application/json' },
|
|
504
|
-
signal:
|
|
736
|
+
signal: combinedSignal,
|
|
505
737
|
body: JSON.stringify(requestBody)
|
|
506
738
|
});
|
|
739
|
+
clearTimeout(turnTimeoutId);
|
|
507
740
|
} catch (fetchErr) {
|
|
508
|
-
if (fetchErr.name === 'AbortError')
|
|
741
|
+
if (fetchErr.name === 'AbortError') {
|
|
742
|
+
// If the task-level controller was aborted, exit cleanly
|
|
743
|
+
if (controller.signal.aborted) break;
|
|
744
|
+
// Otherwise this was a turn-level timeout — treat like empty response and retry
|
|
745
|
+
console.log(` [${agentId}] ⏰ Turn ${turn}: inference timed out — forcing context trim and retry`);
|
|
746
|
+
// Trim aggressively: keep system + first user + last 4 messages
|
|
747
|
+
const _sys = messages[0]; const _usr = messages.find(m => m.role === 'user');
|
|
748
|
+
const _recent = messages.slice(-4);
|
|
749
|
+
messages.length = 0; messages.push(_sys, _usr, ..._recent.filter(Boolean));
|
|
750
|
+
continue; // retry this turn with trimmed context
|
|
751
|
+
}
|
|
509
752
|
throw new Error(`Cannot reach local model server at ${this.baseUrl}. Is it running? (${fetchErr.message})`);
|
|
510
753
|
}
|
|
511
754
|
|
|
@@ -529,18 +772,43 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
529
772
|
let inFenceBlock = false; // inside WRITE_FILE code fence — suppress content from streaming
|
|
530
773
|
let fenceDepth = 0; // ``` count since last WRITE_FILE (even=closed, odd=open)
|
|
531
774
|
let rawTokenCount = 0;
|
|
775
|
+
let tokenCapTruncatedFile = false; // true when token cap fired mid-WRITE_FILE fence
|
|
532
776
|
let lastVisibleAt = Date.now(); // track when we last got visible output (for think timeout)
|
|
533
777
|
|
|
534
778
|
const reader = response.body.getReader();
|
|
535
779
|
const decoder = new TextDecoder();
|
|
536
780
|
let buf = '';
|
|
537
781
|
|
|
538
|
-
// No
|
|
539
|
-
//
|
|
782
|
+
// No hard timeout on inference — local model can take as long as it needs.
|
|
783
|
+
// But we DO time out individual reader.read() calls (30s) so a silently-dropped
|
|
784
|
+
// connection never hangs the worker forever. And when Ollama signals done:true we
|
|
785
|
+
// immediately cancel the reader instead of waiting for the HTTP body to close on
|
|
786
|
+
// its own (which can stall indefinitely on keep-alive connections).
|
|
540
787
|
let turnRetry = false;
|
|
541
|
-
while (true) {
|
|
788
|
+
streamLoop: while (true) {
|
|
542
789
|
if (controller.signal.aborted) break;
|
|
543
|
-
|
|
790
|
+
// Time-box each individual read() call. If no bytes arrive for 30s the stream
|
|
791
|
+
// has stalled (Ollama crashed / connection dropped silently) — abort it.
|
|
792
|
+
let _readTimer;
|
|
793
|
+
let readResult;
|
|
794
|
+
try {
|
|
795
|
+
readResult = await Promise.race([
|
|
796
|
+
reader.read(),
|
|
797
|
+
new Promise((_, reject) => {
|
|
798
|
+
_readTimer = setTimeout(() => reject(new Error('stream_read_stall')), 30000);
|
|
799
|
+
})
|
|
800
|
+
]);
|
|
801
|
+
} catch (e) {
|
|
802
|
+
if (e.message === 'stream_read_stall') {
|
|
803
|
+
console.log(` [${agentId}] ⏱️ Stream stalled (no data for 30s) — aborting`);
|
|
804
|
+
reader.cancel().catch(() => {});
|
|
805
|
+
break;
|
|
806
|
+
}
|
|
807
|
+
throw e;
|
|
808
|
+
} finally {
|
|
809
|
+
clearTimeout(_readTimer);
|
|
810
|
+
}
|
|
811
|
+
const { done, value } = readResult;
|
|
544
812
|
if (done) break;
|
|
545
813
|
|
|
546
814
|
buf += decoder.decode(value, { stream: true });
|
|
@@ -556,7 +824,12 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
556
824
|
// Ollama native NDJSON format
|
|
557
825
|
let nativeEvt;
|
|
558
826
|
try { nativeEvt = JSON.parse(line); } catch { continue; }
|
|
559
|
-
if (nativeEvt.done)
|
|
827
|
+
if (nativeEvt.done) {
|
|
828
|
+
// Ollama says generation is complete — cancel the reader and exit now.
|
|
829
|
+
// Do NOT fall back to reader.read() which can hang on keep-alive connections.
|
|
830
|
+
reader.cancel().catch(() => {});
|
|
831
|
+
break streamLoop;
|
|
832
|
+
}
|
|
560
833
|
tokenText = nativeEvt.message?.content ?? null;
|
|
561
834
|
} else {
|
|
562
835
|
// OpenAI SSE format
|
|
@@ -586,21 +859,37 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
586
859
|
rawTokenCount++;
|
|
587
860
|
streamContent += tokenText;
|
|
588
861
|
|
|
862
|
+
// Per-turn token cap — if a single turn generates >6000 tokens, the model is
|
|
863
|
+
// probably writing multiple large files in one shot or looping. Truncate the stream
|
|
864
|
+
// and let the agent loop handle the (partial) output. Keeps single-turn inference
|
|
865
|
+
// bounded to ~3-5 minutes on local hardware.
|
|
866
|
+
if (rawTokenCount >= 6000) {
|
|
867
|
+
console.log(` [${agentId}] ⚠️ Turn ${turn}: token cap (${rawTokenCount}) — truncating stream`);
|
|
868
|
+
// Close any open code fence so the WRITE_FILE parser can extract partial content.
|
|
869
|
+
// Track whether we truncated mid-write so we can inject a hint after the tool loop.
|
|
870
|
+
if (inFenceBlock && fenceDepth % 2 === 1) {
|
|
871
|
+
streamContent += '\n```\n';
|
|
872
|
+
inFenceBlock = false;
|
|
873
|
+
tokenCapTruncatedFile = true; // set below
|
|
874
|
+
}
|
|
875
|
+
reader.cancel().catch(() => {});
|
|
876
|
+
break streamLoop;
|
|
877
|
+
}
|
|
878
|
+
|
|
589
879
|
// Process token through think + tool_call filters, emit visible text live
|
|
590
880
|
// We scan only the new delta token against the current buffer state
|
|
591
881
|
const chunk = tokenText;
|
|
592
882
|
let visible = '';
|
|
883
|
+
const wasInThinkBlock = inThinkBlock;
|
|
593
884
|
// Simple per-token state machine — handles split tags across tokens by tracking state flags
|
|
594
885
|
if (!inThinkBlock && !inToolCallBlock) {
|
|
595
|
-
// Check if this chunk starts a filtered block
|
|
596
|
-
|
|
886
|
+
// Check if this chunk starts a filtered block.
|
|
887
|
+
// Use `<think` (no closing >) to catch split tokens where `>` arrives separately.
|
|
888
|
+
// `<think` won't false-positive on `</think>` since that starts with `</`.
|
|
889
|
+
if (streamContent.includes('<think') && !streamContent.includes('</think>')) {
|
|
597
890
|
inThinkBlock = true;
|
|
598
|
-
// emit text before the <think> tag
|
|
599
|
-
const before = streamContent.lastIndexOf('<think>');
|
|
600
|
-
// already streamed everything before this point; just suppress from here
|
|
601
891
|
} else if (streamContent.includes('<tool_call>') && !streamContent.slice(streamContent.lastIndexOf('<tool_call>')).includes('</tool_call>')) {
|
|
602
892
|
inToolCallBlock = true;
|
|
603
|
-
// Text before <tool_call> on this same token — already emitted or trivial
|
|
604
893
|
} else if (!inThinkBlock && !inToolCallBlock) {
|
|
605
894
|
visible = chunk;
|
|
606
895
|
}
|
|
@@ -614,6 +903,26 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
614
903
|
inToolCallBlock = false;
|
|
615
904
|
}
|
|
616
905
|
|
|
906
|
+
// Stream think block content live — shown in a collapsible "Thinking…" panel in the dashboard
|
|
907
|
+
{
|
|
908
|
+
let thinkChunk = '';
|
|
909
|
+
if (!wasInThinkBlock && inThinkBlock) {
|
|
910
|
+
// Just entered think block — emit content after the opening <think> tag
|
|
911
|
+
const tagEnd = chunk.indexOf('<think>');
|
|
912
|
+
thinkChunk = tagEnd >= 0 ? chunk.slice(tagEnd + 7) : chunk;
|
|
913
|
+
} else if (wasInThinkBlock && inThinkBlock) {
|
|
914
|
+
// Mid-think block — emit raw chunk (strip stray tag fragments)
|
|
915
|
+
thinkChunk = chunk.replace(/<\/?think>/g, '');
|
|
916
|
+
} else if (wasInThinkBlock && !inThinkBlock) {
|
|
917
|
+
// Just exited think block — emit content before the closing </think> tag
|
|
918
|
+
const tagStart = chunk.indexOf('</think>');
|
|
919
|
+
thinkChunk = tagStart >= 0 ? chunk.slice(0, tagStart) : chunk;
|
|
920
|
+
}
|
|
921
|
+
if (thinkChunk) {
|
|
922
|
+
this.emit('agent_output', { agentId, output: thinkChunk, isThinking: true, isChunk: true });
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
|
|
617
926
|
// Scan ALL lines completed in this token for state transitions.
|
|
618
927
|
// Multi-char tokens can contain multiple lines (WRITE_FILE + ``` in same token).
|
|
619
928
|
if (tokenText.includes('\n')) {
|
|
@@ -622,7 +931,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
622
931
|
while (nlIdx !== -1) {
|
|
623
932
|
const lineStart = Math.max(0, streamContent.lastIndexOf('\n', nlIdx - 1)) + 1;
|
|
624
933
|
const line = streamContent.slice(lineStart, nlIdx).trim();
|
|
625
|
-
if (/^(WRITE_FILE|write_file)
|
|
934
|
+
if (/^(WRITE_FILE|write_file)/i.test(line)) {
|
|
626
935
|
inFenceBlock = true; fenceDepth = 0;
|
|
627
936
|
} else if (inFenceBlock && /^```/.test(line)) {
|
|
628
937
|
fenceDepth++;
|
|
@@ -639,16 +948,28 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
639
948
|
const cleanSC = streamContent.replace(/<think>[\s\S]*?<\/think>/g, '');
|
|
640
949
|
const lastNL = cleanSC.lastIndexOf('\n');
|
|
641
950
|
const curLine = cleanSC.slice(lastNL + 1).trimStart();
|
|
642
|
-
|
|
951
|
+
// Suppress as soon as "WRITE_FILE" appears at start of partial line —
|
|
952
|
+
// don't wait for the path to arrive or the word streams char-by-char to the user.
|
|
953
|
+
if (!inFenceBlock && /^(WRITE_FILE|write_file)/i.test(curLine)) {
|
|
643
954
|
inFenceBlock = true; fenceDepth = 0;
|
|
644
955
|
}
|
|
645
|
-
if
|
|
956
|
+
// Only treat as JSON blob if it looks like actual JSON — `[{` or `["` or `[` followed by quote/brace.
|
|
957
|
+
// Avoid false-positive on `[bash result]:`, `[tool result]:`, etc.
|
|
958
|
+
if (!inJsonBlob && !inFenceBlock && (curLine.startsWith('{') || /^\[[\[{"']/.test(curLine))) {
|
|
646
959
|
inJsonBlob = true;
|
|
647
960
|
}
|
|
648
961
|
}
|
|
649
962
|
|
|
650
963
|
// Emit visible content — safety filter removes any ``` or WRITE_FILE lines
|
|
651
|
-
// that slipped through (e.g. partial token at detection boundary)
|
|
964
|
+
// that slipped through (e.g. partial token at detection boundary).
|
|
965
|
+
// If a complete <think>...</think> block arrived in one token (state machine missed it),
|
|
966
|
+
// route its content as a thinking chunk so users can see the agent's reasoning.
|
|
967
|
+
if (visible) {
|
|
968
|
+
visible = visible.replace(/<think>([\s\S]*?)<\/think>/g, (_, content) => {
|
|
969
|
+
if (content.trim()) this.emit('agent_output', { agentId, output: content, isThinking: true, isChunk: true });
|
|
970
|
+
return '';
|
|
971
|
+
}).replace(/<think>[\s\S]*/g, '');
|
|
972
|
+
}
|
|
652
973
|
if (visible && !inThinkBlock && !inToolCallBlock && !inJsonBlob && !inFenceBlock) {
|
|
653
974
|
const safe = visible.split('\n').filter(ln => {
|
|
654
975
|
const t = ln.trimStart();
|
|
@@ -666,13 +987,40 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
666
987
|
if (inThinkBlock && (Date.now() - lastVisibleAt) > 90000 && rawTokenCount > 100) {
|
|
667
988
|
console.log(` [${agentId}] ⏱️ Think timeout (>90s, ${rawTokenCount} tokens) — aborting stream`);
|
|
668
989
|
reader.cancel().catch(() => {});
|
|
669
|
-
break;
|
|
990
|
+
break streamLoop;
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
// Repetition loop detection — catches runaway token loops (e.g. hundreds of </li> repeating).
|
|
994
|
+
// Small local models can get stuck when fed malformed HTML or very large context.
|
|
995
|
+
// Check every 50 tokens after warmup: if any short pattern fills most of the recent output → abort.
|
|
996
|
+
if (rawTokenCount % 50 === 0 && rawTokenCount > 150) {
|
|
997
|
+
const tail = streamContent.slice(-800);
|
|
998
|
+
let loopDetected = false;
|
|
999
|
+
for (let pLen = 4; pLen <= 15; pLen++) {
|
|
1000
|
+
const pat = tail.slice(-pLen);
|
|
1001
|
+
if (!pat.trim()) continue;
|
|
1002
|
+
let count = 0, pos = 0;
|
|
1003
|
+
while ((pos = tail.indexOf(pat, pos)) !== -1) { count++; pos += pLen; }
|
|
1004
|
+
if (count >= 30) { loopDetected = true; break; } // Fix 9: raised from 20 — HTML/CSS files have naturally repetitive short patterns (px;, </div>, etc.)
|
|
1005
|
+
}
|
|
1006
|
+
if (loopDetected) {
|
|
1007
|
+
console.log(` [${agentId}] 🔄 Repetition loop detected at ${rawTokenCount} tokens — aborting stream`);
|
|
1008
|
+
reader.cancel().catch(() => {});
|
|
1009
|
+
break streamLoop;
|
|
1010
|
+
}
|
|
670
1011
|
}
|
|
671
1012
|
}
|
|
672
1013
|
}
|
|
673
1014
|
|
|
674
|
-
|
|
675
|
-
|
|
1015
|
+
const inferenceMs = Date.now() - inferenceStart;
|
|
1016
|
+
console.log(` [${agentId}] 📊 Turn ${turn}: ${rawTokenCount} tokens, ${streamContent.length} chars raw, ${visibleContent.length} visible, apiToolCalls=${Object.keys(streamToolCalls).length}, inference=${(inferenceMs/1000).toFixed(1)}s`);
|
|
1017
|
+
if (rawTokenCount === 0 && inferenceMs > 10000) {
|
|
1018
|
+
console.log(` [${agentId}] ⚠️ Turn ${turn}: Ollama spent ${(inferenceMs/1000).toFixed(1)}s returning 0 tokens — possible OOM, KV cache eviction, or model degenerate state`);
|
|
1019
|
+
}
|
|
1020
|
+
// Log visible content (what the user sees) — helps diagnose planning vs acting
|
|
1021
|
+
if (visibleContent.trim()) console.log(` [${agentId}] 👁️ Visible: ${visibleContent.trim().replace(/\n/g, ' ').slice(0, 300)}`);
|
|
1022
|
+
// Log raw content if no visible (pure tool call turn) — helps diagnose tool format
|
|
1023
|
+
else if (streamContent.trim()) console.log(` [${agentId}] 📝 Raw: ${streamContent.trim().replace(/\n/g, ' ').slice(0, 200)}`);
|
|
676
1024
|
|
|
677
1025
|
// ── Extract tool calls from content ───────────────────────────────────
|
|
678
1026
|
// Try <tool_call> XML tags first (some models emit this format), then fall through
|
|
@@ -712,6 +1060,19 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
712
1060
|
}
|
|
713
1061
|
}
|
|
714
1062
|
|
|
1063
|
+
// Detect model mimicking compaction format: [wrote: /path — N chars, M lines]
|
|
1064
|
+
// This happens after context trim — model sees these summaries and generates them as fake outputs.
|
|
1065
|
+
// The model THINKS it wrote the file but it hasn't. Correct it immediately.
|
|
1066
|
+
if (!parsedTagCalls && Object.keys(streamToolCalls).length === 0 && streamContent) {
|
|
1067
|
+
const fakeWroteMatch = streamContent.match(/\[wrote:\s*([^\s\]]+)[^\]]*\]/i);
|
|
1068
|
+
if (fakeWroteMatch) {
|
|
1069
|
+
const fakePath = fakeWroteMatch[1];
|
|
1070
|
+
console.log(` [${agentId}] ⚠️ Model generated fake [wrote: ...] summary — correcting`);
|
|
1071
|
+
messages.push({ role: 'user', content: `You output "[wrote: ${fakePath}...]" but that is a SUMMARY FORMAT from your context history — you did NOT actually write any file. To actually write a file, you MUST use WRITE_FILE format:\n\nWRITE_FILE ${fakePath}\n\`\`\`\n...complete file content...\n\`\`\`\n\nOutput the full file content now using WRITE_FILE.` });
|
|
1072
|
+
continue;
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
|
|
715
1076
|
// Fallback 4: if we found ONLY bash tool calls but content has writing blocks too,
|
|
716
1077
|
// merge them so files get written AND bash runs
|
|
717
1078
|
if (parsedTagCalls && streamContent) {
|
|
@@ -757,10 +1118,18 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
757
1118
|
// ── Push assistant message ────────────────────────────────────────────
|
|
758
1119
|
// All local models now use JSON-in-text format on the native endpoint.
|
|
759
1120
|
// Strip <think>...</think> blocks to avoid burning context on reasoning traces.
|
|
1121
|
+
// Also compact WRITE_FILE fences: replace the file body with a summary line
|
|
1122
|
+
// to prevent large file contents from flooding the context on every future turn.
|
|
760
1123
|
const toolCallsArray = Object.values(streamToolCalls);
|
|
761
1124
|
const hasToolCalls = toolCallsArray.length > 0;
|
|
762
1125
|
const cleanedContent = (streamContent || '')
|
|
763
1126
|
.replace(/<think>[\s\S]*?<\/think>/g, '')
|
|
1127
|
+
// Compact WRITE_FILE fence bodies: replace with a non-fence note so the model
|
|
1128
|
+
// cannot mistake the summary for real file content and echo it back.
|
|
1129
|
+
.replace(/(?:WRITE_FILE|write_file)[:\s]+([^\n]+)\n```[^\n]*\n([\s\S]*?)```/gi, (match, filePath, fileContent) => {
|
|
1130
|
+
const lines = fileContent.split('\n').length;
|
|
1131
|
+
return `[wrote: ${filePath.trim()} — ${fileContent.length} chars, ${lines} lines]`;
|
|
1132
|
+
})
|
|
764
1133
|
.trim();
|
|
765
1134
|
messages.push({ role: 'assistant', content: cleanedContent || '' });
|
|
766
1135
|
|
|
@@ -769,6 +1138,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
769
1138
|
|
|
770
1139
|
// ── Execute tool calls ────────────────────────────────────────────────
|
|
771
1140
|
if (toolCallsArray.length > 0) {
|
|
1141
|
+
let completionCheckedThisTurn = false; // deduplicate _isTaskComplete across tool calls in same turn
|
|
772
1142
|
for (const toolCall of toolCallsArray) {
|
|
773
1143
|
if (controller.signal.aborted) break;
|
|
774
1144
|
|
|
@@ -808,7 +1178,37 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
808
1178
|
});
|
|
809
1179
|
console.log(` [${agentId}] 🔧 ${name}: ${JSON.stringify(parsedArgs).slice(0, 120)}`);
|
|
810
1180
|
toolsUsed.push(name);
|
|
1181
|
+
toolsUsedThisTurn++; // Fix 10: track per-turn for _isTaskComplete gating
|
|
811
1182
|
emptyRetries = 0; // reset on successful tool call
|
|
1183
|
+
recentOutputs.length = 0; // reset repeated-output tracker on any tool execution
|
|
1184
|
+
// Track consecutive browser/screenshot calls on a locally-built app.
|
|
1185
|
+
// After 6 such calls the agent has browsed enough — check if done and stop.
|
|
1186
|
+
if (name === 'browser' || name === 'screenshot_and_describe') localBrowserTurns++;
|
|
1187
|
+
else if (name === 'write_file' || name === 'bash') localBrowserTurns = 0; // reset on real work
|
|
1188
|
+
if (localBrowserTurns >= 6 && toolsUsed.filter(t => t === 'write_file').length > 0) {
|
|
1189
|
+
const originalTask3 = messages.find(m => m.role === 'user')?.content || task;
|
|
1190
|
+
const isDoneBrowse = await this._isTaskComplete(originalTask3, visibleContent || allOutput, controller.signal);
|
|
1191
|
+
if (isDoneBrowse) {
|
|
1192
|
+
console.log(` [${agentId}] ✅ Done after ${localBrowserTurns} browser interactions — stopping`);
|
|
1193
|
+
if (visibleContent) finalContent = visibleContent;
|
|
1194
|
+
taskDoneEarly = true;
|
|
1195
|
+
break;
|
|
1196
|
+
}
|
|
1197
|
+
// Not done yet after 6 browser calls — push a targeted hint rather than silently resetting.
|
|
1198
|
+
// This fires every 6 browser calls to redirect the agent toward evaluation or completion.
|
|
1199
|
+
console.log(` [${agentId}] ⚠️ ${localBrowserTurns} browser interactions without completion — injecting guidance`);
|
|
1200
|
+
messages.push({ role: 'user', content: `You have taken ${localBrowserTurns} screenshots/browser interactions. You need to complete the task or make progress.
|
|
1201
|
+
|
|
1202
|
+
If you are trying to verify that a DYNAMIC feature works (timer counting down, animation playing, real-time updates), STOP using screenshots — they capture a single frozen moment and CANNOT prove motion or state change.
|
|
1203
|
+
|
|
1204
|
+
Use browser evaluate instead to directly check JavaScript state:
|
|
1205
|
+
{"name":"browser","arguments":{"action":"evaluate","script":"document.querySelector('#display').textContent"}}
|
|
1206
|
+
Or to read a value, wait 2 seconds, and compare:
|
|
1207
|
+
{"name":"browser","arguments":{"action":"evaluate","script":"(function(){ return new Promise(r => { var t1 = document.body.innerText; setTimeout(() => r('before: ' + t1.slice(0,50) + ' | after: ' + document.body.innerText.slice(0,50)), 2000); }); })()"}}
|
|
1208
|
+
|
|
1209
|
+
If the CODE is correct and the app LOOKS right, declare the task DONE — you do not need to prove every dynamic behavior via screenshot. State what you verified and what you built, then stop.` });
|
|
1210
|
+
localBrowserTurns = 0; // reset so hint fires again after 6 more if still stuck
|
|
1211
|
+
}
|
|
812
1212
|
|
|
813
1213
|
// Loop detection: catch repeated single calls AND alternating A/B/A/B patterns.
|
|
814
1214
|
// Normalize curl commands: strip sleep prefix so "sleep 3 && curl ...URL" and
|
|
@@ -821,15 +1221,44 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
821
1221
|
recentCalls.push(callKey);
|
|
822
1222
|
if (recentCalls.length > 6) recentCalls.shift();
|
|
823
1223
|
|
|
1224
|
+
// Bash-only window: write_file calls don't contaminate bash loop detection.
|
|
1225
|
+
// A write_file between two bash loops was causing the detector to miss patterns
|
|
1226
|
+
// like curl→cat→nohup→write_file(rewrite)→curl→cat→nohup (server-start loop).
|
|
1227
|
+
if (name === 'bash') {
|
|
1228
|
+
recentBashCalls.push(callKey);
|
|
1229
|
+
if (recentBashCalls.length > 6) recentBashCalls.shift();
|
|
1230
|
+
// Detect echo-append pattern: echo '...' >> file (building file line by line)
|
|
1231
|
+
if (/echo\s+['"]/.test(parsedArgs.command || '') && />>\s*\S/.test(parsedArgs.command || '')) {
|
|
1232
|
+
echoAppendCalls++;
|
|
1233
|
+
if (echoAppendCalls >= 4) {
|
|
1234
|
+
const appendTarget = (parsedArgs.command || '').match(/>>[ ]*(\S+)/)?.[1] || 'the file';
|
|
1235
|
+
console.log(` [${agentId}] ⚠️ echo-append loop (${echoAppendCalls}x) — injecting WRITE_FILE hint`);
|
|
1236
|
+
messages.push({ role: 'user', content: `STOP using echo >> to append code line by line — this wastes turns. You have already called echo >> ${echoAppendCalls} times. Use WRITE_FILE with the COMPLETE content of ${appendTarget} in ONE call instead:\n\nWRITE_FILE /abs/path/to/${appendTarget.split('/').pop()}\n\`\`\`\n...complete file content...\n\`\`\`` });
|
|
1237
|
+
echoAppendCalls = 0; // reset so hint only fires once per burst
|
|
1238
|
+
}
|
|
1239
|
+
} else {
|
|
1240
|
+
echoAppendCalls = 0; // non-echo bash call resets the counter
|
|
1241
|
+
}
|
|
1242
|
+
} else if (name === 'write_file') {
|
|
1243
|
+
// A successful write_file is progress; don't reset entirely but clear bash window
|
|
1244
|
+
// so the loop detector starts fresh for the post-rewrite phase.
|
|
1245
|
+
recentBashCalls.length = 0;
|
|
1246
|
+
echoAppendCalls = 0;
|
|
1247
|
+
}
|
|
1248
|
+
|
|
824
1249
|
// Detect: same call 3x in a row (2x for screenshot — never valid to screenshot without a change)
|
|
825
1250
|
const screenshotLoop = name === 'screenshot_and_describe' && recentCalls.length >= 2 && recentCalls.slice(-2).every(c => c === callKey);
|
|
826
1251
|
const last3Same = screenshotLoop || (recentCalls.length >= 3 && recentCalls.slice(-3).every(c => c === callKey));
|
|
827
|
-
// Detect: alternating A,B,A,B pattern (last 4 calls)
|
|
1252
|
+
// Detect: alternating A,B,A,B pattern (last 4 calls) — check both windows
|
|
828
1253
|
const last4 = recentCalls.slice(-4);
|
|
829
|
-
const
|
|
830
|
-
|
|
1254
|
+
const last4bash = recentBashCalls.slice(-4);
|
|
1255
|
+
const abab = (last4.length === 4 && last4[0] === last4[2] && last4[1] === last4[3] && last4[0] !== last4[1])
|
|
1256
|
+
|| (last4bash.length === 4 && last4bash[0] === last4bash[2] && last4bash[1] === last4bash[3] && last4bash[0] !== last4bash[1]);
|
|
1257
|
+
// Detect: A,B,C,A,B,C pattern (last 6) — check both windows
|
|
831
1258
|
const last6 = recentCalls.slice(-6);
|
|
832
|
-
const
|
|
1259
|
+
const last6bash = recentBashCalls.slice(-6);
|
|
1260
|
+
const abcabc = (last6.length === 6 && last6[0] === last6[3] && last6[1] === last6[4] && last6[2] === last6[5])
|
|
1261
|
+
|| (last6bash.length === 6 && last6bash[0] === last6bash[3] && last6bash[1] === last6bash[4] && last6bash[2] === last6bash[5]);
|
|
833
1262
|
|
|
834
1263
|
if (last3Same || abab || abcabc) {
|
|
835
1264
|
const pattern = last3Same ? 'same call 3x' : abab ? 'A/B/A/B alternating' : 'A/B/C repeating';
|
|
@@ -847,14 +1276,36 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
847
1276
|
const openPort = openPortMatch ? openPortMatch[1] : '????';
|
|
848
1277
|
loopFixMsg += `You are calling 'open http://localhost:${openPort}' repeatedly but the server is not running — opening the browser to a dead port does nothing. You must RESTART THE SERVER first:\n{"name":"bash","arguments":{"command":"pkill -f 'node.*${openPort}' 2>/dev/null; sleep 1; cd YOUR_PROJECT_DIR && nohup /usr/local/bin/node server.js > /tmp/server.log 2>&1 & sleep 3 && curl -s -o /dev/null -w '%{http_code}' http://localhost:${openPort}"}}\nIf curl returns 000, check the crash: bash cat /tmp/server.log. Fix the crash FIRST. Only call 'open' after curl returns 200.`;
|
|
849
1278
|
} else if (name === 'bash' && (loopCmd.includes('curl') || loopCmd.includes('http_code'))) {
|
|
850
|
-
|
|
1279
|
+
// Auto-read crash log now so the hint can include the actual error
|
|
1280
|
+
let crashLogNow = '';
|
|
1281
|
+
try {
|
|
1282
|
+
crashLogNow = String(await this._executeTool('bash', { command: 'cat /tmp/server.log 2>/dev/null | tail -30 || echo "No server.log"' }, workDir, agentId)).trim();
|
|
1283
|
+
} catch {}
|
|
1284
|
+
let serverLoopHint = `The server is stuck in a crash-restart loop — curl keeps returning 000.\n\nLatest crash log:\n${crashLogNow}\n\n`;
|
|
1285
|
+
// If crash log has a SyntaxError, auto-read the code snippet
|
|
1286
|
+
const synMatch = crashLogNow.match(/^(\/[^\n:]+\.(?:js|ts|mjs|cjs)):(\d+)\n/m);
|
|
1287
|
+
if (synMatch && /SyntaxError/.test(crashLogNow)) {
|
|
1288
|
+
const synFile = synMatch[1];
|
|
1289
|
+
const synLine = parseInt(synMatch[2], 10);
|
|
1290
|
+
let snippet = '';
|
|
1291
|
+
try {
|
|
1292
|
+
snippet = String(await this._executeTool('bash', {
|
|
1293
|
+
command: `awk 'NR>=${Math.max(1, synLine - 5)} && NR<=${synLine + 5} {printf "%4d: %s\\n", NR, $0}' "${synFile}" 2>/dev/null`
|
|
1294
|
+
}, workDir, agentId)).trim();
|
|
1295
|
+
} catch {}
|
|
1296
|
+
serverLoopHint += `⚠️ SyntaxError in ${synFile} at line ${synLine}${snippet ? `:\n\`\`\`\n${snippet}\n\`\`\`` : ''}.\n\n`;
|
|
1297
|
+
serverLoopHint += `Fix the syntax error:\n1. write_file to patch only the broken line (do NOT rewrite the whole file unless it is tiny)\n2. Then restart with nohup\nNEVER restart before fixing the syntax error — it will always crash again.`;
|
|
1298
|
+
} else {
|
|
1299
|
+
serverLoopHint += `The error is shown above. Fix the code, then restart. Do NOT call curl or cat again before making a fix.`;
|
|
1300
|
+
}
|
|
1301
|
+
loopFixMsg += serverLoopHint;
|
|
851
1302
|
} else if (loopCmd.includes('npm install')) {
|
|
852
1303
|
loopFixMsg += `npm install is looping — packages likely already installed. Skip it and start the server directly with nohup.`;
|
|
853
1304
|
} else if (name === 'bash' && (loopCmd.includes('/tmp/') && (loopCmd.includes('.js') || loopCmd.includes('node')) && loopCmd.includes('9223'))) {
|
|
854
1305
|
loopFixMsg += `Your Node.js/CDP script is only READING the page — that is why nothing changes. You need to WRITE A NEW SCRIPT THAT CLICKS.\n\nReplace your /tmp script with one that clicks the target element:\n\nWRITE_FILE /tmp/cdp_click.js\n\`\`\`javascript\nconst ws = new WebSocket('ws://localhost:9223/devtools/page/TAB_ID_HERE');\nws.onopen = () => {\n // Click element containing the text you need (change "Filter" to what you see on the page)\n ws.send(JSON.stringify({id:1, method:'Runtime.evaluate', params:{expression: 'Array.from(document.querySelectorAll("a,button,input,span,div,th")).find(el=>el.textContent.trim().includes("Filter"))?.click() || "not found"', returnByValue:true}}));\n};\nws.onmessage = e => { console.log(JSON.parse(e.data)); ws.close(); };\nsetTimeout(() => ws.close(), 5000);\n\`\`\`\n\nThen run: bash → /usr/local/bin/node --experimental-websocket /tmp/cdp_click.js\n\nYou CAN click. You CAN interact. Stop saying you cannot — write the clicking script.`;
|
|
855
1306
|
} else if (name === 'screenshot_and_describe') {
|
|
856
1307
|
const loopPort = (parsedArgs.url || '').match(/:(\d+)/)?.[1] || '????';
|
|
857
|
-
loopFixMsg += `You are calling screenshot_and_describe repeatedly — STOP. Taking the same screenshot over and over changes nothing
|
|
1308
|
+
loopFixMsg += `You are calling screenshot_and_describe repeatedly — STOP. Taking the same screenshot over and over changes nothing.\n\nIf you are trying to verify DYNAMIC behavior (timer running, animation, countdown, live updates): screenshots CANNOT prove this — they capture a frozen moment. Use browser evaluate instead:\n{"name":"browser","arguments":{"action":"evaluate","script":"document.querySelector('#timer-display, .display, #display, [id*=time], [class*=time]')?.textContent || document.body.innerText.slice(0,200)"}}\nOr wait 2s and compare: {"name":"browser","arguments":{"action":"evaluate","script":"(function(){ return new Promise(r => { var t1 = document.body.innerText.slice(0,100); setTimeout(() => r({before:t1, after:document.body.innerText.slice(0,100)}), 2000); }); })()" }}\n\nOtherwise, you have two choices:\nA) If the code is correct and the app looks right — declare the task DONE. You do not need to screenshot every feature.\nB) If something specific is visually wrong — make a code change FIRST, then ONE screenshot to verify.`;
|
|
858
1309
|
} else {
|
|
859
1310
|
loopFixMsg += `Observe the tool results above, identify what is specifically broken, then make a targeted fix. Do not repeat commands that already ran.`;
|
|
860
1311
|
}
|
|
@@ -862,9 +1313,31 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
862
1313
|
messages.push({ role: 'user', content: loopFixMsg });
|
|
863
1314
|
// Don't fully reset — keep 1 entry so next identical call fires after 2 more (not 3)
|
|
864
1315
|
recentCalls.splice(0, recentCalls.length - 1);
|
|
1316
|
+
recentBashCalls.splice(0, recentBashCalls.length - 1);
|
|
865
1317
|
break; // break inner tool loop, let model respond to hint
|
|
866
1318
|
}
|
|
867
1319
|
|
|
1320
|
+
// ── Read-loop detector (cross-turn) ──────────────────────────────
|
|
1321
|
+
// Tracks how many times each file path has been read since the last write_file.
|
|
1322
|
+
// If the agent reads the same file 3+ times without writing anything, it is stuck
|
|
1323
|
+
// in a "read to plan" loop that never produces output — force it to write now.
|
|
1324
|
+
if (name === 'read_file' && parsedArgs.path) {
|
|
1325
|
+
const rp = parsedArgs.path;
|
|
1326
|
+
const readCount = (fileReadCounts.get(rp) || 0) + 1;
|
|
1327
|
+
fileReadCounts.set(rp, readCount);
|
|
1328
|
+
if (readCount >= 3) {
|
|
1329
|
+
const fname = path.basename(rp);
|
|
1330
|
+
console.log(` [${agentId}] 🔁 Read-loop: "${rp}" read ${readCount}x without a write — forcing write`);
|
|
1331
|
+
fileReadCounts.set(rp, 0); // reset so hint can fire again if agent persists
|
|
1332
|
+
messages.push({ role: 'user', content: `STOP. You have read ${fname} ${readCount} times in a row without writing anything. You already have the full file content in your context. Reading it again changes nothing.\n\nSTOP READING. Write your next WRITE_FILE now — put the updated ${fname} content in a fence:\n\nWRITE_FILE ${rp}\n\`\`\`\n...updated content...\n\`\`\`\n\nDo NOT read any more files. Write.` });
|
|
1333
|
+
break; // break inner tool loop
|
|
1334
|
+
}
|
|
1335
|
+
}
|
|
1336
|
+
// Any write_file clears the read counts — fresh slate after actual progress
|
|
1337
|
+
if (name === 'write_file') {
|
|
1338
|
+
fileReadCounts.clear();
|
|
1339
|
+
}
|
|
1340
|
+
|
|
868
1341
|
const result = await this._executeTool(name, parsedArgs, workDir, agentId);
|
|
869
1342
|
|
|
870
1343
|
this.emit('tool_activity', { agentId, event: 'tool_end', tool: name, description: `✓ ${name}` });
|
|
@@ -874,15 +1347,71 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
874
1347
|
this.emit('agent_image', { agentId, image: result });
|
|
875
1348
|
}
|
|
876
1349
|
|
|
877
|
-
// ── Bash: curl
|
|
1350
|
+
// ── Bash: curl result handling ────────────────────────────────────
|
|
878
1351
|
if (name === 'bash') {
|
|
879
1352
|
const resultStr = String(result).trim();
|
|
880
1353
|
const isCurlZero = resultStr === '000' || resultStr.endsWith('\n000') || /\b000$/.test(resultStr);
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
1354
|
+
const isCurl404 = resultStr === '404' || resultStr.endsWith('\n404') || /\b404$/.test(resultStr);
|
|
1355
|
+
const isCurl200 = resultStr === '200' || resultStr.endsWith('\n200') || /\b200$/.test(resultStr);
|
|
1356
|
+
|
|
1357
|
+
if (isCurlZero || isCurl404) {
|
|
1358
|
+
const logRead = await this._executeTool('bash', { command: 'cat /tmp/server.log 2>/dev/null | tail -40 || echo "No server.log found"' }, workDir, agentId);
|
|
1359
|
+
if (isCurlZero) {
|
|
1360
|
+
// If crash log has a SyntaxError with file:line, auto-read the snippet to save the agent
|
|
1361
|
+
// a read_file round-trip and make the fix obvious
|
|
1362
|
+
let syntaxSnippet = '';
|
|
1363
|
+
const synErrMatch = String(logRead).match(/^(\/[^\n:]+\.(?:js|ts|mjs|cjs)):(\d+)\n/m);
|
|
1364
|
+
if (synErrMatch && /SyntaxError/.test(String(logRead))) {
|
|
1365
|
+
const synFile = synErrMatch[1];
|
|
1366
|
+
const synLine = parseInt(synErrMatch[2], 10);
|
|
1367
|
+
try {
|
|
1368
|
+
const snippet = await this._executeTool('bash', {
|
|
1369
|
+
command: `awk 'NR>=${Math.max(1, synLine - 8)} && NR<=${synLine + 8} {printf "%4d: %s\\n", NR, $0}' "${synFile}" 2>/dev/null`
|
|
1370
|
+
}, workDir, agentId);
|
|
1371
|
+
if (snippet && String(snippet).trim()) {
|
|
1372
|
+
syntaxSnippet = `\n\n⚠️ SYNTAX ERROR in ${synFile} near line ${synLine}. The relevant code:\n\`\`\`\n${snippet}\n\`\`\`\nFix the syntax error in that file BEFORE trying to restart.`;
|
|
1373
|
+
}
|
|
1374
|
+
} catch {}
|
|
1375
|
+
}
|
|
1376
|
+
messages.push({ role: 'user', content: `[bash result]: 000 (connection refused — server is NOT running)\n\nCrash log:\n${logRead}${syntaxSnippet}\n\nThe server crashed or never started. Fix the actual error shown above. Do NOT assume it is running. Do NOT change the port. Make a targeted fix to the code then restart.` });
|
|
1377
|
+
} else {
|
|
1378
|
+
messages.push({ role: 'user', content: `[bash result]: 404 (server is running but root route not found)\n\nServer log:\n${logRead}\n\nCommon cause: static files path is wrong. In server.js: (1) express.static must use path.join(__dirname, 'public'); (2) any res.sendFile for the root route must use path.join(__dirname, 'public', 'index.html') — NEVER path.join(__dirname, 'index.html') or relative paths. Fix and restart. Do NOT rewrite the whole file.` });
|
|
1379
|
+
}
|
|
884
1380
|
continue;
|
|
885
1381
|
}
|
|
1382
|
+
|
|
1383
|
+
// ── curl 200: server confirmed running — open in AgentForge browser ──
|
|
1384
|
+
// Platform responsibility: always show the user their app the moment it's live.
|
|
1385
|
+
// Agent does not need to call 'open' — the platform handles it here.
|
|
1386
|
+
if (isCurl200) {
|
|
1387
|
+
const curlCmd = parsedArgs.command || '';
|
|
1388
|
+
const portMatch = curlCmd.match(/localhost:(\d+)/);
|
|
1389
|
+
if (portMatch) {
|
|
1390
|
+
const appUrl = `http://localhost:${portMatch[1]}`;
|
|
1391
|
+
const { opened } = await this._openInBrowser(appUrl, agentId);
|
|
1392
|
+
// ── Register project in global registry so other agents can find it ──
|
|
1393
|
+
try {
|
|
1394
|
+
const REGISTRY = '/tmp/agentforge/projects.json';
|
|
1395
|
+
const registry = existsSync(REGISTRY) ? JSON.parse(readFileSync(REGISTRY, 'utf8')) : {};
|
|
1396
|
+
// Derive a readable project name: prefer Desktop/Projects subdir name, else workDir basename
|
|
1397
|
+
let projectName = path.basename(workDir);
|
|
1398
|
+
const homeDir2 = process.env.HOME || '/tmp';
|
|
1399
|
+
const desktopProjects = `${homeDir2}/Desktop/Projects`;
|
|
1400
|
+
try {
|
|
1401
|
+
// Walk Desktop/Projects for the most recently modified dir — likely the active project
|
|
1402
|
+
const dirs = readdirSync(desktopProjects, { withFileTypes: true })
|
|
1403
|
+
.filter(e => e.isDirectory())
|
|
1404
|
+
.map(e => ({ name: e.name, mtime: statSync(path.join(desktopProjects, e.name)).mtimeMs }))
|
|
1405
|
+
.sort((a, b) => b.mtime - a.mtime);
|
|
1406
|
+
if (dirs.length > 0) projectName = dirs[0].name;
|
|
1407
|
+
} catch {}
|
|
1408
|
+
registry[portMatch[1]] = { port: parseInt(portMatch[1]), path: workDir, agentId, name: projectName, updated: new Date().toISOString() };
|
|
1409
|
+
writeFileSync(REGISTRY, JSON.stringify(registry, null, 2));
|
|
1410
|
+
} catch {}
|
|
1411
|
+
messages.push({ role: 'user', content: `[bash result]: 200 — server is running at ${appUrl}${opened ? '. App opened in browser and screenshot sent to user.' : '.'}\n\nNow call screenshot_and_describe with url:"${appUrl}" and send_to_user:true to verify it looks correct, then iterate to improve it.` });
|
|
1412
|
+
continue;
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
886
1415
|
}
|
|
887
1416
|
|
|
888
1417
|
// ALL models get tool results fed back — no model should run blind.
|
|
@@ -890,12 +1419,90 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
890
1419
|
// must be in context so the model can see what happened and react correctly.
|
|
891
1420
|
{
|
|
892
1421
|
const noThink = '';
|
|
1422
|
+
|
|
1423
|
+
// Deployment URL detection: if bash output contains a public HTTPS URL
|
|
1424
|
+
// from a known hosting platform, extract it, persist it to the project registry,
|
|
1425
|
+
// and tell the agent to report it.
|
|
1426
|
+
if (name === 'bash') {
|
|
1427
|
+
const resultStr = String(result);
|
|
1428
|
+
const deployUrlMatch = resultStr.match(/https:\/\/[a-zA-Z0-9._-]+\.(railway\.app|vercel\.app|netlify\.app|fly\.dev|surge\.sh|pages\.dev|web\.app|github\.io|onrender\.com|up\.railway\.app)[^\s]*/);
|
|
1429
|
+
if (deployUrlMatch) {
|
|
1430
|
+
const deployUrl = deployUrlMatch[0];
|
|
1431
|
+
console.log(` [${agentId}] 🌐 Deployment URL detected: ${deployUrl}`);
|
|
1432
|
+
// Persist the live URL (and Railway project name if available) into the registry
|
|
1433
|
+
// so future agents know the deployed URL without re-running CLI commands.
|
|
1434
|
+
try {
|
|
1435
|
+
const REGISTRY = '/tmp/agentforge/projects.json';
|
|
1436
|
+
const registry = existsSync(REGISTRY) ? JSON.parse(readFileSync(REGISTRY, 'utf8')) : {};
|
|
1437
|
+
const entry = Object.values(registry).find(e => e.path === workDir || e.agentId === agentId);
|
|
1438
|
+
if (entry) {
|
|
1439
|
+
entry.liveUrl = deployUrl;
|
|
1440
|
+
// Capture Railway project name if railway status is available
|
|
1441
|
+
try {
|
|
1442
|
+
const { stdout: statusOut } = await execAsync('railway status 2>/dev/null', { cwd: workDir, timeout: 5000 });
|
|
1443
|
+
const projectMatch = statusOut.match(/Project:\s*(.+)/);
|
|
1444
|
+
if (projectMatch) entry.railwayProject = projectMatch[1].trim();
|
|
1445
|
+
} catch {}
|
|
1446
|
+
const key = Object.keys(registry).find(k => registry[k] === entry);
|
|
1447
|
+
if (key) {
|
|
1448
|
+
registry[key] = entry;
|
|
1449
|
+
writeFileSync(REGISTRY, JSON.stringify(registry, null, 2));
|
|
1450
|
+
console.log(` [${agentId}] 💾 Saved live URL to registry: ${deployUrl}`);
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
} catch {}
|
|
1454
|
+
messages.push({ role: 'user', content: `[bash result]:\n${resultStr.slice(0, 3000)}\n\nDeployment succeeded. The live URL is: ${deployUrl}\n\nReport this URL to the user as your final response.` });
|
|
1455
|
+
continue;
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1459
|
+
// After writing files, check if the task is complete — don't just blindly kick "Continue".
|
|
1460
|
+
// Run _isTaskComplete after any write_file call (≥2 tools used so agent has done real work).
|
|
1461
|
+
// Only check once per turn to avoid redundant LLM calls when multiple files are written.
|
|
1462
|
+
// Note: visibleContent may be just "WRITE_FILE" (10 chars) for pure file-write turns — don't
|
|
1463
|
+
// require long visible content here; the write_file result itself is sufficient evidence.
|
|
1464
|
+
if (name === 'write_file' && toolsUsed.length >= 2 && !completionCheckedThisTurn) {
|
|
1465
|
+
completionCheckedThisTurn = true;
|
|
1466
|
+
const originalTask2 = messages.find(m => m.role === 'user')?.content || task;
|
|
1467
|
+
// Use write_file result as context (includes path that was written) + any visible text
|
|
1468
|
+
const completionContext = (visibleContent.length > 10 ? visibleContent + '\n' : '') + 'Just wrote: ' + String(result).slice(0, 500);
|
|
1469
|
+
const isDoneEarly = await this._isTaskComplete(originalTask2, completionContext, controller.signal);
|
|
1470
|
+
if (isDoneEarly) {
|
|
1471
|
+
console.log(` [${agentId}] ✅ Task complete after write_file — stopping`);
|
|
1472
|
+
if (visibleContent) finalContent = visibleContent;
|
|
1473
|
+
taskDoneEarly = true;
|
|
1474
|
+
break; // break inner tool loop; outer loop checks taskDoneEarly
|
|
1475
|
+
}
|
|
1476
|
+
}
|
|
893
1477
|
if (isImageResult) {
|
|
894
1478
|
const base64 = result.replace(/^data:image\/\w+;base64,/, '');
|
|
895
1479
|
messages.push({ role: 'user', content: `[${name} result]: Screenshot captured. Continue with the next step.${noThink}`, images: [base64] });
|
|
896
1480
|
} else {
|
|
897
1481
|
const resultText = isImageResult ? '[Screenshot captured]' : String(result).slice(0, 6000);
|
|
898
|
-
|
|
1482
|
+
// Fix 12/19: after writing an HTML file for a static task, automatically navigate the browser
|
|
1483
|
+
// to the file:// URL so the agent's tab IS on the correct page before the next turn.
|
|
1484
|
+
// Previously we only injected a guidance message, which models often ignored — jumping straight
|
|
1485
|
+
// to screenshot_and_describe with no URL and getting a blank screenshot of the wrong tab.
|
|
1486
|
+
let continueMsg = `Continue with the next step.${noThink}`;
|
|
1487
|
+
if (name === 'write_file' && successfulScreenshots === 0) {
|
|
1488
|
+
const writtenPath = parsedArgs?.path || '';
|
|
1489
|
+
const isHtmlFile = /\.html?$/i.test(writtenPath);
|
|
1490
|
+
const taskLower2 = (messages.find(m => m.role === 'user')?.content || task).toLowerCase();
|
|
1491
|
+
const isStaticTask = isHtmlFile && !/\b(railway|vercel|render|netlify|fly\.io|heroku|deploy|server\.js|express|http\.createserver)\b/.test(taskLower2);
|
|
1492
|
+
if (isStaticTask && writtenPath) {
|
|
1493
|
+
const absolutePath = writtenPath.startsWith('~') ? writtenPath.replace(/^~/, process.env.HOME || '/Users/' + (workDir.split('/')[2] || 'user')) : writtenPath;
|
|
1494
|
+
// Fix 19: auto-navigate the browser tab to the file so it's already loaded.
|
|
1495
|
+
try {
|
|
1496
|
+
await browserAction({ action: 'navigate', url: `file://${absolutePath}` }, agentId);
|
|
1497
|
+
console.log(` [${agentId}] 🌐 Auto-navigated to file://${absolutePath}`);
|
|
1498
|
+
continueMsg = `File written and opened in browser at file://${absolutePath}. Now take a screenshot to verify it looks correct:\n{"name":"browser","arguments":{"action":"screenshot_and_describe","check_for":"the complete app with all required features"}}\n${noThink}`;
|
|
1499
|
+
} catch (navErr) {
|
|
1500
|
+
// Navigation failed — fall back to instruction-only
|
|
1501
|
+
continueMsg = `File written. YOUR NEXT ACTION MUST BE THIS — navigate to the file first, then screenshot:\n1. {"name":"browser","arguments":{"action":"navigate","url":"file://${absolutePath}"}}\n2. {"name":"browser","arguments":{"action":"screenshot_and_describe","check_for":"the complete app"}}\nDO NOT call screenshot_and_describe without url first — you will get a blank screenshot.${noThink}`;
|
|
1502
|
+
}
|
|
1503
|
+
}
|
|
1504
|
+
}
|
|
1505
|
+
messages.push({ role: 'user', content: `[${name} result]:\n${resultText}\n\n${continueMsg}` });
|
|
899
1506
|
|
|
900
1507
|
if (name === 'screenshot_and_describe') {
|
|
901
1508
|
const screenshotResult = String(result);
|
|
@@ -917,26 +1524,45 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
917
1524
|
}
|
|
918
1525
|
// Successful screenshot of a build task — push to make a code change
|
|
919
1526
|
else if (isLocalhost) {
|
|
920
|
-
|
|
1527
|
+
// Catch placeholder/hello world pages on localhost — force the model to keep building
|
|
1528
|
+
const screenshotText = String(result).toLowerCase();
|
|
1529
|
+
const isPlaceholder = (
|
|
1530
|
+
screenshotText.includes('hello world') ||
|
|
1531
|
+
screenshotText.includes('cannot get /') ||
|
|
1532
|
+
(screenshotText.includes('express') && screenshotText.includes('error')) ||
|
|
1533
|
+
// Only match "placeholder" as an unbuilt-page indicator, not Gemini describing
|
|
1534
|
+
// a UI element's placeholder attribute (e.g. "Placeholder Text: Start typing...")
|
|
1535
|
+
/\bplaceholder\s*(page|app|content|site)\b/.test(screenshotText) ||
|
|
1536
|
+
screenshotText.includes('coming soon') ||
|
|
1537
|
+
(screenshotText.includes('blank') && !screenshotText.includes('not blank'))
|
|
1538
|
+
);
|
|
1539
|
+
if (isPlaceholder) {
|
|
1540
|
+
messages.push({ role: 'user', content: `The screenshot shows a placeholder or empty page — the app is not done yet. Continue writing complete working code. Identify which files still need real implementation and write them now.${noThink}` });
|
|
1541
|
+
} else {
|
|
1542
|
+
successfulScreenshots++;
|
|
1543
|
+
if (successfulScreenshots >= 2) {
|
|
1544
|
+
// Agent has confirmed the app works at least twice. Time to wrap up rather
|
|
1545
|
+
// than looping indefinitely on minor improvements.
|
|
1546
|
+
messages.push({ role: 'user', content: `The app is working correctly (confirmed twice). Your task is complete. Write your final reply now: describe what you built, what it does, and how to use it. Do NOT make any more code changes — just reply in text.` });
|
|
1547
|
+
} else {
|
|
1548
|
+
messages.push({ role: 'user', content: `The app is running. If there is one specific thing that is clearly missing or broken, fix it now (read_file → write_file → restart → screenshot). If the app already fulfills all the requirements, skip improvements and write your final reply instead.` });
|
|
1549
|
+
}
|
|
1550
|
+
}
|
|
921
1551
|
}
|
|
922
1552
|
// Successful screenshot of a public URL — agent is doing research, let it reason
|
|
923
1553
|
}
|
|
924
|
-
// Catch placeholder/hello world pages — force the model to keep building
|
|
925
|
-
const screenshotText = String(result).toLowerCase();
|
|
926
|
-
const isPlaceholder = (
|
|
927
|
-
screenshotText.includes('hello world') ||
|
|
928
|
-
screenshotText.includes('cannot get /') ||
|
|
929
|
-
(screenshotText.includes('express') && screenshotText.includes('error')) ||
|
|
930
|
-
screenshotText.includes('placeholder') ||
|
|
931
|
-
screenshotText.includes('coming soon') ||
|
|
932
|
-
(screenshotText.includes('blank') && !screenshotText.includes('not blank'))
|
|
933
|
-
);
|
|
934
|
-
if (isPlaceholder) {
|
|
935
|
-
messages.push({ role: 'user', content: `The screenshot shows a placeholder or empty page — the app is not done yet. Continue writing complete working code. Identify which files still need real implementation and write them now.${noThink}` });
|
|
936
|
-
}
|
|
937
1554
|
}
|
|
938
1555
|
}
|
|
939
1556
|
}
|
|
1557
|
+
// Token cap fired mid-WRITE_FILE — the last file written is truncated.
|
|
1558
|
+
// Alert the agent so it knows to complete the file instead of immediately starting the server.
|
|
1559
|
+
if (tokenCapTruncatedFile) {
|
|
1560
|
+
tokenCapTruncatedFile = false;
|
|
1561
|
+
console.log(` [${agentId}] ⚠️ Token cap truncated a file — injecting continuation hint`);
|
|
1562
|
+
messages.push({ role: 'user', content: `⚠️ Your last response was cut off — the file was only partially written. The server will crash with a SyntaxError.\n\nDo NOT run the server yet. First complete the truncated file: read_file it to see where it was cut, then write_file to add the missing code (closing braces, remaining routes, etc.). Make sure the file is syntactically complete before starting the server.` });
|
|
1563
|
+
tokenCapTruncatedFile = false;
|
|
1564
|
+
}
|
|
1565
|
+
if (taskDoneEarly) break; // completion language detected inside tool loop — stop the turn loop
|
|
940
1566
|
continue; // loop back for next model turn
|
|
941
1567
|
}
|
|
942
1568
|
|
|
@@ -946,13 +1572,78 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
946
1572
|
const hasContent = combined.trim().length > 30;
|
|
947
1573
|
const isEmpty = combined.trim().length === 0;
|
|
948
1574
|
|
|
1575
|
+
// Structural: agent writing "Running command..." or "WRITE_FILE" headers but no actual tool JSON.
|
|
1576
|
+
// Happens when the model plans multiple steps using the header format but forgets the JSON body.
|
|
1577
|
+
const hasFakeHeaders = (streamContent.match(/^Running command\.\.\./gm) || []).length >= 2 ||
|
|
1578
|
+
/^WRITE_FILE\s*$/m.test(streamContent); // WRITE_FILE with no path on same line
|
|
1579
|
+
if (hasFakeHeaders) {
|
|
1580
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: agent writing planning headers without tool calls — showing correct format`);
|
|
1581
|
+
messages.push({ role: 'user', content: `You are writing "Running command..." or "WRITE_FILE" as planning text but not outputting actual tool calls.\n\nSTOP PLANNING. Execute now. First step: create the project directory:\n{"name":"bash","arguments":{"command":"mkdir -p ${projectsDir}/PROJECT_NAME && cd ${projectsDir}/PROJECT_NAME && /usr/local/bin/npm init -y && /usr/local/bin/npm install express"}}\n\nThen write files with ABSOLUTE paths:\nWRITE_FILE ${projectsDir}/PROJECT_NAME/server.js\n\`\`\`\nconst express = require('express');\nconst PORT = process.env.PORT || ${assignedPort};\n// complete file here\n\`\`\`\n\nOutput ONLY the bash JSON tool call right now. Nothing else.` });
|
|
1582
|
+
continue;
|
|
1583
|
+
}
|
|
1584
|
+
|
|
949
1585
|
// Structural: truncated JSON — model started a tool call but stream ended early
|
|
950
1586
|
const hasTruncatedJson = /\{"name"\s*:\s*"(bash|web_fetch|screenshot_and_describe|read_file|write_file|list_directory)"/i.test(streamContent) && Object.keys(streamToolCalls).length === 0;
|
|
951
1587
|
if (hasTruncatedJson) {
|
|
952
|
-
|
|
953
|
-
|
|
1588
|
+
consecutiveTruncations++;
|
|
1589
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: truncated JSON tool call (${consecutiveTruncations}x) — kicking to re-output`);
|
|
1590
|
+
|
|
1591
|
+
// WRITE_FILE called as JSON — model is trying {"name":"WRITE_FILE","path":"...","content":"..."} which
|
|
1592
|
+
// always truncates because file content doesn't fit in a JSON string. Redirect immediately, every time.
|
|
1593
|
+
const isJsonWriteFileCall = /\{"name"\s*:\s*"WRITE_FILE"\s*,\s*"(path|arguments)"/i.test(streamContent);
|
|
1594
|
+
if (isJsonWriteFileCall) {
|
|
1595
|
+
const pathMatch = streamContent.match(/"path"\s*:\s*"([^"]+)"/);
|
|
1596
|
+
const filePath = pathMatch ? pathMatch[1] : '/Users/hamp/Desktop/Projects/PROJECTNAME/filename.js';
|
|
1597
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: WRITE_FILE used as JSON tool call — correcting to fence format`);
|
|
1598
|
+
consecutiveTruncations = 0;
|
|
1599
|
+
// Permanently inject reminder into system message so it survives all context trims
|
|
1600
|
+
if (messages[0] && messages[0].role === 'system' && !messages[0].content.includes('NEVER use {"name":"WRITE_FILE"')) {
|
|
1601
|
+
messages[0] = { ...messages[0], content: messages[0].content + `\n\n⚠️ WRITE_FILE REMINDER (injected after format error): NEVER use {"name":"WRITE_FILE",...} JSON. ALWAYS use the code-fence format:\nWRITE_FILE /absolute/path/to/file\n\`\`\`\nfull file content\n\`\`\`` };
|
|
1602
|
+
}
|
|
1603
|
+
messages.push({ role: 'user', content: `WRITE_FILE is NOT a JSON tool. It uses a code-fence format — the ONLY correct way:\n\nWRITE_FILE ${filePath}\n\`\`\`\n...complete file content here...\n\`\`\`\n\nOutput ONLY the WRITE_FILE fence now. No JSON, no explanation.` });
|
|
1604
|
+
continue;
|
|
1605
|
+
}
|
|
1606
|
+
// Fix 8: bash command that contains WRITE_FILE — model is confusing WRITE_FILE fence with a shell command.
|
|
1607
|
+
const isBashWriteFile = /\{"name"\s*:\s*"bash"[\s\S]{0,300}WRITE_FILE\s+(\/\S+)/i.test(streamContent);
|
|
1608
|
+
// Fix 18: bash command embedding file content via node -e writeFileSync, echo, cat, etc.
|
|
1609
|
+
// These always truncate because the file content doesn't fit in max_tokens.
|
|
1610
|
+
// After 3+ consecutive truncations, escalate with a firm WRITE_FILE redirect.
|
|
1611
|
+
const isBashEmbedFile = /\{"name"\s*:\s*"bash"[\s\S]{0,200}(writeFileSync|echo|cat\s+<<|printf)[\s\S]{0,200}\.(css|html|js|ts|py|json|txt|md)/i.test(streamContent);
|
|
1612
|
+
const fileNameMatch = streamContent.match(/writeFileSync\s*\(\s*['"`]?([^'"`\s,)]+\.[a-z]{1,5})/i)
|
|
1613
|
+
|| streamContent.match(/>\s*['"]?([A-Za-z0-9_.-]+\.(css|html|js|ts|py|json|txt|md))['"]?/i);
|
|
1614
|
+
const fname = fileNameMatch ? fileNameMatch[1] : 'server.js';
|
|
1615
|
+
|
|
1616
|
+
if (isBashWriteFile) {
|
|
1617
|
+
const pathMatch = streamContent.match(/WRITE_FILE\s+(\/[^\s\\'"]+)/);
|
|
1618
|
+
const filePath = pathMatch ? pathMatch[1] : '/path/to/file';
|
|
1619
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: bash+WRITE_FILE pattern — correcting format`);
|
|
1620
|
+
messages.push({ role: 'user', content: `WRITE_FILE is NOT a bash command. Use the WRITE_FILE fence format directly at the top level (outside any bash call):\n\nWRITE_FILE ${filePath}\n\`\`\`\n...complete file content here...\n\`\`\`\n\nOutput ONLY the WRITE_FILE fence now — do NOT wrap it in a bash tool call.` });
|
|
1621
|
+
} else if (isBashEmbedFile || consecutiveTruncations >= 3) {
|
|
1622
|
+
// Large file embedded in bash always truncates. Redirect to WRITE_FILE.
|
|
1623
|
+
const dirMatch = streamContent.match(/cd\s+([\w/~.-]+)/);
|
|
1624
|
+
const dir = dirMatch ? dirMatch[1] : '/absolute/path/to/dir';
|
|
1625
|
+
const truncCount = consecutiveTruncations;
|
|
1626
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: bash-embed-file pattern (${truncCount}x) — redirecting to WRITE_FILE`);
|
|
1627
|
+
consecutiveTruncations = 0; // reset after escalation
|
|
1628
|
+
// Check if the task already had a successful write_file — if so, remind agent the file exists
|
|
1629
|
+
const hadPriorWrite = toolsUsed.filter(t => t === 'write_file').length > 0;
|
|
1630
|
+
const priorWriteHint = hadPriorWrite
|
|
1631
|
+
? `\n\nNOTE: You already wrote a file earlier in this task. Check if you still need to write more files, or if you should instead verify the existing file works.`
|
|
1632
|
+
: '';
|
|
1633
|
+
messages.push({ role: 'user', content: `STOP. Your bash command embeds file content as a string and will ALWAYS be truncated — it cannot work. You have tried this ${truncCount} times in a row.\n\nTo write a file, ALWAYS use WRITE_FILE which handles files of any size:\n\nWRITE_FILE ${dir}/${fname}\n\`\`\`\n...complete file content here...\n\`\`\`${priorWriteHint}` });
|
|
1634
|
+
} else {
|
|
1635
|
+
const isEchoFileWrite = /\{"name"\s*:\s*"bash"[\s\S]{0,300}(echo|cat\s+<<|printf)[\s\S]{0,200}>\s*\S+\.(css|html|js|ts|py|json|txt|md)/i.test(streamContent);
|
|
1636
|
+
if (isEchoFileWrite) {
|
|
1637
|
+
const fnMatch = streamContent.match(/>\s*['"]?(\S+\.(css|html|js|ts|py|json|txt|md))['"]?/i);
|
|
1638
|
+
const fn = fnMatch ? fnMatch[1] : 'the file';
|
|
1639
|
+
messages.push({ role: 'user', content: `Your bash echo/cat command is too large and will always be truncated. You MUST use WRITE_FILE instead — it handles any file size:\n\nWriting ${fn}...\nWRITE_FILE /absolute/path/to/${fn}\n\`\`\`\n...complete file content here...\n\`\`\`` });
|
|
1640
|
+
} else {
|
|
1641
|
+
messages.push({ role: 'user', content: 'Your tool call was cut off. Output the complete JSON on one line now.' });
|
|
1642
|
+
}
|
|
1643
|
+
}
|
|
954
1644
|
continue;
|
|
955
1645
|
}
|
|
1646
|
+
consecutiveTruncations = 0; // reset on any successful parse
|
|
956
1647
|
|
|
957
1648
|
// Structural: empty response — model produced nothing
|
|
958
1649
|
if (isEmpty) {
|
|
@@ -965,23 +1656,150 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
965
1656
|
console.log(` [${agentId}] ⚠️ Turn ${turn}: empty after 3 retries`);
|
|
966
1657
|
}
|
|
967
1658
|
|
|
1659
|
+
// Structural: model echoed tool result / non-JSON bracket text as plain output.
|
|
1660
|
+
// visibleContent=0 despite having raw content means inJsonBlob fired on a false-positive
|
|
1661
|
+
// (e.g. "[bash result]: ..." starts with "[") or model output was all inside <think>.
|
|
1662
|
+
// Either way: no tool calls, nothing visible, task not done — kick it to continue.
|
|
1663
|
+
if (!isEmpty && visibleContent.length === 0 && toolsUsedThisTurn === 0 && toolsUsed.length > 0) {
|
|
1664
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: raw output with 0 visible content and no tool calls — model echoed tool result or thought-only response, kicking to continue`);
|
|
1665
|
+
messages.push({ role: 'user', content: 'You echoed a result instead of making your next tool call. Keep going — call the next tool now.' });
|
|
1666
|
+
continue;
|
|
1667
|
+
}
|
|
1668
|
+
|
|
1669
|
+
// Repeated output detection — context overflow causes model to output same text repeatedly.
|
|
1670
|
+
// Normalize whitespace, strip "I will not"/"task is complete" boilerplate, then compare.
|
|
1671
|
+
// If we see the same output 2+ times in a row with no tool calls, hard-stop.
|
|
1672
|
+
if (hasContent) {
|
|
1673
|
+
const normalizedOutput = combined.trim().replace(/\s+/g, ' ').slice(0, 300);
|
|
1674
|
+
recentOutputs.push(normalizedOutput);
|
|
1675
|
+
if (recentOutputs.length > 4) recentOutputs.shift();
|
|
1676
|
+
// Check: last 3 outputs identical (context maxed — repeating same text)
|
|
1677
|
+
const last3Same = recentOutputs.length >= 3 &&
|
|
1678
|
+
recentOutputs[recentOutputs.length - 1] === recentOutputs[recentOutputs.length - 2] &&
|
|
1679
|
+
recentOutputs[recentOutputs.length - 2] === recentOutputs[recentOutputs.length - 3];
|
|
1680
|
+
if (last3Same) {
|
|
1681
|
+
console.log(` [${agentId}] 🛑 Repeated identical output detected — context likely maxed. Hard-stopping.`);
|
|
1682
|
+
finalContent = combined.trim();
|
|
1683
|
+
break;
|
|
1684
|
+
}
|
|
1685
|
+
}
|
|
1686
|
+
|
|
1687
|
+
// Structural: agent outputting code as chat text instead of writing files.
|
|
1688
|
+
// Detected by markdown code fences (```html/css/js) in visible output.
|
|
1689
|
+
// This happens when tool calls fail repeatedly and the agent falls back to showing code.
|
|
1690
|
+
// Redirect to WRITE_FILE — never accept code dumps as a substitute for file writes.
|
|
1691
|
+
if (hasContent && /```(html|css|js|javascript|typescript|python|json)/i.test(visibleContent)) {
|
|
1692
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: agent dumping code as chat text — redirecting to WRITE_FILE`);
|
|
1693
|
+
messages.push({ role: 'user', content: 'Do NOT show code in chat. You MUST write files to disk using WRITE_FILE:\n\nWriting filename.ext...\nWRITE_FILE /absolute/path/to/filename.ext\n```\n...complete file content here...\n```\n\nWrite every file now.' });
|
|
1694
|
+
continue;
|
|
1695
|
+
}
|
|
1696
|
+
|
|
968
1697
|
// Structural: agent hasn't used any tools yet — it must act before it can answer
|
|
969
1698
|
if (toolsUsed.length === 0 && hasContent) {
|
|
970
|
-
|
|
971
|
-
|
|
1699
|
+
// Ask the LLM whether the text response actually satisfies the task.
|
|
1700
|
+
// If complete, stop. If not, kick into tool use.
|
|
1701
|
+
if (visibleContent.length > 50) {
|
|
1702
|
+
const originalTask = messages.find(m => m.role === 'user')?.content || task;
|
|
1703
|
+
const isDone = await this._isTaskComplete(originalTask, visibleContent, controller.signal);
|
|
1704
|
+
if (isDone) {
|
|
1705
|
+
console.log(` [${agentId}] ✅ Turn ${turn}: text-only response satisfies task — stopping`);
|
|
1706
|
+
if (visibleContent) finalContent = visibleContent;
|
|
1707
|
+
break;
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
noToolKicks++;
|
|
1711
|
+
const refusalPhrases = ['i cannot', 'i am unable', 'i don\'t have', 'i do not have', 'not possible', 'impossible', 'cannot be done', 'outside my', 'beyond my', 'you must use', 'you should use', 'you will need to', 'recommend using'];
|
|
1712
|
+
const isRefusing = refusalPhrases.some(p => visibleContent.toLowerCase().includes(p));
|
|
1713
|
+
// Also catch "I can't do X but I'll build a prototype/concept instead" pivot patterns
|
|
1714
|
+
const isDowngrading = /\b(prototype|concept|simplified|scaled.down|limited version|mock.up|demo|without.*backend|without.*server|front.?end only|client.?side only)\b/i.test(visibleContent) && /\b(i (can'?t|cannot|am unable|won'?t|will not)|not possible|too complex|beyond|outside)\b/i.test(visibleContent);
|
|
1715
|
+
if (isRefusing || isDowngrading) {
|
|
1716
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: agent ${isDowngrading ? 'downgrading task' : 'refusing task'} — overriding`);
|
|
1717
|
+
messages.push({ role: 'user', content: 'Do NOT explain what you cannot do. Do NOT build a prototype or simplified version unless explicitly asked. Build the real thing. You have bash, Node.js, npm, and a full browser. Start executing now — first tool call only, no text.' });
|
|
1718
|
+
} else if (noToolKicks >= 3) {
|
|
1719
|
+
// Agent has been kicked 3+ times and still not calling tools — give an explicit example
|
|
1720
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: no tools after ${noToolKicks} kicks — showing exact format`);
|
|
1721
|
+
messages.push({ role: 'user', content: `STOP writing plans. You have been asked ${noToolKicks} times and have not called a single tool.\n\nHere is exactly what a tool call looks like — output ONLY this, right now:\n{"name":"bash","arguments":{"command":"ls ${workDir}"}}\n\nNothing before it. Nothing after it. No "Running command...", no explanation, no plan. Just that one line of JSON. DO IT NOW.` });
|
|
1722
|
+
} else {
|
|
1723
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: no tools used yet — kicking to act (${noToolKicks})`);
|
|
1724
|
+
messages.push({ role: 'user', content: 'Stop planning. Make your first tool call now. Output only the JSON, nothing else.' });
|
|
1725
|
+
}
|
|
972
1726
|
continue;
|
|
973
1727
|
}
|
|
1728
|
+
noToolKicks = 0; // reset when tools are actually used
|
|
974
1729
|
|
|
975
|
-
//
|
|
976
|
-
//
|
|
1730
|
+
// Mid-task refusal detection — agent used some tools but then refused to continue.
|
|
1731
|
+
// e.g. "I cannot access X" after reading files. Override with capability reminder.
|
|
1732
|
+
// NEVER accept defeat — always push harder with alternative approaches.
|
|
977
1733
|
if (hasContent && toolsUsed.length > 0) {
|
|
1734
|
+
const midRefusalPhrases = [
|
|
1735
|
+
'i cannot', 'i am unable', 'i do not have', "i don't have", 'not possible', 'impossible',
|
|
1736
|
+
'cannot be done', 'outside my capabilities', 'beyond my capabilities', 'you must use',
|
|
1737
|
+
'you will need to', 'you should use',
|
|
1738
|
+
// Deployment give-up patterns
|
|
1739
|
+
'all.*methods failed', 'methods have failed', 'deployment.*failed', 'failed.*deployment',
|
|
1740
|
+
'cannot provide a', 'i must stop', 'have concluded', 'since all', 'every attempt',
|
|
1741
|
+
'all automated', 'i have tried', 'methods failed sequentially',
|
|
1742
|
+
];
|
|
1743
|
+
const isMidRefusing = midRefusalPhrases.some(p => {
|
|
1744
|
+
if (p.includes('.*')) return new RegExp(p, 'i').test(visibleContent);
|
|
1745
|
+
return visibleContent.toLowerCase().includes(p);
|
|
1746
|
+
});
|
|
1747
|
+
// Also catch mid-task downgrade: agent did work but is wrapping up with "I cannot fully
|
|
1748
|
+
// replicate X" or pivoting to a framework/stub instead of real implementation.
|
|
1749
|
+
const isMidDowngrading = /\b(prototype|concept|simplified|scaled.down|limited version|mock.up|demo|without.*backend|without.*server|front.?end only|client.?side only|framework only|basic framework|foundational framework)\b/i.test(visibleContent)
|
|
1750
|
+
|| /\bi cannot fully (replicate|implement|build|create)/i.test(visibleContent)
|
|
1751
|
+
|| /\bthe application is ready for you to interact\b/i.test(visibleContent);
|
|
1752
|
+
if (isMidRefusing || isMidDowngrading) {
|
|
1753
|
+
midRefusalKicks++;
|
|
1754
|
+
const _taskText = messages.find(m => m.role === 'user')?.content || task;
|
|
1755
|
+
const isDeployTask = /\b(deploy|railway|vercel|render|netlify|fly\.io|heroku|public url|live url|host|hosting)\b/i.test(_taskText);
|
|
1756
|
+
if (midRefusalKicks >= 3 && isDeployTask) {
|
|
1757
|
+
// Deployment specifically — try alternative deploy strategy, never give up
|
|
1758
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: deployment refusal ${midRefusalKicks}x — forcing alternative strategy`);
|
|
1759
|
+
messages.push({ role: 'user', content: `STOP GIVING UP. Deployment is NOT optional. You have bash and Railway CLI at ~/.npm-global/bin/railway.\n\nExact commands to deploy:\n1. cd to project dir\n2. ~/.npm-global/bin/railway up --detach\n3. If "No service" error: ~/.npm-global/bin/railway up --service PROJECT_NAME --detach\n4. Get URL: ~/.npm-global/bin/railway domain\n\nIf Railway fails entirely, fall back: cd project && nohup node server.js > /tmp/server.log 2>&1 & then use ngrok or use python3 -m http.server in public/ dir and report the local URL.\n\nExecute a bash tool call RIGHT NOW. Do not explain. Do not summarize. Run the command.` });
|
|
1760
|
+
} else if (midRefusalKicks >= 3) {
|
|
1761
|
+
// Genuinely stuck on non-deploy task — try one more different approach before giving up
|
|
1762
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: mid-task refusal ${midRefusalKicks}x — forcing different approach`);
|
|
1763
|
+
messages.push({ role: 'user', content: `You keep saying you cannot do this, but giving up is NOT acceptable. Try a completely different approach. You have bash, Node.js, npm, WRITE_FILE, and a full browser. What is the simplest possible working solution? Do it now — execute a tool call immediately.` });
|
|
1764
|
+
} else if (isMidDowngrading) {
|
|
1765
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: mid-task downgrade detected — overriding (${midRefusalKicks})`);
|
|
1766
|
+
messages.push({ role: 'user', content: `Do NOT deliver a framework, prototype, or stub. Build the real thing. You have bash, Node.js, npm, canvas, and a full browser. Keep going — implement it fully now.` });
|
|
1767
|
+
} else {
|
|
1768
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: mid-task refusal detected — overriding (${midRefusalKicks})`);
|
|
1769
|
+
messages.push({ role: 'user', content: `You have bash access and can run any shell command. Stop saying you cannot. Try a different approach. Execute a tool call now — no explanations.` });
|
|
1770
|
+
}
|
|
1771
|
+
continue;
|
|
1772
|
+
}
|
|
1773
|
+
}
|
|
1774
|
+
midRefusalKicks = 0; // reset when agent proceeds normally
|
|
1775
|
+
|
|
1776
|
+
// Semantic: ask the LLM whether the task is actually complete.
|
|
1777
|
+
// Fix 10: only fire when the CURRENT turn actually used tools (toolsUsedThisTurn > 0),
|
|
1778
|
+
// OR when many turns have passed (turn >= 5). Using the cumulative toolsUsed.length caused
|
|
1779
|
+
// premature kicks on mid-plan text outputs (agent says "I will click the buttons" → gets
|
|
1780
|
+
// kicked → abandons button-click plan and starts a Node server instead).
|
|
1781
|
+
if (hasContent && (toolsUsedThisTurn > 0 || turn >= 5) && visibleContent.length > 100) {
|
|
1782
|
+
if (successfulScreenshots >= 2) {
|
|
1783
|
+
console.log(` [${agentId}] ✅ Turn ${turn}: app confirmed twice by screenshots — accepting final output`);
|
|
1784
|
+
if (visibleContent) finalContent = visibleContent;
|
|
1785
|
+
break;
|
|
1786
|
+
}
|
|
978
1787
|
const originalTask = messages.find(m => m.role === 'user')?.content || task;
|
|
979
1788
|
const isDone = await this._isTaskComplete(originalTask, combined, controller.signal);
|
|
980
1789
|
if (!isDone) {
|
|
981
|
-
|
|
982
|
-
|
|
1790
|
+
incompleteKicks++;
|
|
1791
|
+
console.log(` [${agentId}] ⚡ Turn ${turn}: LLM says task incomplete — kicking (${incompleteKicks}/3)`);
|
|
1792
|
+
// After 3 consecutive incomplete verdicts, the agent likely has the answer
|
|
1793
|
+
// but is adding self-doubt text. Force-complete to stop the spiral.
|
|
1794
|
+
if (incompleteKicks >= 3) {
|
|
1795
|
+
console.log(` [${agentId}] 🛑 3 incomplete verdicts — forcing completion with current output`);
|
|
1796
|
+
if (visibleContent) finalContent = visibleContent;
|
|
1797
|
+
break;
|
|
1798
|
+
}
|
|
1799
|
+
messages.push({ role: 'user', content: 'The task is not complete yet. Continue making progress.' });
|
|
983
1800
|
continue;
|
|
984
1801
|
}
|
|
1802
|
+
incompleteKicks = 0; // reset on success
|
|
985
1803
|
console.log(` [${agentId}] ✅ Turn ${turn}: LLM confirmed task complete`);
|
|
986
1804
|
}
|
|
987
1805
|
}
|
|
@@ -993,6 +1811,19 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
993
1811
|
}
|
|
994
1812
|
|
|
995
1813
|
if (!finalContent && allOutput) finalContent = allOutput;
|
|
1814
|
+
// Final safety strip — remove any <think> blocks that leaked through the per-token filter
|
|
1815
|
+
if (finalContent) finalContent = finalContent.replace(/<think>[\s\S]*?<\/think>/g, '').replace(/<\/?think>/g, '').trim();
|
|
1816
|
+
|
|
1817
|
+
// Quality gate: reject finalContent that is just a tool header with no real text.
|
|
1818
|
+
// e.g. "WRITE_FILE" or "{\"name\":" — these are tool invocations, not agent replies.
|
|
1819
|
+
if (finalContent) {
|
|
1820
|
+
const fc = finalContent.trim();
|
|
1821
|
+
const isToolHeader = /^WRITE_FILE\b|^READ_FILE\b|^\{"name":|^{"name":/.test(fc) || fc.length < 15;
|
|
1822
|
+
if (isToolHeader) {
|
|
1823
|
+
console.log(` [${agentId}] ⚠️ finalContent looks like a tool header ("${fc.slice(0, 40)}") — requesting summary`);
|
|
1824
|
+
finalContent = '';
|
|
1825
|
+
}
|
|
1826
|
+
}
|
|
996
1827
|
|
|
997
1828
|
// If still no output (model did only tool calls, never wrote text), ask for a summary.
|
|
998
1829
|
// Use only the last 6 messages to avoid context overflow after many tool-call turns.
|
|
@@ -1067,6 +1898,8 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1067
1898
|
|
|
1068
1899
|
const duration = Date.now() - startTime;
|
|
1069
1900
|
this.activeAgents.delete(agentId);
|
|
1901
|
+
this._taskVisionModel = null;
|
|
1902
|
+
this._taskProviderKeys = null;
|
|
1070
1903
|
|
|
1071
1904
|
this.emit('agent_completed', {
|
|
1072
1905
|
agentId,
|
|
@@ -1076,10 +1909,13 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1076
1909
|
});
|
|
1077
1910
|
|
|
1078
1911
|
console.log(`\n✅ [Ollama] Agent ${agentId} completed in ${(duration / 1000).toFixed(2)}s\n`);
|
|
1912
|
+
releaseAgentTab(agentId);
|
|
1079
1913
|
return { success: true, agentId, duration, result: { output: finalContent } };
|
|
1080
1914
|
|
|
1081
1915
|
} catch (err) {
|
|
1082
1916
|
this.activeAgents.delete(agentId);
|
|
1917
|
+
this._taskVisionModel = null;
|
|
1918
|
+
this._taskProviderKeys = null;
|
|
1083
1919
|
|
|
1084
1920
|
if (err.name === 'AbortError' || controller.signal.aborted) {
|
|
1085
1921
|
this.emit('agent_cancelled', { agentId });
|
|
@@ -1114,6 +1950,26 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1114
1950
|
return Array.from(this.activeAgents.values());
|
|
1115
1951
|
}
|
|
1116
1952
|
|
|
1953
|
+
// ─── Open URL in AgentForge browser ──────────────────────────────────────
|
|
1954
|
+
// Single abstraction for navigating the user-facing browser.
|
|
1955
|
+
// Uses browserAction (puppeteer-core) — never raw CDP WebSocket directly.
|
|
1956
|
+
// Called by: bash 'open' intercept, curl 200 auto-launch.
|
|
1957
|
+
async _openInBrowser(url, agentId = 'agent') {
|
|
1958
|
+
try {
|
|
1959
|
+
await browserAction({ action: 'navigate', url }, agentId);
|
|
1960
|
+
await new Promise(r => setTimeout(r, 1500)); // let page render
|
|
1961
|
+
const shot = await browserAction({ action: 'screenshot' }, agentId);
|
|
1962
|
+
if (shot && shot.__screenshot) {
|
|
1963
|
+
this.emit('agent_image', { agentId, image: `data:image/png;base64,${shot.base64}` });
|
|
1964
|
+
}
|
|
1965
|
+
console.log(` [${agentId}] 🌐 Opened ${url} in AgentForge browser`);
|
|
1966
|
+
return { opened: true };
|
|
1967
|
+
} catch (err) {
|
|
1968
|
+
console.log(` [${agentId}] ⚠️ _openInBrowser(${url}): ${err.message}`);
|
|
1969
|
+
return { opened: false };
|
|
1970
|
+
}
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1117
1973
|
// ─── Tool execution ───────────────────────────────────────────────────────
|
|
1118
1974
|
|
|
1119
1975
|
async _executeTool(name, args, workDir, agentId = 'agent') {
|
|
@@ -1144,40 +2000,32 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1144
2000
|
}
|
|
1145
2001
|
}
|
|
1146
2002
|
|
|
1147
|
-
// Intercept
|
|
1148
|
-
//
|
|
2003
|
+
// Intercept bash calls where command is exactly a tool name — model confused tool names
|
|
2004
|
+
// with CLI commands. e.g. {"name":"bash","arguments":{"command":"screenshot_and_describe"}}
|
|
2005
|
+
// Most common pattern: screenshot_and_describe / web_fetch called inside bash.
|
|
2006
|
+
const cmdTrimmed = args.command.trim().replace(/\s+.*$/, ''); // first word only
|
|
2007
|
+
if (cmdTrimmed === 'screenshot_and_describe') {
|
|
2008
|
+
console.log(` [${agentId}] 🔀 bash("screenshot_and_describe") → redirecting to screenshot_and_describe tool`);
|
|
2009
|
+
const urlMatch = args.command.match(/https?:\/\/\S+/);
|
|
2010
|
+
const result = await this._screenshotAndDescribe(urlMatch ? urlMatch[0] : null, null, agentId);
|
|
2011
|
+
if (this._lastScreenshotData) { this.emit('agent_image', { agentId, image: this._lastScreenshotData }); this._lastScreenshotData = null; }
|
|
2012
|
+
return result;
|
|
2013
|
+
}
|
|
2014
|
+
|
|
2015
|
+
// Intercept "open http://..." — navigate the AgentForge browser via _openInBrowser,
|
|
2016
|
+
// then get an AI description so the agent can reason about what it built.
|
|
1149
2017
|
const openUrlMatch = args.command.trim().match(/^open\s+(https?:\/\/\S+)/);
|
|
1150
2018
|
if (openUrlMatch) {
|
|
1151
2019
|
const targetUrl = openUrlMatch[1];
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
const newTabData = await newTabRes.json();
|
|
1156
|
-
const tabWs = new WebSocket(`ws://127.0.0.1:9223/devtools/page/${newTabData.id}`);
|
|
1157
|
-
await new Promise(r => tabWs.on('open', r));
|
|
1158
|
-
await new Promise(r => {
|
|
1159
|
-
let navigated = false;
|
|
1160
|
-
tabWs.send(JSON.stringify({ id: 1, method: 'Page.navigate', params: { url: targetUrl } }));
|
|
1161
|
-
tabWs.on('message', () => { if (!navigated) { navigated = true; tabWs.close(); r(); } });
|
|
1162
|
-
setTimeout(() => { tabWs.close(); r(); }, 3000);
|
|
1163
|
-
});
|
|
1164
|
-
openedViaCDP = true;
|
|
1165
|
-
} catch {
|
|
1166
|
-
// CDP unavailable — fall through to OS open
|
|
1167
|
-
try { await execAsync(`open "${targetUrl}"`); } catch {}
|
|
1168
|
-
}
|
|
1169
|
-
// Auto-screenshot after opening so the agent sees what it built.
|
|
1170
|
-
// Wait for page to load, then call screenshot_and_describe.
|
|
1171
|
-
await new Promise(r => setTimeout(r, 2500));
|
|
2020
|
+
const { opened } = await this._openInBrowser(targetUrl, agentId);
|
|
2021
|
+
// Get AI description for agent context (screenshot already emitted by _openInBrowser)
|
|
2022
|
+
await new Promise(r => setTimeout(r, 800));
|
|
1172
2023
|
try {
|
|
1173
|
-
const
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
send_to_user: true
|
|
1177
|
-
}, workDir, agentId);
|
|
1178
|
-
return `Opened ${targetUrl} in browser${openedViaCDP ? ' (AgentForge browser)' : ''}.\n\nVisual snapshot of what is currently visible:\n${screenshotResult}`;
|
|
2024
|
+
const desc = await this._screenshotAndDescribe(targetUrl, 'the running application', agentId);
|
|
2025
|
+
this._lastScreenshotData = null; // suppress duplicate emit — raw already sent above
|
|
2026
|
+
return `Opened ${targetUrl}${opened ? ' in AgentForge browser' : ''}.\n\nWhat is currently visible:\n${desc}`;
|
|
1179
2027
|
} catch {
|
|
1180
|
-
return `Opened ${targetUrl}
|
|
2028
|
+
return `Opened ${targetUrl}${opened ? ' in AgentForge browser' : ''}.`;
|
|
1181
2029
|
}
|
|
1182
2030
|
}
|
|
1183
2031
|
|
|
@@ -1186,18 +2034,39 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1186
2034
|
let bashCwd = workDir;
|
|
1187
2035
|
const _home = process.env.HOME || '/tmp';
|
|
1188
2036
|
try { if (!existsSync(bashCwd)) bashCwd = _home; } catch { bashCwd = _home; }
|
|
1189
|
-
//
|
|
1190
|
-
//
|
|
2037
|
+
// Inject a PATH that includes the directories needed to find node/npm/python3
|
|
2038
|
+
// regardless of how the worker was started (nohup/launchd strip the user PATH).
|
|
2039
|
+
// process.execPath is the node binary running this worker — its directory always
|
|
2040
|
+
// contains npm too, and is correct on any machine/version/install method.
|
|
2041
|
+
const bashEnv = {
|
|
2042
|
+
...process.env,
|
|
2043
|
+
PATH: [
|
|
2044
|
+
path.dirname(process.execPath), // node + npm, always matches running version
|
|
2045
|
+
'/usr/local/bin', // homebrew, system tools
|
|
2046
|
+
'/usr/local/sbin',
|
|
2047
|
+
process.env.HOME ? `${process.env.HOME}/.npm-global/bin` : '',
|
|
2048
|
+
process.env.PATH || '',
|
|
2049
|
+
].filter(Boolean).join(':'),
|
|
2050
|
+
};
|
|
2051
|
+
// Background commands (ending with &): use spawn with detached+stdio:ignore so the
|
|
2052
|
+
// child process is fully detached from our pipe FDs and returns immediately.
|
|
2053
|
+
// Using execAsync here hangs for the full 120s timeout because the background process
|
|
2054
|
+
// inherits the exec pipe and keeps it open as long as the server runs.
|
|
1191
2055
|
const isBackground = /&\s*$/.test(args.command.trim());
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
2056
|
+
if (isBackground) {
|
|
2057
|
+
// Strip trailing & — spawn will run detached
|
|
2058
|
+
const cmd = args.command.replace(/&\s*$/, '').trim();
|
|
2059
|
+
await new Promise((resolve) => {
|
|
2060
|
+
const child = spawn('/bin/sh', ['-c', cmd], {
|
|
2061
|
+
cwd: bashCwd,
|
|
2062
|
+
env: bashEnv,
|
|
2063
|
+
detached: true,
|
|
2064
|
+
stdio: 'ignore',
|
|
2065
|
+
});
|
|
2066
|
+
child.unref();
|
|
2067
|
+
// Give the process a moment to start up, then read back any log file
|
|
2068
|
+
setTimeout(resolve, 1500);
|
|
2069
|
+
});
|
|
1201
2070
|
let confirmation = 'Background process started.';
|
|
1202
2071
|
try {
|
|
1203
2072
|
const logContent = readFileSync('/tmp/server.log', 'utf-8').trim().split('\n').slice(-3).join('\n');
|
|
@@ -1205,12 +2074,33 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1205
2074
|
} catch { /* no log yet */ }
|
|
1206
2075
|
return confirmation;
|
|
1207
2076
|
}
|
|
1208
|
-
|
|
2077
|
+
const { stdout, stderr } = await execAsync(args.command, {
|
|
2078
|
+
cwd: bashCwd,
|
|
2079
|
+
timeout: 120000,
|
|
2080
|
+
maxBuffer: 1024 * 1024 * 2, // 2MB
|
|
2081
|
+
env: bashEnv,
|
|
2082
|
+
});
|
|
2083
|
+
const rawOut = (stdout + stderr).trim() || '(no output)';
|
|
2084
|
+
// Truncate large outputs to prevent context flooding (e.g. npm install, large file cats)
|
|
2085
|
+
const MAX_BASH_OUTPUT = 3000;
|
|
2086
|
+
if (rawOut.length > MAX_BASH_OUTPUT) {
|
|
2087
|
+
const head = rawOut.slice(0, 500);
|
|
2088
|
+
const tail = rawOut.slice(-2000);
|
|
2089
|
+
return `${head}\n...(${rawOut.length - 2500} chars omitted)...\n${tail}`;
|
|
2090
|
+
}
|
|
2091
|
+
return rawOut;
|
|
1209
2092
|
}
|
|
1210
2093
|
|
|
1211
2094
|
case 'read_file': {
|
|
1212
2095
|
const fp = this._resolvePath(args.path, workDir);
|
|
1213
|
-
|
|
2096
|
+
const fileContent = readFileSync(fp, 'utf-8');
|
|
2097
|
+
const MAX_READ_OUTPUT = 8000;
|
|
2098
|
+
if (fileContent.length > MAX_READ_OUTPUT) {
|
|
2099
|
+
const head = fileContent.slice(0, 3000);
|
|
2100
|
+
const tail = fileContent.slice(-3000);
|
|
2101
|
+
return `${head}\n...(${fileContent.length - 6000} chars omitted — file is ${fileContent.length} chars total)...\n${tail}`;
|
|
2102
|
+
}
|
|
2103
|
+
return fileContent;
|
|
1214
2104
|
}
|
|
1215
2105
|
|
|
1216
2106
|
case 'write_file': {
|
|
@@ -1242,7 +2132,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1242
2132
|
|
|
1243
2133
|
if (target === 'browser') {
|
|
1244
2134
|
// Navigate + screenshot via CDP on agent browser (port 9223)
|
|
1245
|
-
return await this._cdpScreenshot(args.url, tmpFile);
|
|
2135
|
+
return await this._cdpScreenshot(args.url, tmpFile, agentId);
|
|
1246
2136
|
} else {
|
|
1247
2137
|
// Full screen capture
|
|
1248
2138
|
await execAsync(`screencapture -x "${tmpFile}"`);
|
|
@@ -1253,7 +2143,7 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1253
2143
|
}
|
|
1254
2144
|
|
|
1255
2145
|
case 'screenshot_and_describe': {
|
|
1256
|
-
const result = await this._screenshotAndDescribe(args.url, args.check_for);
|
|
2146
|
+
const result = await this._screenshotAndDescribe(args.url, args.check_for, agentId);
|
|
1257
2147
|
// Always send screenshot to user — agent called this tool, user should always see it
|
|
1258
2148
|
if (this._lastScreenshotData) {
|
|
1259
2149
|
this.emit('agent_image', { agentId, image: this._lastScreenshotData });
|
|
@@ -1263,7 +2153,22 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1263
2153
|
}
|
|
1264
2154
|
|
|
1265
2155
|
case 'browser': {
|
|
1266
|
-
|
|
2156
|
+
// Intercept browser→screenshot_and_describe misuse — agent confused the browser action
|
|
2157
|
+
// namespace with the standalone tool name. Redirect to the real vision handler so the
|
|
2158
|
+
// agent gets back a text description it can reason about, not just "Image sent to chat."
|
|
2159
|
+
if (args.action === 'screenshot_and_describe' || args.action === 'describe') {
|
|
2160
|
+
const result = await this._screenshotAndDescribe(args.url || null, args.check_for || null, agentId);
|
|
2161
|
+
if (this._lastScreenshotData) {
|
|
2162
|
+
this.emit('agent_image', { agentId, image: this._lastScreenshotData });
|
|
2163
|
+
this._lastScreenshotData = null;
|
|
2164
|
+
}
|
|
2165
|
+
return result;
|
|
2166
|
+
}
|
|
2167
|
+
const t0 = Date.now();
|
|
2168
|
+
const result = await browserAction(args, agentId);
|
|
2169
|
+
const elapsed = Date.now() - t0;
|
|
2170
|
+
const resultPreview = typeof result === 'string' ? result.slice(0, 200) : (result?.__screenshot ? `[screenshot ${Math.round((result.base64?.length||0)*0.75/1024)}KB]` : JSON.stringify(result).slice(0,200));
|
|
2171
|
+
console.log(` [${agentId}] 🌐 browser(${args.action}) → ${elapsed}ms → ${resultPreview.replace(/\n/g,' ')}`);
|
|
1267
2172
|
if (result && result.__screenshot) {
|
|
1268
2173
|
const imgData = `data:image/png;base64,${result.base64}`;
|
|
1269
2174
|
this.emit('agent_image', { agentId, image: imgData });
|
|
@@ -1281,81 +2186,51 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1281
2186
|
}
|
|
1282
2187
|
|
|
1283
2188
|
// ─── CDP browser screenshot ───────────────────────────────────────────────
|
|
2189
|
+
// Uses the persistent browserAction connection (puppeteer-core) — never raw CDP WebSocket.
|
|
2190
|
+
// This reuses the existing connection to port 9223 with ad blocking already active.
|
|
1284
2191
|
|
|
1285
|
-
async _cdpScreenshot(navigateUrl,
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
const send = (method, params = {}) => new Promise((res, rej) => {
|
|
1299
|
-
const id = msgId++;
|
|
1300
|
-
pending.set(id, { resolve: res, reject: rej });
|
|
1301
|
-
ws.send(JSON.stringify({ id, method, params }));
|
|
1302
|
-
});
|
|
1303
|
-
|
|
1304
|
-
ws.addEventListener('message', (evt) => {
|
|
1305
|
-
const msg = JSON.parse(evt.data);
|
|
1306
|
-
if (msg.id && pending.has(msg.id)) {
|
|
1307
|
-
const { resolve: res, reject: rej } = pending.get(msg.id);
|
|
1308
|
-
pending.delete(msg.id);
|
|
1309
|
-
if (msg.error) rej(new Error(msg.error.message));
|
|
1310
|
-
else res(msg.result);
|
|
1311
|
-
}
|
|
1312
|
-
});
|
|
1313
|
-
|
|
1314
|
-
ws.addEventListener('open', async () => {
|
|
1315
|
-
try {
|
|
1316
|
-
if (navigateUrl) {
|
|
1317
|
-
await send('Page.navigate', { url: navigateUrl });
|
|
1318
|
-
// Wait for page to fully render
|
|
1319
|
-
await new Promise(r => setTimeout(r, 3000));
|
|
1320
|
-
}
|
|
1321
|
-
const { data } = await send('Page.captureScreenshot', { format: 'png' });
|
|
1322
|
-
// Close the temporary tab
|
|
1323
|
-
await send('Target.closeTarget', { targetId: tabId }).catch(() => {});
|
|
1324
|
-
ws.close();
|
|
1325
|
-
resolve(`data:image/png;base64,${data}`);
|
|
1326
|
-
} catch (err) {
|
|
1327
|
-
ws.close();
|
|
1328
|
-
reject(err);
|
|
1329
|
-
}
|
|
1330
|
-
});
|
|
1331
|
-
|
|
1332
|
-
ws.addEventListener('error', (err) => reject(new Error(`CDP WebSocket error: ${err.message}`)));
|
|
1333
|
-
setTimeout(() => { ws.close(); reject(new Error('CDP screenshot timeout')); }, 25000);
|
|
1334
|
-
});
|
|
2192
|
+
async _cdpScreenshot(navigateUrl, _tmpFile, agentId = 'agent') {
|
|
2193
|
+
if (navigateUrl) {
|
|
2194
|
+
await browserAction({ action: 'navigate', url: navigateUrl }, agentId);
|
|
2195
|
+
} else {
|
|
2196
|
+
// No navigation — page may be mid-render (e.g., after press:Enter form submit or JS SPA update)
|
|
2197
|
+
// Wait for JS to finish rendering before snapping
|
|
2198
|
+
await new Promise(r => setTimeout(r, 1500));
|
|
2199
|
+
}
|
|
2200
|
+
const result = await browserAction({ action: 'screenshot' }, agentId);
|
|
2201
|
+
if (result && result.__screenshot) {
|
|
2202
|
+
return `data:image/png;base64,${result.base64}`;
|
|
2203
|
+
}
|
|
2204
|
+
throw new Error('Screenshot returned no image data');
|
|
1335
2205
|
}
|
|
1336
2206
|
|
|
1337
2207
|
// ─── Screenshot + vision analysis ─────────────────────────────────────────
|
|
1338
2208
|
// Takes a screenshot of a URL, then asks the active vision model to describe it.
|
|
1339
2209
|
// Returns a plain-text description the main agent can reason about.
|
|
1340
2210
|
|
|
1341
|
-
async _screenshotAndDescribe(url, checkFor) {
|
|
2211
|
+
async _screenshotAndDescribe(url, checkFor, agentId = 'agent') {
|
|
1342
2212
|
const question = checkFor
|
|
1343
|
-
? `
|
|
1344
|
-
: `Describe
|
|
2213
|
+
? `Look at this web page and specifically find: ${checkFor}. List exactly what you see — exact text, numbers, titles, labels, counts. CRITICAL: Preserve ALL spaces between words exactly as they appear — never merge adjacent words or labels together without a space between them. Also note the background color, any canvas element, or visual errors.`
|
|
2214
|
+
: `Describe this web page in full. List ALL visible text content: headlines, titles, labels, numbers, post titles, scores, counts — copy them exactly as shown. CRITICAL: Preserve ALL spaces between words — never concatenate adjacent text elements without a space. If two pieces of text appear next to each other (e.g. a label like "Posted" next to a value like "22 hr. ago"), always write them with a space between them. Then describe the visual layout: background color, UI elements, canvas, any errors.`;
|
|
1345
2215
|
|
|
1346
|
-
// === Server reachability check —
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
2216
|
+
// === Server reachability check — only for local dev servers ===
|
|
2217
|
+
// Skipped when url is null (current browser tab) or a public site.
|
|
2218
|
+
const isLocalUrl = url && (url.includes('localhost') || url.includes('127.0.0.1') || url.match(/:\d{4,5}/));
|
|
2219
|
+
if (isLocalUrl) {
|
|
2220
|
+
try {
|
|
2221
|
+
await fetch(url, { signal: AbortSignal.timeout(4000) });
|
|
2222
|
+
} catch (reachErr) {
|
|
2223
|
+
const portMatch = url.match(/:(\d+)/);
|
|
2224
|
+
const port = portMatch ? portMatch[1] : '?';
|
|
2225
|
+
return `SERVER IS NOT REACHABLE at ${url} (${reachErr.message}). The server on port ${port} is not running or crashed. You must restart it using bash before taking a screenshot:\n{"name":"bash","arguments":{"command":"pkill -f 'node.*${port}' 2>/dev/null; sleep 1; cd YOUR_PROJECT_DIR && nohup node server.js > /tmp/server.log 2>&1 & sleep 2 && echo started"}}\nCheck /tmp/server.log for errors if it still fails.`;
|
|
2226
|
+
}
|
|
1353
2227
|
}
|
|
1354
2228
|
|
|
1355
|
-
// === HTML dependency audit
|
|
2229
|
+
// === HTML dependency audit — only for local dev servers ===
|
|
1356
2230
|
// Fetches the page HTML and checks for common missing client-side dependencies.
|
|
1357
|
-
//
|
|
2231
|
+
// Skipped for external sites (useless) and null url (current tab).
|
|
1358
2232
|
let auditNotes = '';
|
|
2233
|
+
if (isLocalUrl) {
|
|
1359
2234
|
try {
|
|
1360
2235
|
const htmlRes = await fetch(url, { signal: AbortSignal.timeout(8000) });
|
|
1361
2236
|
const html = await htmlRes.text();
|
|
@@ -1375,46 +2250,31 @@ B16. TEST LIKE A USER: Scroll, click buttons, simulate actions, check different
|
|
|
1375
2250
|
auditNotes = `\n\nHTML DEPENDENCY AUDIT FOUND ISSUES:\n${missing.map(m => '- ' + m).join('\n')}`;
|
|
1376
2251
|
}
|
|
1377
2252
|
} catch {}
|
|
2253
|
+
} // end isLocalUrl audit block
|
|
2254
|
+
|
|
2255
|
+
// === DOM snapshot (when no URL — current page, or file:// URL) ===
|
|
2256
|
+
// Captures all page text regardless of scroll position. Appended alongside the vision
|
|
2257
|
+
// result so the agent always gets DOM content even when results are below the fold.
|
|
2258
|
+
// Also runs for file:// URLs: vision models sometimes misidentify form inputs or static
|
|
2259
|
+
// elements — the DOM snapshot provides ground-truth element types and values alongside
|
|
2260
|
+
// the visual description so the agent can cross-reference and avoid false rewrites.
|
|
2261
|
+
let domSnapshot = '';
|
|
2262
|
+
if (!url || url.startsWith('file://')) {
|
|
2263
|
+
try {
|
|
2264
|
+
const snap = await browserAction({ action: 'snapshot' }, agentId);
|
|
2265
|
+
if (typeof snap === 'string' && snap.length > 200) {
|
|
2266
|
+
domSnapshot = `\n\n--- DOM snapshot (actual element types and values — use this to verify what is really on the page, not just what it looks like) ---\n${snap}`;
|
|
2267
|
+
}
|
|
2268
|
+
} catch {}
|
|
2269
|
+
}
|
|
1378
2270
|
|
|
1379
2271
|
let imageData;
|
|
1380
|
-
const tmpFile = `/tmp/af_verify_${Date.now()}.png`;
|
|
1381
2272
|
|
|
1382
|
-
//
|
|
2273
|
+
// Use the AgentForge browser via browserAction (persistent puppeteer connection, ad blocking active)
|
|
1383
2274
|
try {
|
|
1384
|
-
imageData = await this._cdpScreenshot(url, null);
|
|
1385
|
-
} catch (
|
|
1386
|
-
|
|
1387
|
-
try {
|
|
1388
|
-
const puppeteerModule = process.env.HOME + '/.npm-global/lib/node_modules/puppeteer';
|
|
1389
|
-
const scriptFile = `/tmp/af_pup_${Date.now()}.js`;
|
|
1390
|
-
const nodeScript = `
|
|
1391
|
-
const puppeteer = require(${JSON.stringify(puppeteerModule)});
|
|
1392
|
-
(async () => {
|
|
1393
|
-
const browser = await puppeteer.launch({headless: true, protocolTimeout: 30000, args: ['--no-sandbox','--disable-setuid-sandbox','--disable-gpu','--disable-dev-shm-usage']});
|
|
1394
|
-
const page = await browser.newPage();
|
|
1395
|
-
await page.setDefaultNavigationTimeout(12000);
|
|
1396
|
-
await page.setViewport({width: 1280, height: 900});
|
|
1397
|
-
try {
|
|
1398
|
-
await page.goto(${JSON.stringify(url)}, {waitUntil: 'domcontentloaded', timeout: 12000}).catch(()=>{});
|
|
1399
|
-
await new Promise(r => setTimeout(r, 2500));
|
|
1400
|
-
await page.screenshot({path: ${JSON.stringify(tmpFile)}, fullPage: true});
|
|
1401
|
-
console.log('puppeteer screenshot ok');
|
|
1402
|
-
} finally {
|
|
1403
|
-
await browser.close();
|
|
1404
|
-
}
|
|
1405
|
-
})().then(() => process.exit(0)).catch(e => { console.error(e.message); process.exit(1); });
|
|
1406
|
-
`;
|
|
1407
|
-
writeFileSync(scriptFile, nodeScript);
|
|
1408
|
-
await execAsync(`/usr/local/bin/node "${scriptFile}"`, { timeout: 45000 });
|
|
1409
|
-
await execAsync(`rm -f "${scriptFile}"`).catch(() => {});
|
|
1410
|
-
const raw = readFileSync(tmpFile).toString('base64');
|
|
1411
|
-
await execAsync(`rm -f "${tmpFile}"`).catch(() => {});
|
|
1412
|
-
imageData = `data:image/png;base64,${raw}`;
|
|
1413
|
-
} catch (pupErr) {
|
|
1414
|
-
console.warn(` [screenshot_and_describe] puppeteer failed: ${pupErr.message}`);
|
|
1415
|
-
// No screenshot possible — return audit notes only
|
|
1416
|
-
return `Cannot take screenshot (CDP: ${cdpErr.message}, puppeteer: ${pupErr.message}). ${auditNotes || 'No dependency issues found in HTML. Check server logs for errors.'}`;
|
|
1417
|
-
}
|
|
2275
|
+
imageData = await this._cdpScreenshot(url, null, agentId);
|
|
2276
|
+
} catch (err) {
|
|
2277
|
+
return `Cannot take screenshot: ${err.message}. Is the AgentForge Browser running?${auditNotes}${domSnapshot}`;
|
|
1418
2278
|
}
|
|
1419
2279
|
|
|
1420
2280
|
// Store imageData so caller can emit to user if send_to_user=true
|
|
@@ -1422,39 +2282,95 @@ const puppeteer = require(${JSON.stringify(puppeteerModule)});
|
|
|
1422
2282
|
|
|
1423
2283
|
const base64 = imageData.replace(/^data:image\/\w+;base64,/, '');
|
|
1424
2284
|
|
|
1425
|
-
//
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
2285
|
+
// Resolve vision backend: use task-level vision model if configured (from modelflow),
|
|
2286
|
+
// otherwise fall back to the agent's primary Ollama model.
|
|
2287
|
+
const taskVisionModel = this._taskVisionModel;
|
|
2288
|
+
const taskGeminiKey = this._taskProviderKeys?.google || null;
|
|
2289
|
+
const isGemini = taskVisionModel && (taskVisionModel.startsWith('google/') || taskVisionModel.startsWith('gemini-'));
|
|
2290
|
+
|
|
2291
|
+
if (isGemini && taskGeminiKey) {
|
|
2292
|
+
// ── Gemini vision via Google AI REST API ──────────────────────────────
|
|
2293
|
+
// Model ID from flow is like "google/gemini-2.5-flash" → strip "google/" prefix
|
|
2294
|
+
const geminiModel = taskVisionModel.startsWith('google/') ? taskVisionModel.slice(7) : taskVisionModel;
|
|
2295
|
+
console.log(` [screenshot_and_describe] Using Gemini vision: ${geminiModel}`);
|
|
2296
|
+
try {
|
|
2297
|
+
const geminiUrl = `https://generativelanguage.googleapis.com/v1beta/models/${geminiModel}:generateContent?key=${taskGeminiKey}`;
|
|
2298
|
+
const res = await fetch(geminiUrl, {
|
|
2299
|
+
method: 'POST',
|
|
2300
|
+
headers: { 'Content-Type': 'application/json' },
|
|
2301
|
+
body: JSON.stringify({
|
|
2302
|
+
contents: [{
|
|
2303
|
+
parts: [
|
|
2304
|
+
{ text: question },
|
|
2305
|
+
{ inline_data: { mime_type: 'image/png', data: base64 } }
|
|
2306
|
+
]
|
|
2307
|
+
}],
|
|
2308
|
+
generationConfig: { maxOutputTokens: 1024 }
|
|
2309
|
+
}),
|
|
2310
|
+
signal: AbortSignal.timeout(30000)
|
|
2311
|
+
});
|
|
2312
|
+
if (res.ok) {
|
|
2313
|
+
const json = await res.json();
|
|
2314
|
+
const description = json.candidates?.[0]?.content?.parts?.[0]?.text || '';
|
|
2315
|
+
const clean = description.trim();
|
|
2316
|
+
if (clean) {
|
|
2317
|
+
console.log(` [screenshot_and_describe] Gemini: ${clean.slice(0, 200)}`);
|
|
2318
|
+
return `Screenshot analysis of ${url || 'current page'}:\n${clean}${auditNotes}${domSnapshot}`;
|
|
2319
|
+
}
|
|
2320
|
+
} else {
|
|
2321
|
+
const errText = await res.text().catch(() => '');
|
|
2322
|
+
console.warn(` [screenshot_and_describe] Gemini error ${res.status}: ${errText.slice(0, 200)}`);
|
|
2323
|
+
}
|
|
2324
|
+
} catch (err) {
|
|
2325
|
+
console.warn(` [screenshot_and_describe] Gemini vision call failed: ${err.message}`);
|
|
2326
|
+
}
|
|
2327
|
+
} else {
|
|
2328
|
+
// ── Ollama vision (default) ───────────────────────────────────────────
|
|
2329
|
+
try {
|
|
2330
|
+
const res = await fetch(`${this.baseUrl}/api/chat`, {
|
|
2331
|
+
method: 'POST',
|
|
2332
|
+
headers: { 'Content-Type': 'application/json' },
|
|
2333
|
+
body: JSON.stringify({
|
|
2334
|
+
model: this.model,
|
|
2335
|
+
messages: [{ role: 'user', content: question, images: [base64] }],
|
|
2336
|
+
stream: false,
|
|
2337
|
+
options: { num_ctx: 4096 }
|
|
2338
|
+
}),
|
|
2339
|
+
signal: AbortSignal.timeout(120000)
|
|
2340
|
+
});
|
|
1439
2341
|
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
2342
|
+
if (res.ok) {
|
|
2343
|
+
const json = await res.json();
|
|
2344
|
+
const description = json.message?.content || json.response || '';
|
|
2345
|
+
const clean = description.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
|
|
2346
|
+
if (clean) {
|
|
2347
|
+
console.log(` [screenshot_and_describe] ${clean.slice(0, 200)}`);
|
|
2348
|
+
return `Screenshot analysis of ${url || 'current page'}:\n${clean}${auditNotes}${domSnapshot}`;
|
|
2349
|
+
}
|
|
1447
2350
|
}
|
|
2351
|
+
} catch (err) {
|
|
2352
|
+
console.warn(` [screenshot_and_describe] vision call failed: ${err.message}`);
|
|
1448
2353
|
}
|
|
1449
|
-
} catch (err) {
|
|
1450
|
-
console.warn(` [screenshot_and_describe] vision call failed: ${err.message}`);
|
|
1451
2354
|
}
|
|
1452
2355
|
|
|
1453
|
-
return `Screenshot captured but description unavailable. The app is visible at ${url} — use read_file to check the code and make targeted improvements.${auditNotes}`;
|
|
2356
|
+
return `Screenshot captured but description unavailable. The app is visible at ${url} — use read_file to check the code and make targeted improvements.${auditNotes}${domSnapshot}`;
|
|
1454
2357
|
}
|
|
1455
2358
|
|
|
1456
2359
|
_resolvePath(p, workDir) {
|
|
1457
|
-
|
|
2360
|
+
// Expand ~ to home directory before any other resolution.
|
|
2361
|
+
// path.isAbsolute('~/foo') === false, so without this the path would be
|
|
2362
|
+
// joined with workDir and land in /tmp/agentforge/agents/{id}/~/foo (wrong).
|
|
2363
|
+
if (p.startsWith('~/') || p === '~') {
|
|
2364
|
+
p = p.replace(/^~/, homedir());
|
|
2365
|
+
}
|
|
2366
|
+
if (!path.isAbsolute(p)) return path.join(workDir, p);
|
|
2367
|
+
// Reject paths directly under / (e.g. /index.html, /style.css) — those are filesystem root
|
|
2368
|
+
// and always read-only. Redirect to workDir so the file lands somewhere writable.
|
|
2369
|
+
if (path.dirname(p) === '/') {
|
|
2370
|
+
console.log(` [worker] ⚠️ Path "${p}" is at filesystem root — redirecting to ${workDir}`);
|
|
2371
|
+
return path.join(workDir, path.basename(p));
|
|
2372
|
+
}
|
|
2373
|
+
return p;
|
|
1458
2374
|
}
|
|
1459
2375
|
|
|
1460
2376
|
_toolDesc(name, args) {
|
|
@@ -1472,6 +2388,22 @@ const puppeteer = require(${JSON.stringify(puppeteerModule)});
|
|
|
1472
2388
|
}
|
|
1473
2389
|
case 'take_screenshot':
|
|
1474
2390
|
return `Screenshot: ${args.url || args.target}`;
|
|
2391
|
+
case 'browser': {
|
|
2392
|
+
const action = args.action || 'browser';
|
|
2393
|
+
if (action === 'navigate' || action === 'open') {
|
|
2394
|
+
try { return `browser → ${new URL(args.url).hostname}`; } catch { return `browser → navigate`; }
|
|
2395
|
+
}
|
|
2396
|
+
if (action === 'snapshot') return 'browser → snapshot page';
|
|
2397
|
+
if (action === 'screenshot') return 'browser → screenshot';
|
|
2398
|
+
if (action === 'click') return `browser → click "${(args.text || args.selector || '').toString().slice(0, 40)}"`;
|
|
2399
|
+
if (action === 'type') return `browser → type into ${(args.selector || 'input').toString().slice(0, 40)}`;
|
|
2400
|
+
if (action === 'tabs') return 'browser → list tabs';
|
|
2401
|
+
if (action === 'evaluate') return 'browser → run JS';
|
|
2402
|
+
if (action === 'scroll') return 'browser → scroll';
|
|
2403
|
+
if (action === 'find_elements') return 'browser → find elements';
|
|
2404
|
+
if (action === 'get_bookmarks') return 'browser → get bookmarks';
|
|
2405
|
+
return `browser → ${action}`;
|
|
2406
|
+
}
|
|
1475
2407
|
default:
|
|
1476
2408
|
return name;
|
|
1477
2409
|
}
|
|
@@ -1522,7 +2454,7 @@ const puppeteer = require(${JSON.stringify(puppeteerModule)});
|
|
|
1522
2454
|
model: this.model,
|
|
1523
2455
|
messages: [
|
|
1524
2456
|
{ role: 'system', content: 'You determine if a task is complete. Reply with only "yes" or "no".' },
|
|
1525
|
-
{ role: 'user', content: `Task: ${task.slice(0,
|
|
2457
|
+
{ role: 'user', content: `Task: ${task.slice(0, 400)}\n\nAgent output (last part):\n${output.slice(-800)}\n\nDid the agent complete ALL requirements of the task? Judge based on evidence of completed actions (files written, commands run, results returned) — NOT based on the agent's own statements about what it can or cannot do. Agent self-assessments and disclaimers are unreliable.\n- For build/server tasks: code must be written AND server must be running locally. Do NOT require cloud deployment (Railway/Vercel/Render/etc.) unless the task explicitly says to deploy or host publicly.\n- For tasks that explicitly mention deploying to Railway/Vercel/Render/Netlify/fly.io/Heroku: there MUST be a live public URL in the output.\n- For research/Q&A tasks: specific facts must be present.\nAnswer "yes" only if ALL stated requirements are done. Answer "no" if ANY required step is missing.` }
|
|
1526
2458
|
],
|
|
1527
2459
|
stream: false,
|
|
1528
2460
|
think: false,
|