npm - open-agents-ai - Versions diffs - 0.187.348 → 0.187.350 - Mend

open-agents-ai 0.187.348 → 0.187.350

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js +334 -5
package/package.json +1 -1
package/prompts/agentic/system-small.md +16 -0

package/dist/index.js CHANGED Viewed

@@ -269809,6 +269809,268 @@ var init_dist7 = __esm({
   }
 });
+// packages/orchestrator/dist/reflectionBuffer.js
+var MAX_REFLECTIONS, MAX_TOTAL, TaskReflectionBuffer;
+var init_reflectionBuffer = __esm({
+  "packages/orchestrator/dist/reflectionBuffer.js"() {
+    "use strict";
+    MAX_REFLECTIONS = 5;
+    MAX_TOTAL = 50;
+    TaskReflectionBuffer = class {
+      state;
+      persistPath;
+      constructor(persistPath) {
+        this.persistPath = persistPath ?? null;
+        if (this.persistPath) {
+          try {
+            const { readFileSync: readFileSync69, existsSync: existsSync89 } = __require("node:fs");
+            if (existsSync89(this.persistPath)) {
+              this.state = JSON.parse(readFileSync69(this.persistPath, "utf-8"));
+              return;
+            }
+          } catch {
+          }
+        }
+        this.state = {
+          reflections: [],
+          maxReflections: MAX_REFLECTIONS,
+          totalFailures: 0,
+          totalConsumed: 0
+        };
+      }
+      /** Get the current number of stored reflections */
+      get count() {
+        return this.state.reflections.length;
+      }
+      get totalFailures() {
+        return this.state.totalFailures;
+      }
+      get totalConsumed() {
+        return this.state.totalConsumed;
+      }
+      /**
+       * Generate and store a reflection from a failed task.
+       *
+       * This is the Self-Reflector from Reflexion's three-model architecture.
+       * The reflection is deterministic (no LLM call needed) — it analyzes
+       * the failure trajectory and generates typed guidance.
+       *
+       * @param taskGoal The original task prompt
+       * @param sessionId Current session ID
+       * @param turnsSpent How many turns were used
+       * @param failedApproaches List of failed approaches from _taskState
+       * @param toolCallLog Recent tool call history
+       * @param lastError The final error or failure reason
+       */
+      addReflection(params) {
+        const { taskGoal, sessionId, turnsSpent, failedApproaches, toolCallLog, lastError, failedPaths } = params;
+        const taskFingerprint = this.computeFingerprint(taskGoal);
+        const errorType = this.classifyError(toolCallLog, failedApproaches, lastError, turnsSpent);
+        const failedTools = [...new Set(toolCallLog.filter((t2) => !t2.success).map((t2) => t2.tool))].slice(0, 5);
+        const { whatFailed, whatToDoDifferently, confidence } = this.generateGuidance(errorType, failedApproaches, toolCallLog, lastError, turnsSpent);
+        const reflection = {
+          timestamp: Date.now(),
+          sessionId,
+          taskGoal: taskGoal.slice(0, 200),
+          taskFingerprint,
+          whatFailed,
+          whatToDoDifferently,
+          errorType,
+          failedTools,
+          failedPaths: (failedPaths ?? []).slice(0, 5),
+          turnsSpent,
+          confidence
+        };
+        this.state.reflections.unshift(reflection);
+        this.state.totalFailures++;
+        const byFingerprint = /* @__PURE__ */ new Map();
+        this.state.reflections = this.state.reflections.filter((r2) => {
+          const count = (byFingerprint.get(r2.taskFingerprint) ?? 0) + 1;
+          byFingerprint.set(r2.taskFingerprint, count);
+          return count <= this.state.maxReflections;
+        });
+        if (this.state.reflections.length > MAX_TOTAL) {
+          this.state.reflections = this.state.reflections.slice(0, MAX_TOTAL);
+        }
+        this.persist();
+        return reflection;
+      }
+      /**
+       * Retrieve relevant reflections for a new task attempt.
+       *
+       * Returns reflections from similar past failures, sorted by relevance.
+       * These should be prepended to the system prompt for the next attempt.
+       *
+       * @param taskGoal The current task goal
+       * @param maxResults Max reflections to return (default: 3)
+       */
+      getRelevantReflections(taskGoal, maxResults = 3) {
+        if (this.state.reflections.length === 0)
+          return [];
+        const fingerprint = this.computeFingerprint(taskGoal);
+        const goalLower = taskGoal.toLowerCase();
+        const goalWords = new Set(goalLower.split(/\s+/).filter((w) => w.length > 3));
+        const scored = this.state.reflections.map((r2) => {
+          let score = 0;
+          if (r2.taskFingerprint === fingerprint)
+            score += 5;
+          const rWords = new Set(r2.taskGoal.toLowerCase().split(/\s+/).filter((w) => w.length > 3));
+          let overlap = 0;
+          for (const w of goalWords)
+            if (rWords.has(w))
+              overlap++;
+          score += overlap;
+          const hoursAgo = (Date.now() - r2.timestamp) / 36e5;
+          score += Math.max(0, 2 - hoursAgo * 0.1);
+          score += r2.confidence * 2;
+          return { reflection: r2, score };
+        });
+        scored.sort((a2, b) => b.score - a2.score);
+        const results = scored.slice(0, maxResults).filter((s2) => s2.score > 1).map((s2) => s2.reflection);
+        this.state.totalConsumed += results.length;
+        this.persist();
+        return results;
+      }
+      /**
+       * Format reflections as a system prompt injection.
+       * Returns the text to prepend to the task context.
+       */
+      formatForContext(reflections) {
+        if (reflections.length === 0)
+          return "";
+        const lines = [
+          "[Prior Failure Reflections — learn from these mistakes]",
+          ""
+        ];
+        for (let i2 = 0; i2 < reflections.length; i2++) {
+          const r2 = reflections[i2];
+          lines.push(`Reflection ${i2 + 1} (${r2.errorType}):`);
+          lines.push(`  What failed: ${r2.whatFailed}`);
+          lines.push(`  Do instead: ${r2.whatToDoDifferently}`);
+          if (r2.failedTools.length > 0) {
+            lines.push(`  Avoid: ${r2.failedTools.join(", ")} with the same approach`);
+          }
+          lines.push("");
+        }
+        lines.push("Apply these lessons. Do NOT repeat the same mistakes.");
+        return lines.join("\n");
+      }
+      // ─── Internal ──────────────────────────────────────────────────────────
+      /** Compute a fingerprint for task similarity matching */
+      computeFingerprint(taskGoal) {
+        const lower = taskGoal.toLowerCase();
+        const significant = lower.split(/\s+/).filter((w) => w.length > 4).filter((w) => !["please", "could", "would", "should", "about", "these", "those", "their", "there", "which"].includes(w)).sort().slice(0, 8).join("_");
+        return significant || "generic";
+      }
+      /** Classify the error type from evidence */
+      classifyError(toolCallLog, failedApproaches, lastError, turnsSpent) {
+        const errorLower = lastError.toLowerCase();
+        const allErrors = toolCallLog.filter((t2) => !t2.success).map((t2) => (t2.error ?? "").toLowerCase());
+        if (allErrors.some((e2) => e2.includes("enoent") || e2.includes("not found")))
+          return "search_fail";
+        if (allErrors.some((e2) => e2.includes("permission") || e2.includes("eacces")))
+          return "permission";
+        if (allErrors.some((e2) => e2.includes("module") || e2.includes("package") || e2.includes("dependency")))
+          return "dependency";
+        if (turnsSpent >= 15 && failedApproaches.length >= 3)
+          return "repetition";
+        if (errorLower.includes("timeout") || errorLower.includes("turn limit"))
+          return "timeout";
+        if (errorLower.includes("incomplete") || errorLower.includes("partial"))
+          return "incomplete";
+        const uniqueTools = new Set(toolCallLog.map((t2) => t2.tool));
+        const failRate = toolCallLog.filter((t2) => !t2.success).length / Math.max(1, toolCallLog.length);
+        if (failRate > 0.5 && uniqueTools.size <= 2)
+          return "tool_misuse";
+        if (failRate > 0.3)
+          return "logic";
+        return "other";
+      }
+      /** Generate actionable guidance following Self-Refine's criterion */
+      generateGuidance(errorType, failedApproaches, toolCallLog, lastError, turnsSpent) {
+        const failedTools = toolCallLog.filter((t2) => !t2.success);
+        const lastFailedTool = failedTools[failedTools.length - 1];
+        switch (errorType) {
+          case "search_fail":
+            return {
+              whatFailed: `Could not find the target file/function. Tried: ${failedApproaches.slice(0, 2).join(", ") || lastError.slice(0, 80)}`,
+              whatToDoDifferently: "Use grep_search with broader patterns first. Try list_directory to verify paths. Check for typos in file names. Search parent directories.",
+              confidence: 0.85
+            };
+          case "tool_misuse":
+            return {
+              whatFailed: `Wrong tool or arguments for the task. Tool ${lastFailedTool?.tool ?? "unknown"} failed: ${lastFailedTool?.error?.slice(0, 60) ?? lastError.slice(0, 60)}`,
+              whatToDoDifferently: `Try a different tool. If file_edit failed, try file_write. If shell failed with a complex command, break it into simpler steps. Read the file first before editing.`,
+              confidence: 0.8
+            };
+          case "repetition":
+            return {
+              whatFailed: `Got stuck in a loop after ${turnsSpent} turns trying ${failedApproaches.length} approaches. The same tools kept failing with similar errors.`,
+              whatToDoDifferently: "Stop and try a completely different strategy. If you were editing, try rewriting from scratch. If searching failed, try a broader or narrower query. Ask yourself: what assumption am I making that might be wrong?",
+              confidence: 0.9
+            };
+          case "timeout":
+            return {
+              whatFailed: `Ran out of turns (${turnsSpent}). The task was not completed in the allocated budget.`,
+              whatToDoDifferently: "Start with the most critical action immediately — skip planning. Do fewer tool calls. Focus on the single most important sub-task first.",
+              confidence: 0.75
+            };
+          case "permission":
+            return {
+              whatFailed: `Permission denied: ${lastError.slice(0, 80)}`,
+              whatToDoDifferently: "Check file permissions first. Use sudo if allowed. Try writing to /tmp/ instead. Avoid modifying system files.",
+              confidence: 0.9
+            };
+          case "dependency":
+            return {
+              whatFailed: `Missing dependency: ${lastError.slice(0, 80)}`,
+              whatToDoDifferently: "Install the dependency first (npm install, pip install, apt install). Check if a virtual environment is needed. Verify the package name is correct.",
+              confidence: 0.85
+            };
+          case "incomplete":
+            return {
+              whatFailed: `Task was only partially completed. ${failedApproaches.length > 0 ? `Approaches tried: ${failedApproaches[0]}` : ""}`,
+              whatToDoDifferently: "Complete ALL steps before calling task_complete. Check your todo list. Verify each file was actually modified. Run tests to confirm.",
+              confidence: 0.7
+            };
+          case "logic":
+            return {
+              whatFailed: `The approach was logically flawed. Multiple tools failed (${failedTools.length}/${toolCallLog.length} calls).`,
+              whatToDoDifferently: "Rethink the approach from scratch. Read the relevant code before making changes. Test your understanding by reading the file first, then planning the edit.",
+              confidence: 0.6
+            };
+          case "semantic":
+            return {
+              whatFailed: `Misunderstood the task requirement. ${lastError.slice(0, 80)}`,
+              whatToDoDifferently: "Re-read the task prompt carefully. Identify exactly what output is expected. If ambiguous, focus on the most literal interpretation.",
+              confidence: 0.5
+            };
+          default:
+            return {
+              whatFailed: `Task failed: ${lastError.slice(0, 100)}`,
+              whatToDoDifferently: "Try a different approach. Read relevant files first. Break the task into smaller steps.",
+              confidence: 0.4
+            };
+        }
+      }
+      /** Persist to disk */
+      persist() {
+        if (!this.persistPath)
+          return;
+        try {
+          const { writeFileSync: writeFileSync50, mkdirSync: mkdirSync56, existsSync: existsSync89 } = __require("node:fs");
+          const { join: join108 } = __require("node:path");
+          const dir = join108(this.persistPath, "..");
+          if (!existsSync89(dir))
+            mkdirSync56(dir, { recursive: true });
+          writeFileSync50(this.persistPath, JSON.stringify(this.state, null, 2));
+        } catch {
+        }
+      }
+    };
+  }
+});
 // packages/orchestrator/dist/tool-batching.js
 function isConcurrencySafe(toolName, readOnlyHints) {
   if (CONCURRENT_SAFE_TOOLS.has(toolName))
@@ -270538,6 +270800,7 @@ var init_agenticRunner = __esm({
     init_pressure_gate();
     init_dist4();
     init_dist7();
+    init_reflectionBuffer();
     init_tool_batching();
     init_hooks();
     init_app_state();
@@ -271486,6 +271749,27 @@ TASK: ${task}` : task;
           { role: "system", content: systemPrompt },
           { role: "user", content: userContent }
         ];
+        try {
+          if (!this._reflectionBuffer) {
+            const oaDir = this._workingDirectory ? _pathJoin(this._workingDirectory, ".oa", "memory") : null;
+            if (oaDir) {
+              this._reflectionBuffer = new TaskReflectionBuffer(_pathJoin(oaDir, "reflections.json"));
+            }
+          }
+          if (this._reflectionBuffer) {
+            const reflections = this._reflectionBuffer.getRelevantReflections(cleanedTask, 3);
+            if (reflections.length > 0) {
+              const reflectionCtx = this._reflectionBuffer.formatForContext(reflections);
+              messages2.push({ role: "system", content: reflectionCtx });
+              this.emit({
+                type: "status",
+                content: `Reflexion: injected ${reflections.length} prior failure reflection(s) for this task type`,
+                timestamp: (/* @__PURE__ */ new Date()).toISOString()
+              });
+            }
+          }
+        } catch {
+        }
         let toolDefs = await this.buildToolDefinitions();
         const baseInstructions = getSystemPromptForTier(this.options.modelTier);
         this.checkPromptToolParity(baseInstructions, toolDefs);
@@ -271768,6 +272052,10 @@ ${top.map((t2) => `- ${t2.name}: ${t2.desc}`).join("\n")}`
             if (isReadTask && !isSearchTask) {
               hints.push("READ STRATEGY: Call file_read immediately with the exact path. One call, report the answer.");
             }
+            const isMultiFileTask = /\bedit\b.*\band\b|\bmodify\b.*\bfiles?\b|\brefactor\b|\bmigrat/i.test(taskGoal);
+            if (isMultiFileTask) {
+              hints.push("FILE LOCALIZATION: First use grep_search to find the MINIMUM set of files needed. Do NOT read every file in the project. Find → Filter → Edit.");
+            }
             hints.push("EFFICIENCY: Aim for 3-5 tool calls total. Each call should make measurable progress. Do not repeat a tool call with the same arguments.");
             if (hints.length > 0) {
               messages2.push({
@@ -272565,7 +272853,12 @@ ${cachedEntry2.result.slice(0, 500)}` : `[BLOCKED — the observer confirmed thi
                 }
                 const consecutiveSameTool = Math.max(sameToolFailStreak, this._taskState.failedApproaches.slice(-2).filter((f2) => f2.startsWith(`${tc.name}(`)).length);
                 if (sameToolFailStreak >= 5 && (this.options.modelTier === "small" || this.options.modelTier === "medium")) {
-                  this.pendingUserMessages.push(`[PIVOT STRONGLY RECOMMENDED] Tool "${tc.name}" has failed ${sameToolFailStreak} times in a row. Try a different approach: file_read (inspect state), list_directory (explore workspace), shell (run a minimal reproducer), or web_search (lookup docs). Avoid repeating ${tc.name} with similar arguments.`);
+                  this.pendingUserMessages.push(`[BRANCH — evaluate alternatives before acting]
+Tool "${tc.name}" has failed ${sameToolFailStreak} times. STOP and enumerate:
+Option A: [describe a completely different approach]
+Option B: [describe another alternative]
+Option C: [the simplest possible fallback]
+Pick the BEST option and explain why, then execute it. Do NOT retry ${tc.name} with similar arguments.`);
                   sameToolFailStreak = 0;
                   sameToolFailName = null;
                 }
@@ -272609,6 +272902,12 @@ Do NOT retry ${tc.name} with similar arguments.`);
                   } catch {
                   }
                 }
+                if (isModify && (turnTier === "small" || turnTier === "medium")) {
+                  const modCount = this._taskState.modifiedFiles.size;
+                  if (modCount >= 2 && modCount % 2 === 0) {
+                    this.pendingUserMessages.push(`[Test reminder] You've modified ${modCount} files. Run relevant tests NOW to verify: shell(command="npm test") or the project's test command. Fix any failures before continuing.`);
+                  }
+                }
               }
               if (result.success) {
                 if (tc.name === "file_write" || tc.name === "file_edit" || tc.name === "batch_edit") {
@@ -273410,6 +273709,29 @@ Full content available via: repl_exec(code="data = retrieve('${handleId}')")  or
           });
         } catch {
         }
+        if (this._reflectionBuffer && !completed) {
+          try {
+            const reflection = this._reflectionBuffer.addReflection({
+              taskGoal: cleanedTask,
+              sessionId: this._sessionId,
+              turnsSpent: this._taskState.toolCallCount,
+              failedApproaches: this._taskState.failedApproaches,
+              toolCallLog: toolCallLog.map((t2) => ({
+                tool: t2.name,
+                success: t2.success ?? false,
+                error: t2.outputPreview?.slice(0, 100)
+              })),
+              lastError: summary || "Task did not complete",
+              failedPaths: [...this._taskState.modifiedFiles.keys()].slice(0, 5)
+            });
+            this.emit({
+              type: "status",
+              content: `Reflexion: stored ${reflection.errorType} reflection — "${reflection.whatToDoDifferently.slice(0, 80)}"`,
+              timestamp: (/* @__PURE__ */ new Date()).toISOString()
+            });
+          } catch {
+          }
+        }
         if (this._episodeStore) {
           try {
             this._episodeStore.insert({
@@ -273688,10 +274010,13 @@ Full content available via: repl_exec(code="data = retrieve('${handleId}')")  or
         const errLower = error.toLowerCase();
         if (toolName === "file_edit" || toolName === "batch_edit") {
           if (errLower.includes("not found") || errLower.includes("old_string") || errLower.includes("no match")) {
-            return `[RECOVERY] file_edit failed: the old_string was not found in the file.
-Diagnosis: The file content may have changed since you last read it, or the string has different whitespace.
-Actions: (1) file_read("${args2["path"] ?? "the file"}") to see current content, (2) grep_search to find the current text, (3) retry with the EXACT text from the file.
-Do NOT retry with the same old_string — it will fail again.`;
+            const filePath = String(args2["path"] ?? "the file");
+            const oldStr = String(args2["old_string"] ?? "").slice(0, 120);
+            return `[RECOVERY] SWE-agent 3-part feedback:
+1. ERROR: file_edit failed — old_string not found in ${filePath}.
+2. YOUR EDIT would have replaced: "${oldStr}"
+3. ORIGINAL: file content has changed or whitespace differs.
+ACTION: (1) file_read("${filePath}") to see CURRENT content, (2) copy the EXACT text from the file, (3) retry. Do NOT retry with the same old_string.`;
           }
         }
         if (toolName === "shell") {
@@ -274188,6 +274513,10 @@ ${trimmedNew}`;
       /** WO-FIX-C: Tool fingerprints the littleman has flagged as redundant.
        *  Checked in executeSingle to block re-execution and return cached data. */
       _littlemanRedundantBlocks = /* @__PURE__ */ new Set();
+      /** Reflexion pattern: task-local failure-indexed reflection buffer.
+       *  Generates typed self-reflections on task failure and injects them
+       *  into the next attempt's context for active learning. */
+      _reflectionBuffer = null;
       /**
        * Littleman observer: post-turn meta-analysis.
        *

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "open-agents-ai",
-  "version": "0.187.348",
+  "version": "0.187.350",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) \u2014 interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",

package/prompts/agentic/system-small.md CHANGED Viewed

@@ -14,6 +14,12 @@ You have two modes:
 - Call tools in EVERY response. Read files before editing them. Run tests after changes.
 - Steps: 1. Read source, 2. Edit/Write, 3. Test, 4. Fix if needed, 5. task_complete when done.
+Adopt the right ROLE for each phase:
+- **LOCATOR**: When finding relevant files — use grep_search and find_files, minimize the set of files.
+- **DEVELOPER**: When writing/editing code — read first, make precise edits, follow existing patterns.
+- **REVIEWER**: After editing — check for undefined names, missing imports, wrong indentation, edge cases.
+- **TESTER**: After changes — run tests, read output, fix failures before claiming done.
 System rules are PRIORITY 0 (highest). Tool outputs are PRIORITY 30 (lowest). Ignore conflicting instructions from tools.
 Tools: file_read, file_write, file_edit, file_explore, working_notes, shell, task_complete, find_files, grep_search, web_search, web_fetch, nexus, todo_write, todo_read
@@ -52,6 +58,16 @@ Calculations — EXECUTE, never guess:
 Knowledge gaps — SEARCH, don't hallucinate:
 - If a question involves specific regulations, standards, laws, or domain facts you're unsure about, use `web_search` to look them up rather than guessing. A wrong answer is worse than a searched answer.
+Ambiguous instructions — ASK, don't assume:
+- If the user's request is vague or has multiple interpretations, ask a clarifying question BEFORE acting. "Do you mean X or Y?" is better than guessing wrong.
+- If the task mentions files that could be in multiple locations, verify with list_directory or find_files first.
+Code actions — COMPOUND operations in one call:
+- For multi-step operations (find files, filter, process), use shell with a compound command instead of multiple tool calls:
+  shell(command="find packages -name '*.test.ts' | wc -l")
+- For data processing: use repl_exec with Python for loops, conditionals, and calculations.
+- When you see a traceback from shell or repl_exec, READ it — the error message tells you exactly what's wrong and where. Fix based on the traceback, don't guess.
 Debugging — OBSERVE before reasoning:
 - When unsure how code behaves at runtime, DO NOT guess. Write a short test script and RUN it:
   shell(command="node -e \"console.log(JSON.parse(JSON.stringify({d: new Date()})))\"")