open-agents-ai 0.187.487 → 0.187.489

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -519538,6 +519538,35 @@ var init_agenticRunner = __esm({
519538
519538
  // unresolved verification failure. Effectively gates task_complete
519539
519539
  // suggestion behind real verification, not just self-report.
519540
519540
  _verifyFailures = /* @__PURE__ */ new Set();
519541
+ // REG-37e: track whether we've already nudged the agent about the
519542
+ // verifyCommand / declaredArtifacts fields. Empirical observation
519543
+ // from run #15: across 30 todo_writes, agent set neither field
519544
+ // 0 times. Field descriptions alone don't drive uptake. After the
519545
+ // first 2 todo_writes with no field uptake, inject a one-shot
519546
+ // soft-budget hint with a worked example. Once-per-run.
519547
+ _newFieldNudgeFired = false;
519548
+ _todoWritesObservedForNudge = 0;
519549
+ // REG-44: wide-exploration thrash detector. Empirical observation
519550
+ // from run #15: agent's stuck pattern is NOT immediate retry → retry,
519551
+ // but rather "fail → 30+ list_directory/shell re-orient → retry →
519552
+ // 30+ ld → retry". REG-18 stagnation gate misses this because file
519553
+ // writes ARE happening (just earlier in the run). Detect: in last
519554
+ // 12 turns, ld+sh count >= 25 + fw growth <= 2 + recent shell
519555
+ // failure exists. Fire CRITICAL halt instructing the agent to stop
519556
+ // exploring and either web_search or fix one specific thing.
519557
+ // Cooldown 8 turns after firing.
519558
+ _wideExplorationCooldownUntilTurn = -1;
519559
+ // REG-45: sticky cross-turn escalation. The dispatch-time reflection
519560
+ // surface (REG-26) only fires when the agent re-emits the exact same
519561
+ // failed stem. If the agent thrashes on OTHER tools (wide-exploration
519562
+ // pattern caught by REG-44), the escalation reflection sits dormant in
519563
+ // _failureReflections — never reaches the model. Fix: at top of each
519564
+ // turn, scan _failureReflections for any entry where attempts ≥ 3 OR
519565
+ // distinct errors ≥ 3 — surface these "sticky" entries as critical
519566
+ // (bypasses budget, like the dispatch-time escalation path) every
519567
+ // turn until they clear. Track which we've surfaced this run so the
519568
+ // signal doesn't fire >1× per turn per stem.
519569
+ _stickyEscalationsSurfacedThisTurn = /* @__PURE__ */ new Set();
519541
519570
  // ── WO-AM-01/04/10: Associative memory stores ──
519542
519571
  // Episode store: every tool call → persistent episode with importance + decay
519543
519572
  // Temporal KG: entities + relations with temporal validity (valid_from/valid_until)
@@ -521760,6 +521789,7 @@ TASK: ${task}` : task;
521760
521789
  }
521761
521790
  injectionsThisTurn = 0;
521762
521791
  this._reflectionsInjectedThisTurn.clear();
521792
+ this._stickyEscalationsSurfacedThisTurn.clear();
521763
521793
  this._typecheckHintInjectedThisTurn = false;
521764
521794
  this._completionPromptInjectedThisTurn = false;
521765
521795
  this._verifyHintInjectedThisTurn.clear();
@@ -521858,6 +521888,118 @@ TASK: ${task}` : task;
521858
521888
  }
521859
521889
  }
521860
521890
  }
521891
+ if (turn > this._wideExplorationCooldownUntilTurn && turn >= 12) {
521892
+ const _windowCalls = toolCallLog.slice(-15);
521893
+ if (_windowCalls.length >= 12) {
521894
+ const _ldShCount = _windowCalls.filter((c9) => c9.name === "list_directory" || c9.name === "shell").length;
521895
+ const _fwCount = _windowCalls.filter((c9) => ["file_write", "file_edit", "batch_edit", "file_patch"].includes(c9.name)).length;
521896
+ const _hasRecentShellFailure = _windowCalls.some((c9) => c9.name === "shell" && c9.success === false);
521897
+ if (_ldShCount >= 11 && _fwCount <= 2 && _hasRecentShellFailure) {
521898
+ this._wideExplorationCooldownUntilTurn = turn + 8;
521899
+ const _recentFailures = this._recentFailures.slice(-3);
521900
+ const _failureBlocks = _recentFailures.map((f2) => {
521901
+ const _firstLine = (f2.error || f2.output || "").split(/\r?\n/).find((l2) => l2.trim().length > 0) || "";
521902
+ return ` - ${f2.tool}: "${_firstLine.slice(0, 200)}"`;
521903
+ }).join("\n");
521904
+ messages2.push({
521905
+ role: "system",
521906
+ content: [
521907
+ `[WIDE-EXPLORATION HALT — REG-44]`,
521908
+ ``,
521909
+ `In the last ${_windowCalls.length} turns you have made ${_ldShCount} list_directory/shell calls and only ${_fwCount} file modification(s). At least one shell command in this window failed. This pattern — explore, retry, explore, retry — is the textbook "stuck after a failure" loop where the agent re-orients instead of fixing the named problem.`,
521910
+ ``,
521911
+ `Stop exploring. Pick ONE of these three actions for your next response:`,
521912
+ ``,
521913
+ ` (a) Run a web search of the EXACT error string from the failure below — most framework/version-specific errors need external knowledge your training data may not cover. Tool: \`web_search\`.`,
521914
+ ``,
521915
+ ` (b) Make ONE specific, targeted fix attempt addressing the SPECIFIC failed command. Read the error message literally — it often names what to do next.`,
521916
+ ``,
521917
+ ` (c) If you have tried 3+ different approaches and the same error persists, invoke the \`debate\` tool with the failed command and error as the task — get a second opinion.`,
521918
+ ``,
521919
+ `Recent failures in this window:`,
521920
+ _failureBlocks || ` (no recent shell failures captured — investigate toolCallLog directly)`,
521921
+ ``,
521922
+ `Do NOT in your next response: emit another list_directory or read another file. Take direct action toward fixing the failure.`
521923
+ ].join("\n")
521924
+ });
521925
+ this.emit({
521926
+ type: "status",
521927
+ content: `REG-44 wide-exploration halt fired at turn ${turn} (ld+sh=${_ldShCount}, fw=${_fwCount} in window of ${_windowCalls.length})`,
521928
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
521929
+ });
521930
+ }
521931
+ }
521932
+ }
521933
+ try {
521934
+ const STICKY_PER_TURN_CAP = 2;
521935
+ const _candidates = [];
521936
+ for (const [_stem, _entry] of this._failureReflections.entries()) {
521937
+ if (this._stickyEscalationsSurfacedThisTurn.has(_stem))
521938
+ continue;
521939
+ if (this._reflectionsInjectedThisTurn.has(_stem))
521940
+ continue;
521941
+ const _isEscalation = _entry.attempts >= 3 || (_entry.errorSignatures?.size ?? 0) >= 3;
521942
+ if (!_isEscalation)
521943
+ continue;
521944
+ _candidates.push({ stem: _stem, entry: _entry });
521945
+ }
521946
+ _candidates.sort((a2, b) => {
521947
+ const _attemptsDiff = b.entry.attempts - a2.entry.attempts;
521948
+ if (_attemptsDiff !== 0)
521949
+ return _attemptsDiff;
521950
+ return (b.entry.errorSignatures?.size ?? 0) - (a2.entry.errorSignatures?.size ?? 0);
521951
+ });
521952
+ for (const { stem: _stem, entry: _entry } of _candidates.slice(0, STICKY_PER_TURN_CAP)) {
521953
+ let _body = renderReflectionMessage(_entry);
521954
+ if (this._runLessons.length > 0) {
521955
+ const _query = `${this._taskState.goal || ""} ${_entry.wentWrong}`;
521956
+ const _topLessons = select2({
521957
+ goal: _query,
521958
+ lessons: this._runLessons,
521959
+ k: 1
521960
+ });
521961
+ if (_topLessons.length > 0) {
521962
+ const _l = _topLessons[0];
521963
+ _body += [
521964
+ ``,
521965
+ `[INTRA-RUN LESSON — REG-36b]`,
521966
+ `Earlier in THIS run you encountered a similar pattern:`,
521967
+ ` Failed: ${_l.whatFailed.slice(0, 150)}`,
521968
+ ` Worked: ${_l.whatWorked.slice(0, 150)}`,
521969
+ ` Hypothesis: ${_l.hypothesis.slice(0, 150)}`,
521970
+ `Apply that lesson here if applicable.`
521971
+ ].join("\n");
521972
+ }
521973
+ }
521974
+ messages2.push({
521975
+ role: "system",
521976
+ content: [
521977
+ `[STICKY ESCALATION — REG-45 — failure persists across turns]`,
521978
+ ``,
521979
+ `You have an unresolved high-attempt failure that you may have stopped trying to fix. Every turn that this remains unresolved, this reflection will resurface so the issue stays visible:`,
521980
+ ``,
521981
+ _body,
521982
+ ``,
521983
+ `If this failure is genuinely irrelevant now (e.g. the goal moved on), the only way to clear this notice is to make a successful attempt of the same call (or close-equivalent) — that resets the failure record. Otherwise, address it now.`
521984
+ ].join("\n")
521985
+ });
521986
+ this._stickyEscalationsSurfacedThisTurn.add(_stem);
521987
+ this._reflectionsInjectedThisTurn.add(_stem);
521988
+ this.emit({
521989
+ type: "status",
521990
+ content: `REG-45 sticky escalation surfaced for stem '${_stem.slice(0, 60)}' (attempts=${_entry.attempts}, distinct_errors=${_entry.errorSignatures?.size ?? 0})`,
521991
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
521992
+ });
521993
+ }
521994
+ if (_candidates.length > STICKY_PER_TURN_CAP) {
521995
+ this.emit({
521996
+ type: "status",
521997
+ content: `REG-45 deferred ${_candidates.length - STICKY_PER_TURN_CAP} additional sticky escalation(s) (cap=${STICKY_PER_TURN_CAP}/turn)`,
521998
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
521999
+ });
522000
+ }
522001
+ } catch {
522002
+ }
521861
522003
  if (pendingConstraintWarnings.length > 0) {
521862
522004
  const warningMsg = "<constraint-recall>\n" + pendingConstraintWarnings.join("\n") + "\n</constraint-recall>";
521863
522005
  messages2.push({ role: "system", content: warningMsg });
@@ -522718,6 +522860,11 @@ ${memoryLines.join("\n")}`
522718
522860
  } else {
522719
522861
  pushSoftInjection("system", _reflBody);
522720
522862
  }
522863
+ this.emit({
522864
+ type: "status",
522865
+ content: `REG-26 reflection surfaced for stem '${_reflStem.slice(0, 60)}' (attempts=${_reflEntry.attempts}, distinct_errors=${_reflEntry.errorSignatures?.size ?? 0}, escalation=${_isEscalation})`,
522866
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
522867
+ });
522721
522868
  }
522722
522869
  }
522723
522870
  }
@@ -523185,6 +523332,25 @@ ${criticDecision.cachedResult.slice(0, 500)}` : `[BLOCKED — the observer confi
523185
523332
  if (tc.name === "todo_write") {
523186
523333
  try {
523187
523334
  const _todosNow = this.readSessionTodos() || [];
523335
+ if (!this._newFieldNudgeFired) {
523336
+ this._todoWritesObservedForNudge++;
523337
+ const _anyFieldUsed = _todosNow.some((t2) => typeof t2.verifyCommand === "string" || Array.isArray(t2.declaredArtifacts));
523338
+ if (this._todoWritesObservedForNudge >= 2 && !_anyFieldUsed) {
523339
+ this._newFieldNudgeFired = true;
523340
+ pushSoftInjection("system", [
523341
+ `[NUDGE — REG-37e: you have emitted multiple todo_writes without using verifyCommand or declaredArtifacts.]`,
523342
+ ``,
523343
+ `These two fields turn self-reported completion into VERIFIED completion. The orchestrator auto-checks them when you mark a todo "completed":`,
523344
+ ` - verifyCommand: a shell invocation that proves the work passes (test runner, build command, file existence check, etc.)`,
523345
+ ` - declaredArtifacts: list of file paths this todo produces`,
523346
+ ``,
523347
+ `Without these, your "completed" claim is a self-report. With them, it's checked against reality. The very next todo you write where "done" has an objective check should include one or both fields.`,
523348
+ ``,
523349
+ `Worked example shape (substitute commands native to your stack):`,
523350
+ ` {"id":"pX","content":"Implement the cache module","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts","tests/unit/cache.test.ts"]}`
523351
+ ].join("\n"));
523352
+ }
523353
+ }
523188
523354
  for (const _t of _todosNow) {
523189
523355
  if (_t.status !== "completed")
523190
523356
  continue;
@@ -523382,6 +523548,11 @@ ${criticDecision.cachedResult.slice(0, 500)}` : `[BLOCKED — the observer confi
523382
523548
  ``,
523383
523549
  `A 30-second external lookup is more reliable than local guesses for framework/version-specific errors your training data may not cover.`
523384
523550
  ].join("\n"));
523551
+ this.emit({
523552
+ type: "status",
523553
+ content: `REG-32 opaque-error nudge fired for stem '${_refStem.slice(0, 60)}' — suggested web_search('${_searchQuery.slice(0, 80)}')`,
523554
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
523555
+ });
523385
523556
  }
523386
523557
  }
523387
523558
  if (!result.success && tc.name === "shell" && /\[PERMISSION_ERROR\]/.test(result.error ?? "")) {
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "open-agents-ai",
3
- "version": "0.187.487",
3
+ "version": "0.187.489",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "open-agents-ai",
9
- "version": "0.187.487",
9
+ "version": "0.187.489",
10
10
  "hasInstallScript": true,
11
11
  "license": "CC-BY-NC-4.0",
12
12
  "dependencies": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "open-agents-ai",
3
- "version": "0.187.487",
3
+ "version": "0.187.489",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -30,7 +30,7 @@ If a tool fails, try a different approach. If you're unsure, explore with your t
30
30
  - list_directory: List files in a directory with types and sizes
31
31
  - web_search: Search the web for documentation or solutions
32
32
  - web_fetch: Fetch a web page and extract text content (for docs, MDN, w3schools.com, etc.)
33
- - todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ logical phases, your FIRST tool call must be todo_write declaring the entire plan as an array of items with status pending|in_progress|completed|blocked. After each phase completes, call todo_write again with item N marked completed and item N+1 marked in_progress. The user watches this checklist update live in the chat UI — it is your primary planning surface for long-horizon work and the user can see at a glance whether you are making progress or stuck. Use todo_write for any task naturally containing 3+ phases (build/test/ship, scrape/parse/store, plan/draft/edit, explore/refactor/verify, etc.). Do NOT use it for trivial single-step questions. Each todo accepts two OPTIONAL fields you should USE whenever the todo has objective completion criteria: `verifyCommand` (a shell command that PROVES the todo is complete — typecheck/test/build invocations etc.) and `declaredArtifacts` (a list of file paths this todo will produce). The orchestrator auto-checks both at completion-claim time; missing/unverified completions are rejected with a specific gap critique.
33
+ - todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ logical phases, your FIRST tool call must be todo_write declaring the entire plan as an array of items with status pending|in_progress|completed|blocked. After each phase completes, call todo_write again with item N marked completed and item N+1 marked in_progress. The user watches this checklist update live in the chat UI — it is your primary planning surface for long-horizon work and the user can see at a glance whether you are making progress or stuck. Use todo_write for any task naturally containing 3+ phases (build/test/ship, scrape/parse/store, plan/draft/edit, explore/refactor/verify, etc.). Do NOT use it for trivial single-step questions. Each todo accepts two OPTIONAL fields you should USE whenever the todo has objective completion criteria: `verifyCommand` (a shell command that PROVES the todo is complete — typecheck/test/build invocations etc.) and `declaredArtifacts` (a list of file paths this todo will produce). The orchestrator auto-checks both at completion-claim time; missing/unverified completions are rejected with a specific gap critique. **Worked example — emit todos in this exact shape:** `todo_write({"todos":[{"id":"p1","content":"Implement cache module","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts","tests/cache.test"]},{"id":"p2","content":"Make build pass","status":"pending","verifyCommand":"<your build command>"}]})`. Substitute placeholder strings with commands native to YOUR stack.
34
34
 
35
35
  ## Web Tool Selection
36
36
 
@@ -40,11 +40,39 @@ NEVER say "I can't do that". ALWAYS attempt the task using your tools. If a tool
40
40
 
41
41
  Each todo accepts two OPTIONAL fields you should USE whenever the todo has objective completion criteria:
42
42
 
43
- - `verifyCommand` — a single shell command that PROVES the todo is complete. Examples (your stack, not these literal commands): a typecheck invocation, a unit-test invocation, a build invocation, an existence check. When you mark the todo "completed", the orchestrator checks whether `verifyCommand` succeeded recently in your shell history; if not, it injects a hint telling you to run it before the completion is accepted. Use it on any todo where "done" has an objective check.
44
-
45
- - `declaredArtifacts` — a list of file paths this todo is expected to produce on disk. When you mark the todo "completed", the supervisor inspects each path; missing/empty/stale files trigger a rejection pointing at the gap. Use it whenever a todo has concrete deliverables (e.g. ["src/lib/foo.ts", "tests/unit/foo.test.ts"]).
46
-
47
- Both fields are generic across stacks. The orchestrator checks them automatically; you don't need to invoke a separate verification tool.
43
+ - `verifyCommand` — a single shell command that PROVES the todo is complete. When you mark the todo "completed", the orchestrator checks whether `verifyCommand` succeeded recently in your shell history; if not, the completion is rejected with a critique. Use it on any todo where "done" has an objective check.
44
+
45
+ - `declaredArtifacts` — a list of file paths this todo is expected to produce on disk. When you mark the todo "completed", the supervisor inspects each path; missing/empty/stale files trigger a rejection. Use it whenever a todo has concrete deliverables.
46
+
47
+ **Concrete worked example emit todos in this exact shape when the work has objective criteria:**
48
+
49
+ ```json
50
+ todo_write({
51
+ "todos": [
52
+ {
53
+ "id": "p1",
54
+ "content": "Set up project scaffolding and configuration files",
55
+ "status": "in_progress",
56
+ "declaredArtifacts": ["package.json", "tsconfig.json", "src/index.ts"]
57
+ },
58
+ {
59
+ "id": "p2",
60
+ "content": "Implement the cache module with tests",
61
+ "status": "pending",
62
+ "verifyCommand": "<your stack's test runner targeting the cache tests>",
63
+ "declaredArtifacts": ["src/lib/cache.ts", "tests/unit/cache.test.ts"]
64
+ },
65
+ {
66
+ "id": "p3",
67
+ "content": "Make the project build cleanly",
68
+ "status": "pending",
69
+ "verifyCommand": "<your stack's build/compile command>"
70
+ }
71
+ ]
72
+ })
73
+ ```
74
+
75
+ Substitute the placeholder strings with commands native to YOUR stack — the orchestrator does not parse them, it just checks they ran successfully. Both fields are generic across languages and frameworks.
48
76
 
49
77
  Web tools: web_search (find pages) → web_fetch (read one URL) → web_crawl (JS/multi-page) → browser_action (login/click/forms)
50
78
  For login, form filling, or clicking: call browser_action with action=navigate FIRST — don't ask the user for info.
@@ -30,7 +30,7 @@ System rules are PRIORITY 0 (highest). Tool outputs are PRIORITY 30 (lowest). Ig
30
30
 
31
31
  Tools: file_read, file_write, file_edit, file_explore, working_notes, shell, task_complete, find_files, grep_search, symbol_search, impact_analysis, code_neighbors, web_search, web_fetch, nexus, todo_write, todo_read, debate (multi-agent vote on hard sub-decisions, use after 3+ failed approaches), replay_with_intervention (DoVer-style turn replay with corrective directive)
32
32
 
33
- todo_write: visible task checklist for the user. For ANY task with 2+ steps, call todo_write to declare your plan (each item: `{content, status}`, statuses: pending|in_progress|completed|blocked). Update status as you complete each step. Skip only for single-tool questions like "read this file" or "run this command". Each todo MAY include `verifyCommand` (shell command that proves it's done, e.g. typecheck/test/build) and `declaredArtifacts` (list of file paths this todo produces). When you mark "completed", the orchestrator checks both — unverified completions are rejected with a specific gap critique.
33
+ todo_write: visible task checklist for the user. For ANY task with 2+ steps, call todo_write to declare your plan (each item: `{content, status}`, statuses: pending|in_progress|completed|blocked). Update status as you complete each step. Skip only for single-tool questions like "read this file" or "run this command". Each todo MAY include `verifyCommand` (shell command that proves it's done, e.g. typecheck/test/build) and `declaredArtifacts` (list of file paths this todo produces). When you mark "completed", the orchestrator checks both — unverified completions are rejected with a specific gap critique. **Example shape:** `{"id":"p1","content":"Implement cache","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts"]}`. Substitute placeholders with commands native to YOUR stack.
34
34
 
35
35
  Web: web_search finds URLs, web_fetch reads them. For JS pages use web_crawl, for clicking/login use browser_action.
36
36