jeo-code 0.1.0 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja.md +160 -0
- package/README.ko.md +160 -0
- package/README.md +115 -297
- package/README.zh.md +160 -0
- package/package.json +11 -6
- package/scripts/install.sh +28 -28
- package/scripts/uninstall.sh +17 -15
- package/src/AGENTS.md +50 -0
- package/src/agent/AGENTS.md +49 -0
- package/src/agent/bash-fixups.ts +103 -0
- package/src/agent/compaction.ts +410 -19
- package/src/agent/config-schema.ts +119 -5
- package/src/agent/context-files.ts +314 -17
- package/src/agent/dev/AGENTS.md +36 -0
- package/src/agent/dev/advanced-analyzer.ts +12 -0
- package/src/agent/dev/evolution-bridge.ts +82 -0
- package/src/agent/dev/evolution-logger.ts +41 -0
- package/src/agent/dev/self-analysis.ts +64 -0
- package/src/agent/dev/self-improve.ts +24 -0
- package/src/agent/dev/spec-automation.ts +49 -0
- package/src/agent/engine.ts +804 -54
- package/src/agent/hooks.ts +273 -0
- package/src/agent/loop.ts +21 -1
- package/src/agent/memory.ts +201 -0
- package/src/agent/model-recency.ts +32 -0
- package/src/agent/output-minimizer.ts +108 -0
- package/src/agent/output-util.ts +64 -0
- package/src/agent/plan.ts +187 -0
- package/src/agent/seed.ts +52 -0
- package/src/agent/session.ts +235 -21
- package/src/agent/state.ts +286 -39
- package/src/agent/step-budget.ts +232 -0
- package/src/agent/subagents.ts +223 -26
- package/src/agent/task-tool.ts +272 -0
- package/src/agent/todo-tool.ts +87 -0
- package/src/agent/tokenizer.ts +117 -0
- package/src/agent/tool-registry.ts +54 -0
- package/src/agent/tools.ts +562 -103
- package/src/agent/web-search.ts +538 -0
- package/src/ai/AGENTS.md +44 -0
- package/src/ai/index.ts +1 -0
- package/src/ai/model-catalog-compat.ts +3 -1
- package/src/ai/model-catalog.ts +74 -9
- package/src/ai/model-discovery.ts +215 -17
- package/src/ai/model-manager.ts +346 -32
- package/src/ai/model-picker.ts +1 -1
- package/src/ai/model-registry.ts +4 -2
- package/src/ai/pricing.ts +84 -0
- package/src/ai/provider-registry.ts +23 -0
- package/src/ai/provider-status.ts +60 -16
- package/src/ai/providers/AGENTS.md +42 -0
- package/src/ai/providers/anthropic.ts +250 -31
- package/src/ai/providers/antigravity.ts +219 -0
- package/src/ai/providers/errors.ts +15 -1
- package/src/ai/providers/gemini.ts +196 -13
- package/src/ai/providers/ollama.ts +37 -7
- package/src/ai/providers/openai-responses.ts +173 -0
- package/src/ai/providers/openai.ts +64 -12
- package/src/ai/sse.ts +4 -1
- package/src/ai/types.ts +18 -1
- package/src/auth/AGENTS.md +41 -0
- package/src/auth/callback-server.ts +6 -1
- package/src/auth/flows/AGENTS.md +32 -0
- package/src/auth/flows/antigravity.ts +151 -0
- package/src/auth/flows/google-project.ts +190 -0
- package/src/auth/flows/google.ts +39 -18
- package/src/auth/flows/index.ts +15 -5
- package/src/auth/flows/openai.ts +2 -2
- package/src/auth/oauth.ts +8 -0
- package/src/auth/refresh.ts +44 -27
- package/src/auth/storage.ts +149 -26
- package/src/auth/types.ts +1 -1
- package/src/autopilot.ts +362 -0
- package/src/bun-imports.d.ts +4 -0
- package/src/cli/AGENTS.md +39 -0
- package/src/cli/runner.ts +148 -14
- package/src/cli.ts +13 -4
- package/src/commands/AGENTS.md +40 -0
- package/src/commands/approve.ts +62 -3
- package/src/commands/auth.ts +167 -25
- package/src/commands/chat.ts +37 -8
- package/src/commands/deep-interview.ts +633 -175
- package/src/commands/doctor.ts +84 -37
- package/src/commands/evolve-core.ts +18 -0
- package/src/commands/evolve.ts +2 -1
- package/src/commands/export.ts +176 -0
- package/src/commands/gjc.ts +52 -0
- package/src/commands/launch.ts +3549 -240
- package/src/commands/mcp.ts +3 -3
- package/src/commands/ooo-seed.ts +19 -0
- package/src/commands/ralplan.ts +253 -35
- package/src/commands/resume.ts +1 -1
- package/src/commands/session.ts +183 -0
- package/src/commands/setup-helpers.ts +10 -3
- package/src/commands/setup.ts +57 -16
- package/src/commands/skills.ts +78 -18
- package/src/commands/state.ts +198 -0
- package/src/commands/status.ts +84 -0
- package/src/commands/team.ts +340 -212
- package/src/commands/ultragoal.ts +122 -61
- package/src/commands/update.ts +244 -0
- package/src/ledger.ts +270 -0
- package/src/mcp/AGENTS.md +38 -0
- package/src/mcp/server.ts +115 -14
- package/src/mcp/tools.ts +42 -22
- package/src/md-modules.d.ts +4 -0
- package/src/prompts/AGENTS.md +41 -0
- package/src/prompts/agents/AGENTS.md +35 -0
- package/src/prompts/agents/architect.md +35 -0
- package/src/prompts/agents/critic.md +37 -0
- package/src/prompts/agents/executor.md +36 -0
- package/src/prompts/agents/planner.md +37 -0
- package/src/prompts/skills/AGENTS.md +36 -0
- package/src/prompts/skills/deep-dive/AGENTS.md +31 -0
- package/src/prompts/skills/deep-dive/SKILL.md +13 -0
- package/src/prompts/skills/deep-interview/AGENTS.md +31 -0
- package/src/prompts/skills/deep-interview/SKILL.md +12 -0
- package/src/prompts/skills/gjc/AGENTS.md +31 -0
- package/src/prompts/skills/gjc/SKILL.md +15 -0
- package/src/prompts/skills/ralplan/AGENTS.md +31 -0
- package/src/prompts/skills/ralplan/SKILL.md +11 -0
- package/src/prompts/skills/team/AGENTS.md +31 -0
- package/src/prompts/skills/team/SKILL.md +11 -0
- package/src/prompts/skills/ultragoal/AGENTS.md +31 -0
- package/src/prompts/skills/ultragoal/SKILL.md +11 -0
- package/src/skills/AGENTS.md +38 -0
- package/src/skills/catalog.ts +565 -31
- package/src/tui/AGENTS.md +43 -0
- package/src/tui/app.ts +1181 -92
- package/src/tui/components/AGENTS.md +42 -0
- package/src/tui/components/ascii-art.ts +257 -15
- package/src/tui/components/autocomplete.ts +98 -16
- package/src/tui/components/autopilot-status.ts +65 -0
- package/src/tui/components/category-index.ts +49 -0
- package/src/tui/components/code-view.ts +54 -11
- package/src/tui/components/color.ts +171 -2
- package/src/tui/components/config-panel.ts +82 -15
- package/src/tui/components/duration.ts +38 -0
- package/src/tui/components/evolution.ts +3 -3
- package/src/tui/components/footer.ts +91 -42
- package/src/tui/components/forge.ts +426 -31
- package/src/tui/components/hints.ts +54 -0
- package/src/tui/components/hud.ts +73 -0
- package/src/tui/components/index.ts +4 -0
- package/src/tui/components/input-box.ts +150 -0
- package/src/tui/components/layout.ts +11 -3
- package/src/tui/components/live-model-picker.ts +108 -0
- package/src/tui/components/markdown-table.ts +140 -0
- package/src/tui/components/markdown-text.ts +97 -0
- package/src/tui/components/meter.ts +4 -1
- package/src/tui/components/model-picker.ts +3 -2
- package/src/tui/components/provider-picker.ts +3 -2
- package/src/tui/components/section.ts +70 -0
- package/src/tui/components/select-list.ts +40 -10
- package/src/tui/components/skill-picker.ts +25 -0
- package/src/tui/components/slash.ts +244 -21
- package/src/tui/components/status.ts +272 -11
- package/src/tui/components/step-timeline.ts +218 -0
- package/src/tui/components/stream.ts +26 -9
- package/src/tui/components/themes.ts +212 -6
- package/src/tui/components/todo-card.ts +47 -0
- package/src/tui/components/tool-list.ts +58 -12
- package/src/tui/components/transcript.ts +120 -0
- package/src/tui/components/update-box.ts +31 -0
- package/src/tui/components/welcome.ts +162 -0
- package/src/tui/components/width.ts +163 -0
- package/src/tui/monitoring/AGENTS.md +31 -0
- package/src/tui/monitoring/hud-view.ts +55 -0
- package/src/tui/renderer.ts +112 -3
- package/src/tui/terminal.ts +40 -33
- package/src/util/AGENTS.md +39 -0
- package/src/util/clipboard-image.ts +118 -0
- package/src/util/env.ts +12 -0
- package/src/util/provider-error.ts +78 -0
- package/src/util/retry.ts +91 -6
- package/src/util/update-check.ts +64 -0
- package/src/commands/models.ts +0 -104
package/src/agent/engine.ts
CHANGED
|
@@ -1,16 +1,39 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Reusable agentic tool-call loop — the shared core behind `
|
|
3
|
-
* (per-task executor) and `
|
|
2
|
+
* Reusable agentic tool-call loop — the shared core behind `jeo team`
|
|
3
|
+
* (per-task executor) and `jeo launch` (interactive coding agent).
|
|
4
4
|
*
|
|
5
5
|
* The model is driven in JSON tool-call mode: each step it emits exactly one
|
|
6
6
|
* `{ "tool": "...", "arguments": { ... } }` object; the engine dispatches it,
|
|
7
7
|
* appends the result to history, and continues until the model calls `done`
|
|
8
8
|
* or the step budget is exhausted.
|
|
9
9
|
*/
|
|
10
|
-
import
|
|
10
|
+
import * as fs from "node:fs/promises";
|
|
11
|
+
import * as path from "node:path";
|
|
12
|
+
import type { Message } from "./loop";
|
|
11
13
|
import { extractJsonObject } from "./json";
|
|
12
|
-
import { readTool, writeTool, editTool, bashTool, findTool, searchTool, type ToolResult } from "./tools";
|
|
14
|
+
import { readTool, writeTool, editTool, bashTool, findTool, searchTool, lsTool, type ToolResult } from "./tools";
|
|
15
|
+
import { webSearchTool, setWebSearchActiveModel } from "./web-search";
|
|
16
|
+
import { friendlyProviderError, isContextOverflowError, isRefusalError } from "../util/provider-error";
|
|
17
|
+
import { isRateLimitError } from "../util/retry";
|
|
18
|
+
import { runPreToolHooks, runPostTurnHooks } from "./hooks";
|
|
19
|
+
import { minimizeToolOutput } from "./output-minimizer";
|
|
20
|
+
import { StepBudget, dynamicStepBudgetConfig, resolveStepBudgetConfig, hashSignature, type StepBudgetConfig } from "./step-budget";
|
|
21
|
+
import { historyTokens, trimToolResultsInPlace } from "./compaction";
|
|
22
|
+
import { jeoEnv } from "../util/env";
|
|
13
23
|
|
|
24
|
+
|
|
25
|
+
async function invokeCallLlm(history: Message[], options: {
|
|
26
|
+
jsonMode: boolean;
|
|
27
|
+
model?: string;
|
|
28
|
+
maxTokens?: number;
|
|
29
|
+
signal?: AbortSignal;
|
|
30
|
+
onUsage?: (u: { inputTokens?: number; outputTokens?: number }) => void;
|
|
31
|
+
onRetry?: (attempt: number, err: unknown, delayMs: number) => void;
|
|
32
|
+
onToken?: (delta: string) => void;
|
|
33
|
+
}): Promise<string> {
|
|
34
|
+
const mod = await import("./loop");
|
|
35
|
+
return mod.callLlm(history, options);
|
|
36
|
+
}
|
|
14
37
|
export interface ToolInvocation {
|
|
15
38
|
tool: string;
|
|
16
39
|
arguments?: Record<string, any>;
|
|
@@ -20,47 +43,122 @@ export type ToolHandler = (args: Record<string, any>, cwd: string) => Promise<To
|
|
|
20
43
|
|
|
21
44
|
/** The default executor toolset (read / write / edit / bash / find / search). */
|
|
22
45
|
export const DEFAULT_TOOLS: Record<string, ToolHandler> = {
|
|
23
|
-
read: (a, cwd) => readTool(a.filePath ?? a.path, a.lineRange, cwd),
|
|
46
|
+
read: (a, cwd) => readTool(a.filePath ?? a.path, a.lineRange ?? a.range, cwd, !!a.raw),
|
|
24
47
|
write: (a, cwd) => writeTool(a.filePath ?? a.path, a.content ?? "", cwd),
|
|
25
48
|
edit: (a, cwd) => editTool(a.filePath ?? a.path, a.editBlock ?? a.edit ?? "", cwd),
|
|
26
|
-
bash: (a, cwd) => bashTool(a.command ?? a.cmd, cwd, typeof a.timeoutMs === "number" ? a.timeoutMs : undefined),
|
|
49
|
+
bash: (a, cwd) => bashTool(a.command ?? a.cmd, cwd, typeof a.timeoutMs === "number" ? a.timeoutMs : undefined, typeof a.cwd === "string" ? a.cwd : (typeof a.subdir === "string" ? a.subdir : undefined), a.env && typeof a.env === "object" ? a.env : undefined),
|
|
27
50
|
find: (a, cwd) => findTool(a.globPattern ?? a.pattern, cwd),
|
|
28
|
-
search: (a, cwd) => searchTool(a.pattern, a.globPattern ?? "*", cwd),
|
|
51
|
+
search: (a, cwd) => searchTool(a.pattern, a.globPattern ?? "*", cwd, !!(a.ignoreCase ?? a.i), { before: a.before, after: a.after, context: a.context, maxMatches: a.maxMatches }),
|
|
52
|
+
ls: (a, cwd) => lsTool(a.dirPath ?? a.path ?? a.dir ?? ".", cwd),
|
|
53
|
+
web_search: (a, cwd) => webSearchTool(a, cwd),
|
|
29
54
|
};
|
|
30
55
|
|
|
31
56
|
/** Tool-protocol description injected into the system prompt. */
|
|
32
57
|
export const TOOL_PROTOCOL = [
|
|
33
|
-
"You have these tools (call exactly ONE per step):",
|
|
34
|
-
"1. read {filePath, lineRange?}
|
|
58
|
+
"You have these tools (call exactly ONE per step, or batch multiple independent calls):",
|
|
59
|
+
"1. read {filePath, lineRange?, raw?} — read a file; lines are prefixed `LINEhh|` (hh = 2-char content anchor; the | is a separator, not file bytes)",
|
|
35
60
|
"2. write {filePath, content} — create/overwrite a file",
|
|
36
|
-
"3. edit {filePath, editBlock} — ≔A..B replace lines; ≔A+ insert after line A; ≔$ append EOF (payload on next line)",
|
|
37
|
-
"4. bash {command, timeoutMs?}
|
|
61
|
+
"3. edit {filePath, editBlock} — ≔A..B replace lines (append read anchors for safety: ≔12ab..15cd — rejected with fresh content if the lines changed); ≔A+ insert after line A; ≔$ append EOF (payload on next line). NEVER copy the `LINEhh|` prefixes into SEARCH blocks or payloads",
|
|
62
|
+
"4. bash {command, timeoutMs?, cwd?, env?} — run a shell command (cwd: subdir; env: extra vars)",
|
|
38
63
|
"5. find {globPattern} — find files by name",
|
|
39
|
-
"6. search {pattern, globPattern?}
|
|
40
|
-
"7.
|
|
64
|
+
"6. search {pattern, globPattern?, ignoreCase?, context?, maxMatches?} — grep (context: N lines around each match)",
|
|
65
|
+
"7. ls {dirPath} — list a directory's entries (dirs first)",
|
|
66
|
+
"8. web_search {query, recency?, limit?} — search the web (Anthropic-native: synthesized answer + sources + citations)",
|
|
67
|
+
"9. done {reason?} — call when the task is fully implemented AND verified",
|
|
68
|
+
"",
|
|
69
|
+
"Reply with STRICT JSON only — no code fences. You MAY include an optional leading",
|
|
70
|
+
'"reasoning" string (one short sentence on your plan) before "tool":',
|
|
71
|
+
'{ "reasoning": "<one short sentence>", "tool": "<name>", "arguments": { ... } }',
|
|
72
|
+
"",
|
|
73
|
+
"Alternatively, you may batch up to 6 independent calls in a single turn using the following format:",
|
|
74
|
+
'{ "reasoning": "<one short sentence>", "tools": [{ "tool": "<name>", "arguments": { ... } }, ...] }',
|
|
75
|
+
"Batch only independent calls; NEVER batch 'done', and NEVER put a mutating tool (write/edit/bash) after another mutating tool in one batch whose inputs depend on the earlier one.",
|
|
76
|
+
].join("\n");
|
|
77
|
+
|
|
78
|
+
/** Restricted protocol for read-only subagent roles (planner/architect/critic):
|
|
79
|
+
* advertises only the non-mutating tools so the model does not waste steps
|
|
80
|
+
* calling write/edit/bash, which `subagentToolset` has physically removed. */
|
|
81
|
+
export const READONLY_TOOL_PROTOCOL = [
|
|
82
|
+
"You have these READ-ONLY tools (call exactly ONE per step, or batch multiple independent calls):",
|
|
83
|
+
"1. read {filePath, lineRange?} — read a file (lineRange: \"a-b\", \"a-\", \"a\", \"a+n\", or multi \"a-b,c-d\")",
|
|
84
|
+
"2. find {globPattern} — find files by name",
|
|
85
|
+
"3. search {pattern, globPattern?, ignoreCase?} — grep for a pattern",
|
|
86
|
+
"4. ls {dirPath} — list a directory's entries",
|
|
87
|
+
"5. web_search {query, recency?, limit?} — search the web (answer + sources + citations)",
|
|
88
|
+
"6. done {reason?} — call when your review/analysis is complete",
|
|
41
89
|
"",
|
|
42
90
|
"Reply with STRICT JSON only — no prose, no code fences:",
|
|
43
91
|
'{ "tool": "<name>", "arguments": { ... } }',
|
|
92
|
+
"",
|
|
93
|
+
"Alternatively, you may batch up to 6 independent calls in a single turn using the following format:",
|
|
94
|
+
'{ "tools": [{ "tool": "<name>", "arguments": { ... } }, ...] }',
|
|
95
|
+
"Batch only independent calls; NEVER batch 'done'.",
|
|
96
|
+
].join("\n");
|
|
97
|
+
|
|
98
|
+
/** gjc-inherited working discipline (plan/gjc-inheritance.md B3): the completion
|
|
99
|
+
* contract and tool-priority rules distilled from gjc's system prompt — compact
|
|
100
|
+
* (<300 tokens) per the pi-mono budget so the core prompt stays lean. */
|
|
101
|
+
export const WORKING_DISCIPLINE = [
|
|
102
|
+
"Working discipline:",
|
|
103
|
+
"- Correctness first, maintainability second, brevity third. Prefer boring, explicit code.",
|
|
104
|
+
"- Never present partial work as complete; never suppress tests or warnings to make code pass.",
|
|
105
|
+
"- Never fabricate tool results or test outcomes; verification claims must match what was actually run.",
|
|
106
|
+
"- Never ship stubs, placeholders, or TODO-only code as a delivered feature.",
|
|
107
|
+
"- Never substitute the requested problem with an easier adjacent one.",
|
|
108
|
+
"- Update directly affected callsites, tests, and docs — or state why they are unchanged.",
|
|
109
|
+
"- Reuse existing patterns; parallel conventions are prohibited. Fix problems at their source.",
|
|
110
|
+
"- You are not alone in the repository: treat unexpected changes as user work; never revert or delete them.",
|
|
111
|
+
"- Re-read before acting if a tool fails or a file may have changed.",
|
|
112
|
+
"- Prefer dedicated tools over shell pipelines: read (not cat), search (not grep), edit (not sed).",
|
|
44
113
|
].join("\n");
|
|
45
114
|
|
|
46
|
-
export function executorSystemPrompt(
|
|
115
|
+
export function executorSystemPrompt(
|
|
116
|
+
role = "Executor Agent, a senior software developer",
|
|
117
|
+
protocol: string = TOOL_PROTOCOL,
|
|
118
|
+
verificationDirective = "Always verify (run tests / execute the program) before calling done.",
|
|
119
|
+
): string {
|
|
47
120
|
return (
|
|
48
121
|
`You are the ${role}.\n` +
|
|
49
122
|
`Accomplish the user's request by calling tools and verifying your work.\n\n` +
|
|
50
|
-
`${
|
|
51
|
-
`
|
|
123
|
+
`${protocol}\n\n` +
|
|
124
|
+
`${WORKING_DISCIPLINE}\n\n` +
|
|
125
|
+
verificationDirective
|
|
52
126
|
);
|
|
53
127
|
}
|
|
54
128
|
|
|
55
129
|
export interface AgentLoopEvents {
|
|
56
|
-
onStep?(step: number): void
|
|
130
|
+
onStep?(step: number): void | Promise<void>;
|
|
57
131
|
onAssistant?(raw: string, invocation: ToolInvocation | null): void;
|
|
58
132
|
onToolResult?(tool: string, success: boolean, output: string): void;
|
|
59
|
-
|
|
133
|
+
/** Transient progress notice (e.g. "rate limited — retrying in Ns"); NOT a terminal error. */
|
|
134
|
+
onNotice?(message: string): void;
|
|
135
|
+
/** Cumulative token usage after each LLM call — drives live usage meters. */
|
|
136
|
+
onUsage?(usage: { inputTokens: number; outputTokens: number }): void;
|
|
137
|
+
/** Accumulated streamed model response so far — drives the live reasoning view. Only
|
|
138
|
+
* requested when a consumer sets it (the engine streams solely for the TUI). */
|
|
139
|
+
onModelStream?(textSoFar: string): void;
|
|
140
|
+
/** Step-budget change (gjc-style retry flow): the limit was extended because the
|
|
141
|
+
* turn is making progress. `limit` is the new max; `reason` is display-ready. */
|
|
142
|
+
onBudget?(limit: number, reason: string): void;
|
|
143
|
+
/** Consulted when a lone `done` arrives. Return a corrective message to bounce
|
|
144
|
+
* the done ONCE (e.g. "todo list still shows unfinished items — update it
|
|
145
|
+
* first"); return null to let the turn finish. The engine guarantees at most
|
|
146
|
+
* one bounce per turn, so a stubborn model can never loop here. */
|
|
147
|
+
onBeforeDone?(reason: string): string | null;
|
|
60
148
|
}
|
|
61
149
|
|
|
62
150
|
export interface AgentLoopOptions {
|
|
151
|
+
/** Optional system prompt: prepended to `history` when it has no system message. */
|
|
152
|
+
systemPrompt?: string;
|
|
153
|
+
/** Mid-turn context budget (estimated tokens). When the in-turn history grows
|
|
154
|
+
* past this, the OLDEST tool-result bodies are deterministically elided so a
|
|
155
|
+
* long turn cannot snowball into multi-million-token prompts. Default 80k. */
|
|
156
|
+
maxHistoryTokens?: number;
|
|
63
157
|
cwd: string;
|
|
158
|
+
/** Base step budget (default 15). Non-finite or `<= 0` selects the DYNAMIC budget:
|
|
159
|
+
* the budget keeps extending while the recent tool window shows NOVEL progress,
|
|
160
|
+
* a stalled or cycling turn consolidates a final wrap-up, and a large finite
|
|
161
|
+
* safety cap (`DYNAMIC_HARD_CAP`, default 600) guarantees termination. */
|
|
64
162
|
maxSteps?: number;
|
|
65
163
|
model?: string;
|
|
66
164
|
/** Max generation tokens per step (drives the thinking budget). */
|
|
@@ -68,6 +166,9 @@ export interface AgentLoopOptions {
|
|
|
68
166
|
tools?: Record<string, ToolHandler>;
|
|
69
167
|
signal?: AbortSignal;
|
|
70
168
|
events?: AgentLoopEvents;
|
|
169
|
+
/** Step-budget overrides (gjc-style retry flow). `{ maxExtensions: 0 }` restores the
|
|
170
|
+
* legacy fixed counter — used by bounded subagent delegation. */
|
|
171
|
+
budget?: Partial<StepBudgetConfig>;
|
|
71
172
|
}
|
|
72
173
|
|
|
73
174
|
export interface AgentLoopResult {
|
|
@@ -78,28 +179,150 @@ export interface AgentLoopResult {
|
|
|
78
179
|
usage?: { inputTokens: number; outputTokens: number };
|
|
79
180
|
}
|
|
80
181
|
|
|
182
|
+
/** Env-tunable output budget (plan/gjc-inheritance.md B10, gjc settings-driven
|
|
183
|
+
* output handling 계승): JEO_TOOL_OUTPUT_MAX caps the model-visible tool result;
|
|
184
|
+
* the spill threshold tracks it so anything truncated stays artifact-recoverable. */
|
|
185
|
+
function envOutputMax(): number {
|
|
186
|
+
const raw = Number(jeoEnv("TOOL_OUTPUT_MAX") ?? "");
|
|
187
|
+
return Number.isFinite(raw) && raw >= 500 && raw <= 200_000 ? Math.trunc(raw) : 4_000;
|
|
188
|
+
}
|
|
189
|
+
export const TOOL_OUTPUT_MAX = envOutputMax();
|
|
190
|
+
|
|
191
|
+
/** Wall-clock budget for ONE agent turn (ms). JEO_TURN_MAX_MS overrides; 0 disables.
|
|
192
|
+
* Default 30 minutes: long autonomous runs stay alive, while a turn that spins in
|
|
193
|
+
* "thinking" (huge contexts, endless extensions) is guaranteed to terminate into
|
|
194
|
+
* the consolidation wrap-up instead of running for hours. */
|
|
195
|
+
export function turnMaxMs(env: Record<string, string | undefined> = process.env): number {
|
|
196
|
+
const raw = jeoEnv("TURN_MAX_MS", env);
|
|
197
|
+
if (raw !== undefined && raw !== "") {
|
|
198
|
+
const n = Number(raw);
|
|
199
|
+
if (Number.isFinite(n) && n >= 0) return Math.trunc(n);
|
|
200
|
+
}
|
|
201
|
+
return 30 * 60 * 1000;
|
|
202
|
+
}
|
|
203
|
+
|
|
81
204
|
/**
|
|
82
205
|
* Cap a tool result fed back to the model, keeping both ends: the head holds the
|
|
83
206
|
* start (e.g. a file's top / a command's invocation) and the tail holds what's
|
|
84
207
|
* usually decisive (test summaries, the final error). A pure head-cut loses that.
|
|
85
208
|
*/
|
|
86
|
-
export function truncateToolOutput(s: string, max =
|
|
209
|
+
export function truncateToolOutput(s: string, max = TOOL_OUTPUT_MAX): string {
|
|
87
210
|
if (s.length <= max) return s;
|
|
88
211
|
const head = Math.floor(max * 0.6);
|
|
89
212
|
const tail = max - head;
|
|
90
213
|
return `${s.slice(0, head)}\n…(${s.length - max} chars truncated)…\n${s.slice(s.length - tail)}`;
|
|
91
214
|
}
|
|
92
215
|
|
|
216
|
+
/** Tool output larger than this is spilled to a recoverable artifact file. Aligned
|
|
217
|
+
* with `truncateToolOutput`'s cap so that whenever the model-visible result drops
|
|
218
|
+
* content, the full output is recoverable via the artifact. */
|
|
219
|
+
export const TOOL_SPILL_THRESHOLD = TOOL_OUTPUT_MAX;
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Write an oversized tool result verbatim under `.jeo/artifacts/tool-results/` and
|
|
223
|
+
* return the workspace-relative path (for the model to `read`). Best-effort: throws
|
|
224
|
+
* are caught by the caller, which simply omits the artifact note.
|
|
225
|
+
*/
|
|
226
|
+
/** Most recent tool-result artifacts to keep; older ones are pruned on each spill. */
|
|
227
|
+
export const MAX_TOOL_ARTIFACTS = 50;
|
|
228
|
+
|
|
229
|
+
/** Best-effort retention: keep the newest `MAX_TOOL_ARTIFACTS` files in `dir`, delete the rest. */
|
|
230
|
+
async function pruneToolArtifacts(dir: string): Promise<void> {
|
|
231
|
+
const files = await fs.readdir(dir).catch(() => [] as string[]);
|
|
232
|
+
if (files.length <= MAX_TOOL_ARTIFACTS) return;
|
|
233
|
+
const stamped = await Promise.all(
|
|
234
|
+
files.map(async f => ({ f, m: (await fs.stat(path.join(dir, f)).catch(() => null))?.mtimeMs ?? 0 })),
|
|
235
|
+
);
|
|
236
|
+
stamped.sort((a, b) => b.m - a.m); // newest first
|
|
237
|
+
for (const { f } of stamped.slice(MAX_TOOL_ARTIFACTS)) {
|
|
238
|
+
await fs.rm(path.join(dir, f), { force: true }).catch(() => {});
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
export async function spillToolResult(tool: string, output: string, cwd: string): Promise<string> {
|
|
243
|
+
const dir = path.join(cwd, ".jeo", "artifacts", "tool-results");
|
|
244
|
+
await fs.mkdir(dir, { recursive: true });
|
|
245
|
+
const safeTool = tool.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 32) || "tool";
|
|
246
|
+
const stamp = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
247
|
+
const rel = path.join(".jeo", "artifacts", "tool-results", `${stamp}-${safeTool}.txt`);
|
|
248
|
+
await fs.writeFile(path.join(cwd, rel), output, "utf-8");
|
|
249
|
+
// Retention so a long session can't grow the artifact dir without bound.
|
|
250
|
+
await pruneToolArtifacts(dir);
|
|
251
|
+
return rel;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/** Levenshtein distance (small inputs: tool/command names). */
|
|
255
|
+
function editDistance(a: string, b: string): number {
|
|
256
|
+
const m = a.length, n = b.length;
|
|
257
|
+
if (m === 0) return n;
|
|
258
|
+
if (n === 0) return m;
|
|
259
|
+
let prev = Array.from({ length: n + 1 }, (_, i) => i);
|
|
260
|
+
let cur = new Array<number>(n + 1).fill(0);
|
|
261
|
+
for (let i = 1; i <= m; i++) {
|
|
262
|
+
cur[0] = i;
|
|
263
|
+
for (let j = 1; j <= n; j++) {
|
|
264
|
+
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
265
|
+
cur[j] = Math.min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + cost);
|
|
266
|
+
}
|
|
267
|
+
[prev, cur] = [cur, prev];
|
|
268
|
+
}
|
|
269
|
+
return prev[n];
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/** Nearest known tool name for an unknown call: exact, prefix, or edit distance ≤ 2. */
|
|
273
|
+
export function nearestToolName(name: string, known: string[]): string | undefined {
|
|
274
|
+
const want = name.trim().toLowerCase();
|
|
275
|
+
if (!want) return undefined;
|
|
276
|
+
let best: string | undefined;
|
|
277
|
+
let bestD = Infinity;
|
|
278
|
+
for (const k of known) {
|
|
279
|
+
const kl = k.toLowerCase();
|
|
280
|
+
if (kl === want) return k;
|
|
281
|
+
const d = kl.startsWith(want) || want.startsWith(kl) ? 1 : editDistance(want, kl);
|
|
282
|
+
if (d < bestD) { bestD = d; best = k; }
|
|
283
|
+
}
|
|
284
|
+
return bestD <= 2 ? best : undefined;
|
|
285
|
+
}
|
|
93
286
|
/**
|
|
94
287
|
* Drive `history` through the tool-call loop, mutating it in place so callers
|
|
95
288
|
* (e.g. an interactive REPL) can keep the conversation across multiple turns.
|
|
96
289
|
*/
|
|
97
290
|
export async function runAgentLoop(history: Message[], opts: AgentLoopOptions): Promise<AgentLoopResult> {
|
|
98
291
|
const { cwd } = opts;
|
|
292
|
+
// Active-model gate for web_search's provider chain (gjc parity): the chain
|
|
293
|
+
// prefers the active model's native search backend, never credential-scanning.
|
|
294
|
+
setWebSearchActiveModel(opts.model);
|
|
295
|
+
// Honor an explicit system prompt for callers that build history without one.
|
|
296
|
+
if (opts.systemPrompt && history[0]?.role !== "system") {
|
|
297
|
+
history.unshift({ role: "system", content: opts.systemPrompt });
|
|
298
|
+
}
|
|
99
299
|
const tools = opts.tools ?? DEFAULT_TOOLS;
|
|
100
300
|
const maxSteps = opts.maxSteps ?? 15;
|
|
301
|
+
// gjc-style retry flow: the step limit is a flexible BUDGET, not a bare counter.
|
|
302
|
+
// While the recent window shows real progress the budget extends itself; a stalled
|
|
303
|
+
// turn fails fast into the consolidation wrap-up. An explicit positive maxSteps
|
|
304
|
+
// keeps the bounded flow (base + capped extensions); a non-finite / non-positive
|
|
305
|
+
// maxSteps selects the DYNAMIC budget — extensions keep flowing while NOVEL
|
|
306
|
+
// progress continues, a stalled/cycling window consolidates, and a large finite
|
|
307
|
+
// safety cap (default 600 steps) guarantees the turn always terminates.
|
|
308
|
+
const budget = new StepBudget(
|
|
309
|
+
Number.isFinite(maxSteps) && maxSteps > 0
|
|
310
|
+
? resolveStepBudgetConfig(maxSteps, process.env, opts.budget)
|
|
311
|
+
: dynamicStepBudgetConfig(process.env, opts.budget),
|
|
312
|
+
);
|
|
313
|
+
// Why the loop stopped at the limit — folded into the consolidation message.
|
|
314
|
+
let budgetStopReason = "";
|
|
101
315
|
const ev = opts.events ?? {};
|
|
316
|
+
const maxHistoryTokens = Math.max(10_000, opts.maxHistoryTokens ?? 80_000);
|
|
102
317
|
|
|
318
|
+
// Wall-clock turn budget — the definitive "never sits in thinking forever"
|
|
319
|
+
// guarantee. Step budgets bound the COUNT of model calls; this bounds their total
|
|
320
|
+
// TIME: a turn that crosses it stops at the next loop boundary and consolidates a
|
|
321
|
+
// wrap-up instead of spinning for hours under a generous dynamic step cap.
|
|
322
|
+
const turnStartedAt = Date.now();
|
|
323
|
+
const turnBudgetMs = turnMaxMs();
|
|
324
|
+
// "steps" | "time" — drives honest wording in the consolidation message.
|
|
325
|
+
let stopKind: "steps" | "time" = "steps";
|
|
103
326
|
let step = 1;
|
|
104
327
|
const acc = { inputTokens: 0, outputTokens: 0 };
|
|
105
328
|
let sawUsage = false;
|
|
@@ -111,98 +334,625 @@ export async function runAgentLoop(history: Message[], opts: AgentLoopOptions):
|
|
|
111
334
|
// calls (bad edits, failing commands) would otherwise burn the whole step budget.
|
|
112
335
|
const MAX_FAILURES = 5;
|
|
113
336
|
let consecutiveFailures = 0;
|
|
337
|
+
// done-verification guard (plan/gjc-inheritance.md B4, gjc ultragoal-guard 경량 계승):
|
|
338
|
+
// a turn that MUTATED files but shows no verification signal gets ONE pushback on
|
|
339
|
+
// `done` — run the relevant test/build, or call done again (the escape hatch for
|
|
340
|
+
// doc/config changes where verification is genuinely not applicable).
|
|
341
|
+
let sawMutation = false;
|
|
342
|
+
let sawVerification = false;
|
|
343
|
+
let donePushbackUsed = false;
|
|
344
|
+
// Caller-owned done gate (onBeforeDone) — also strictly once per turn.
|
|
345
|
+
let beforeDoneNudgeUsed = false;
|
|
346
|
+
// F1 (round 4): the run-command of the most recent post-turn hook FAILURE whose
|
|
347
|
+
// diagnostics the model saw but has not yet resolved (a later clean hook run
|
|
348
|
+
// clears it). The done guard treats this as "verification missing" — the hook
|
|
349
|
+
// exit code is the strongest correctness signal in the loop.
|
|
350
|
+
let pendingHookFailure: string | null = null;
|
|
351
|
+
// Round-6 #4: ONE reactive recovery when the PROVIDER reports context overflow
|
|
352
|
+
// (authoritative where the local estimate drifted — images, tokenizer mismatch).
|
|
353
|
+
let contextOverflowRetryUsed = false;
|
|
354
|
+
// Refusal recovery budget: a safety refusal (HTTP 200, no content) on routine
|
|
355
|
+
// coding work is usually a transient false-positive. Retry the SAME step once
|
|
356
|
+
// as-is, then once more with an explicit re-grounding note; only a third
|
|
357
|
+
// refusal in the turn surfaces the (friendly) error. Bounded per turn so a
|
|
358
|
+
// genuinely refused request can never burn billed calls in a loop.
|
|
359
|
+
const MAX_REFUSAL_RETRIES = 3;
|
|
360
|
+
let refusalRetries = 0;
|
|
361
|
+
const VERIFY_SIGNAL_RE = /\b(test|tests|tsc|typecheck|lint|build|check|spec|pytest|vitest|jest)\b/i;
|
|
114
362
|
let lastSig = "";
|
|
115
363
|
let repeatCount = 0;
|
|
116
|
-
|
|
364
|
+
// Cycle guard (the A↔B ping-pong the exact-repeat guard cannot see): the recent
|
|
365
|
+
// executed step signatures, as fixed-size digests. When a full window cycles
|
|
366
|
+
// through ≤2 distinct calls, bounce ONCE with an explicit correction; a spin that
|
|
367
|
+
// persists through the correction stops the turn.
|
|
368
|
+
const CYCLE_WINDOW = 6;
|
|
369
|
+
const recentStepSigs: string[] = [];
|
|
370
|
+
let cycleBounceUsed = false;
|
|
371
|
+
// Invalid-tool-call guard: a model that returns JSON without a usable `tool`
|
|
372
|
+
// field can't drive the loop at all — surface that clearly instead of looping.
|
|
373
|
+
let invalidToolCalls = 0;
|
|
374
|
+
// Prose-bounce guard: after this many invalid-JSON corrections, salvage the
|
|
375
|
+
// model's text as the final answer instead of burning the whole step budget.
|
|
376
|
+
const MAX_PARSE_BOUNCES = 2;
|
|
377
|
+
let parseFailures = 0;
|
|
378
|
+
while (true) {
|
|
379
|
+
if (turnBudgetMs > 0 && Date.now() - turnStartedAt > turnBudgetMs) {
|
|
380
|
+
stopKind = "time";
|
|
381
|
+
budgetStopReason = `turn wall-clock budget of ${Math.round(turnBudgetMs / 60_000)}m exceeded (JEO_TURN_MAX_MS) without done`;
|
|
382
|
+
break;
|
|
383
|
+
}
|
|
384
|
+
if (step > budget.limit()) {
|
|
385
|
+
const decision = budget.tryExtend();
|
|
386
|
+
if (!decision.extend) {
|
|
387
|
+
budgetStopReason = decision.reason;
|
|
388
|
+
break;
|
|
389
|
+
}
|
|
390
|
+
// One surface per sink: budget-aware consumers get onBudget; others the notice.
|
|
391
|
+
if (ev.onBudget) ev.onBudget(decision.limit, decision.reason);
|
|
392
|
+
else ev.onNotice?.(decision.reason);
|
|
393
|
+
}
|
|
117
394
|
if (opts.signal?.aborted) {
|
|
118
395
|
return finish({ done: false, steps: step - 1, doneReason: "Cancelled." });
|
|
119
396
|
}
|
|
120
|
-
ev.onStep?.(step);
|
|
397
|
+
await ev.onStep?.(step);
|
|
398
|
+
|
|
399
|
+
// MID-TURN context guard: a single long turn (60+ steps) otherwise grows the
|
|
400
|
+
// history without bound — turn-boundary compaction never runs inside a turn,
|
|
401
|
+
// and field evidence shows multi-million-token prompts degrading the model
|
|
402
|
+
// into repeat loops while cost compounds. Deterministically elide the OLDEST
|
|
403
|
+
// tool-result bodies once the estimate crosses the budget; recent evidence
|
|
404
|
+
// and all assistant/user content stay intact.
|
|
405
|
+
if (historyTokens(history) > maxHistoryTokens) {
|
|
406
|
+
const res = trimToolResultsInPlace(history, { budgetTokens: maxHistoryTokens });
|
|
407
|
+
if (res.trimmed > 0) {
|
|
408
|
+
ev.onNotice?.(`context guard: elided ${res.trimmed} older tool result(s) mid-turn (~${Math.round(res.tokens / 1000)}k tokens kept)`);
|
|
409
|
+
}
|
|
410
|
+
}
|
|
121
411
|
|
|
412
|
+
// Stream the response into the live reasoning view ONLY when a consumer is attached
|
|
413
|
+
// (a TUI). Non-interactive/test callers leave onModelStream unset → a single
|
|
414
|
+
// non-streaming call(), unchanged. The accumulated text is still parsed as one JSON
|
|
415
|
+
// tool call below, so streaming changes nothing about loop semantics.
|
|
416
|
+
let streamBuf = "";
|
|
417
|
+
const onToken = ev.onModelStream
|
|
418
|
+
? (delta: string) => { streamBuf += delta; ev.onModelStream!(streamBuf); }
|
|
419
|
+
: undefined;
|
|
122
420
|
let responseText: string;
|
|
123
421
|
try {
|
|
124
|
-
responseText = await
|
|
422
|
+
responseText = await invokeCallLlm(history, {
|
|
125
423
|
jsonMode: true,
|
|
126
424
|
model: opts.model,
|
|
127
425
|
maxTokens: opts.maxTokens,
|
|
128
426
|
signal: opts.signal,
|
|
129
427
|
onUsage: u => { acc.inputTokens += u.inputTokens ?? 0; acc.outputTokens += u.outputTokens ?? 0; sawUsage = true; },
|
|
428
|
+
onToken,
|
|
429
|
+
// Make provider auto-retry visible: previously a rate-limited call sat in a
|
|
430
|
+
// silent backoff wait, then surfaced "auto-retry was exhausted" with no trace
|
|
431
|
+
// of the retries that DID happen.
|
|
432
|
+
onRetry: (attempt, err, delayMs) => {
|
|
433
|
+
const wait = Math.max(1, Math.round(delayMs / 1000));
|
|
434
|
+
const what = isRateLimitError(err) ? "rate limited (HTTP 429)" : "transient provider error";
|
|
435
|
+
ev.onNotice?.(`${what} — auto-retry #${attempt} in ${wait}s`);
|
|
436
|
+
},
|
|
130
437
|
});
|
|
131
438
|
} catch (err) {
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
//
|
|
439
|
+
// Reactive context recovery: trim older tool results in place and retry the
|
|
440
|
+
// SAME step once. The provider's overflow signal beats the local estimate;
|
|
441
|
+
// a second overflow (or nothing left to trim) surfaces the friendly error.
|
|
442
|
+
if (isContextOverflowError(err) && !contextOverflowRetryUsed) {
|
|
443
|
+
contextOverflowRetryUsed = true;
|
|
444
|
+
// keepRecent 2 (vs the proactive guard's 8): the provider already REJECTED
|
|
445
|
+
// this prompt — freeing real space beats keeping evidence that can be re-run.
|
|
446
|
+
const res = trimToolResultsInPlace(history, { budgetTokens: Math.max(1, Math.floor(maxHistoryTokens / 2)), keepRecent: 2 });
|
|
447
|
+
if (res.trimmed > 0) {
|
|
448
|
+
ev.onNotice?.(`provider reported context overflow — elided ${res.trimmed} older tool result(s), retrying once`);
|
|
449
|
+
continue; // free retry: the step counter is unchanged
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
// Reactive refusal recovery (the "stop_reason=refusal" dead turn). Anthropic's
|
|
453
|
+
// contract: a refusal means the streaming classifier tripped on the CURRENT
|
|
454
|
+
// conversation content, and the context must be RESET before continuing —
|
|
455
|
+
// resending the same prompt keeps refusing deterministically. Ladder:
|
|
456
|
+
// 1) plain resend — covers a transient classifier flake (the OAuth payload
|
|
457
|
+
// also rotates its per-request user id, which alone can clear a trip);
|
|
458
|
+
// 2) classifier reset — elide tool-result bodies (the usual trigger is
|
|
459
|
+
// freshly-read file/search content, not the task itself) and append a
|
|
460
|
+
// NEUTRAL continuation note. The note deliberately never mentions the
|
|
461
|
+
// safety layer: arguing with the filter reads as a jailbreak attempt
|
|
462
|
+
// and escalates instead of recovering.
|
|
463
|
+
// 3) guidance strip — with tool results already gone, the remaining
|
|
464
|
+
// classifier-trigger candidate is the repo-authored prose injected
|
|
465
|
+
// into the SYSTEM prompt (<project_context> — AGENTS.md / rules can
|
|
466
|
+
// contain text that trips content filters even though the task is
|
|
467
|
+
// routine). Strip that block for the rest of the turn and retry once;
|
|
468
|
+
// core instructions stay intact. Field case: `$gjc init` inside a
|
|
469
|
+
// repo whose guidance files refuse-trip the OAuth classifier.
|
|
470
|
+
if (isRefusalError(err) && refusalRetries < MAX_REFUSAL_RETRIES) {
|
|
471
|
+
refusalRetries++;
|
|
472
|
+
if (refusalRetries === 1) {
|
|
473
|
+
ev.onNotice?.("provider refused the last call (no content) — retrying the same step");
|
|
474
|
+
continue; // free resend: the step counter is unchanged
|
|
475
|
+
}
|
|
476
|
+
if (refusalRetries === 2) {
|
|
477
|
+
const res = trimToolResultsInPlace(history, { budgetTokens: 0, keepRecent: 0 });
|
|
478
|
+
ev.onNotice?.(
|
|
479
|
+
res.trimmed > 0
|
|
480
|
+
? `provider refused again — reset ${res.trimmed} tool result(s) from the context and retrying (refusals require a context reset)`
|
|
481
|
+
: "provider refused again — continuing with a fresh instruction",
|
|
482
|
+
);
|
|
483
|
+
history.push({
|
|
484
|
+
role: "user",
|
|
485
|
+
content:
|
|
486
|
+
"(continuation) The previous response returned no content and older tool outputs were elided from this conversation. " +
|
|
487
|
+
"Re-assess the task from the remaining context and reply with exactly one JSON tool call " +
|
|
488
|
+
'{"tool":"<name>","arguments":{...}} — re-run any tool whose output you still need, ' +
|
|
489
|
+
'or send {"tool":"done","arguments":{"reason":"<summary>"}} if the task is finished.',
|
|
490
|
+
});
|
|
491
|
+
step++;
|
|
492
|
+
continue;
|
|
493
|
+
}
|
|
494
|
+
const sys = history[0];
|
|
495
|
+
if (sys?.role === "system" && sys.content.includes("<project_context>")) {
|
|
496
|
+
const stripped = sys.content.replace(/\n*<project_context>[\s\S]*?<\/project_context>/, "").trimEnd();
|
|
497
|
+
history[0] = { ...sys, content: stripped }; // replace, never mutate (identity caches)
|
|
498
|
+
ev.onNotice?.("provider refused a third time — removed project-context guidance from the system prompt and retrying once more");
|
|
499
|
+
continue; // same step, reduced system prompt
|
|
500
|
+
}
|
|
501
|
+
// Nothing left to strip — fall through to the friendly terminal error
|
|
502
|
+
// instead of burning an identical billed call.
|
|
503
|
+
}
|
|
504
|
+
const message = friendlyProviderError(err);
|
|
505
|
+
// The error IS the turn's doneReason and every caller displays that — emitting a
|
|
506
|
+
// separate error event here printed the same message twice (live stream + reply).
|
|
135
507
|
return finish({ done: false, steps: step, doneReason: `Error: ${message}` });
|
|
136
508
|
}
|
|
509
|
+
if (sawUsage) ev.onUsage?.({ ...acc });
|
|
137
510
|
|
|
138
|
-
let invocation:
|
|
511
|
+
let invocation: any;
|
|
139
512
|
try {
|
|
140
|
-
invocation = extractJsonObject<
|
|
513
|
+
invocation = extractJsonObject<any>(responseText);
|
|
141
514
|
} catch (err) {
|
|
142
|
-
// Not valid tool-call JSON — show the model the error and let it retry.
|
|
143
515
|
ev.onAssistant?.(responseText, null);
|
|
516
|
+
// Prose salvage: a reply with no JSON object at all is a chat-style final
|
|
517
|
+
// answer, not a malformed tool call. Bouncing it back only made the model
|
|
518
|
+
// apologize for the format — and that apology surfaced as the visible reply.
|
|
519
|
+
// Same salvage after repeated bounces: the text we have IS the best answer.
|
|
520
|
+
const trimmed = responseText.trim();
|
|
521
|
+
parseFailures++;
|
|
522
|
+
if (trimmed && (!trimmed.includes("{") || parseFailures > MAX_PARSE_BOUNCES)) {
|
|
523
|
+
history.push({ role: "assistant", content: responseText });
|
|
524
|
+
return finish({ done: true, steps: step, doneReason: trimmed });
|
|
525
|
+
}
|
|
144
526
|
history.push({ role: "assistant", content: responseText });
|
|
145
527
|
history.push({
|
|
146
528
|
role: "user",
|
|
147
529
|
content:
|
|
148
530
|
`Your last reply was not a valid tool call (${(err as Error).message}). ` +
|
|
149
|
-
`
|
|
531
|
+
`Do NOT apologize or explain the formatting mistake. If that reply was your final answer, ` +
|
|
532
|
+
`resend it as {"tool":"done","arguments":{"reason":"<that answer, verbatim>"}}; ` +
|
|
533
|
+
`otherwise reply with exactly one JSON tool call: {"tool":"<name>","arguments":{...}}.`,
|
|
534
|
+
});
|
|
535
|
+
step++;
|
|
536
|
+
continue;
|
|
537
|
+
}
|
|
538
|
+
// A successfully parsed reply ends any bounce streak: MAX_PARSE_BOUNCES is a
|
|
539
|
+
// CONSECUTIVE-failure salvage, not a cumulative one — without this reset a long
|
|
540
|
+
// turn accumulated scattered parse slips and prematurely salvaged mid-task prose.
|
|
541
|
+
parseFailures = 0;
|
|
542
|
+
|
|
543
|
+
// Normalize to an invocation list
|
|
544
|
+
let toolCalls: { tool: string; arguments?: Record<string, any> }[] = [];
|
|
545
|
+
if (invocation && typeof invocation === "object") {
|
|
546
|
+
if (Array.isArray(invocation.tools)) {
|
|
547
|
+
const isValidBatch = invocation.tools.length > 0 && invocation.tools.every(
|
|
548
|
+
(t: any) => t && typeof t === "object" && typeof t.tool === "string" && t.tool.trim().length > 0
|
|
549
|
+
);
|
|
550
|
+
if (isValidBatch) {
|
|
551
|
+
toolCalls = invocation.tools.map((t: any) => ({
|
|
552
|
+
tool: t.tool.trim(),
|
|
553
|
+
arguments: t.arguments
|
|
554
|
+
}));
|
|
555
|
+
}
|
|
556
|
+
} else if (typeof invocation.tool === "string" && invocation.tool.trim().length > 0) {
|
|
557
|
+
toolCalls = [{
|
|
558
|
+
tool: invocation.tool.trim(),
|
|
559
|
+
arguments: invocation.arguments
|
|
560
|
+
}];
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
if (toolCalls.length === 0) {
|
|
565
|
+
invalidToolCalls++;
|
|
566
|
+
if (invalidToolCalls >= MAX_REPEAT) {
|
|
567
|
+
return finish({
|
|
568
|
+
done: false,
|
|
569
|
+
steps: step,
|
|
570
|
+
doneReason: `Stopped: the model returned no valid tool call ${MAX_REPEAT}× (a JSON reply with no valid "tool" or "tools" field). The selected model may be too small to follow the JSON tool protocol — switch to a stronger model with /model.`,
|
|
571
|
+
});
|
|
572
|
+
}
|
|
573
|
+
history.push({ role: "assistant", content: responseText });
|
|
574
|
+
history.push({
|
|
575
|
+
role: "user",
|
|
576
|
+
content: `Your last reply had no "tool" or "tools" field. Reply with exactly one JSON object, e.g. {"tool":"find","arguments":{"globPattern":"src/**"}} or {"tools":[{"tool":"read","arguments":{"filePath":"src/main.ts"}}, ...]}.`,
|
|
150
577
|
});
|
|
151
578
|
step++;
|
|
152
579
|
continue;
|
|
153
580
|
}
|
|
581
|
+
invalidToolCalls = 0;
|
|
154
582
|
|
|
155
|
-
|
|
583
|
+
if (toolCalls.length > 6) {
|
|
584
|
+
ev.onNotice?.(`Too many tool calls in batch (${toolCalls.length}); capping at 6 and dropping the rest.`);
|
|
585
|
+
toolCalls = toolCalls.slice(0, 6);
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
ev.onAssistant?.(responseText, toolCalls[0]);
|
|
156
589
|
|
|
157
|
-
if (
|
|
158
|
-
|
|
590
|
+
if (toolCalls.length === 1 && toolCalls[0].tool === "done") {
|
|
591
|
+
if (sawMutation && (!sawVerification || pendingHookFailure !== null) && !donePushbackUsed) {
|
|
592
|
+
donePushbackUsed = true; // second done always passes — escape hatch
|
|
593
|
+
history.push({ role: "assistant", content: responseText });
|
|
594
|
+
history.push({
|
|
595
|
+
role: "user",
|
|
596
|
+
content: pendingHookFailure !== null
|
|
597
|
+
? `Your latest mutation left the post-turn hook "${pendingHookFailure}" FAILING (non-zero exit) — its diagnostics were shown in the tool result above. ` +
|
|
598
|
+
"Fix the reported problems (the hook re-runs on your next mutation), then call done. " +
|
|
599
|
+
"If the hook failure is a false positive, call done again and say why in the reason."
|
|
600
|
+
: "You modified files this turn but ran NO verification (no test/build/typecheck command succeeded). " +
|
|
601
|
+
"Run the narrowest command that proves your change works, then call done. " +
|
|
602
|
+
"If verification is genuinely not applicable (docs/config-only change), call done again and say why in the reason.",
|
|
603
|
+
});
|
|
604
|
+
step++;
|
|
605
|
+
continue;
|
|
606
|
+
}
|
|
607
|
+
// Caller-owned done gate (e.g. stale-todo reconciliation): ONE bounded
|
|
608
|
+
// bounce, then any later done passes — field case: a 28-step turn ended
|
|
609
|
+
// [DONE] with the Todos checklist still showing 1 in-progress + 4 pending
|
|
610
|
+
// because nothing ever forced a status update.
|
|
611
|
+
if (!beforeDoneNudgeUsed && ev.onBeforeDone) {
|
|
612
|
+
const nudge = ev.onBeforeDone((toolCalls[0].arguments?.reason as string) ?? "");
|
|
613
|
+
if (nudge) {
|
|
614
|
+
beforeDoneNudgeUsed = true;
|
|
615
|
+
history.push({ role: "assistant", content: responseText });
|
|
616
|
+
history.push({ role: "user", content: nudge });
|
|
617
|
+
ev.onNotice?.("done deferred once — final plan reconciliation requested");
|
|
618
|
+
step++;
|
|
619
|
+
continue;
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
return finish({ done: true, steps: step, doneReason: (toolCalls[0].arguments?.reason as string) ?? "" });
|
|
159
623
|
}
|
|
160
624
|
|
|
161
|
-
//
|
|
162
|
-
|
|
625
|
+
// Anti-spin guard, checked BEFORE execution: a repeated identical step must
|
|
626
|
+
// not run its calls again — a repeated mutating bash/edit must not execute
|
|
627
|
+
// a third time merely to be detected.
|
|
628
|
+
// - 2nd identical step → ONE corrective bounce (skip execution, tell the
|
|
629
|
+
// model its previous identical call already ran and to either act
|
|
630
|
+
// differently or call done). Field evidence: long turns died here right
|
|
631
|
+
// after a SUCCESSFUL write because nothing ever told the model to stop
|
|
632
|
+
// repeating — a recovery prompt resolves that without killing the turn.
|
|
633
|
+
// - 3rd identical step (repeated through the explicit correction) → stop.
|
|
634
|
+
const callSigs = toolCalls.map(c => `${c.tool}:${JSON.stringify(c.arguments ?? {})}`);
|
|
635
|
+
// Fixed-size digest of the whole step — `write` signatures embed entire file
|
|
636
|
+
// bodies, so the repeat/cycle guards compare digests, not megabyte strings.
|
|
637
|
+
const sig = hashSignature(callSigs.join(" | "));
|
|
163
638
|
if (sig === lastSig) repeatCount++;
|
|
164
639
|
else {
|
|
165
640
|
repeatCount = 1;
|
|
166
641
|
lastSig = sig;
|
|
167
642
|
}
|
|
643
|
+
if (repeatCount === 2) {
|
|
644
|
+
const what = toolCalls.length === 1 ? `'${toolCalls[0].tool}' call` : "tool batch";
|
|
645
|
+
history.push({ role: "assistant", content: responseText });
|
|
646
|
+
history.push({
|
|
647
|
+
role: "user",
|
|
648
|
+
content:
|
|
649
|
+
`You just repeated the EXACT same ${what} you already ran in the previous step — it was not re-executed. ` +
|
|
650
|
+
`Its result has not changed. If the task is complete, reply {"tool":"done","arguments":{"reason":"<summary of what was accomplished>"}}; ` +
|
|
651
|
+
`otherwise take a DIFFERENT next action (verify the result, move to the next file, or fix something new).`,
|
|
652
|
+
});
|
|
653
|
+
ev.onNotice?.(`repeated ${what} skipped — asked the model to act differently or call done`);
|
|
654
|
+
step++;
|
|
655
|
+
continue;
|
|
656
|
+
}
|
|
168
657
|
if (repeatCount >= MAX_REPEAT) {
|
|
658
|
+
const what = toolCalls.length === 1 ? `the same '${toolCalls[0].tool}' call` : "the same tool calls";
|
|
169
659
|
return finish({
|
|
170
660
|
done: false,
|
|
171
661
|
steps: step,
|
|
172
|
-
doneReason: `Stopped: repeated
|
|
662
|
+
doneReason: `Stopped: repeated ${what} ${MAX_REPEAT}× even after an explicit correction (the model never signaled done).`,
|
|
173
663
|
});
|
|
174
664
|
}
|
|
175
665
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
666
|
+
// Cycle guard: an A↔B (or A↔B↔C-minus-one) alternation never trips the
|
|
667
|
+
// exact-repeat guard above — each step differs from its immediate predecessor —
|
|
668
|
+
// yet it is the same spin (field case: re-reading one file and re-running one
|
|
669
|
+
// command forever, "thinking" never ends). Detect a full recent window that
|
|
670
|
+
// cycles through ≤2 distinct step signatures: ONE corrective bounce (skip
|
|
671
|
+
// execution — a repeated mutating call must not run again merely to be
|
|
672
|
+
// detected), then stop if the spin survives the explicit correction.
|
|
673
|
+
recentStepSigs.push(sig);
|
|
674
|
+
if (recentStepSigs.length > CYCLE_WINDOW) recentStepSigs.shift();
|
|
675
|
+
if (recentStepSigs.length === CYCLE_WINDOW && new Set(recentStepSigs).size <= 2) {
|
|
676
|
+
if (!cycleBounceUsed) {
|
|
677
|
+
cycleBounceUsed = true;
|
|
678
|
+
recentStepSigs.length = 0; // fresh window: the correction earns a real retry
|
|
679
|
+
history.push({ role: "assistant", content: responseText });
|
|
680
|
+
history.push({
|
|
681
|
+
role: "user",
|
|
682
|
+
content:
|
|
683
|
+
`You are cycling through the same ${new Set(callSigs).size <= 1 ? "tool call" : "tool calls"} you already ran in recent steps — this call was NOT re-executed and its result has not changed. ` +
|
|
684
|
+
`If the task is complete, reply {"tool":"done","arguments":{"reason":"<summary of what was accomplished>"}}; ` +
|
|
685
|
+
`otherwise take a genuinely DIFFERENT next action (a new file, a new command, or a fix you have not tried).`,
|
|
686
|
+
});
|
|
687
|
+
ev.onNotice?.("tool-call cycle detected — skipped execution and asked the model to act differently or call done");
|
|
688
|
+
step++;
|
|
689
|
+
continue;
|
|
690
|
+
}
|
|
691
|
+
return finish({
|
|
692
|
+
done: false,
|
|
693
|
+
steps: step,
|
|
694
|
+
doneReason: `Stopped: the model cycled through the same tool calls for ${CYCLE_WINDOW} consecutive steps even after an explicit correction (it never signaled done).`,
|
|
695
|
+
});
|
|
186
696
|
}
|
|
187
697
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
});
|
|
698
|
+
// Helper to execute a single tool call
|
|
699
|
+
const executeTool = async (call: { tool: string; arguments?: Record<string, any> }) => {
|
|
700
|
+
const { tool, arguments: args } = call;
|
|
701
|
+
let success: boolean;
|
|
702
|
+
let output: string;
|
|
194
703
|
|
|
195
|
-
|
|
704
|
+
if (tool === "done") {
|
|
705
|
+
success = false;
|
|
706
|
+
output = "Error: 'done' can only be called as the single tool invocation, not in a batch. Please send 'done' alone.";
|
|
707
|
+
} else {
|
|
708
|
+
const handler = tools[tool];
|
|
709
|
+
if (!handler) {
|
|
710
|
+
success = false;
|
|
711
|
+
const suggestion = nearestToolName(tool, Object.keys(tools));
|
|
712
|
+
const hint = suggestion ? ` Did you mean "${suggestion}"?` : "";
|
|
713
|
+
output = `Unknown tool: ${tool}.${hint} Available: ${Object.keys(tools).join(", ")}, done.`;
|
|
714
|
+
} else {
|
|
715
|
+
const preHookResult = await runPreToolHooks(
|
|
716
|
+
cwd,
|
|
717
|
+
tool,
|
|
718
|
+
args ?? {},
|
|
719
|
+
opts.signal,
|
|
720
|
+
ev.onNotice
|
|
721
|
+
);
|
|
722
|
+
if (preHookResult.vetoed) {
|
|
723
|
+
success = false;
|
|
724
|
+
output = preHookResult.error + (preHookResult.output ? `\n${preHookResult.output}` : "");
|
|
725
|
+
} else {
|
|
726
|
+
try {
|
|
727
|
+
const res = await handler(args ?? {}, cwd);
|
|
728
|
+
success = res.success;
|
|
729
|
+
output = res.success ? res.output : (res.error ? (res.output ? `${res.error}\n${res.output}` : res.error) : res.output);
|
|
730
|
+
} catch (err: any) {
|
|
731
|
+
success = false;
|
|
732
|
+
output = err?.message || String(err);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
return { success, output };
|
|
738
|
+
};
|
|
739
|
+
|
|
740
|
+
const READONLY_TOOLS = new Set(["read", "find", "search", "ls", "web_search"]);
|
|
741
|
+
const WRITE_TOOLS = new Set(["write", "edit"]);
|
|
742
|
+
// Batch grouping → concurrency plan (plan/gjc-inheritance.md cycle 12):
|
|
743
|
+
// read group — consecutive read-only calls run in parallel (safe).
|
|
744
|
+
// write group — consecutive write/edit calls to DISTINCT files run in
|
|
745
|
+
// parallel; a same-file (or path-less) collision opens a
|
|
746
|
+
// sequential boundary so ordered edits to one file stay ordered.
|
|
747
|
+
// exclusive group — bash (and anything else) always runs alone, in order.
|
|
748
|
+
// Reads and writes never share a group, so a read can never race a write.
|
|
749
|
+
type ToolGroup = {
|
|
750
|
+
kind: "read" | "write" | "exclusive";
|
|
751
|
+
calls: { tool: string; arguments?: Record<string, any>; index: number }[];
|
|
752
|
+
files?: Set<string>;
|
|
753
|
+
};
|
|
754
|
+
const groups: ToolGroup[] = [];
|
|
755
|
+
// Dedup key = RESOLVED, case-folded path (F3): `./x.ts` vs `x.ts` vs
|
|
756
|
+
// `src/../x.ts` — and case variants on the (default case-insensitive) macOS
|
|
757
|
+
// FS — must collapse to ONE key, or two spellings of the same file run in
|
|
758
|
+
// parallel and the second write silently clobbers the first. Folding case on
|
|
759
|
+
// a case-sensitive FS merely serializes two genuinely-distinct files — safe.
|
|
760
|
+
const targetFile = (call: { arguments?: Record<string, any> }): string | null => {
|
|
761
|
+
const p = call.arguments?.filePath ?? call.arguments?.path;
|
|
762
|
+
return typeof p === "string" && p.trim() !== "" ? path.resolve(cwd, p).toLowerCase() : null;
|
|
763
|
+
};
|
|
764
|
+
for (let i = 0; i < toolCalls.length; i++) {
|
|
765
|
+
const entry = { ...toolCalls[i], index: i };
|
|
766
|
+
const last = groups[groups.length - 1];
|
|
767
|
+
if (READONLY_TOOLS.has(entry.tool)) {
|
|
768
|
+
if (last && last.kind === "read") last.calls.push(entry);
|
|
769
|
+
else groups.push({ kind: "read", calls: [entry] });
|
|
770
|
+
} else if (WRITE_TOOLS.has(entry.tool)) {
|
|
771
|
+
const file = targetFile(entry);
|
|
772
|
+
if (last && last.kind === "write" && file !== null && !last.files!.has(file)) {
|
|
773
|
+
last.calls.push(entry);
|
|
774
|
+
last.files!.add(file);
|
|
775
|
+
} else {
|
|
776
|
+
groups.push({ kind: "write", calls: [entry], files: new Set(file !== null ? [file] : []) });
|
|
777
|
+
}
|
|
778
|
+
} else {
|
|
779
|
+
groups.push({ kind: "exclusive", calls: [entry] });
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
const results: { success: boolean; output: string; executed: boolean }[] = Array.from(
|
|
784
|
+
{ length: toolCalls.length },
|
|
785
|
+
() => ({ success: false, output: "", executed: false })
|
|
786
|
+
);
|
|
787
|
+
|
|
788
|
+
let aborted = false;
|
|
789
|
+
for (const group of groups) {
|
|
790
|
+
if (opts.signal?.aborted) {
|
|
791
|
+
aborted = true;
|
|
792
|
+
break;
|
|
793
|
+
}
|
|
794
|
+
if (group.calls.length > 1) {
|
|
795
|
+
// read OR distinct-file write group → run concurrently.
|
|
796
|
+
await Promise.all(group.calls.map(async (call) => {
|
|
797
|
+
const res = await executeTool(call);
|
|
798
|
+
results[call.index] = { ...res, executed: true };
|
|
799
|
+
}));
|
|
800
|
+
} else {
|
|
801
|
+
const call = group.calls[0];
|
|
802
|
+
const res = await executeTool(call);
|
|
803
|
+
results[call.index] = { ...res, executed: true };
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
const processAndPushResults = async (indices: number[]) => {
|
|
808
|
+
const resultBlocks: string[] = [];
|
|
809
|
+
// Per-batch dedup of post-turn hook diagnostics: a whole-project `tsc` hook
|
|
810
|
+
// matching every edit in a batch yields identical output N times — show it
|
|
811
|
+
// once, cross-reference the rest (cycle 13).
|
|
812
|
+
const seenHookFeedback = new Set<string>();
|
|
813
|
+
for (const idx of indices) {
|
|
814
|
+
const call = toolCalls[idx];
|
|
815
|
+
const res = results[idx];
|
|
816
|
+
|
|
817
|
+
ev.onToolResult?.(call.tool, res.success, res.output);
|
|
818
|
+
|
|
819
|
+
const minimized = minimizeToolOutput(res.output, call.tool);
|
|
820
|
+
const visible = minimized.text;
|
|
821
|
+
let resultBody = truncateToolOutput(visible);
|
|
822
|
+
if (res.output.length > TOOL_SPILL_THRESHOLD) {
|
|
823
|
+
const artifact = await spillToolResult(call.tool, res.output, cwd).catch(() => null);
|
|
824
|
+
if (artifact) {
|
|
825
|
+
resultBody += `\n[full output (${res.output.length} chars) saved to ${artifact} — read it for the elided middle]`;
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
const { diags: hookDiags, ran: hooksRan } = await runPostTurnHooks(
|
|
830
|
+
cwd,
|
|
831
|
+
call.tool,
|
|
832
|
+
call.arguments ?? {},
|
|
833
|
+
res.success,
|
|
834
|
+
res.output,
|
|
835
|
+
opts.signal,
|
|
836
|
+
ev.onNotice
|
|
837
|
+
);
|
|
838
|
+
// F1: a red hook becomes a pending failure the done guard enforces; a
|
|
839
|
+
// later hook run that completes CLEAN (ran > 0, zero diags) clears it.
|
|
840
|
+
if (hookDiags.length > 0) pendingHookFailure = hookDiags[hookDiags.length - 1].run;
|
|
841
|
+
else if (hooksRan > 0) pendingHookFailure = null;
|
|
842
|
+
|
|
843
|
+
// Append non-zero-exit hook diagnostics to THIS tool's result block so the
|
|
844
|
+
// model can self-correct. The tool's own ok/fail is unchanged (guard).
|
|
845
|
+
let resultBlock = `Tool [${call.tool}] result (${res.success ? "ok" : "fail"}):\n${resultBody}`;
|
|
846
|
+
for (const d of hookDiags) {
|
|
847
|
+
const key = `${d.run}\u0000${d.output}`;
|
|
848
|
+
if (seenHookFeedback.has(key)) {
|
|
849
|
+
resultBlock += `\n[post-turn hook "${d.run}" — exit ${d.exitCode}: same diagnostics as above]`;
|
|
850
|
+
} else {
|
|
851
|
+
seenHookFeedback.add(key);
|
|
852
|
+
resultBlock += `\n[post-turn hook "${d.run}" — exit ${d.exitCode}]:\n${truncateToolOutput(d.output)}`;
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
resultBlocks.push(resultBlock);
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
history.push({ role: "assistant", content: responseText });
|
|
859
|
+
history.push({
|
|
860
|
+
role: "user",
|
|
861
|
+
content: resultBlocks.join("\n\n"),
|
|
862
|
+
});
|
|
863
|
+
};
|
|
864
|
+
|
|
865
|
+
if (aborted) {
|
|
866
|
+
const executedIndices = results.map((r, i) => r.executed ? i : -1).filter(i => i !== -1);
|
|
867
|
+
if (executedIndices.length > 0) {
|
|
868
|
+
await processAndPushResults(executedIndices);
|
|
869
|
+
}
|
|
870
|
+
return finish({ done: false, steps: step, doneReason: "Cancelled." });
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
const allIndices = toolCalls.map((_, i) => i);
|
|
874
|
+
await processAndPushResults(allIndices);
|
|
875
|
+
|
|
876
|
+
// Score the budget window per CALL, not per batch: a batch of five failing
|
|
877
|
+
// edits plus one trivial successful read must not look like a progressing
|
|
878
|
+
// step to the extension heuristic (that loophole earned endless extensions).
|
|
879
|
+
for (let i = 0; i < toolCalls.length; i++) {
|
|
880
|
+
if (results[i].executed) budget.record(callSigs[i], results[i].success);
|
|
881
|
+
}
|
|
882
|
+
// done-verification guard bookkeeping: write/edit successes mark the turn as
|
|
883
|
+
// mutating; a successful bash whose command/output looks like a test/build run
|
|
884
|
+
// counts as verification. A verification AFTER the last mutation is what the
|
|
885
|
+
// done guard wants, but order-insensitive tracking keeps it one-pushback simple.
|
|
886
|
+
for (let i = 0; i < toolCalls.length; i++) {
|
|
887
|
+
if (!results[i].executed || !results[i].success) continue;
|
|
888
|
+
const t = toolCalls[i].tool;
|
|
889
|
+
if (t === "write" || t === "edit") sawMutation = true;
|
|
890
|
+
else if (t === "bash") {
|
|
891
|
+
const cmd = String(toolCalls[i].arguments?.command ?? "");
|
|
892
|
+
if (VERIFY_SIGNAL_RE.test(cmd) || VERIFY_SIGNAL_RE.test(results[i].output.slice(0, 2000))) sawVerification = true;
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
// F6 (round 4 architect, Low): judge the step by its NON-TRIVIAL calls — a
|
|
896
|
+
// batch of read(ok)+edit(fail) repeated with varying targets previously
|
|
897
|
+
// never tripped MAX_FAILURES because the trivial read reset the streak.
|
|
898
|
+
// Read-only-only steps keep the old any-success rule.
|
|
899
|
+
const nonTrivial = toolCalls
|
|
900
|
+
.map((c, i) => ({ tool: c.tool, r: results[i] }))
|
|
901
|
+
.filter(x => !READONLY_TOOLS.has(x.tool) && x.r.executed);
|
|
902
|
+
const stepSuccess = nonTrivial.length > 0
|
|
903
|
+
? nonTrivial.some(x => x.r.success)
|
|
904
|
+
: results.some(r => r.success);
|
|
905
|
+
|
|
906
|
+
if (stepSuccess) {
|
|
196
907
|
consecutiveFailures = 0;
|
|
197
908
|
} else if (++consecutiveFailures >= MAX_FAILURES) {
|
|
909
|
+
const isSingle = toolCalls.length === 1;
|
|
910
|
+
const stopMsg = isSingle
|
|
911
|
+
? `Stopped: ${MAX_FAILURES} consecutive failing tool calls (last '${toolCalls[0].tool}'); the model could not recover.`
|
|
912
|
+
: `Stopped: ${MAX_FAILURES} consecutive failing tool steps; the model could not recover.`;
|
|
198
913
|
return finish({
|
|
199
914
|
done: false,
|
|
200
915
|
steps: step,
|
|
201
|
-
doneReason:
|
|
916
|
+
doneReason: stopMsg,
|
|
202
917
|
});
|
|
203
918
|
}
|
|
204
919
|
step++;
|
|
205
920
|
}
|
|
206
921
|
|
|
207
|
-
|
|
922
|
+
// Budget exhausted without `done` (step limit declined a further extension, or
|
|
923
|
+
// the turn wall-clock budget fired). Instead of dying with a bare limit error,
|
|
924
|
+
// dynamically CONSOLIDATE: one final no-tools model call summarizes what was
|
|
925
|
+
// accomplished, key findings, and what remains — a useful wrap-up, not a failure.
|
|
926
|
+
const extInfo = budget.extensionsUsed() > 0 ? ` after ${budget.extensionsUsed()} extension(s)` : "";
|
|
927
|
+
const stopInfo = budgetStopReason ? `; ${budgetStopReason}` : "";
|
|
928
|
+
const budgetLabel = stopKind === "time"
|
|
929
|
+
? `turn time budget of ${Math.round(turnBudgetMs / 60_000)}m reached`
|
|
930
|
+
: `step budget of ${budget.limit()} reached`;
|
|
931
|
+
try {
|
|
932
|
+
if (!opts.signal?.aborted) {
|
|
933
|
+
const wrapUp = await invokeCallLlm(
|
|
934
|
+
[
|
|
935
|
+
...history,
|
|
936
|
+
{
|
|
937
|
+
role: "user",
|
|
938
|
+
content:
|
|
939
|
+
"The budget for this turn is exhausted. Do NOT call any tool. " +
|
|
940
|
+
"Reply with plain prose (no JSON): consolidate what you accomplished this turn, " +
|
|
941
|
+
"the key findings/changes so far, and what remains to be done next.",
|
|
942
|
+
},
|
|
943
|
+
],
|
|
944
|
+
{ jsonMode: false, model: opts.model, maxTokens: opts.maxTokens, signal: opts.signal },
|
|
945
|
+
);
|
|
946
|
+
const consolidated = wrapUp.trim();
|
|
947
|
+
if (consolidated) {
|
|
948
|
+
history.push({ role: "assistant", content: consolidated });
|
|
949
|
+
return finish({
|
|
950
|
+
done: false,
|
|
951
|
+
steps: budget.limit(),
|
|
952
|
+
doneReason: `${consolidated}\n\n(${budgetLabel}${extInfo}${stopInfo} — consolidated wrap-up above; continue with a follow-up request)`,
|
|
953
|
+
});
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
} catch { /* wrap-up is best-effort; fall through to the plain budget message */ }
|
|
957
|
+
return finish({ done: false, steps: stopKind === "time" ? step : budget.limit(), doneReason: budgetStopReason ? `(${budgetLabel}${extInfo} — ${budgetStopReason})` : undefined });
|
|
208
958
|
}
|