jeo-code 0.1.0 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/README.ja.md +160 -0
  2. package/README.ko.md +160 -0
  3. package/README.md +115 -297
  4. package/README.zh.md +160 -0
  5. package/package.json +11 -6
  6. package/scripts/install.sh +28 -28
  7. package/scripts/uninstall.sh +17 -15
  8. package/src/AGENTS.md +50 -0
  9. package/src/agent/AGENTS.md +49 -0
  10. package/src/agent/bash-fixups.ts +103 -0
  11. package/src/agent/compaction.ts +410 -19
  12. package/src/agent/config-schema.ts +119 -5
  13. package/src/agent/context-files.ts +314 -17
  14. package/src/agent/dev/AGENTS.md +36 -0
  15. package/src/agent/dev/advanced-analyzer.ts +12 -0
  16. package/src/agent/dev/evolution-bridge.ts +82 -0
  17. package/src/agent/dev/evolution-logger.ts +41 -0
  18. package/src/agent/dev/self-analysis.ts +64 -0
  19. package/src/agent/dev/self-improve.ts +24 -0
  20. package/src/agent/dev/spec-automation.ts +49 -0
  21. package/src/agent/engine.ts +808 -54
  22. package/src/agent/hooks.ts +273 -0
  23. package/src/agent/loop.ts +21 -1
  24. package/src/agent/memory.ts +201 -0
  25. package/src/agent/model-recency.ts +32 -0
  26. package/src/agent/output-minimizer.ts +108 -0
  27. package/src/agent/output-util.ts +64 -0
  28. package/src/agent/plan.ts +187 -0
  29. package/src/agent/seed.ts +52 -0
  30. package/src/agent/session.ts +235 -21
  31. package/src/agent/state.ts +286 -39
  32. package/src/agent/step-budget.ts +232 -0
  33. package/src/agent/subagents.ts +223 -26
  34. package/src/agent/task-tool.ts +272 -0
  35. package/src/agent/todo-tool.ts +87 -0
  36. package/src/agent/tokenizer.ts +117 -0
  37. package/src/agent/tool-registry.ts +54 -0
  38. package/src/agent/tools.ts +624 -103
  39. package/src/agent/web-search.ts +538 -0
  40. package/src/ai/AGENTS.md +44 -0
  41. package/src/ai/index.ts +1 -0
  42. package/src/ai/model-catalog-compat.ts +3 -1
  43. package/src/ai/model-catalog.ts +74 -9
  44. package/src/ai/model-discovery.ts +215 -17
  45. package/src/ai/model-manager.ts +346 -32
  46. package/src/ai/model-picker.ts +1 -1
  47. package/src/ai/model-registry.ts +4 -2
  48. package/src/ai/pricing.ts +84 -0
  49. package/src/ai/provider-registry.ts +23 -0
  50. package/src/ai/provider-status.ts +60 -16
  51. package/src/ai/providers/AGENTS.md +42 -0
  52. package/src/ai/providers/anthropic.ts +250 -31
  53. package/src/ai/providers/antigravity.ts +219 -0
  54. package/src/ai/providers/errors.ts +15 -1
  55. package/src/ai/providers/gemini.ts +196 -13
  56. package/src/ai/providers/ollama.ts +37 -7
  57. package/src/ai/providers/openai-responses.ts +173 -0
  58. package/src/ai/providers/openai.ts +64 -12
  59. package/src/ai/sse.ts +4 -1
  60. package/src/ai/types.ts +18 -1
  61. package/src/auth/AGENTS.md +41 -0
  62. package/src/auth/callback-server.ts +6 -1
  63. package/src/auth/flows/AGENTS.md +32 -0
  64. package/src/auth/flows/antigravity.ts +151 -0
  65. package/src/auth/flows/google-project.ts +190 -0
  66. package/src/auth/flows/google.ts +39 -18
  67. package/src/auth/flows/index.ts +15 -5
  68. package/src/auth/flows/openai.ts +2 -2
  69. package/src/auth/oauth.ts +8 -0
  70. package/src/auth/refresh.ts +44 -27
  71. package/src/auth/storage.ts +149 -26
  72. package/src/auth/types.ts +1 -1
  73. package/src/autopilot.ts +362 -0
  74. package/src/bun-imports.d.ts +4 -0
  75. package/src/cli/AGENTS.md +39 -0
  76. package/src/cli/runner.ts +148 -14
  77. package/src/cli.ts +13 -4
  78. package/src/commands/AGENTS.md +40 -0
  79. package/src/commands/approve.ts +62 -3
  80. package/src/commands/auth.ts +167 -25
  81. package/src/commands/chat.ts +37 -8
  82. package/src/commands/deep-interview.ts +633 -175
  83. package/src/commands/doctor.ts +84 -37
  84. package/src/commands/evolve-core.ts +18 -0
  85. package/src/commands/evolve.ts +2 -1
  86. package/src/commands/export.ts +176 -0
  87. package/src/commands/gjc.ts +52 -0
  88. package/src/commands/launch.ts +3549 -240
  89. package/src/commands/mcp.ts +3 -3
  90. package/src/commands/ooo-seed.ts +19 -0
  91. package/src/commands/ralplan.ts +253 -35
  92. package/src/commands/resume.ts +1 -1
  93. package/src/commands/session.ts +183 -0
  94. package/src/commands/setup-helpers.ts +10 -3
  95. package/src/commands/setup.ts +57 -16
  96. package/src/commands/skills.ts +78 -18
  97. package/src/commands/state.ts +198 -0
  98. package/src/commands/status.ts +84 -0
  99. package/src/commands/team.ts +340 -212
  100. package/src/commands/ultragoal.ts +122 -61
  101. package/src/commands/update.ts +244 -0
  102. package/src/ledger.ts +270 -0
  103. package/src/mcp/AGENTS.md +38 -0
  104. package/src/mcp/server.ts +115 -14
  105. package/src/mcp/tools.ts +42 -22
  106. package/src/md-modules.d.ts +4 -0
  107. package/src/prompts/AGENTS.md +41 -0
  108. package/src/prompts/agents/AGENTS.md +35 -0
  109. package/src/prompts/agents/architect.md +35 -0
  110. package/src/prompts/agents/critic.md +37 -0
  111. package/src/prompts/agents/executor.md +36 -0
  112. package/src/prompts/agents/planner.md +37 -0
  113. package/src/prompts/skills/AGENTS.md +36 -0
  114. package/src/prompts/skills/deep-dive/AGENTS.md +31 -0
  115. package/src/prompts/skills/deep-dive/SKILL.md +13 -0
  116. package/src/prompts/skills/deep-interview/AGENTS.md +31 -0
  117. package/src/prompts/skills/deep-interview/SKILL.md +12 -0
  118. package/src/prompts/skills/gjc/AGENTS.md +31 -0
  119. package/src/prompts/skills/gjc/SKILL.md +15 -0
  120. package/src/prompts/skills/ralplan/AGENTS.md +31 -0
  121. package/src/prompts/skills/ralplan/SKILL.md +11 -0
  122. package/src/prompts/skills/team/AGENTS.md +31 -0
  123. package/src/prompts/skills/team/SKILL.md +11 -0
  124. package/src/prompts/skills/ultragoal/AGENTS.md +31 -0
  125. package/src/prompts/skills/ultragoal/SKILL.md +11 -0
  126. package/src/skills/AGENTS.md +38 -0
  127. package/src/skills/catalog.ts +565 -31
  128. package/src/tui/AGENTS.md +43 -0
  129. package/src/tui/app.ts +1181 -92
  130. package/src/tui/components/AGENTS.md +42 -0
  131. package/src/tui/components/ascii-art.ts +257 -15
  132. package/src/tui/components/autocomplete.ts +98 -16
  133. package/src/tui/components/autopilot-status.ts +65 -0
  134. package/src/tui/components/category-index.ts +49 -0
  135. package/src/tui/components/code-view.ts +54 -11
  136. package/src/tui/components/color.ts +171 -2
  137. package/src/tui/components/config-panel.ts +82 -15
  138. package/src/tui/components/duration.ts +38 -0
  139. package/src/tui/components/evolution.ts +3 -3
  140. package/src/tui/components/footer.ts +91 -42
  141. package/src/tui/components/forge.ts +426 -31
  142. package/src/tui/components/hints.ts +54 -0
  143. package/src/tui/components/hud.ts +73 -0
  144. package/src/tui/components/index.ts +4 -0
  145. package/src/tui/components/input-box.ts +150 -0
  146. package/src/tui/components/layout.ts +11 -3
  147. package/src/tui/components/live-model-picker.ts +108 -0
  148. package/src/tui/components/markdown-table.ts +140 -0
  149. package/src/tui/components/markdown-text.ts +97 -0
  150. package/src/tui/components/meter.ts +4 -1
  151. package/src/tui/components/model-picker.ts +3 -2
  152. package/src/tui/components/provider-picker.ts +3 -2
  153. package/src/tui/components/section.ts +70 -0
  154. package/src/tui/components/select-list.ts +40 -10
  155. package/src/tui/components/skill-picker.ts +25 -0
  156. package/src/tui/components/slash.ts +244 -21
  157. package/src/tui/components/status.ts +272 -11
  158. package/src/tui/components/step-timeline.ts +218 -0
  159. package/src/tui/components/stream.ts +26 -9
  160. package/src/tui/components/themes.ts +212 -6
  161. package/src/tui/components/todo-card.ts +47 -0
  162. package/src/tui/components/tool-list.ts +58 -12
  163. package/src/tui/components/transcript.ts +120 -0
  164. package/src/tui/components/update-box.ts +31 -0
  165. package/src/tui/components/welcome.ts +162 -0
  166. package/src/tui/components/width.ts +163 -0
  167. package/src/tui/monitoring/AGENTS.md +31 -0
  168. package/src/tui/monitoring/hud-view.ts +55 -0
  169. package/src/tui/renderer.ts +112 -3
  170. package/src/tui/terminal.ts +40 -33
  171. package/src/util/AGENTS.md +39 -0
  172. package/src/util/clipboard-image.ts +118 -0
  173. package/src/util/env.ts +12 -0
  174. package/src/util/provider-error.ts +78 -0
  175. package/src/util/retry.ts +91 -6
  176. package/src/util/update-check.ts +64 -0
  177. package/src/commands/models.ts +0 -104
@@ -1,16 +1,39 @@
1
1
  /**
2
- * Reusable agentic tool-call loop — the shared core behind `joc team`
3
- * (per-task executor) and `joc launch` (interactive coding agent).
2
+ * Reusable agentic tool-call loop — the shared core behind `jeo team`
3
+ * (per-task executor) and `jeo launch` (interactive coding agent).
4
4
  *
5
5
  * The model is driven in JSON tool-call mode: each step it emits exactly one
6
6
  * `{ "tool": "...", "arguments": { ... } }` object; the engine dispatches it,
7
7
  * appends the result to history, and continues until the model calls `done`
8
8
  * or the step budget is exhausted.
9
9
  */
10
- import { callLlm, type Message } from "./loop";
10
+ import * as fs from "node:fs/promises";
11
+ import * as path from "node:path";
12
+ import type { Message } from "./loop";
11
13
  import { extractJsonObject } from "./json";
12
- import { readTool, writeTool, editTool, bashTool, findTool, searchTool, type ToolResult } from "./tools";
14
+ import { readTool, writeTool, editTool, bashTool, findTool, searchTool, lsTool, mkdirTool, deleteTool, type ToolResult } from "./tools";
15
+ import { webSearchTool, setWebSearchActiveModel } from "./web-search";
16
+ import { friendlyProviderError, isContextOverflowError, isRefusalError } from "../util/provider-error";
17
+ import { isRateLimitError } from "../util/retry";
18
+ import { runPreToolHooks, runPostTurnHooks } from "./hooks";
19
+ import { minimizeToolOutput } from "./output-minimizer";
20
+ import { StepBudget, dynamicStepBudgetConfig, resolveStepBudgetConfig, hashSignature, type StepBudgetConfig } from "./step-budget";
21
+ import { historyTokens, trimToolResultsInPlace } from "./compaction";
22
+ import { jeoEnv } from "../util/env";
13
23
 
24
+
25
+ async function invokeCallLlm(history: Message[], options: {
26
+ jsonMode: boolean;
27
+ model?: string;
28
+ maxTokens?: number;
29
+ signal?: AbortSignal;
30
+ onUsage?: (u: { inputTokens?: number; outputTokens?: number }) => void;
31
+ onRetry?: (attempt: number, err: unknown, delayMs: number) => void;
32
+ onToken?: (delta: string) => void;
33
+ }): Promise<string> {
34
+ const mod = await import("./loop");
35
+ return mod.callLlm(history, options);
36
+ }
14
37
  export interface ToolInvocation {
15
38
  tool: string;
16
39
  arguments?: Record<string, any>;
@@ -20,47 +43,126 @@ export type ToolHandler = (args: Record<string, any>, cwd: string) => Promise<To
20
43
 
21
44
  /** The default executor toolset (read / write / edit / bash / find / search). */
22
45
  export const DEFAULT_TOOLS: Record<string, ToolHandler> = {
23
- read: (a, cwd) => readTool(a.filePath ?? a.path, a.lineRange, cwd),
46
+ read: (a, cwd) => readTool(a.filePath ?? a.path, a.lineRange ?? a.range, cwd, !!a.raw),
24
47
  write: (a, cwd) => writeTool(a.filePath ?? a.path, a.content ?? "", cwd),
25
48
  edit: (a, cwd) => editTool(a.filePath ?? a.path, a.editBlock ?? a.edit ?? "", cwd),
26
- bash: (a, cwd) => bashTool(a.command ?? a.cmd, cwd, typeof a.timeoutMs === "number" ? a.timeoutMs : undefined),
49
+ bash: (a, cwd) => bashTool(a.command ?? a.cmd, cwd, typeof a.timeoutMs === "number" ? a.timeoutMs : undefined, typeof a.cwd === "string" ? a.cwd : (typeof a.subdir === "string" ? a.subdir : undefined), a.env && typeof a.env === "object" ? a.env : undefined),
27
50
  find: (a, cwd) => findTool(a.globPattern ?? a.pattern, cwd),
28
- search: (a, cwd) => searchTool(a.pattern, a.globPattern ?? "*", cwd),
51
+ search: (a, cwd) => searchTool(a.pattern, a.globPattern ?? "*", cwd, !!(a.ignoreCase ?? a.i), { before: a.before, after: a.after, context: a.context, maxMatches: a.maxMatches }),
52
+ ls: (a, cwd) => lsTool(a.dirPath ?? a.path ?? a.dir ?? ".", cwd),
53
+ mkdir: (a, cwd) => mkdirTool(a.dirPath ?? a.path ?? a.dir, cwd),
54
+ delete: (a, cwd) => deleteTool(a.path ?? a.filePath ?? a.targetPath ?? a.dirPath, cwd, !!(a.recursive ?? a.r)),
55
+ web_search: (a, cwd) => webSearchTool(a, cwd),
29
56
  };
30
57
 
31
58
  /** Tool-protocol description injected into the system prompt. */
32
59
  export const TOOL_PROTOCOL = [
33
- "You have these tools (call exactly ONE per step):",
34
- "1. read {filePath, lineRange?} — read a file (lineRange: \"start-end\", \"start-\", or \"start\")",
60
+ "You have these tools (call exactly ONE per step, or batch multiple independent calls):",
61
+ "1. read {filePath, lineRange?, raw?} — read a file; lines are prefixed `LINEhh|` (hh = 2-char content anchor; the | is a separator, not file bytes)",
35
62
  "2. write {filePath, content} — create/overwrite a file",
36
- "3. edit {filePath, editBlock} — ≔A..B replace lines; ≔A+ insert after line A; ≔$ append EOF (payload on next line)",
37
- "4. bash {command, timeoutMs?} — run a shell command (tests, build, mkdir, ...); timeoutMs default 120000",
63
+ "3. edit {filePath, editBlock} — ≔A..B replace lines (append read anchors for safety: ≔12ab..15cd — rejected with fresh content if the lines changed); ≔A+ insert after line A; ≔$ append EOF (payload on next line). NEVER copy the `LINEhh|` prefixes into SEARCH blocks or payloads",
64
+ "4. bash {command, timeoutMs?, cwd?, env?} — run a shell command (cwd: subdir; env: extra vars)",
38
65
  "5. find {globPattern} — find files by name",
39
- "6. search {pattern, globPattern?} — grep for a pattern",
40
- "7. done {reason?} — call when the task is fully implemented AND verified",
66
+ "6. search {pattern, globPattern?, ignoreCase?, context?, maxMatches?} — grep (context: N lines around each match)",
67
+ "7. ls {dirPath} — list a directory's entries (dirs first)",
68
+ "8. mkdir {dirPath} — create a directory (parents included; idempotent)",
69
+ "9. delete {path, recursive?} — remove a file (or directory with recursive:true)",
70
+ "10. web_search {query, recency?, limit?} — search the web (Anthropic-native: synthesized answer + sources + citations)",
71
+ "11. done {reason?} — call when the task is fully implemented AND verified",
72
+ "",
73
+ "Reply with STRICT JSON only — no code fences. You MAY include an optional leading",
74
+ '"reasoning" string (one short sentence on your plan) before "tool":',
75
+ '{ "reasoning": "<one short sentence>", "tool": "<name>", "arguments": { ... } }',
76
+ "",
77
+ "Alternatively, you may batch up to 6 independent calls in a single turn using the following format:",
78
+ '{ "reasoning": "<one short sentence>", "tools": [{ "tool": "<name>", "arguments": { ... } }, ...] }',
79
+ "Batch only independent calls; NEVER batch 'done', and NEVER put a mutating tool (write/edit/bash) after another mutating tool in one batch whose inputs depend on the earlier one.",
80
+ ].join("\n");
81
+
82
+ /** Restricted protocol for read-only subagent roles (planner/architect/critic):
83
+ * advertises only the non-mutating tools so the model does not waste steps
84
+ * calling write/edit/bash, which `subagentToolset` has physically removed. */
85
+ export const READONLY_TOOL_PROTOCOL = [
86
+ "You have these READ-ONLY tools (call exactly ONE per step, or batch multiple independent calls):",
87
+ "1. read {filePath, lineRange?} — read a file (lineRange: \"a-b\", \"a-\", \"a\", \"a+n\", or multi \"a-b,c-d\")",
88
+ "2. find {globPattern} — find files by name",
89
+ "3. search {pattern, globPattern?, ignoreCase?} — grep for a pattern",
90
+ "4. ls {dirPath} — list a directory's entries",
91
+ "5. web_search {query, recency?, limit?} — search the web (answer + sources + citations)",
92
+ "6. done {reason?} — call when your review/analysis is complete",
41
93
  "",
42
94
  "Reply with STRICT JSON only — no prose, no code fences:",
43
95
  '{ "tool": "<name>", "arguments": { ... } }',
96
+ "",
97
+ "Alternatively, you may batch up to 6 independent calls in a single turn using the following format:",
98
+ '{ "tools": [{ "tool": "<name>", "arguments": { ... } }, ...] }',
99
+ "Batch only independent calls; NEVER batch 'done'.",
100
+ ].join("\n");
101
+
102
+ /** gjc-inherited working discipline (plan/gjc-inheritance.md B3): the completion
103
+ * contract and tool-priority rules distilled from gjc's system prompt — compact
104
+ * (<300 tokens) per the pi-mono budget so the core prompt stays lean. */
105
+ export const WORKING_DISCIPLINE = [
106
+ "Working discipline:",
107
+ "- Correctness first, maintainability second, brevity third. Prefer boring, explicit code.",
108
+ "- Never present partial work as complete; never suppress tests or warnings to make code pass.",
109
+ "- Never fabricate tool results or test outcomes; verification claims must match what was actually run.",
110
+ "- Never ship stubs, placeholders, or TODO-only code as a delivered feature.",
111
+ "- Never substitute the requested problem with an easier adjacent one.",
112
+ "- Update directly affected callsites, tests, and docs — or state why they are unchanged.",
113
+ "- Reuse existing patterns; parallel conventions are prohibited. Fix problems at their source.",
114
+ "- You are not alone in the repository: treat unexpected changes as user work; never revert or delete them.",
115
+ "- Re-read before acting if a tool fails or a file may have changed.",
116
+ "- Prefer dedicated tools over shell pipelines: read (not cat), search (not grep), edit (not sed).",
44
117
  ].join("\n");
45
118
 
46
- export function executorSystemPrompt(role = "Executor Agent, a senior software developer"): string {
119
+ export function executorSystemPrompt(
120
+ role = "Executor Agent, a senior software developer",
121
+ protocol: string = TOOL_PROTOCOL,
122
+ verificationDirective = "Always verify (run tests / execute the program) before calling done.",
123
+ ): string {
47
124
  return (
48
125
  `You are the ${role}.\n` +
49
126
  `Accomplish the user's request by calling tools and verifying your work.\n\n` +
50
- `${TOOL_PROTOCOL}\n\n` +
51
- `Always verify (run tests / execute the program) before calling done.`
127
+ `${protocol}\n\n` +
128
+ `${WORKING_DISCIPLINE}\n\n` +
129
+ verificationDirective
52
130
  );
53
131
  }
54
132
 
55
133
  export interface AgentLoopEvents {
56
- onStep?(step: number): void;
134
+ onStep?(step: number): void | Promise<void>;
57
135
  onAssistant?(raw: string, invocation: ToolInvocation | null): void;
58
136
  onToolResult?(tool: string, success: boolean, output: string): void;
59
- onError?(message: string): void;
137
+ /** Transient progress notice (e.g. "rate limited — retrying in Ns"); NOT a terminal error. */
138
+ onNotice?(message: string): void;
139
+ /** Cumulative token usage after each LLM call — drives live usage meters. */
140
+ onUsage?(usage: { inputTokens: number; outputTokens: number }): void;
141
+ /** Accumulated streamed model response so far — drives the live reasoning view. Only
142
+ * requested when a consumer sets it (the engine streams solely for the TUI). */
143
+ onModelStream?(textSoFar: string): void;
144
+ /** Step-budget change (gjc-style retry flow): the limit was extended because the
145
+ * turn is making progress. `limit` is the new max; `reason` is display-ready. */
146
+ onBudget?(limit: number, reason: string): void;
147
+ /** Consulted when a lone `done` arrives. Return a corrective message to bounce
148
+ * the done ONCE (e.g. "todo list still shows unfinished items — update it
149
+ * first"); return null to let the turn finish. The engine guarantees at most
150
+ * one bounce per turn, so a stubborn model can never loop here. */
151
+ onBeforeDone?(reason: string): string | null;
60
152
  }
61
153
 
62
154
  export interface AgentLoopOptions {
155
+ /** Optional system prompt: prepended to `history` when it has no system message. */
156
+ systemPrompt?: string;
157
+ /** Mid-turn context budget (estimated tokens). When the in-turn history grows
158
+ * past this, the OLDEST tool-result bodies are deterministically elided so a
159
+ * long turn cannot snowball into multi-million-token prompts. Default 80k. */
160
+ maxHistoryTokens?: number;
63
161
  cwd: string;
162
+ /** Base step budget (default 15). Non-finite or `<= 0` selects the DYNAMIC budget:
163
+ * the budget keeps extending while the recent tool window shows NOVEL progress,
164
+ * a stalled or cycling turn consolidates a final wrap-up, and a large finite
165
+ * safety cap (`DYNAMIC_HARD_CAP`, default 600) guarantees termination. */
64
166
  maxSteps?: number;
65
167
  model?: string;
66
168
  /** Max generation tokens per step (drives the thinking budget). */
@@ -68,6 +170,9 @@ export interface AgentLoopOptions {
68
170
  tools?: Record<string, ToolHandler>;
69
171
  signal?: AbortSignal;
70
172
  events?: AgentLoopEvents;
173
+ /** Step-budget overrides (gjc-style retry flow). `{ maxExtensions: 0 }` restores the
174
+ * legacy fixed counter — used by bounded subagent delegation. */
175
+ budget?: Partial<StepBudgetConfig>;
71
176
  }
72
177
 
73
178
  export interface AgentLoopResult {
@@ -78,28 +183,150 @@ export interface AgentLoopResult {
78
183
  usage?: { inputTokens: number; outputTokens: number };
79
184
  }
80
185
 
186
+ /** Env-tunable output budget (plan/gjc-inheritance.md B10, gjc settings-driven
187
+ * output handling 계승): JEO_TOOL_OUTPUT_MAX caps the model-visible tool result;
188
+ * the spill threshold tracks it so anything truncated stays artifact-recoverable. */
189
+ function envOutputMax(): number {
190
+ const raw = Number(jeoEnv("TOOL_OUTPUT_MAX") ?? "");
191
+ return Number.isFinite(raw) && raw >= 500 && raw <= 200_000 ? Math.trunc(raw) : 4_000;
192
+ }
193
+ export const TOOL_OUTPUT_MAX = envOutputMax();
194
+
195
+ /** Wall-clock budget for ONE agent turn (ms). JEO_TURN_MAX_MS overrides; 0 disables.
196
+ * Default 30 minutes: long autonomous runs stay alive, while a turn that spins in
197
+ * "thinking" (huge contexts, endless extensions) is guaranteed to terminate into
198
+ * the consolidation wrap-up instead of running for hours. */
199
+ export function turnMaxMs(env: Record<string, string | undefined> = process.env): number {
200
+ const raw = jeoEnv("TURN_MAX_MS", env);
201
+ if (raw !== undefined && raw !== "") {
202
+ const n = Number(raw);
203
+ if (Number.isFinite(n) && n >= 0) return Math.trunc(n);
204
+ }
205
+ return 30 * 60 * 1000;
206
+ }
207
+
81
208
  /**
82
209
  * Cap a tool result fed back to the model, keeping both ends: the head holds the
83
210
  * start (e.g. a file's top / a command's invocation) and the tail holds what's
84
211
  * usually decisive (test summaries, the final error). A pure head-cut loses that.
85
212
  */
86
- export function truncateToolOutput(s: string, max = 4000): string {
213
+ export function truncateToolOutput(s: string, max = TOOL_OUTPUT_MAX): string {
87
214
  if (s.length <= max) return s;
88
215
  const head = Math.floor(max * 0.6);
89
216
  const tail = max - head;
90
217
  return `${s.slice(0, head)}\n…(${s.length - max} chars truncated)…\n${s.slice(s.length - tail)}`;
91
218
  }
92
219
 
220
+ /** Tool output larger than this is spilled to a recoverable artifact file. Aligned
221
+ * with `truncateToolOutput`'s cap so that whenever the model-visible result drops
222
+ * content, the full output is recoverable via the artifact. */
223
+ export const TOOL_SPILL_THRESHOLD = TOOL_OUTPUT_MAX;
224
+
225
+ /**
226
+ * Write an oversized tool result verbatim under `.jeo/artifacts/tool-results/` and
227
+ * return the workspace-relative path (for the model to `read`). Best-effort: throws
228
+ * are caught by the caller, which simply omits the artifact note.
229
+ */
230
+ /** Most recent tool-result artifacts to keep; older ones are pruned on each spill. */
231
+ export const MAX_TOOL_ARTIFACTS = 50;
232
+
233
+ /** Best-effort retention: keep the newest `MAX_TOOL_ARTIFACTS` files in `dir`, delete the rest. */
234
+ async function pruneToolArtifacts(dir: string): Promise<void> {
235
+ const files = await fs.readdir(dir).catch(() => [] as string[]);
236
+ if (files.length <= MAX_TOOL_ARTIFACTS) return;
237
+ const stamped = await Promise.all(
238
+ files.map(async f => ({ f, m: (await fs.stat(path.join(dir, f)).catch(() => null))?.mtimeMs ?? 0 })),
239
+ );
240
+ stamped.sort((a, b) => b.m - a.m); // newest first
241
+ for (const { f } of stamped.slice(MAX_TOOL_ARTIFACTS)) {
242
+ await fs.rm(path.join(dir, f), { force: true }).catch(() => {});
243
+ }
244
+ }
245
+
246
+ export async function spillToolResult(tool: string, output: string, cwd: string): Promise<string> {
247
+ const dir = path.join(cwd, ".jeo", "artifacts", "tool-results");
248
+ await fs.mkdir(dir, { recursive: true });
249
+ const safeTool = tool.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 32) || "tool";
250
+ const stamp = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
251
+ const rel = path.join(".jeo", "artifacts", "tool-results", `${stamp}-${safeTool}.txt`);
252
+ await fs.writeFile(path.join(cwd, rel), output, "utf-8");
253
+ // Retention so a long session can't grow the artifact dir without bound.
254
+ await pruneToolArtifacts(dir);
255
+ return rel;
256
+ }
257
+
258
+ /** Levenshtein distance (small inputs: tool/command names). */
259
+ function editDistance(a: string, b: string): number {
260
+ const m = a.length, n = b.length;
261
+ if (m === 0) return n;
262
+ if (n === 0) return m;
263
+ let prev = Array.from({ length: n + 1 }, (_, i) => i);
264
+ let cur = new Array<number>(n + 1).fill(0);
265
+ for (let i = 1; i <= m; i++) {
266
+ cur[0] = i;
267
+ for (let j = 1; j <= n; j++) {
268
+ const cost = a[i - 1] === b[j - 1] ? 0 : 1;
269
+ cur[j] = Math.min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + cost);
270
+ }
271
+ [prev, cur] = [cur, prev];
272
+ }
273
+ return prev[n];
274
+ }
275
+
276
+ /** Nearest known tool name for an unknown call: exact, prefix, or edit distance ≤ 2. */
277
+ export function nearestToolName(name: string, known: string[]): string | undefined {
278
+ const want = name.trim().toLowerCase();
279
+ if (!want) return undefined;
280
+ let best: string | undefined;
281
+ let bestD = Infinity;
282
+ for (const k of known) {
283
+ const kl = k.toLowerCase();
284
+ if (kl === want) return k;
285
+ const d = kl.startsWith(want) || want.startsWith(kl) ? 1 : editDistance(want, kl);
286
+ if (d < bestD) { bestD = d; best = k; }
287
+ }
288
+ return bestD <= 2 ? best : undefined;
289
+ }
93
290
  /**
94
291
  * Drive `history` through the tool-call loop, mutating it in place so callers
95
292
  * (e.g. an interactive REPL) can keep the conversation across multiple turns.
96
293
  */
97
294
  export async function runAgentLoop(history: Message[], opts: AgentLoopOptions): Promise<AgentLoopResult> {
98
295
  const { cwd } = opts;
296
+ // Active-model gate for web_search's provider chain (gjc parity): the chain
297
+ // prefers the active model's native search backend, never credential-scanning.
298
+ setWebSearchActiveModel(opts.model);
299
+ // Honor an explicit system prompt for callers that build history without one.
300
+ if (opts.systemPrompt && history[0]?.role !== "system") {
301
+ history.unshift({ role: "system", content: opts.systemPrompt });
302
+ }
99
303
  const tools = opts.tools ?? DEFAULT_TOOLS;
100
304
  const maxSteps = opts.maxSteps ?? 15;
305
+ // gjc-style retry flow: the step limit is a flexible BUDGET, not a bare counter.
306
+ // While the recent window shows real progress the budget extends itself; a stalled
307
+ // turn fails fast into the consolidation wrap-up. An explicit positive maxSteps
308
+ // keeps the bounded flow (base + capped extensions); a non-finite / non-positive
309
+ // maxSteps selects the DYNAMIC budget — extensions keep flowing while NOVEL
310
+ // progress continues, a stalled/cycling window consolidates, and a large finite
311
+ // safety cap (default 600 steps) guarantees the turn always terminates.
312
+ const budget = new StepBudget(
313
+ Number.isFinite(maxSteps) && maxSteps > 0
314
+ ? resolveStepBudgetConfig(maxSteps, process.env, opts.budget)
315
+ : dynamicStepBudgetConfig(process.env, opts.budget),
316
+ );
317
+ // Why the loop stopped at the limit — folded into the consolidation message.
318
+ let budgetStopReason = "";
101
319
  const ev = opts.events ?? {};
320
+ const maxHistoryTokens = Math.max(10_000, opts.maxHistoryTokens ?? 80_000);
102
321
 
322
+ // Wall-clock turn budget — the definitive "never sits in thinking forever"
323
+ // guarantee. Step budgets bound the COUNT of model calls; this bounds their total
324
+ // TIME: a turn that crosses it stops at the next loop boundary and consolidates a
325
+ // wrap-up instead of spinning for hours under a generous dynamic step cap.
326
+ const turnStartedAt = Date.now();
327
+ const turnBudgetMs = turnMaxMs();
328
+ // "steps" | "time" — drives honest wording in the consolidation message.
329
+ let stopKind: "steps" | "time" = "steps";
103
330
  let step = 1;
104
331
  const acc = { inputTokens: 0, outputTokens: 0 };
105
332
  let sawUsage = false;
@@ -111,98 +338,625 @@ export async function runAgentLoop(history: Message[], opts: AgentLoopOptions):
111
338
  // calls (bad edits, failing commands) would otherwise burn the whole step budget.
112
339
  const MAX_FAILURES = 5;
113
340
  let consecutiveFailures = 0;
341
+ // done-verification guard (plan/gjc-inheritance.md B4, gjc ultragoal-guard 경량 계승):
342
+ // a turn that MUTATED files but shows no verification signal gets ONE pushback on
343
+ // `done` — run the relevant test/build, or call done again (the escape hatch for
344
+ // doc/config changes where verification is genuinely not applicable).
345
+ let sawMutation = false;
346
+ let sawVerification = false;
347
+ let donePushbackUsed = false;
348
+ // Caller-owned done gate (onBeforeDone) — also strictly once per turn.
349
+ let beforeDoneNudgeUsed = false;
350
+ // F1 (round 4): the run-command of the most recent post-turn hook FAILURE whose
351
+ // diagnostics the model saw but has not yet resolved (a later clean hook run
352
+ // clears it). The done guard treats this as "verification missing" — the hook
353
+ // exit code is the strongest correctness signal in the loop.
354
+ let pendingHookFailure: string | null = null;
355
+ // Round-6 #4: ONE reactive recovery when the PROVIDER reports context overflow
356
+ // (authoritative where the local estimate drifted — images, tokenizer mismatch).
357
+ let contextOverflowRetryUsed = false;
358
+ // Refusal recovery budget: a safety refusal (HTTP 200, no content) on routine
359
+ // coding work is usually a transient false-positive. Retry the SAME step once
360
+ // as-is, then once more with an explicit re-grounding note; only a third
361
+ // refusal in the turn surfaces the (friendly) error. Bounded per turn so a
362
+ // genuinely refused request can never burn billed calls in a loop.
363
+ const MAX_REFUSAL_RETRIES = 3;
364
+ let refusalRetries = 0;
365
+ const VERIFY_SIGNAL_RE = /\b(test|tests|tsc|typecheck|lint|build|check|spec|pytest|vitest|jest)\b/i;
114
366
  let lastSig = "";
115
367
  let repeatCount = 0;
116
- while (step <= maxSteps) {
368
+ // Cycle guard (the A↔B ping-pong the exact-repeat guard cannot see): the recent
369
+ // executed step signatures, as fixed-size digests. When a full window cycles
370
+ // through ≤2 distinct calls, bounce ONCE with an explicit correction; a spin that
371
+ // persists through the correction stops the turn.
372
+ const CYCLE_WINDOW = 6;
373
+ const recentStepSigs: string[] = [];
374
+ let cycleBounceUsed = false;
375
+ // Invalid-tool-call guard: a model that returns JSON without a usable `tool`
376
+ // field can't drive the loop at all — surface that clearly instead of looping.
377
+ let invalidToolCalls = 0;
378
+ // Prose-bounce guard: after this many invalid-JSON corrections, salvage the
379
+ // model's text as the final answer instead of burning the whole step budget.
380
+ const MAX_PARSE_BOUNCES = 2;
381
+ let parseFailures = 0;
382
+ while (true) {
383
+ if (turnBudgetMs > 0 && Date.now() - turnStartedAt > turnBudgetMs) {
384
+ stopKind = "time";
385
+ budgetStopReason = `turn wall-clock budget of ${Math.round(turnBudgetMs / 60_000)}m exceeded (JEO_TURN_MAX_MS) without done`;
386
+ break;
387
+ }
388
+ if (step > budget.limit()) {
389
+ const decision = budget.tryExtend();
390
+ if (!decision.extend) {
391
+ budgetStopReason = decision.reason;
392
+ break;
393
+ }
394
+ // One surface per sink: budget-aware consumers get onBudget; others the notice.
395
+ if (ev.onBudget) ev.onBudget(decision.limit, decision.reason);
396
+ else ev.onNotice?.(decision.reason);
397
+ }
117
398
  if (opts.signal?.aborted) {
118
399
  return finish({ done: false, steps: step - 1, doneReason: "Cancelled." });
119
400
  }
120
- ev.onStep?.(step);
401
+ await ev.onStep?.(step);
402
+
403
+ // MID-TURN context guard: a single long turn (60+ steps) otherwise grows the
404
+ // history without bound — turn-boundary compaction never runs inside a turn,
405
+ // and field evidence shows multi-million-token prompts degrading the model
406
+ // into repeat loops while cost compounds. Deterministically elide the OLDEST
407
+ // tool-result bodies once the estimate crosses the budget; recent evidence
408
+ // and all assistant/user content stay intact.
409
+ if (historyTokens(history) > maxHistoryTokens) {
410
+ const res = trimToolResultsInPlace(history, { budgetTokens: maxHistoryTokens });
411
+ if (res.trimmed > 0) {
412
+ ev.onNotice?.(`context guard: elided ${res.trimmed} older tool result(s) mid-turn (~${Math.round(res.tokens / 1000)}k tokens kept)`);
413
+ }
414
+ }
121
415
 
416
+ // Stream the response into the live reasoning view ONLY when a consumer is attached
417
+ // (a TUI). Non-interactive/test callers leave onModelStream unset → a single
418
+ // non-streaming call(), unchanged. The accumulated text is still parsed as one JSON
419
+ // tool call below, so streaming changes nothing about loop semantics.
420
+ let streamBuf = "";
421
+ const onToken = ev.onModelStream
422
+ ? (delta: string) => { streamBuf += delta; ev.onModelStream!(streamBuf); }
423
+ : undefined;
122
424
  let responseText: string;
123
425
  try {
124
- responseText = await callLlm(history, {
426
+ responseText = await invokeCallLlm(history, {
125
427
  jsonMode: true,
126
428
  model: opts.model,
127
429
  maxTokens: opts.maxTokens,
128
430
  signal: opts.signal,
129
431
  onUsage: u => { acc.inputTokens += u.inputTokens ?? 0; acc.outputTokens += u.outputTokens ?? 0; sawUsage = true; },
432
+ onToken,
433
+ // Make provider auto-retry visible: previously a rate-limited call sat in a
434
+ // silent backoff wait, then surfaced "auto-retry was exhausted" with no trace
435
+ // of the retries that DID happen.
436
+ onRetry: (attempt, err, delayMs) => {
437
+ const wait = Math.max(1, Math.round(delayMs / 1000));
438
+ const what = isRateLimitError(err) ? "rate limited (HTTP 429)" : "transient provider error";
439
+ ev.onNotice?.(`${what} — auto-retry #${attempt} in ${wait}s`);
440
+ },
130
441
  });
131
442
  } catch (err) {
132
- const message = (err as Error).message;
133
- ev.onError?.(message);
134
- // Surface the real cause so callers don't print a misleading "step limit" message.
443
+ // Reactive context recovery: trim older tool results in place and retry the
444
+ // SAME step once. The provider's overflow signal beats the local estimate;
445
+ // a second overflow (or nothing left to trim) surfaces the friendly error.
446
+ if (isContextOverflowError(err) && !contextOverflowRetryUsed) {
447
+ contextOverflowRetryUsed = true;
448
+ // keepRecent 2 (vs the proactive guard's 8): the provider already REJECTED
449
+ // this prompt — freeing real space beats keeping evidence that can be re-run.
450
+ const res = trimToolResultsInPlace(history, { budgetTokens: Math.max(1, Math.floor(maxHistoryTokens / 2)), keepRecent: 2 });
451
+ if (res.trimmed > 0) {
452
+ ev.onNotice?.(`provider reported context overflow — elided ${res.trimmed} older tool result(s), retrying once`);
453
+ continue; // free retry: the step counter is unchanged
454
+ }
455
+ }
456
+ // Reactive refusal recovery (the "stop_reason=refusal" dead turn). Anthropic's
457
+ // contract: a refusal means the streaming classifier tripped on the CURRENT
458
+ // conversation content, and the context must be RESET before continuing —
459
+ // resending the same prompt keeps refusing deterministically. Ladder:
460
+ // 1) plain resend — covers a transient classifier flake (the OAuth payload
461
+ // also rotates its per-request user id, which alone can clear a trip);
462
+ // 2) classifier reset — elide tool-result bodies (the usual trigger is
463
+ // freshly-read file/search content, not the task itself) and append a
464
+ // NEUTRAL continuation note. The note deliberately never mentions the
465
+ // safety layer: arguing with the filter reads as a jailbreak attempt
466
+ // and escalates instead of recovering.
467
+ // 3) guidance strip — with tool results already gone, the remaining
468
+ // classifier-trigger candidate is the repo-authored prose injected
469
+ // into the SYSTEM prompt (<project_context> — AGENTS.md / rules can
470
+ // contain text that trips content filters even though the task is
471
+ // routine). Strip that block for the rest of the turn and retry once;
472
+ // core instructions stay intact. Field case: `$gjc init` inside a
473
+ // repo whose guidance files refuse-trip the OAuth classifier.
474
+ if (isRefusalError(err) && refusalRetries < MAX_REFUSAL_RETRIES) {
475
+ refusalRetries++;
476
+ if (refusalRetries === 1) {
477
+ ev.onNotice?.("provider refused the last call (no content) — retrying the same step");
478
+ continue; // free resend: the step counter is unchanged
479
+ }
480
+ if (refusalRetries === 2) {
481
+ const res = trimToolResultsInPlace(history, { budgetTokens: 0, keepRecent: 0 });
482
+ ev.onNotice?.(
483
+ res.trimmed > 0
484
+ ? `provider refused again — reset ${res.trimmed} tool result(s) from the context and retrying (refusals require a context reset)`
485
+ : "provider refused again — continuing with a fresh instruction",
486
+ );
487
+ history.push({
488
+ role: "user",
489
+ content:
490
+ "(continuation) The previous response returned no content and older tool outputs were elided from this conversation. " +
491
+ "Re-assess the task from the remaining context and reply with exactly one JSON tool call " +
492
+ '{"tool":"<name>","arguments":{...}} — re-run any tool whose output you still need, ' +
493
+ 'or send {"tool":"done","arguments":{"reason":"<summary>"}} if the task is finished.',
494
+ });
495
+ step++;
496
+ continue;
497
+ }
498
+ const sys = history[0];
499
+ if (sys?.role === "system" && sys.content.includes("<project_context>")) {
500
+ const stripped = sys.content.replace(/\n*<project_context>[\s\S]*?<\/project_context>/, "").trimEnd();
501
+ history[0] = { ...sys, content: stripped }; // replace, never mutate (identity caches)
502
+ ev.onNotice?.("provider refused a third time — removed project-context guidance from the system prompt and retrying once more");
503
+ continue; // same step, reduced system prompt
504
+ }
505
+ // Nothing left to strip — fall through to the friendly terminal error
506
+ // instead of burning an identical billed call.
507
+ }
508
+ const message = friendlyProviderError(err);
509
+ // The error IS the turn's doneReason and every caller displays that — emitting a
510
+ // separate error event here printed the same message twice (live stream + reply).
135
511
  return finish({ done: false, steps: step, doneReason: `Error: ${message}` });
136
512
  }
513
+ if (sawUsage) ev.onUsage?.({ ...acc });
137
514
 
138
- let invocation: ToolInvocation;
515
+ let invocation: any;
139
516
  try {
140
- invocation = extractJsonObject<ToolInvocation>(responseText);
517
+ invocation = extractJsonObject<any>(responseText);
141
518
  } catch (err) {
142
- // Not valid tool-call JSON — show the model the error and let it retry.
143
519
  ev.onAssistant?.(responseText, null);
520
+ // Prose salvage: a reply with no JSON object at all is a chat-style final
521
+ // answer, not a malformed tool call. Bouncing it back only made the model
522
+ // apologize for the format — and that apology surfaced as the visible reply.
523
+ // Same salvage after repeated bounces: the text we have IS the best answer.
524
+ const trimmed = responseText.trim();
525
+ parseFailures++;
526
+ if (trimmed && (!trimmed.includes("{") || parseFailures > MAX_PARSE_BOUNCES)) {
527
+ history.push({ role: "assistant", content: responseText });
528
+ return finish({ done: true, steps: step, doneReason: trimmed });
529
+ }
144
530
  history.push({ role: "assistant", content: responseText });
145
531
  history.push({
146
532
  role: "user",
147
533
  content:
148
534
  `Your last reply was not a valid tool call (${(err as Error).message}). ` +
149
- `Reply with exactly one JSON object: {"tool":"<name>","arguments":{...}}.`,
535
+ `Do NOT apologize or explain the formatting mistake. If that reply was your final answer, ` +
536
+ `resend it as {"tool":"done","arguments":{"reason":"<that answer, verbatim>"}}; ` +
537
+ `otherwise reply with exactly one JSON tool call: {"tool":"<name>","arguments":{...}}.`,
538
+ });
539
+ step++;
540
+ continue;
541
+ }
542
+ // A successfully parsed reply ends any bounce streak: MAX_PARSE_BOUNCES is a
543
+ // CONSECUTIVE-failure salvage, not a cumulative one — without this reset a long
544
+ // turn accumulated scattered parse slips and prematurely salvaged mid-task prose.
545
+ parseFailures = 0;
546
+
547
+ // Normalize to an invocation list
548
+ let toolCalls: { tool: string; arguments?: Record<string, any> }[] = [];
549
+ if (invocation && typeof invocation === "object") {
550
+ if (Array.isArray(invocation.tools)) {
551
+ const isValidBatch = invocation.tools.length > 0 && invocation.tools.every(
552
+ (t: any) => t && typeof t === "object" && typeof t.tool === "string" && t.tool.trim().length > 0
553
+ );
554
+ if (isValidBatch) {
555
+ toolCalls = invocation.tools.map((t: any) => ({
556
+ tool: t.tool.trim(),
557
+ arguments: t.arguments
558
+ }));
559
+ }
560
+ } else if (typeof invocation.tool === "string" && invocation.tool.trim().length > 0) {
561
+ toolCalls = [{
562
+ tool: invocation.tool.trim(),
563
+ arguments: invocation.arguments
564
+ }];
565
+ }
566
+ }
567
+
568
+ if (toolCalls.length === 0) {
569
+ invalidToolCalls++;
570
+ if (invalidToolCalls >= MAX_REPEAT) {
571
+ return finish({
572
+ done: false,
573
+ steps: step,
574
+ doneReason: `Stopped: the model returned no valid tool call ${MAX_REPEAT}× (a JSON reply with no valid "tool" or "tools" field). The selected model may be too small to follow the JSON tool protocol — switch to a stronger model with /model.`,
575
+ });
576
+ }
577
+ history.push({ role: "assistant", content: responseText });
578
+ history.push({
579
+ role: "user",
580
+ content: `Your last reply had no "tool" or "tools" field. Reply with exactly one JSON object, e.g. {"tool":"find","arguments":{"globPattern":"src/**"}} or {"tools":[{"tool":"read","arguments":{"filePath":"src/main.ts"}}, ...]}.`,
150
581
  });
151
582
  step++;
152
583
  continue;
153
584
  }
585
+ invalidToolCalls = 0;
154
586
 
155
- ev.onAssistant?.(responseText, invocation);
587
+ if (toolCalls.length > 6) {
588
+ ev.onNotice?.(`Too many tool calls in batch (${toolCalls.length}); capping at 6 and dropping the rest.`);
589
+ toolCalls = toolCalls.slice(0, 6);
590
+ }
591
+
592
+ ev.onAssistant?.(responseText, toolCalls[0]);
156
593
 
157
- if (invocation.tool === "done") {
158
- return finish({ done: true, steps: step, doneReason: (invocation.arguments?.reason as string) ?? "" });
594
+ if (toolCalls.length === 1 && toolCalls[0].tool === "done") {
595
+ if (sawMutation && (!sawVerification || pendingHookFailure !== null) && !donePushbackUsed) {
596
+ donePushbackUsed = true; // second done always passes — escape hatch
597
+ history.push({ role: "assistant", content: responseText });
598
+ history.push({
599
+ role: "user",
600
+ content: pendingHookFailure !== null
601
+ ? `Your latest mutation left the post-turn hook "${pendingHookFailure}" FAILING (non-zero exit) — its diagnostics were shown in the tool result above. ` +
602
+ "Fix the reported problems (the hook re-runs on your next mutation), then call done. " +
603
+ "If the hook failure is a false positive, call done again and say why in the reason."
604
+ : "You modified files this turn but ran NO verification (no test/build/typecheck command succeeded). " +
605
+ "Run the narrowest command that proves your change works, then call done. " +
606
+ "If verification is genuinely not applicable (docs/config-only change), call done again and say why in the reason.",
607
+ });
608
+ step++;
609
+ continue;
610
+ }
611
+ // Caller-owned done gate (e.g. stale-todo reconciliation): ONE bounded
612
+ // bounce, then any later done passes — field case: a 28-step turn ended
613
+ // [DONE] with the Todos checklist still showing 1 in-progress + 4 pending
614
+ // because nothing ever forced a status update.
615
+ if (!beforeDoneNudgeUsed && ev.onBeforeDone) {
616
+ const nudge = ev.onBeforeDone((toolCalls[0].arguments?.reason as string) ?? "");
617
+ if (nudge) {
618
+ beforeDoneNudgeUsed = true;
619
+ history.push({ role: "assistant", content: responseText });
620
+ history.push({ role: "user", content: nudge });
621
+ ev.onNotice?.("done deferred once — final plan reconciliation requested");
622
+ step++;
623
+ continue;
624
+ }
625
+ }
626
+ return finish({ done: true, steps: step, doneReason: (toolCalls[0].arguments?.reason as string) ?? "" });
159
627
  }
160
628
 
161
- // Detect repeated identical tool calls (no forward progress).
162
- const sig = `${invocation.tool}:${JSON.stringify(invocation.arguments ?? {})}`;
629
+ // Anti-spin guard, checked BEFORE execution: a repeated identical step must
630
+ // not run its calls again — a repeated mutating bash/edit must not execute
631
+ // a third time merely to be detected.
632
+ // - 2nd identical step → ONE corrective bounce (skip execution, tell the
633
+ // model its previous identical call already ran and to either act
634
+ // differently or call done). Field evidence: long turns died here right
635
+ // after a SUCCESSFUL write because nothing ever told the model to stop
636
+ // repeating — a recovery prompt resolves that without killing the turn.
637
+ // - 3rd identical step (repeated through the explicit correction) → stop.
638
+ const callSigs = toolCalls.map(c => `${c.tool}:${JSON.stringify(c.arguments ?? {})}`);
639
+ // Fixed-size digest of the whole step — `write` signatures embed entire file
640
+ // bodies, so the repeat/cycle guards compare digests, not megabyte strings.
641
+ const sig = hashSignature(callSigs.join(" | "));
163
642
  if (sig === lastSig) repeatCount++;
164
643
  else {
165
644
  repeatCount = 1;
166
645
  lastSig = sig;
167
646
  }
647
+ if (repeatCount === 2) {
648
+ const what = toolCalls.length === 1 ? `'${toolCalls[0].tool}' call` : "tool batch";
649
+ history.push({ role: "assistant", content: responseText });
650
+ history.push({
651
+ role: "user",
652
+ content:
653
+ `You just repeated the EXACT same ${what} you already ran in the previous step — it was not re-executed. ` +
654
+ `Its result has not changed. If the task is complete, reply {"tool":"done","arguments":{"reason":"<summary of what was accomplished>"}}; ` +
655
+ `otherwise take a DIFFERENT next action (verify the result, move to the next file, or fix something new).`,
656
+ });
657
+ ev.onNotice?.(`repeated ${what} skipped — asked the model to act differently or call done`);
658
+ step++;
659
+ continue;
660
+ }
168
661
  if (repeatCount >= MAX_REPEAT) {
662
+ const what = toolCalls.length === 1 ? `the same '${toolCalls[0].tool}' call` : "the same tool calls";
169
663
  return finish({
170
664
  done: false,
171
665
  steps: step,
172
- doneReason: `Stopped: repeated the same '${invocation.tool}' call ${MAX_REPEAT}× with no new progress (the model never signaled done).`,
666
+ doneReason: `Stopped: repeated ${what} ${MAX_REPEAT}× even after an explicit correction (the model never signaled done).`,
173
667
  });
174
668
  }
175
669
 
176
- const handler = tools[invocation.tool];
177
- let success: boolean;
178
- let output: string;
179
- if (!handler) {
180
- success = false;
181
- output = `Unknown tool: ${invocation.tool}. Available: ${Object.keys(tools).join(", ")}, done.`;
182
- } else {
183
- const res = await handler(invocation.arguments ?? {}, cwd);
184
- success = res.success;
185
- output = res.success ? res.output : (res.error ? (res.output ? `${res.error}\n${res.output}` : res.error) : res.output);
670
+ // Cycle guard: an A↔B (or A↔B↔C-minus-one) alternation never trips the
671
+ // exact-repeat guard above — each step differs from its immediate predecessor —
672
+ // yet it is the same spin (field case: re-reading one file and re-running one
673
+ // command forever, "thinking" never ends). Detect a full recent window that
674
+ // cycles through ≤2 distinct step signatures: ONE corrective bounce (skip
675
+ // execution a repeated mutating call must not run again merely to be
676
+ // detected), then stop if the spin survives the explicit correction.
677
+ recentStepSigs.push(sig);
678
+ if (recentStepSigs.length > CYCLE_WINDOW) recentStepSigs.shift();
679
+ if (recentStepSigs.length === CYCLE_WINDOW && new Set(recentStepSigs).size <= 2) {
680
+ if (!cycleBounceUsed) {
681
+ cycleBounceUsed = true;
682
+ recentStepSigs.length = 0; // fresh window: the correction earns a real retry
683
+ history.push({ role: "assistant", content: responseText });
684
+ history.push({
685
+ role: "user",
686
+ content:
687
+ `You are cycling through the same ${new Set(callSigs).size <= 1 ? "tool call" : "tool calls"} you already ran in recent steps — this call was NOT re-executed and its result has not changed. ` +
688
+ `If the task is complete, reply {"tool":"done","arguments":{"reason":"<summary of what was accomplished>"}}; ` +
689
+ `otherwise take a genuinely DIFFERENT next action (a new file, a new command, or a fix you have not tried).`,
690
+ });
691
+ ev.onNotice?.("tool-call cycle detected — skipped execution and asked the model to act differently or call done");
692
+ step++;
693
+ continue;
694
+ }
695
+ return finish({
696
+ done: false,
697
+ steps: step,
698
+ doneReason: `Stopped: the model cycled through the same tool calls for ${CYCLE_WINDOW} consecutive steps even after an explicit correction (it never signaled done).`,
699
+ });
186
700
  }
187
701
 
188
- ev.onToolResult?.(invocation.tool, success, output);
189
- history.push({ role: "assistant", content: responseText });
190
- history.push({
191
- role: "user",
192
- content: `Tool [${invocation.tool}] result (${success ? "ok" : "fail"}):\n${truncateToolOutput(output)}`,
193
- });
702
+ // Helper to execute a single tool call
703
+ const executeTool = async (call: { tool: string; arguments?: Record<string, any> }) => {
704
+ const { tool, arguments: args } = call;
705
+ let success: boolean;
706
+ let output: string;
194
707
 
195
- if (success) {
708
+ if (tool === "done") {
709
+ success = false;
710
+ output = "Error: 'done' can only be called as the single tool invocation, not in a batch. Please send 'done' alone.";
711
+ } else {
712
+ const handler = tools[tool];
713
+ if (!handler) {
714
+ success = false;
715
+ const suggestion = nearestToolName(tool, Object.keys(tools));
716
+ const hint = suggestion ? ` Did you mean "${suggestion}"?` : "";
717
+ output = `Unknown tool: ${tool}.${hint} Available: ${Object.keys(tools).join(", ")}, done.`;
718
+ } else {
719
+ const preHookResult = await runPreToolHooks(
720
+ cwd,
721
+ tool,
722
+ args ?? {},
723
+ opts.signal,
724
+ ev.onNotice
725
+ );
726
+ if (preHookResult.vetoed) {
727
+ success = false;
728
+ output = preHookResult.error + (preHookResult.output ? `\n${preHookResult.output}` : "");
729
+ } else {
730
+ try {
731
+ const res = await handler(args ?? {}, cwd);
732
+ success = res.success;
733
+ output = res.success ? res.output : (res.error ? (res.output ? `${res.error}\n${res.output}` : res.error) : res.output);
734
+ } catch (err: any) {
735
+ success = false;
736
+ output = err?.message || String(err);
737
+ }
738
+ }
739
+ }
740
+ }
741
+ return { success, output };
742
+ };
743
+
744
+ const READONLY_TOOLS = new Set(["read", "find", "search", "ls", "web_search"]);
745
+ const WRITE_TOOLS = new Set(["write", "edit"]);
746
+ // Batch grouping → concurrency plan (plan/gjc-inheritance.md cycle 12):
747
+ // read group — consecutive read-only calls run in parallel (safe).
748
+ // write group — consecutive write/edit calls to DISTINCT files run in
749
+ // parallel; a same-file (or path-less) collision opens a
750
+ // sequential boundary so ordered edits to one file stay ordered.
751
+ // exclusive group — bash (and anything else) always runs alone, in order.
752
+ // Reads and writes never share a group, so a read can never race a write.
753
+ type ToolGroup = {
754
+ kind: "read" | "write" | "exclusive";
755
+ calls: { tool: string; arguments?: Record<string, any>; index: number }[];
756
+ files?: Set<string>;
757
+ };
758
+ const groups: ToolGroup[] = [];
759
+ // Dedup key = RESOLVED, case-folded path (F3): `./x.ts` vs `x.ts` vs
760
+ // `src/../x.ts` — and case variants on the (default case-insensitive) macOS
761
+ // FS — must collapse to ONE key, or two spellings of the same file run in
762
+ // parallel and the second write silently clobbers the first. Folding case on
763
+ // a case-sensitive FS merely serializes two genuinely-distinct files — safe.
764
+ const targetFile = (call: { arguments?: Record<string, any> }): string | null => {
765
+ const p = call.arguments?.filePath ?? call.arguments?.path;
766
+ return typeof p === "string" && p.trim() !== "" ? path.resolve(cwd, p).toLowerCase() : null;
767
+ };
768
+ for (let i = 0; i < toolCalls.length; i++) {
769
+ const entry = { ...toolCalls[i], index: i };
770
+ const last = groups[groups.length - 1];
771
+ if (READONLY_TOOLS.has(entry.tool)) {
772
+ if (last && last.kind === "read") last.calls.push(entry);
773
+ else groups.push({ kind: "read", calls: [entry] });
774
+ } else if (WRITE_TOOLS.has(entry.tool)) {
775
+ const file = targetFile(entry);
776
+ if (last && last.kind === "write" && file !== null && !last.files!.has(file)) {
777
+ last.calls.push(entry);
778
+ last.files!.add(file);
779
+ } else {
780
+ groups.push({ kind: "write", calls: [entry], files: new Set(file !== null ? [file] : []) });
781
+ }
782
+ } else {
783
+ groups.push({ kind: "exclusive", calls: [entry] });
784
+ }
785
+ }
786
+
787
+ const results: { success: boolean; output: string; executed: boolean }[] = Array.from(
788
+ { length: toolCalls.length },
789
+ () => ({ success: false, output: "", executed: false })
790
+ );
791
+
792
+ let aborted = false;
793
+ for (const group of groups) {
794
+ if (opts.signal?.aborted) {
795
+ aborted = true;
796
+ break;
797
+ }
798
+ if (group.calls.length > 1) {
799
+ // read OR distinct-file write group → run concurrently.
800
+ await Promise.all(group.calls.map(async (call) => {
801
+ const res = await executeTool(call);
802
+ results[call.index] = { ...res, executed: true };
803
+ }));
804
+ } else {
805
+ const call = group.calls[0];
806
+ const res = await executeTool(call);
807
+ results[call.index] = { ...res, executed: true };
808
+ }
809
+ }
810
+
811
+ const processAndPushResults = async (indices: number[]) => {
812
+ const resultBlocks: string[] = [];
813
+ // Per-batch dedup of post-turn hook diagnostics: a whole-project `tsc` hook
814
+ // matching every edit in a batch yields identical output N times — show it
815
+ // once, cross-reference the rest (cycle 13).
816
+ const seenHookFeedback = new Set<string>();
817
+ for (const idx of indices) {
818
+ const call = toolCalls[idx];
819
+ const res = results[idx];
820
+
821
+ ev.onToolResult?.(call.tool, res.success, res.output);
822
+
823
+ const minimized = minimizeToolOutput(res.output, call.tool);
824
+ const visible = minimized.text;
825
+ let resultBody = truncateToolOutput(visible);
826
+ if (res.output.length > TOOL_SPILL_THRESHOLD) {
827
+ const artifact = await spillToolResult(call.tool, res.output, cwd).catch(() => null);
828
+ if (artifact) {
829
+ resultBody += `\n[full output (${res.output.length} chars) saved to ${artifact} — read it for the elided middle]`;
830
+ }
831
+ }
832
+
833
+ const { diags: hookDiags, ran: hooksRan } = await runPostTurnHooks(
834
+ cwd,
835
+ call.tool,
836
+ call.arguments ?? {},
837
+ res.success,
838
+ res.output,
839
+ opts.signal,
840
+ ev.onNotice
841
+ );
842
+ // F1: a red hook becomes a pending failure the done guard enforces; a
843
+ // later hook run that completes CLEAN (ran > 0, zero diags) clears it.
844
+ if (hookDiags.length > 0) pendingHookFailure = hookDiags[hookDiags.length - 1].run;
845
+ else if (hooksRan > 0) pendingHookFailure = null;
846
+
847
+ // Append non-zero-exit hook diagnostics to THIS tool's result block so the
848
+ // model can self-correct. The tool's own ok/fail is unchanged (guard).
849
+ let resultBlock = `Tool [${call.tool}] result (${res.success ? "ok" : "fail"}):\n${resultBody}`;
850
+ for (const d of hookDiags) {
851
+ const key = `${d.run}\u0000${d.output}`;
852
+ if (seenHookFeedback.has(key)) {
853
+ resultBlock += `\n[post-turn hook "${d.run}" — exit ${d.exitCode}: same diagnostics as above]`;
854
+ } else {
855
+ seenHookFeedback.add(key);
856
+ resultBlock += `\n[post-turn hook "${d.run}" — exit ${d.exitCode}]:\n${truncateToolOutput(d.output)}`;
857
+ }
858
+ }
859
+ resultBlocks.push(resultBlock);
860
+ }
861
+
862
+ history.push({ role: "assistant", content: responseText });
863
+ history.push({
864
+ role: "user",
865
+ content: resultBlocks.join("\n\n"),
866
+ });
867
+ };
868
+
869
+ if (aborted) {
870
+ const executedIndices = results.map((r, i) => r.executed ? i : -1).filter(i => i !== -1);
871
+ if (executedIndices.length > 0) {
872
+ await processAndPushResults(executedIndices);
873
+ }
874
+ return finish({ done: false, steps: step, doneReason: "Cancelled." });
875
+ }
876
+
877
+ const allIndices = toolCalls.map((_, i) => i);
878
+ await processAndPushResults(allIndices);
879
+
880
+ // Score the budget window per CALL, not per batch: a batch of five failing
881
+ // edits plus one trivial successful read must not look like a progressing
882
+ // step to the extension heuristic (that loophole earned endless extensions).
883
+ for (let i = 0; i < toolCalls.length; i++) {
884
+ if (results[i].executed) budget.record(callSigs[i], results[i].success);
885
+ }
886
+ // done-verification guard bookkeeping: write/edit successes mark the turn as
887
+ // mutating; a successful bash whose command/output looks like a test/build run
888
+ // counts as verification. A verification AFTER the last mutation is what the
889
+ // done guard wants, but order-insensitive tracking keeps it one-pushback simple.
890
+ for (let i = 0; i < toolCalls.length; i++) {
891
+ if (!results[i].executed || !results[i].success) continue;
892
+ const t = toolCalls[i].tool;
893
+ if (t === "write" || t === "edit") sawMutation = true;
894
+ else if (t === "bash") {
895
+ const cmd = String(toolCalls[i].arguments?.command ?? "");
896
+ if (VERIFY_SIGNAL_RE.test(cmd) || VERIFY_SIGNAL_RE.test(results[i].output.slice(0, 2000))) sawVerification = true;
897
+ }
898
+ }
899
+ // F6 (round 4 architect, Low): judge the step by its NON-TRIVIAL calls — a
900
+ // batch of read(ok)+edit(fail) repeated with varying targets previously
901
+ // never tripped MAX_FAILURES because the trivial read reset the streak.
902
+ // Read-only-only steps keep the old any-success rule.
903
+ const nonTrivial = toolCalls
904
+ .map((c, i) => ({ tool: c.tool, r: results[i] }))
905
+ .filter(x => !READONLY_TOOLS.has(x.tool) && x.r.executed);
906
+ const stepSuccess = nonTrivial.length > 0
907
+ ? nonTrivial.some(x => x.r.success)
908
+ : results.some(r => r.success);
909
+
910
+ if (stepSuccess) {
196
911
  consecutiveFailures = 0;
197
912
  } else if (++consecutiveFailures >= MAX_FAILURES) {
913
+ const isSingle = toolCalls.length === 1;
914
+ const stopMsg = isSingle
915
+ ? `Stopped: ${MAX_FAILURES} consecutive failing tool calls (last '${toolCalls[0].tool}'); the model could not recover.`
916
+ : `Stopped: ${MAX_FAILURES} consecutive failing tool steps; the model could not recover.`;
198
917
  return finish({
199
918
  done: false,
200
919
  steps: step,
201
- doneReason: `Stopped: ${MAX_FAILURES} consecutive failing tool calls (last '${invocation.tool}'); the model could not recover.`,
920
+ doneReason: stopMsg,
202
921
  });
203
922
  }
204
923
  step++;
205
924
  }
206
925
 
207
- return finish({ done: false, steps: maxSteps });
926
+ // Budget exhausted without `done` (step limit declined a further extension, or
927
+ // the turn wall-clock budget fired). Instead of dying with a bare limit error,
928
+ // dynamically CONSOLIDATE: one final no-tools model call summarizes what was
929
+ // accomplished, key findings, and what remains — a useful wrap-up, not a failure.
930
+ const extInfo = budget.extensionsUsed() > 0 ? ` after ${budget.extensionsUsed()} extension(s)` : "";
931
+ const stopInfo = budgetStopReason ? `; ${budgetStopReason}` : "";
932
+ const budgetLabel = stopKind === "time"
933
+ ? `turn time budget of ${Math.round(turnBudgetMs / 60_000)}m reached`
934
+ : `step budget of ${budget.limit()} reached`;
935
+ try {
936
+ if (!opts.signal?.aborted) {
937
+ const wrapUp = await invokeCallLlm(
938
+ [
939
+ ...history,
940
+ {
941
+ role: "user",
942
+ content:
943
+ "The budget for this turn is exhausted. Do NOT call any tool. " +
944
+ "Reply with plain prose (no JSON): consolidate what you accomplished this turn, " +
945
+ "the key findings/changes so far, and what remains to be done next.",
946
+ },
947
+ ],
948
+ { jsonMode: false, model: opts.model, maxTokens: opts.maxTokens, signal: opts.signal },
949
+ );
950
+ const consolidated = wrapUp.trim();
951
+ if (consolidated) {
952
+ history.push({ role: "assistant", content: consolidated });
953
+ return finish({
954
+ done: false,
955
+ steps: budget.limit(),
956
+ doneReason: `${consolidated}\n\n(${budgetLabel}${extInfo}${stopInfo} — consolidated wrap-up above; continue with a follow-up request)`,
957
+ });
958
+ }
959
+ }
960
+ } catch { /* wrap-up is best-effort; fall through to the plain budget message */ }
961
+ return finish({ done: false, steps: stopKind === "time" ? step : budget.limit(), doneReason: budgetStopReason ? `(${budgetLabel}${extInfo} — ${budgetStopReason})` : undefined });
208
962
  }