jeo-code 0.1.0 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/README.ja.md +160 -0
  2. package/README.ko.md +160 -0
  3. package/README.md +115 -297
  4. package/README.zh.md +160 -0
  5. package/package.json +11 -6
  6. package/scripts/install.sh +28 -28
  7. package/scripts/uninstall.sh +17 -15
  8. package/src/AGENTS.md +50 -0
  9. package/src/agent/AGENTS.md +49 -0
  10. package/src/agent/bash-fixups.ts +103 -0
  11. package/src/agent/compaction.ts +410 -19
  12. package/src/agent/config-schema.ts +119 -5
  13. package/src/agent/context-files.ts +314 -17
  14. package/src/agent/dev/AGENTS.md +36 -0
  15. package/src/agent/dev/advanced-analyzer.ts +12 -0
  16. package/src/agent/dev/evolution-bridge.ts +82 -0
  17. package/src/agent/dev/evolution-logger.ts +41 -0
  18. package/src/agent/dev/self-analysis.ts +64 -0
  19. package/src/agent/dev/self-improve.ts +24 -0
  20. package/src/agent/dev/spec-automation.ts +49 -0
  21. package/src/agent/engine.ts +804 -54
  22. package/src/agent/hooks.ts +273 -0
  23. package/src/agent/loop.ts +21 -1
  24. package/src/agent/memory.ts +201 -0
  25. package/src/agent/model-recency.ts +32 -0
  26. package/src/agent/output-minimizer.ts +108 -0
  27. package/src/agent/output-util.ts +64 -0
  28. package/src/agent/plan.ts +187 -0
  29. package/src/agent/seed.ts +52 -0
  30. package/src/agent/session.ts +235 -21
  31. package/src/agent/state.ts +286 -39
  32. package/src/agent/step-budget.ts +232 -0
  33. package/src/agent/subagents.ts +223 -26
  34. package/src/agent/task-tool.ts +272 -0
  35. package/src/agent/todo-tool.ts +87 -0
  36. package/src/agent/tokenizer.ts +117 -0
  37. package/src/agent/tool-registry.ts +54 -0
  38. package/src/agent/tools.ts +562 -103
  39. package/src/agent/web-search.ts +538 -0
  40. package/src/ai/AGENTS.md +44 -0
  41. package/src/ai/index.ts +1 -0
  42. package/src/ai/model-catalog-compat.ts +3 -1
  43. package/src/ai/model-catalog.ts +74 -9
  44. package/src/ai/model-discovery.ts +215 -17
  45. package/src/ai/model-manager.ts +346 -32
  46. package/src/ai/model-picker.ts +1 -1
  47. package/src/ai/model-registry.ts +4 -2
  48. package/src/ai/pricing.ts +84 -0
  49. package/src/ai/provider-registry.ts +23 -0
  50. package/src/ai/provider-status.ts +60 -16
  51. package/src/ai/providers/AGENTS.md +42 -0
  52. package/src/ai/providers/anthropic.ts +250 -31
  53. package/src/ai/providers/antigravity.ts +219 -0
  54. package/src/ai/providers/errors.ts +15 -1
  55. package/src/ai/providers/gemini.ts +196 -13
  56. package/src/ai/providers/ollama.ts +37 -7
  57. package/src/ai/providers/openai-responses.ts +173 -0
  58. package/src/ai/providers/openai.ts +64 -12
  59. package/src/ai/sse.ts +4 -1
  60. package/src/ai/types.ts +18 -1
  61. package/src/auth/AGENTS.md +41 -0
  62. package/src/auth/callback-server.ts +6 -1
  63. package/src/auth/flows/AGENTS.md +32 -0
  64. package/src/auth/flows/antigravity.ts +151 -0
  65. package/src/auth/flows/google-project.ts +190 -0
  66. package/src/auth/flows/google.ts +39 -18
  67. package/src/auth/flows/index.ts +15 -5
  68. package/src/auth/flows/openai.ts +2 -2
  69. package/src/auth/oauth.ts +8 -0
  70. package/src/auth/refresh.ts +44 -27
  71. package/src/auth/storage.ts +149 -26
  72. package/src/auth/types.ts +1 -1
  73. package/src/autopilot.ts +362 -0
  74. package/src/bun-imports.d.ts +4 -0
  75. package/src/cli/AGENTS.md +39 -0
  76. package/src/cli/runner.ts +148 -14
  77. package/src/cli.ts +13 -4
  78. package/src/commands/AGENTS.md +40 -0
  79. package/src/commands/approve.ts +62 -3
  80. package/src/commands/auth.ts +167 -25
  81. package/src/commands/chat.ts +37 -8
  82. package/src/commands/deep-interview.ts +633 -175
  83. package/src/commands/doctor.ts +84 -37
  84. package/src/commands/evolve-core.ts +18 -0
  85. package/src/commands/evolve.ts +2 -1
  86. package/src/commands/export.ts +176 -0
  87. package/src/commands/gjc.ts +52 -0
  88. package/src/commands/launch.ts +3549 -240
  89. package/src/commands/mcp.ts +3 -3
  90. package/src/commands/ooo-seed.ts +19 -0
  91. package/src/commands/ralplan.ts +253 -35
  92. package/src/commands/resume.ts +1 -1
  93. package/src/commands/session.ts +183 -0
  94. package/src/commands/setup-helpers.ts +10 -3
  95. package/src/commands/setup.ts +57 -16
  96. package/src/commands/skills.ts +78 -18
  97. package/src/commands/state.ts +198 -0
  98. package/src/commands/status.ts +84 -0
  99. package/src/commands/team.ts +340 -212
  100. package/src/commands/ultragoal.ts +122 -61
  101. package/src/commands/update.ts +244 -0
  102. package/src/ledger.ts +270 -0
  103. package/src/mcp/AGENTS.md +38 -0
  104. package/src/mcp/server.ts +115 -14
  105. package/src/mcp/tools.ts +42 -22
  106. package/src/md-modules.d.ts +4 -0
  107. package/src/prompts/AGENTS.md +41 -0
  108. package/src/prompts/agents/AGENTS.md +35 -0
  109. package/src/prompts/agents/architect.md +35 -0
  110. package/src/prompts/agents/critic.md +37 -0
  111. package/src/prompts/agents/executor.md +36 -0
  112. package/src/prompts/agents/planner.md +37 -0
  113. package/src/prompts/skills/AGENTS.md +36 -0
  114. package/src/prompts/skills/deep-dive/AGENTS.md +31 -0
  115. package/src/prompts/skills/deep-dive/SKILL.md +13 -0
  116. package/src/prompts/skills/deep-interview/AGENTS.md +31 -0
  117. package/src/prompts/skills/deep-interview/SKILL.md +12 -0
  118. package/src/prompts/skills/gjc/AGENTS.md +31 -0
  119. package/src/prompts/skills/gjc/SKILL.md +15 -0
  120. package/src/prompts/skills/ralplan/AGENTS.md +31 -0
  121. package/src/prompts/skills/ralplan/SKILL.md +11 -0
  122. package/src/prompts/skills/team/AGENTS.md +31 -0
  123. package/src/prompts/skills/team/SKILL.md +11 -0
  124. package/src/prompts/skills/ultragoal/AGENTS.md +31 -0
  125. package/src/prompts/skills/ultragoal/SKILL.md +11 -0
  126. package/src/skills/AGENTS.md +38 -0
  127. package/src/skills/catalog.ts +565 -31
  128. package/src/tui/AGENTS.md +43 -0
  129. package/src/tui/app.ts +1181 -92
  130. package/src/tui/components/AGENTS.md +42 -0
  131. package/src/tui/components/ascii-art.ts +257 -15
  132. package/src/tui/components/autocomplete.ts +98 -16
  133. package/src/tui/components/autopilot-status.ts +65 -0
  134. package/src/tui/components/category-index.ts +49 -0
  135. package/src/tui/components/code-view.ts +54 -11
  136. package/src/tui/components/color.ts +171 -2
  137. package/src/tui/components/config-panel.ts +82 -15
  138. package/src/tui/components/duration.ts +38 -0
  139. package/src/tui/components/evolution.ts +3 -3
  140. package/src/tui/components/footer.ts +91 -42
  141. package/src/tui/components/forge.ts +426 -31
  142. package/src/tui/components/hints.ts +54 -0
  143. package/src/tui/components/hud.ts +73 -0
  144. package/src/tui/components/index.ts +4 -0
  145. package/src/tui/components/input-box.ts +150 -0
  146. package/src/tui/components/layout.ts +11 -3
  147. package/src/tui/components/live-model-picker.ts +108 -0
  148. package/src/tui/components/markdown-table.ts +140 -0
  149. package/src/tui/components/markdown-text.ts +97 -0
  150. package/src/tui/components/meter.ts +4 -1
  151. package/src/tui/components/model-picker.ts +3 -2
  152. package/src/tui/components/provider-picker.ts +3 -2
  153. package/src/tui/components/section.ts +70 -0
  154. package/src/tui/components/select-list.ts +40 -10
  155. package/src/tui/components/skill-picker.ts +25 -0
  156. package/src/tui/components/slash.ts +244 -21
  157. package/src/tui/components/status.ts +272 -11
  158. package/src/tui/components/step-timeline.ts +218 -0
  159. package/src/tui/components/stream.ts +26 -9
  160. package/src/tui/components/themes.ts +212 -6
  161. package/src/tui/components/todo-card.ts +47 -0
  162. package/src/tui/components/tool-list.ts +58 -12
  163. package/src/tui/components/transcript.ts +120 -0
  164. package/src/tui/components/update-box.ts +31 -0
  165. package/src/tui/components/welcome.ts +162 -0
  166. package/src/tui/components/width.ts +163 -0
  167. package/src/tui/monitoring/AGENTS.md +31 -0
  168. package/src/tui/monitoring/hud-view.ts +55 -0
  169. package/src/tui/renderer.ts +112 -3
  170. package/src/tui/terminal.ts +40 -33
  171. package/src/util/AGENTS.md +39 -0
  172. package/src/util/clipboard-image.ts +118 -0
  173. package/src/util/env.ts +12 -0
  174. package/src/util/provider-error.ts +78 -0
  175. package/src/util/retry.ts +91 -6
  176. package/src/util/update-check.ts +64 -0
  177. package/src/commands/models.ts +0 -104
@@ -1,16 +1,39 @@
1
1
  /**
2
- * Reusable agentic tool-call loop — the shared core behind `joc team`
3
- * (per-task executor) and `joc launch` (interactive coding agent).
2
+ * Reusable agentic tool-call loop — the shared core behind `jeo team`
3
+ * (per-task executor) and `jeo launch` (interactive coding agent).
4
4
  *
5
5
  * The model is driven in JSON tool-call mode: each step it emits exactly one
6
6
  * `{ "tool": "...", "arguments": { ... } }` object; the engine dispatches it,
7
7
  * appends the result to history, and continues until the model calls `done`
8
8
  * or the step budget is exhausted.
9
9
  */
10
- import { callLlm, type Message } from "./loop";
10
+ import * as fs from "node:fs/promises";
11
+ import * as path from "node:path";
12
+ import type { Message } from "./loop";
11
13
  import { extractJsonObject } from "./json";
12
- import { readTool, writeTool, editTool, bashTool, findTool, searchTool, type ToolResult } from "./tools";
14
+ import { readTool, writeTool, editTool, bashTool, findTool, searchTool, lsTool, type ToolResult } from "./tools";
15
+ import { webSearchTool, setWebSearchActiveModel } from "./web-search";
16
+ import { friendlyProviderError, isContextOverflowError, isRefusalError } from "../util/provider-error";
17
+ import { isRateLimitError } from "../util/retry";
18
+ import { runPreToolHooks, runPostTurnHooks } from "./hooks";
19
+ import { minimizeToolOutput } from "./output-minimizer";
20
+ import { StepBudget, dynamicStepBudgetConfig, resolveStepBudgetConfig, hashSignature, type StepBudgetConfig } from "./step-budget";
21
+ import { historyTokens, trimToolResultsInPlace } from "./compaction";
22
+ import { jeoEnv } from "../util/env";
13
23
 
24
+
25
+ async function invokeCallLlm(history: Message[], options: {
26
+ jsonMode: boolean;
27
+ model?: string;
28
+ maxTokens?: number;
29
+ signal?: AbortSignal;
30
+ onUsage?: (u: { inputTokens?: number; outputTokens?: number }) => void;
31
+ onRetry?: (attempt: number, err: unknown, delayMs: number) => void;
32
+ onToken?: (delta: string) => void;
33
+ }): Promise<string> {
34
+ const mod = await import("./loop");
35
+ return mod.callLlm(history, options);
36
+ }
14
37
  export interface ToolInvocation {
15
38
  tool: string;
16
39
  arguments?: Record<string, any>;
@@ -20,47 +43,122 @@ export type ToolHandler = (args: Record<string, any>, cwd: string) => Promise<To
20
43
 
21
44
  /** The default executor toolset (read / write / edit / bash / find / search). */
22
45
  export const DEFAULT_TOOLS: Record<string, ToolHandler> = {
23
- read: (a, cwd) => readTool(a.filePath ?? a.path, a.lineRange, cwd),
46
+ read: (a, cwd) => readTool(a.filePath ?? a.path, a.lineRange ?? a.range, cwd, !!a.raw),
24
47
  write: (a, cwd) => writeTool(a.filePath ?? a.path, a.content ?? "", cwd),
25
48
  edit: (a, cwd) => editTool(a.filePath ?? a.path, a.editBlock ?? a.edit ?? "", cwd),
26
- bash: (a, cwd) => bashTool(a.command ?? a.cmd, cwd, typeof a.timeoutMs === "number" ? a.timeoutMs : undefined),
49
+ bash: (a, cwd) => bashTool(a.command ?? a.cmd, cwd, typeof a.timeoutMs === "number" ? a.timeoutMs : undefined, typeof a.cwd === "string" ? a.cwd : (typeof a.subdir === "string" ? a.subdir : undefined), a.env && typeof a.env === "object" ? a.env : undefined),
27
50
  find: (a, cwd) => findTool(a.globPattern ?? a.pattern, cwd),
28
- search: (a, cwd) => searchTool(a.pattern, a.globPattern ?? "*", cwd),
51
+ search: (a, cwd) => searchTool(a.pattern, a.globPattern ?? "*", cwd, !!(a.ignoreCase ?? a.i), { before: a.before, after: a.after, context: a.context, maxMatches: a.maxMatches }),
52
+ ls: (a, cwd) => lsTool(a.dirPath ?? a.path ?? a.dir ?? ".", cwd),
53
+ web_search: (a, cwd) => webSearchTool(a, cwd),
29
54
  };
30
55
 
31
56
  /** Tool-protocol description injected into the system prompt. */
32
57
  export const TOOL_PROTOCOL = [
33
- "You have these tools (call exactly ONE per step):",
34
- "1. read {filePath, lineRange?} — read a file (lineRange: \"start-end\", \"start-\", or \"start\")",
58
+ "You have these tools (call exactly ONE per step, or batch multiple independent calls):",
59
+ "1. read {filePath, lineRange?, raw?} — read a file; lines are prefixed `LINEhh|` (hh = 2-char content anchor; the | is a separator, not file bytes)",
35
60
  "2. write {filePath, content} — create/overwrite a file",
36
- "3. edit {filePath, editBlock} — ≔A..B replace lines; ≔A+ insert after line A; ≔$ append EOF (payload on next line)",
37
- "4. bash {command, timeoutMs?} — run a shell command (tests, build, mkdir, ...); timeoutMs default 120000",
61
+ "3. edit {filePath, editBlock} — ≔A..B replace lines (append read anchors for safety: ≔12ab..15cd — rejected with fresh content if the lines changed); ≔A+ insert after line A; ≔$ append EOF (payload on next line). NEVER copy the `LINEhh|` prefixes into SEARCH blocks or payloads",
62
+ "4. bash {command, timeoutMs?, cwd?, env?} — run a shell command (cwd: subdir; env: extra vars)",
38
63
  "5. find {globPattern} — find files by name",
39
- "6. search {pattern, globPattern?} — grep for a pattern",
40
- "7. done {reason?} — call when the task is fully implemented AND verified",
64
+ "6. search {pattern, globPattern?, ignoreCase?, context?, maxMatches?} — grep (context: N lines around each match)",
65
+ "7. ls {dirPath} — list a directory's entries (dirs first)",
66
+ "8. web_search {query, recency?, limit?} — search the web (Anthropic-native: synthesized answer + sources + citations)",
67
+ "9. done {reason?} — call when the task is fully implemented AND verified",
68
+ "",
69
+ "Reply with STRICT JSON only — no code fences. You MAY include an optional leading",
70
+ '"reasoning" string (one short sentence on your plan) before "tool":',
71
+ '{ "reasoning": "<one short sentence>", "tool": "<name>", "arguments": { ... } }',
72
+ "",
73
+ "Alternatively, you may batch up to 6 independent calls in a single turn using the following format:",
74
+ '{ "reasoning": "<one short sentence>", "tools": [{ "tool": "<name>", "arguments": { ... } }, ...] }',
75
+ "Batch only independent calls; NEVER batch 'done', and NEVER put a mutating tool (write/edit/bash) after another mutating tool in one batch whose inputs depend on the earlier one.",
76
+ ].join("\n");
77
+
78
+ /** Restricted protocol for read-only subagent roles (planner/architect/critic):
79
+ * advertises only the non-mutating tools so the model does not waste steps
80
+ * calling write/edit/bash, which `subagentToolset` has physically removed. */
81
+ export const READONLY_TOOL_PROTOCOL = [
82
+ "You have these READ-ONLY tools (call exactly ONE per step, or batch multiple independent calls):",
83
+ "1. read {filePath, lineRange?} — read a file (lineRange: \"a-b\", \"a-\", \"a\", \"a+n\", or multi \"a-b,c-d\")",
84
+ "2. find {globPattern} — find files by name",
85
+ "3. search {pattern, globPattern?, ignoreCase?} — grep for a pattern",
86
+ "4. ls {dirPath} — list a directory's entries",
87
+ "5. web_search {query, recency?, limit?} — search the web (answer + sources + citations)",
88
+ "6. done {reason?} — call when your review/analysis is complete",
41
89
  "",
42
90
  "Reply with STRICT JSON only — no prose, no code fences:",
43
91
  '{ "tool": "<name>", "arguments": { ... } }',
92
+ "",
93
+ "Alternatively, you may batch up to 6 independent calls in a single turn using the following format:",
94
+ '{ "tools": [{ "tool": "<name>", "arguments": { ... } }, ...] }',
95
+ "Batch only independent calls; NEVER batch 'done'.",
96
+ ].join("\n");
97
+
98
+ /** gjc-inherited working discipline (plan/gjc-inheritance.md B3): the completion
99
+ * contract and tool-priority rules distilled from gjc's system prompt — compact
100
+ * (<300 tokens) per the pi-mono budget so the core prompt stays lean. */
101
+ export const WORKING_DISCIPLINE = [
102
+ "Working discipline:",
103
+ "- Correctness first, maintainability second, brevity third. Prefer boring, explicit code.",
104
+ "- Never present partial work as complete; never suppress tests or warnings to make code pass.",
105
+ "- Never fabricate tool results or test outcomes; verification claims must match what was actually run.",
106
+ "- Never ship stubs, placeholders, or TODO-only code as a delivered feature.",
107
+ "- Never substitute the requested problem with an easier adjacent one.",
108
+ "- Update directly affected callsites, tests, and docs — or state why they are unchanged.",
109
+ "- Reuse existing patterns; parallel conventions are prohibited. Fix problems at their source.",
110
+ "- You are not alone in the repository: treat unexpected changes as user work; never revert or delete them.",
111
+ "- Re-read before acting if a tool fails or a file may have changed.",
112
+ "- Prefer dedicated tools over shell pipelines: read (not cat), search (not grep), edit (not sed).",
44
113
  ].join("\n");
45
114
 
46
- export function executorSystemPrompt(role = "Executor Agent, a senior software developer"): string {
115
+ export function executorSystemPrompt(
116
+ role = "Executor Agent, a senior software developer",
117
+ protocol: string = TOOL_PROTOCOL,
118
+ verificationDirective = "Always verify (run tests / execute the program) before calling done.",
119
+ ): string {
47
120
  return (
48
121
  `You are the ${role}.\n` +
49
122
  `Accomplish the user's request by calling tools and verifying your work.\n\n` +
50
- `${TOOL_PROTOCOL}\n\n` +
51
- `Always verify (run tests / execute the program) before calling done.`
123
+ `${protocol}\n\n` +
124
+ `${WORKING_DISCIPLINE}\n\n` +
125
+ verificationDirective
52
126
  );
53
127
  }
54
128
 
55
129
  export interface AgentLoopEvents {
56
- onStep?(step: number): void;
130
+ onStep?(step: number): void | Promise<void>;
57
131
  onAssistant?(raw: string, invocation: ToolInvocation | null): void;
58
132
  onToolResult?(tool: string, success: boolean, output: string): void;
59
- onError?(message: string): void;
133
+ /** Transient progress notice (e.g. "rate limited — retrying in Ns"); NOT a terminal error. */
134
+ onNotice?(message: string): void;
135
+ /** Cumulative token usage after each LLM call — drives live usage meters. */
136
+ onUsage?(usage: { inputTokens: number; outputTokens: number }): void;
137
+ /** Accumulated streamed model response so far — drives the live reasoning view. Only
138
+ * requested when a consumer sets it (the engine streams solely for the TUI). */
139
+ onModelStream?(textSoFar: string): void;
140
+ /** Step-budget change (gjc-style retry flow): the limit was extended because the
141
+ * turn is making progress. `limit` is the new max; `reason` is display-ready. */
142
+ onBudget?(limit: number, reason: string): void;
143
+ /** Consulted when a lone `done` arrives. Return a corrective message to bounce
144
+ * the done ONCE (e.g. "todo list still shows unfinished items — update it
145
+ * first"); return null to let the turn finish. The engine guarantees at most
146
+ * one bounce per turn, so a stubborn model can never loop here. */
147
+ onBeforeDone?(reason: string): string | null;
60
148
  }
61
149
 
62
150
  export interface AgentLoopOptions {
151
+ /** Optional system prompt: prepended to `history` when it has no system message. */
152
+ systemPrompt?: string;
153
+ /** Mid-turn context budget (estimated tokens). When the in-turn history grows
154
+ * past this, the OLDEST tool-result bodies are deterministically elided so a
155
+ * long turn cannot snowball into multi-million-token prompts. Default 80k. */
156
+ maxHistoryTokens?: number;
63
157
  cwd: string;
158
+ /** Base step budget (default 15). Non-finite or `<= 0` selects the DYNAMIC budget:
159
+ * the budget keeps extending while the recent tool window shows NOVEL progress,
160
+ * a stalled or cycling turn consolidates a final wrap-up, and a large finite
161
+ * safety cap (`DYNAMIC_HARD_CAP`, default 600) guarantees termination. */
64
162
  maxSteps?: number;
65
163
  model?: string;
66
164
  /** Max generation tokens per step (drives the thinking budget). */
@@ -68,6 +166,9 @@ export interface AgentLoopOptions {
68
166
  tools?: Record<string, ToolHandler>;
69
167
  signal?: AbortSignal;
70
168
  events?: AgentLoopEvents;
169
+ /** Step-budget overrides (gjc-style retry flow). `{ maxExtensions: 0 }` restores the
170
+ * legacy fixed counter — used by bounded subagent delegation. */
171
+ budget?: Partial<StepBudgetConfig>;
71
172
  }
72
173
 
73
174
  export interface AgentLoopResult {
@@ -78,28 +179,150 @@ export interface AgentLoopResult {
78
179
  usage?: { inputTokens: number; outputTokens: number };
79
180
  }
80
181
 
182
+ /** Env-tunable output budget (plan/gjc-inheritance.md B10, gjc settings-driven
183
+ * output handling 계승): JEO_TOOL_OUTPUT_MAX caps the model-visible tool result;
184
+ * the spill threshold tracks it so anything truncated stays artifact-recoverable. */
185
+ function envOutputMax(): number {
186
+ const raw = Number(jeoEnv("TOOL_OUTPUT_MAX") ?? "");
187
+ return Number.isFinite(raw) && raw >= 500 && raw <= 200_000 ? Math.trunc(raw) : 4_000;
188
+ }
189
+ export const TOOL_OUTPUT_MAX = envOutputMax();
190
+
191
+ /** Wall-clock budget for ONE agent turn (ms). JEO_TURN_MAX_MS overrides; 0 disables.
192
+ * Default 30 minutes: long autonomous runs stay alive, while a turn that spins in
193
+ * "thinking" (huge contexts, endless extensions) is guaranteed to terminate into
194
+ * the consolidation wrap-up instead of running for hours. */
195
+ export function turnMaxMs(env: Record<string, string | undefined> = process.env): number {
196
+ const raw = jeoEnv("TURN_MAX_MS", env);
197
+ if (raw !== undefined && raw !== "") {
198
+ const n = Number(raw);
199
+ if (Number.isFinite(n) && n >= 0) return Math.trunc(n);
200
+ }
201
+ return 30 * 60 * 1000;
202
+ }
203
+
81
204
  /**
82
205
  * Cap a tool result fed back to the model, keeping both ends: the head holds the
83
206
  * start (e.g. a file's top / a command's invocation) and the tail holds what's
84
207
  * usually decisive (test summaries, the final error). A pure head-cut loses that.
85
208
  */
86
- export function truncateToolOutput(s: string, max = 4000): string {
209
+ export function truncateToolOutput(s: string, max = TOOL_OUTPUT_MAX): string {
87
210
  if (s.length <= max) return s;
88
211
  const head = Math.floor(max * 0.6);
89
212
  const tail = max - head;
90
213
  return `${s.slice(0, head)}\n…(${s.length - max} chars truncated)…\n${s.slice(s.length - tail)}`;
91
214
  }
92
215
 
216
+ /** Tool output larger than this is spilled to a recoverable artifact file. Aligned
217
+ * with `truncateToolOutput`'s cap so that whenever the model-visible result drops
218
+ * content, the full output is recoverable via the artifact. */
219
+ export const TOOL_SPILL_THRESHOLD = TOOL_OUTPUT_MAX;
220
+
221
+ /**
222
+ * Write an oversized tool result verbatim under `.jeo/artifacts/tool-results/` and
223
+ * return the workspace-relative path (for the model to `read`). Best-effort: throws
224
+ * are caught by the caller, which simply omits the artifact note.
225
+ */
226
+ /** Most recent tool-result artifacts to keep; older ones are pruned on each spill. */
227
+ export const MAX_TOOL_ARTIFACTS = 50;
228
+
229
+ /** Best-effort retention: keep the newest `MAX_TOOL_ARTIFACTS` files in `dir`, delete the rest. */
230
+ async function pruneToolArtifacts(dir: string): Promise<void> {
231
+ const files = await fs.readdir(dir).catch(() => [] as string[]);
232
+ if (files.length <= MAX_TOOL_ARTIFACTS) return;
233
+ const stamped = await Promise.all(
234
+ files.map(async f => ({ f, m: (await fs.stat(path.join(dir, f)).catch(() => null))?.mtimeMs ?? 0 })),
235
+ );
236
+ stamped.sort((a, b) => b.m - a.m); // newest first
237
+ for (const { f } of stamped.slice(MAX_TOOL_ARTIFACTS)) {
238
+ await fs.rm(path.join(dir, f), { force: true }).catch(() => {});
239
+ }
240
+ }
241
+
242
+ export async function spillToolResult(tool: string, output: string, cwd: string): Promise<string> {
243
+ const dir = path.join(cwd, ".jeo", "artifacts", "tool-results");
244
+ await fs.mkdir(dir, { recursive: true });
245
+ const safeTool = tool.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 32) || "tool";
246
+ const stamp = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
247
+ const rel = path.join(".jeo", "artifacts", "tool-results", `${stamp}-${safeTool}.txt`);
248
+ await fs.writeFile(path.join(cwd, rel), output, "utf-8");
249
+ // Retention so a long session can't grow the artifact dir without bound.
250
+ await pruneToolArtifacts(dir);
251
+ return rel;
252
+ }
253
+
254
+ /** Levenshtein distance (small inputs: tool/command names). */
255
+ function editDistance(a: string, b: string): number {
256
+ const m = a.length, n = b.length;
257
+ if (m === 0) return n;
258
+ if (n === 0) return m;
259
+ let prev = Array.from({ length: n + 1 }, (_, i) => i);
260
+ let cur = new Array<number>(n + 1).fill(0);
261
+ for (let i = 1; i <= m; i++) {
262
+ cur[0] = i;
263
+ for (let j = 1; j <= n; j++) {
264
+ const cost = a[i - 1] === b[j - 1] ? 0 : 1;
265
+ cur[j] = Math.min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + cost);
266
+ }
267
+ [prev, cur] = [cur, prev];
268
+ }
269
+ return prev[n];
270
+ }
271
+
272
+ /** Nearest known tool name for an unknown call: exact, prefix, or edit distance ≤ 2. */
273
+ export function nearestToolName(name: string, known: string[]): string | undefined {
274
+ const want = name.trim().toLowerCase();
275
+ if (!want) return undefined;
276
+ let best: string | undefined;
277
+ let bestD = Infinity;
278
+ for (const k of known) {
279
+ const kl = k.toLowerCase();
280
+ if (kl === want) return k;
281
+ const d = kl.startsWith(want) || want.startsWith(kl) ? 1 : editDistance(want, kl);
282
+ if (d < bestD) { bestD = d; best = k; }
283
+ }
284
+ return bestD <= 2 ? best : undefined;
285
+ }
93
286
  /**
94
287
  * Drive `history` through the tool-call loop, mutating it in place so callers
95
288
  * (e.g. an interactive REPL) can keep the conversation across multiple turns.
96
289
  */
97
290
  export async function runAgentLoop(history: Message[], opts: AgentLoopOptions): Promise<AgentLoopResult> {
98
291
  const { cwd } = opts;
292
+ // Active-model gate for web_search's provider chain (gjc parity): the chain
293
+ // prefers the active model's native search backend, never credential-scanning.
294
+ setWebSearchActiveModel(opts.model);
295
+ // Honor an explicit system prompt for callers that build history without one.
296
+ if (opts.systemPrompt && history[0]?.role !== "system") {
297
+ history.unshift({ role: "system", content: opts.systemPrompt });
298
+ }
99
299
  const tools = opts.tools ?? DEFAULT_TOOLS;
100
300
  const maxSteps = opts.maxSteps ?? 15;
301
+ // gjc-style retry flow: the step limit is a flexible BUDGET, not a bare counter.
302
+ // While the recent window shows real progress the budget extends itself; a stalled
303
+ // turn fails fast into the consolidation wrap-up. An explicit positive maxSteps
304
+ // keeps the bounded flow (base + capped extensions); a non-finite / non-positive
305
+ // maxSteps selects the DYNAMIC budget — extensions keep flowing while NOVEL
306
+ // progress continues, a stalled/cycling window consolidates, and a large finite
307
+ // safety cap (default 600 steps) guarantees the turn always terminates.
308
+ const budget = new StepBudget(
309
+ Number.isFinite(maxSteps) && maxSteps > 0
310
+ ? resolveStepBudgetConfig(maxSteps, process.env, opts.budget)
311
+ : dynamicStepBudgetConfig(process.env, opts.budget),
312
+ );
313
+ // Why the loop stopped at the limit — folded into the consolidation message.
314
+ let budgetStopReason = "";
101
315
  const ev = opts.events ?? {};
316
+ const maxHistoryTokens = Math.max(10_000, opts.maxHistoryTokens ?? 80_000);
102
317
 
318
+ // Wall-clock turn budget — the definitive "never sits in thinking forever"
319
+ // guarantee. Step budgets bound the COUNT of model calls; this bounds their total
320
+ // TIME: a turn that crosses it stops at the next loop boundary and consolidates a
321
+ // wrap-up instead of spinning for hours under a generous dynamic step cap.
322
+ const turnStartedAt = Date.now();
323
+ const turnBudgetMs = turnMaxMs();
324
+ // "steps" | "time" — drives honest wording in the consolidation message.
325
+ let stopKind: "steps" | "time" = "steps";
103
326
  let step = 1;
104
327
  const acc = { inputTokens: 0, outputTokens: 0 };
105
328
  let sawUsage = false;
@@ -111,98 +334,625 @@ export async function runAgentLoop(history: Message[], opts: AgentLoopOptions):
111
334
  // calls (bad edits, failing commands) would otherwise burn the whole step budget.
112
335
  const MAX_FAILURES = 5;
113
336
  let consecutiveFailures = 0;
337
+ // done-verification guard (plan/gjc-inheritance.md B4, gjc ultragoal-guard 경량 계승):
338
+ // a turn that MUTATED files but shows no verification signal gets ONE pushback on
339
+ // `done` — run the relevant test/build, or call done again (the escape hatch for
340
+ // doc/config changes where verification is genuinely not applicable).
341
+ let sawMutation = false;
342
+ let sawVerification = false;
343
+ let donePushbackUsed = false;
344
+ // Caller-owned done gate (onBeforeDone) — also strictly once per turn.
345
+ let beforeDoneNudgeUsed = false;
346
+ // F1 (round 4): the run-command of the most recent post-turn hook FAILURE whose
347
+ // diagnostics the model saw but has not yet resolved (a later clean hook run
348
+ // clears it). The done guard treats this as "verification missing" — the hook
349
+ // exit code is the strongest correctness signal in the loop.
350
+ let pendingHookFailure: string | null = null;
351
+ // Round-6 #4: ONE reactive recovery when the PROVIDER reports context overflow
352
+ // (authoritative where the local estimate drifted — images, tokenizer mismatch).
353
+ let contextOverflowRetryUsed = false;
354
+ // Refusal recovery budget: a safety refusal (HTTP 200, no content) on routine
355
+ // coding work is usually a transient false-positive. Retry the SAME step once
356
+ // as-is, then once more with an explicit re-grounding note; only a third
357
+ // refusal in the turn surfaces the (friendly) error. Bounded per turn so a
358
+ // genuinely refused request can never burn billed calls in a loop.
359
+ const MAX_REFUSAL_RETRIES = 3;
360
+ let refusalRetries = 0;
361
+ const VERIFY_SIGNAL_RE = /\b(test|tests|tsc|typecheck|lint|build|check|spec|pytest|vitest|jest)\b/i;
114
362
  let lastSig = "";
115
363
  let repeatCount = 0;
116
- while (step <= maxSteps) {
364
+ // Cycle guard (the A↔B ping-pong the exact-repeat guard cannot see): the recent
365
+ // executed step signatures, as fixed-size digests. When a full window cycles
366
+ // through ≤2 distinct calls, bounce ONCE with an explicit correction; a spin that
367
+ // persists through the correction stops the turn.
368
+ const CYCLE_WINDOW = 6;
369
+ const recentStepSigs: string[] = [];
370
+ let cycleBounceUsed = false;
371
+ // Invalid-tool-call guard: a model that returns JSON without a usable `tool`
372
+ // field can't drive the loop at all — surface that clearly instead of looping.
373
+ let invalidToolCalls = 0;
374
+ // Prose-bounce guard: after this many invalid-JSON corrections, salvage the
375
+ // model's text as the final answer instead of burning the whole step budget.
376
+ const MAX_PARSE_BOUNCES = 2;
377
+ let parseFailures = 0;
378
+ while (true) {
379
+ if (turnBudgetMs > 0 && Date.now() - turnStartedAt > turnBudgetMs) {
380
+ stopKind = "time";
381
+ budgetStopReason = `turn wall-clock budget of ${Math.round(turnBudgetMs / 60_000)}m exceeded (JEO_TURN_MAX_MS) without done`;
382
+ break;
383
+ }
384
+ if (step > budget.limit()) {
385
+ const decision = budget.tryExtend();
386
+ if (!decision.extend) {
387
+ budgetStopReason = decision.reason;
388
+ break;
389
+ }
390
+ // One surface per sink: budget-aware consumers get onBudget; others the notice.
391
+ if (ev.onBudget) ev.onBudget(decision.limit, decision.reason);
392
+ else ev.onNotice?.(decision.reason);
393
+ }
117
394
  if (opts.signal?.aborted) {
118
395
  return finish({ done: false, steps: step - 1, doneReason: "Cancelled." });
119
396
  }
120
- ev.onStep?.(step);
397
+ await ev.onStep?.(step);
398
+
399
+ // MID-TURN context guard: a single long turn (60+ steps) otherwise grows the
400
+ // history without bound — turn-boundary compaction never runs inside a turn,
401
+ // and field evidence shows multi-million-token prompts degrading the model
402
+ // into repeat loops while cost compounds. Deterministically elide the OLDEST
403
+ // tool-result bodies once the estimate crosses the budget; recent evidence
404
+ // and all assistant/user content stay intact.
405
+ if (historyTokens(history) > maxHistoryTokens) {
406
+ const res = trimToolResultsInPlace(history, { budgetTokens: maxHistoryTokens });
407
+ if (res.trimmed > 0) {
408
+ ev.onNotice?.(`context guard: elided ${res.trimmed} older tool result(s) mid-turn (~${Math.round(res.tokens / 1000)}k tokens kept)`);
409
+ }
410
+ }
121
411
 
412
+ // Stream the response into the live reasoning view ONLY when a consumer is attached
413
+ // (a TUI). Non-interactive/test callers leave onModelStream unset → a single
414
+ // non-streaming call(), unchanged. The accumulated text is still parsed as one JSON
415
+ // tool call below, so streaming changes nothing about loop semantics.
416
+ let streamBuf = "";
417
+ const onToken = ev.onModelStream
418
+ ? (delta: string) => { streamBuf += delta; ev.onModelStream!(streamBuf); }
419
+ : undefined;
122
420
  let responseText: string;
123
421
  try {
124
- responseText = await callLlm(history, {
422
+ responseText = await invokeCallLlm(history, {
125
423
  jsonMode: true,
126
424
  model: opts.model,
127
425
  maxTokens: opts.maxTokens,
128
426
  signal: opts.signal,
129
427
  onUsage: u => { acc.inputTokens += u.inputTokens ?? 0; acc.outputTokens += u.outputTokens ?? 0; sawUsage = true; },
428
+ onToken,
429
+ // Make provider auto-retry visible: previously a rate-limited call sat in a
430
+ // silent backoff wait, then surfaced "auto-retry was exhausted" with no trace
431
+ // of the retries that DID happen.
432
+ onRetry: (attempt, err, delayMs) => {
433
+ const wait = Math.max(1, Math.round(delayMs / 1000));
434
+ const what = isRateLimitError(err) ? "rate limited (HTTP 429)" : "transient provider error";
435
+ ev.onNotice?.(`${what} — auto-retry #${attempt} in ${wait}s`);
436
+ },
130
437
  });
131
438
  } catch (err) {
132
- const message = (err as Error).message;
133
- ev.onError?.(message);
134
- // Surface the real cause so callers don't print a misleading "step limit" message.
439
+ // Reactive context recovery: trim older tool results in place and retry the
440
+ // SAME step once. The provider's overflow signal beats the local estimate;
441
+ // a second overflow (or nothing left to trim) surfaces the friendly error.
442
+ if (isContextOverflowError(err) && !contextOverflowRetryUsed) {
443
+ contextOverflowRetryUsed = true;
444
+ // keepRecent 2 (vs the proactive guard's 8): the provider already REJECTED
445
+ // this prompt — freeing real space beats keeping evidence that can be re-run.
446
+ const res = trimToolResultsInPlace(history, { budgetTokens: Math.max(1, Math.floor(maxHistoryTokens / 2)), keepRecent: 2 });
447
+ if (res.trimmed > 0) {
448
+ ev.onNotice?.(`provider reported context overflow — elided ${res.trimmed} older tool result(s), retrying once`);
449
+ continue; // free retry: the step counter is unchanged
450
+ }
451
+ }
452
+ // Reactive refusal recovery (the "stop_reason=refusal" dead turn). Anthropic's
453
+ // contract: a refusal means the streaming classifier tripped on the CURRENT
454
+ // conversation content, and the context must be RESET before continuing —
455
+ // resending the same prompt keeps refusing deterministically. Ladder:
456
+ // 1) plain resend — covers a transient classifier flake (the OAuth payload
457
+ // also rotates its per-request user id, which alone can clear a trip);
458
+ // 2) classifier reset — elide tool-result bodies (the usual trigger is
459
+ // freshly-read file/search content, not the task itself) and append a
460
+ // NEUTRAL continuation note. The note deliberately never mentions the
461
+ // safety layer: arguing with the filter reads as a jailbreak attempt
462
+ // and escalates instead of recovering.
463
+ // 3) guidance strip — with tool results already gone, the remaining
464
+ // classifier-trigger candidate is the repo-authored prose injected
465
+ // into the SYSTEM prompt (<project_context> — AGENTS.md / rules can
466
+ // contain text that trips content filters even though the task is
467
+ // routine). Strip that block for the rest of the turn and retry once;
468
+ // core instructions stay intact. Field case: `$gjc init` inside a
469
+ // repo whose guidance files refuse-trip the OAuth classifier.
470
+ if (isRefusalError(err) && refusalRetries < MAX_REFUSAL_RETRIES) {
471
+ refusalRetries++;
472
+ if (refusalRetries === 1) {
473
+ ev.onNotice?.("provider refused the last call (no content) — retrying the same step");
474
+ continue; // free resend: the step counter is unchanged
475
+ }
476
+ if (refusalRetries === 2) {
477
+ const res = trimToolResultsInPlace(history, { budgetTokens: 0, keepRecent: 0 });
478
+ ev.onNotice?.(
479
+ res.trimmed > 0
480
+ ? `provider refused again — reset ${res.trimmed} tool result(s) from the context and retrying (refusals require a context reset)`
481
+ : "provider refused again — continuing with a fresh instruction",
482
+ );
483
+ history.push({
484
+ role: "user",
485
+ content:
486
+ "(continuation) The previous response returned no content and older tool outputs were elided from this conversation. " +
487
+ "Re-assess the task from the remaining context and reply with exactly one JSON tool call " +
488
+ '{"tool":"<name>","arguments":{...}} — re-run any tool whose output you still need, ' +
489
+ 'or send {"tool":"done","arguments":{"reason":"<summary>"}} if the task is finished.',
490
+ });
491
+ step++;
492
+ continue;
493
+ }
494
+ const sys = history[0];
495
+ if (sys?.role === "system" && sys.content.includes("<project_context>")) {
496
+ const stripped = sys.content.replace(/\n*<project_context>[\s\S]*?<\/project_context>/, "").trimEnd();
497
+ history[0] = { ...sys, content: stripped }; // replace, never mutate (identity caches)
498
+ ev.onNotice?.("provider refused a third time — removed project-context guidance from the system prompt and retrying once more");
499
+ continue; // same step, reduced system prompt
500
+ }
501
+ // Nothing left to strip — fall through to the friendly terminal error
502
+ // instead of burning an identical billed call.
503
+ }
504
+ const message = friendlyProviderError(err);
505
+ // The error IS the turn's doneReason and every caller displays that — emitting a
506
+ // separate error event here printed the same message twice (live stream + reply).
135
507
  return finish({ done: false, steps: step, doneReason: `Error: ${message}` });
136
508
  }
509
+ if (sawUsage) ev.onUsage?.({ ...acc });
137
510
 
138
- let invocation: ToolInvocation;
511
+ let invocation: any;
139
512
  try {
140
- invocation = extractJsonObject<ToolInvocation>(responseText);
513
+ invocation = extractJsonObject<any>(responseText);
141
514
  } catch (err) {
142
- // Not valid tool-call JSON — show the model the error and let it retry.
143
515
  ev.onAssistant?.(responseText, null);
516
+ // Prose salvage: a reply with no JSON object at all is a chat-style final
517
+ // answer, not a malformed tool call. Bouncing it back only made the model
518
+ // apologize for the format — and that apology surfaced as the visible reply.
519
+ // Same salvage after repeated bounces: the text we have IS the best answer.
520
+ const trimmed = responseText.trim();
521
+ parseFailures++;
522
+ if (trimmed && (!trimmed.includes("{") || parseFailures > MAX_PARSE_BOUNCES)) {
523
+ history.push({ role: "assistant", content: responseText });
524
+ return finish({ done: true, steps: step, doneReason: trimmed });
525
+ }
144
526
  history.push({ role: "assistant", content: responseText });
145
527
  history.push({
146
528
  role: "user",
147
529
  content:
148
530
  `Your last reply was not a valid tool call (${(err as Error).message}). ` +
149
- `Reply with exactly one JSON object: {"tool":"<name>","arguments":{...}}.`,
531
+ `Do NOT apologize or explain the formatting mistake. If that reply was your final answer, ` +
532
+ `resend it as {"tool":"done","arguments":{"reason":"<that answer, verbatim>"}}; ` +
533
+ `otherwise reply with exactly one JSON tool call: {"tool":"<name>","arguments":{...}}.`,
534
+ });
535
+ step++;
536
+ continue;
537
+ }
538
+ // A successfully parsed reply ends any bounce streak: MAX_PARSE_BOUNCES is a
539
+ // CONSECUTIVE-failure salvage, not a cumulative one — without this reset a long
540
+ // turn accumulated scattered parse slips and prematurely salvaged mid-task prose.
541
+ parseFailures = 0;
542
+
543
+ // Normalize to an invocation list
544
+ let toolCalls: { tool: string; arguments?: Record<string, any> }[] = [];
545
+ if (invocation && typeof invocation === "object") {
546
+ if (Array.isArray(invocation.tools)) {
547
+ const isValidBatch = invocation.tools.length > 0 && invocation.tools.every(
548
+ (t: any) => t && typeof t === "object" && typeof t.tool === "string" && t.tool.trim().length > 0
549
+ );
550
+ if (isValidBatch) {
551
+ toolCalls = invocation.tools.map((t: any) => ({
552
+ tool: t.tool.trim(),
553
+ arguments: t.arguments
554
+ }));
555
+ }
556
+ } else if (typeof invocation.tool === "string" && invocation.tool.trim().length > 0) {
557
+ toolCalls = [{
558
+ tool: invocation.tool.trim(),
559
+ arguments: invocation.arguments
560
+ }];
561
+ }
562
+ }
563
+
564
+ if (toolCalls.length === 0) {
565
+ invalidToolCalls++;
566
+ if (invalidToolCalls >= MAX_REPEAT) {
567
+ return finish({
568
+ done: false,
569
+ steps: step,
570
+ doneReason: `Stopped: the model returned no valid tool call ${MAX_REPEAT}× (a JSON reply with no valid "tool" or "tools" field). The selected model may be too small to follow the JSON tool protocol — switch to a stronger model with /model.`,
571
+ });
572
+ }
573
+ history.push({ role: "assistant", content: responseText });
574
+ history.push({
575
+ role: "user",
576
+ content: `Your last reply had no "tool" or "tools" field. Reply with exactly one JSON object, e.g. {"tool":"find","arguments":{"globPattern":"src/**"}} or {"tools":[{"tool":"read","arguments":{"filePath":"src/main.ts"}}, ...]}.`,
150
577
  });
151
578
  step++;
152
579
  continue;
153
580
  }
581
+ invalidToolCalls = 0;
154
582
 
155
- ev.onAssistant?.(responseText, invocation);
583
+ if (toolCalls.length > 6) {
584
+ ev.onNotice?.(`Too many tool calls in batch (${toolCalls.length}); capping at 6 and dropping the rest.`);
585
+ toolCalls = toolCalls.slice(0, 6);
586
+ }
587
+
588
+ ev.onAssistant?.(responseText, toolCalls[0]);
156
589
 
157
- if (invocation.tool === "done") {
158
- return finish({ done: true, steps: step, doneReason: (invocation.arguments?.reason as string) ?? "" });
590
+ if (toolCalls.length === 1 && toolCalls[0].tool === "done") {
591
+ if (sawMutation && (!sawVerification || pendingHookFailure !== null) && !donePushbackUsed) {
592
+ donePushbackUsed = true; // second done always passes — escape hatch
593
+ history.push({ role: "assistant", content: responseText });
594
+ history.push({
595
+ role: "user",
596
+ content: pendingHookFailure !== null
597
+ ? `Your latest mutation left the post-turn hook "${pendingHookFailure}" FAILING (non-zero exit) — its diagnostics were shown in the tool result above. ` +
598
+ "Fix the reported problems (the hook re-runs on your next mutation), then call done. " +
599
+ "If the hook failure is a false positive, call done again and say why in the reason."
600
+ : "You modified files this turn but ran NO verification (no test/build/typecheck command succeeded). " +
601
+ "Run the narrowest command that proves your change works, then call done. " +
602
+ "If verification is genuinely not applicable (docs/config-only change), call done again and say why in the reason.",
603
+ });
604
+ step++;
605
+ continue;
606
+ }
607
+ // Caller-owned done gate (e.g. stale-todo reconciliation): ONE bounded
608
+ // bounce, then any later done passes — field case: a 28-step turn ended
609
+ // [DONE] with the Todos checklist still showing 1 in-progress + 4 pending
610
+ // because nothing ever forced a status update.
611
+ if (!beforeDoneNudgeUsed && ev.onBeforeDone) {
612
+ const nudge = ev.onBeforeDone((toolCalls[0].arguments?.reason as string) ?? "");
613
+ if (nudge) {
614
+ beforeDoneNudgeUsed = true;
615
+ history.push({ role: "assistant", content: responseText });
616
+ history.push({ role: "user", content: nudge });
617
+ ev.onNotice?.("done deferred once — final plan reconciliation requested");
618
+ step++;
619
+ continue;
620
+ }
621
+ }
622
+ return finish({ done: true, steps: step, doneReason: (toolCalls[0].arguments?.reason as string) ?? "" });
159
623
  }
160
624
 
161
- // Detect repeated identical tool calls (no forward progress).
162
- const sig = `${invocation.tool}:${JSON.stringify(invocation.arguments ?? {})}`;
625
+ // Anti-spin guard, checked BEFORE execution: a repeated identical step must
626
+ // not run its calls again — a repeated mutating bash/edit must not execute
627
+ // a third time merely to be detected.
628
+ // - 2nd identical step → ONE corrective bounce (skip execution, tell the
629
+ // model its previous identical call already ran and to either act
630
+ // differently or call done). Field evidence: long turns died here right
631
+ // after a SUCCESSFUL write because nothing ever told the model to stop
632
+ // repeating — a recovery prompt resolves that without killing the turn.
633
+ // - 3rd identical step (repeated through the explicit correction) → stop.
634
+ const callSigs = toolCalls.map(c => `${c.tool}:${JSON.stringify(c.arguments ?? {})}`);
635
+ // Fixed-size digest of the whole step — `write` signatures embed entire file
636
+ // bodies, so the repeat/cycle guards compare digests, not megabyte strings.
637
+ const sig = hashSignature(callSigs.join(" | "));
163
638
  if (sig === lastSig) repeatCount++;
164
639
  else {
165
640
  repeatCount = 1;
166
641
  lastSig = sig;
167
642
  }
643
+ if (repeatCount === 2) {
644
+ const what = toolCalls.length === 1 ? `'${toolCalls[0].tool}' call` : "tool batch";
645
+ history.push({ role: "assistant", content: responseText });
646
+ history.push({
647
+ role: "user",
648
+ content:
649
+ `You just repeated the EXACT same ${what} you already ran in the previous step — it was not re-executed. ` +
650
+ `Its result has not changed. If the task is complete, reply {"tool":"done","arguments":{"reason":"<summary of what was accomplished>"}}; ` +
651
+ `otherwise take a DIFFERENT next action (verify the result, move to the next file, or fix something new).`,
652
+ });
653
+ ev.onNotice?.(`repeated ${what} skipped — asked the model to act differently or call done`);
654
+ step++;
655
+ continue;
656
+ }
168
657
  if (repeatCount >= MAX_REPEAT) {
658
+ const what = toolCalls.length === 1 ? `the same '${toolCalls[0].tool}' call` : "the same tool calls";
169
659
  return finish({
170
660
  done: false,
171
661
  steps: step,
172
- doneReason: `Stopped: repeated the same '${invocation.tool}' call ${MAX_REPEAT}× with no new progress (the model never signaled done).`,
662
+ doneReason: `Stopped: repeated ${what} ${MAX_REPEAT}× even after an explicit correction (the model never signaled done).`,
173
663
  });
174
664
  }
175
665
 
176
- const handler = tools[invocation.tool];
177
- let success: boolean;
178
- let output: string;
179
- if (!handler) {
180
- success = false;
181
- output = `Unknown tool: ${invocation.tool}. Available: ${Object.keys(tools).join(", ")}, done.`;
182
- } else {
183
- const res = await handler(invocation.arguments ?? {}, cwd);
184
- success = res.success;
185
- output = res.success ? res.output : (res.error ? (res.output ? `${res.error}\n${res.output}` : res.error) : res.output);
666
+ // Cycle guard: an A↔B (or A↔B↔C-minus-one) alternation never trips the
667
+ // exact-repeat guard above — each step differs from its immediate predecessor —
668
+ // yet it is the same spin (field case: re-reading one file and re-running one
669
+ // command forever, "thinking" never ends). Detect a full recent window that
670
+ // cycles through ≤2 distinct step signatures: ONE corrective bounce (skip
671
+ // execution a repeated mutating call must not run again merely to be
672
+ // detected), then stop if the spin survives the explicit correction.
673
+ recentStepSigs.push(sig);
674
+ if (recentStepSigs.length > CYCLE_WINDOW) recentStepSigs.shift();
675
+ if (recentStepSigs.length === CYCLE_WINDOW && new Set(recentStepSigs).size <= 2) {
676
+ if (!cycleBounceUsed) {
677
+ cycleBounceUsed = true;
678
+ recentStepSigs.length = 0; // fresh window: the correction earns a real retry
679
+ history.push({ role: "assistant", content: responseText });
680
+ history.push({
681
+ role: "user",
682
+ content:
683
+ `You are cycling through the same ${new Set(callSigs).size <= 1 ? "tool call" : "tool calls"} you already ran in recent steps — this call was NOT re-executed and its result has not changed. ` +
684
+ `If the task is complete, reply {"tool":"done","arguments":{"reason":"<summary of what was accomplished>"}}; ` +
685
+ `otherwise take a genuinely DIFFERENT next action (a new file, a new command, or a fix you have not tried).`,
686
+ });
687
+ ev.onNotice?.("tool-call cycle detected — skipped execution and asked the model to act differently or call done");
688
+ step++;
689
+ continue;
690
+ }
691
+ return finish({
692
+ done: false,
693
+ steps: step,
694
+ doneReason: `Stopped: the model cycled through the same tool calls for ${CYCLE_WINDOW} consecutive steps even after an explicit correction (it never signaled done).`,
695
+ });
186
696
  }
187
697
 
188
- ev.onToolResult?.(invocation.tool, success, output);
189
- history.push({ role: "assistant", content: responseText });
190
- history.push({
191
- role: "user",
192
- content: `Tool [${invocation.tool}] result (${success ? "ok" : "fail"}):\n${truncateToolOutput(output)}`,
193
- });
698
+ // Helper to execute a single tool call
699
+ const executeTool = async (call: { tool: string; arguments?: Record<string, any> }) => {
700
+ const { tool, arguments: args } = call;
701
+ let success: boolean;
702
+ let output: string;
194
703
 
195
- if (success) {
704
+ if (tool === "done") {
705
+ success = false;
706
+ output = "Error: 'done' can only be called as the single tool invocation, not in a batch. Please send 'done' alone.";
707
+ } else {
708
+ const handler = tools[tool];
709
+ if (!handler) {
710
+ success = false;
711
+ const suggestion = nearestToolName(tool, Object.keys(tools));
712
+ const hint = suggestion ? ` Did you mean "${suggestion}"?` : "";
713
+ output = `Unknown tool: ${tool}.${hint} Available: ${Object.keys(tools).join(", ")}, done.`;
714
+ } else {
715
+ const preHookResult = await runPreToolHooks(
716
+ cwd,
717
+ tool,
718
+ args ?? {},
719
+ opts.signal,
720
+ ev.onNotice
721
+ );
722
+ if (preHookResult.vetoed) {
723
+ success = false;
724
+ output = preHookResult.error + (preHookResult.output ? `\n${preHookResult.output}` : "");
725
+ } else {
726
+ try {
727
+ const res = await handler(args ?? {}, cwd);
728
+ success = res.success;
729
+ output = res.success ? res.output : (res.error ? (res.output ? `${res.error}\n${res.output}` : res.error) : res.output);
730
+ } catch (err: any) {
731
+ success = false;
732
+ output = err?.message || String(err);
733
+ }
734
+ }
735
+ }
736
+ }
737
+ return { success, output };
738
+ };
739
+
740
+ const READONLY_TOOLS = new Set(["read", "find", "search", "ls", "web_search"]);
741
+ const WRITE_TOOLS = new Set(["write", "edit"]);
742
+ // Batch grouping → concurrency plan (plan/gjc-inheritance.md cycle 12):
743
+ // read group — consecutive read-only calls run in parallel (safe).
744
+ // write group — consecutive write/edit calls to DISTINCT files run in
745
+ // parallel; a same-file (or path-less) collision opens a
746
+ // sequential boundary so ordered edits to one file stay ordered.
747
+ // exclusive group — bash (and anything else) always runs alone, in order.
748
+ // Reads and writes never share a group, so a read can never race a write.
749
+ type ToolGroup = {
750
+ kind: "read" | "write" | "exclusive";
751
+ calls: { tool: string; arguments?: Record<string, any>; index: number }[];
752
+ files?: Set<string>;
753
+ };
754
+ const groups: ToolGroup[] = [];
755
+ // Dedup key = RESOLVED, case-folded path (F3): `./x.ts` vs `x.ts` vs
756
+ // `src/../x.ts` — and case variants on the (default case-insensitive) macOS
757
+ // FS — must collapse to ONE key, or two spellings of the same file run in
758
+ // parallel and the second write silently clobbers the first. Folding case on
759
+ // a case-sensitive FS merely serializes two genuinely-distinct files — safe.
760
+ const targetFile = (call: { arguments?: Record<string, any> }): string | null => {
761
+ const p = call.arguments?.filePath ?? call.arguments?.path;
762
+ return typeof p === "string" && p.trim() !== "" ? path.resolve(cwd, p).toLowerCase() : null;
763
+ };
764
+ for (let i = 0; i < toolCalls.length; i++) {
765
+ const entry = { ...toolCalls[i], index: i };
766
+ const last = groups[groups.length - 1];
767
+ if (READONLY_TOOLS.has(entry.tool)) {
768
+ if (last && last.kind === "read") last.calls.push(entry);
769
+ else groups.push({ kind: "read", calls: [entry] });
770
+ } else if (WRITE_TOOLS.has(entry.tool)) {
771
+ const file = targetFile(entry);
772
+ if (last && last.kind === "write" && file !== null && !last.files!.has(file)) {
773
+ last.calls.push(entry);
774
+ last.files!.add(file);
775
+ } else {
776
+ groups.push({ kind: "write", calls: [entry], files: new Set(file !== null ? [file] : []) });
777
+ }
778
+ } else {
779
+ groups.push({ kind: "exclusive", calls: [entry] });
780
+ }
781
+ }
782
+
783
+ const results: { success: boolean; output: string; executed: boolean }[] = Array.from(
784
+ { length: toolCalls.length },
785
+ () => ({ success: false, output: "", executed: false })
786
+ );
787
+
788
+ let aborted = false;
789
+ for (const group of groups) {
790
+ if (opts.signal?.aborted) {
791
+ aborted = true;
792
+ break;
793
+ }
794
+ if (group.calls.length > 1) {
795
+ // read OR distinct-file write group → run concurrently.
796
+ await Promise.all(group.calls.map(async (call) => {
797
+ const res = await executeTool(call);
798
+ results[call.index] = { ...res, executed: true };
799
+ }));
800
+ } else {
801
+ const call = group.calls[0];
802
+ const res = await executeTool(call);
803
+ results[call.index] = { ...res, executed: true };
804
+ }
805
+ }
806
+
807
+ const processAndPushResults = async (indices: number[]) => {
808
+ const resultBlocks: string[] = [];
809
+ // Per-batch dedup of post-turn hook diagnostics: a whole-project `tsc` hook
810
+ // matching every edit in a batch yields identical output N times — show it
811
+ // once, cross-reference the rest (cycle 13).
812
+ const seenHookFeedback = new Set<string>();
813
+ for (const idx of indices) {
814
+ const call = toolCalls[idx];
815
+ const res = results[idx];
816
+
817
+ ev.onToolResult?.(call.tool, res.success, res.output);
818
+
819
+ const minimized = minimizeToolOutput(res.output, call.tool);
820
+ const visible = minimized.text;
821
+ let resultBody = truncateToolOutput(visible);
822
+ if (res.output.length > TOOL_SPILL_THRESHOLD) {
823
+ const artifact = await spillToolResult(call.tool, res.output, cwd).catch(() => null);
824
+ if (artifact) {
825
+ resultBody += `\n[full output (${res.output.length} chars) saved to ${artifact} — read it for the elided middle]`;
826
+ }
827
+ }
828
+
829
+ const { diags: hookDiags, ran: hooksRan } = await runPostTurnHooks(
830
+ cwd,
831
+ call.tool,
832
+ call.arguments ?? {},
833
+ res.success,
834
+ res.output,
835
+ opts.signal,
836
+ ev.onNotice
837
+ );
838
+ // F1: a red hook becomes a pending failure the done guard enforces; a
839
+ // later hook run that completes CLEAN (ran > 0, zero diags) clears it.
840
+ if (hookDiags.length > 0) pendingHookFailure = hookDiags[hookDiags.length - 1].run;
841
+ else if (hooksRan > 0) pendingHookFailure = null;
842
+
843
+ // Append non-zero-exit hook diagnostics to THIS tool's result block so the
844
+ // model can self-correct. The tool's own ok/fail is unchanged (guard).
845
+ let resultBlock = `Tool [${call.tool}] result (${res.success ? "ok" : "fail"}):\n${resultBody}`;
846
+ for (const d of hookDiags) {
847
+ const key = `${d.run}\u0000${d.output}`;
848
+ if (seenHookFeedback.has(key)) {
849
+ resultBlock += `\n[post-turn hook "${d.run}" — exit ${d.exitCode}: same diagnostics as above]`;
850
+ } else {
851
+ seenHookFeedback.add(key);
852
+ resultBlock += `\n[post-turn hook "${d.run}" — exit ${d.exitCode}]:\n${truncateToolOutput(d.output)}`;
853
+ }
854
+ }
855
+ resultBlocks.push(resultBlock);
856
+ }
857
+
858
+ history.push({ role: "assistant", content: responseText });
859
+ history.push({
860
+ role: "user",
861
+ content: resultBlocks.join("\n\n"),
862
+ });
863
+ };
864
+
865
+ if (aborted) {
866
+ const executedIndices = results.map((r, i) => r.executed ? i : -1).filter(i => i !== -1);
867
+ if (executedIndices.length > 0) {
868
+ await processAndPushResults(executedIndices);
869
+ }
870
+ return finish({ done: false, steps: step, doneReason: "Cancelled." });
871
+ }
872
+
873
+ const allIndices = toolCalls.map((_, i) => i);
874
+ await processAndPushResults(allIndices);
875
+
876
+ // Score the budget window per CALL, not per batch: a batch of five failing
877
+ // edits plus one trivial successful read must not look like a progressing
878
+ // step to the extension heuristic (that loophole earned endless extensions).
879
+ for (let i = 0; i < toolCalls.length; i++) {
880
+ if (results[i].executed) budget.record(callSigs[i], results[i].success);
881
+ }
882
+ // done-verification guard bookkeeping: write/edit successes mark the turn as
883
+ // mutating; a successful bash whose command/output looks like a test/build run
884
+ // counts as verification. A verification AFTER the last mutation is what the
885
+ // done guard wants, but order-insensitive tracking keeps it one-pushback simple.
886
+ for (let i = 0; i < toolCalls.length; i++) {
887
+ if (!results[i].executed || !results[i].success) continue;
888
+ const t = toolCalls[i].tool;
889
+ if (t === "write" || t === "edit") sawMutation = true;
890
+ else if (t === "bash") {
891
+ const cmd = String(toolCalls[i].arguments?.command ?? "");
892
+ if (VERIFY_SIGNAL_RE.test(cmd) || VERIFY_SIGNAL_RE.test(results[i].output.slice(0, 2000))) sawVerification = true;
893
+ }
894
+ }
895
+ // F6 (round 4 architect, Low): judge the step by its NON-TRIVIAL calls — a
896
+ // batch of read(ok)+edit(fail) repeated with varying targets previously
897
+ // never tripped MAX_FAILURES because the trivial read reset the streak.
898
+ // Read-only-only steps keep the old any-success rule.
899
+ const nonTrivial = toolCalls
900
+ .map((c, i) => ({ tool: c.tool, r: results[i] }))
901
+ .filter(x => !READONLY_TOOLS.has(x.tool) && x.r.executed);
902
+ const stepSuccess = nonTrivial.length > 0
903
+ ? nonTrivial.some(x => x.r.success)
904
+ : results.some(r => r.success);
905
+
906
+ if (stepSuccess) {
196
907
  consecutiveFailures = 0;
197
908
  } else if (++consecutiveFailures >= MAX_FAILURES) {
909
+ const isSingle = toolCalls.length === 1;
910
+ const stopMsg = isSingle
911
+ ? `Stopped: ${MAX_FAILURES} consecutive failing tool calls (last '${toolCalls[0].tool}'); the model could not recover.`
912
+ : `Stopped: ${MAX_FAILURES} consecutive failing tool steps; the model could not recover.`;
198
913
  return finish({
199
914
  done: false,
200
915
  steps: step,
201
- doneReason: `Stopped: ${MAX_FAILURES} consecutive failing tool calls (last '${invocation.tool}'); the model could not recover.`,
916
+ doneReason: stopMsg,
202
917
  });
203
918
  }
204
919
  step++;
205
920
  }
206
921
 
207
- return finish({ done: false, steps: maxSteps });
922
+ // Budget exhausted without `done` (step limit declined a further extension, or
923
+ // the turn wall-clock budget fired). Instead of dying with a bare limit error,
924
+ // dynamically CONSOLIDATE: one final no-tools model call summarizes what was
925
+ // accomplished, key findings, and what remains — a useful wrap-up, not a failure.
926
+ const extInfo = budget.extensionsUsed() > 0 ? ` after ${budget.extensionsUsed()} extension(s)` : "";
927
+ const stopInfo = budgetStopReason ? `; ${budgetStopReason}` : "";
928
+ const budgetLabel = stopKind === "time"
929
+ ? `turn time budget of ${Math.round(turnBudgetMs / 60_000)}m reached`
930
+ : `step budget of ${budget.limit()} reached`;
931
+ try {
932
+ if (!opts.signal?.aborted) {
933
+ const wrapUp = await invokeCallLlm(
934
+ [
935
+ ...history,
936
+ {
937
+ role: "user",
938
+ content:
939
+ "The budget for this turn is exhausted. Do NOT call any tool. " +
940
+ "Reply with plain prose (no JSON): consolidate what you accomplished this turn, " +
941
+ "the key findings/changes so far, and what remains to be done next.",
942
+ },
943
+ ],
944
+ { jsonMode: false, model: opts.model, maxTokens: opts.maxTokens, signal: opts.signal },
945
+ );
946
+ const consolidated = wrapUp.trim();
947
+ if (consolidated) {
948
+ history.push({ role: "assistant", content: consolidated });
949
+ return finish({
950
+ done: false,
951
+ steps: budget.limit(),
952
+ doneReason: `${consolidated}\n\n(${budgetLabel}${extInfo}${stopInfo} — consolidated wrap-up above; continue with a follow-up request)`,
953
+ });
954
+ }
955
+ }
956
+ } catch { /* wrap-up is best-effort; fall through to the plain budget message */ }
957
+ return finish({ done: false, steps: stopKind === "time" ? step : budget.limit(), doneReason: budgetStopReason ? `(${budgetLabel}${extInfo} — ${budgetStopReason})` : undefined });
208
958
  }