@pentoshi/clai 0.13.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/bin/clai.mjs +25 -0
  2. package/dist/agent/runner.d.ts +31 -1
  3. package/dist/agent/runner.js +416 -28
  4. package/dist/agent/runner.js.map +1 -1
  5. package/dist/commands/update.js +1 -1
  6. package/dist/commands/update.js.map +1 -1
  7. package/dist/llm/anthropic.js +31 -12
  8. package/dist/llm/anthropic.js.map +1 -1
  9. package/dist/llm/capabilities.d.ts +13 -0
  10. package/dist/llm/capabilities.js +107 -24
  11. package/dist/llm/capabilities.js.map +1 -1
  12. package/dist/llm/gemini.js +17 -4
  13. package/dist/llm/gemini.js.map +1 -1
  14. package/dist/llm/http.d.ts +12 -1
  15. package/dist/llm/http.js +50 -25
  16. package/dist/llm/http.js.map +1 -1
  17. package/dist/llm/ollama.js +16 -8
  18. package/dist/llm/ollama.js.map +1 -1
  19. package/dist/modes/agent.d.ts +2 -1
  20. package/dist/modes/agent.js.map +1 -1
  21. package/dist/modes/ask.d.ts +2 -1
  22. package/dist/modes/ask.js +5 -1
  23. package/dist/modes/ask.js.map +1 -1
  24. package/dist/os/cwd.d.ts +30 -0
  25. package/dist/os/cwd.js +76 -0
  26. package/dist/os/cwd.js.map +1 -0
  27. package/dist/os/detect.js +2 -1
  28. package/dist/os/detect.js.map +1 -1
  29. package/dist/prompts/index.d.ts +1 -1
  30. package/dist/prompts/index.js +66 -21
  31. package/dist/prompts/index.js.map +1 -1
  32. package/dist/repl.d.ts +10 -0
  33. package/dist/repl.js +258 -28
  34. package/dist/repl.js.map +1 -1
  35. package/dist/safety/classifier.js +121 -26
  36. package/dist/safety/classifier.js.map +1 -1
  37. package/dist/safety/patterns.d.ts +26 -0
  38. package/dist/safety/patterns.js +167 -0
  39. package/dist/safety/patterns.js.map +1 -1
  40. package/dist/store/config.js +2 -1
  41. package/dist/store/config.js.map +1 -1
  42. package/dist/store/history.js +19 -5
  43. package/dist/store/history.js.map +1 -1
  44. package/dist/store/plan.d.ts +43 -0
  45. package/dist/store/plan.js +201 -0
  46. package/dist/store/plan.js.map +1 -0
  47. package/dist/store/project.js +3 -2
  48. package/dist/store/project.js.map +1 -1
  49. package/dist/tools/capabilities.js +6 -1
  50. package/dist/tools/capabilities.js.map +1 -1
  51. package/dist/tools/fs.js +3 -2
  52. package/dist/tools/fs.js.map +1 -1
  53. package/dist/tools/image.d.ts +13 -0
  54. package/dist/tools/image.js +81 -0
  55. package/dist/tools/image.js.map +1 -0
  56. package/dist/tools/jobs.js +2 -1
  57. package/dist/tools/jobs.js.map +1 -1
  58. package/dist/tools/pdf.d.ts +18 -0
  59. package/dist/tools/pdf.js +200 -0
  60. package/dist/tools/pdf.js.map +1 -0
  61. package/dist/tools/registry.js +79 -7
  62. package/dist/tools/registry.js.map +1 -1
  63. package/dist/tools/shell.js +3 -2
  64. package/dist/tools/shell.js.map +1 -1
  65. package/dist/types.d.ts +16 -0
  66. package/dist/ui/keys.d.ts +1 -0
  67. package/dist/ui/keys.js +4 -0
  68. package/dist/ui/keys.js.map +1 -1
  69. package/dist/ui/mentions.d.ts +32 -1
  70. package/dist/ui/mentions.js +304 -27
  71. package/dist/ui/mentions.js.map +1 -1
  72. package/dist/ui/plan-pane.d.ts +19 -0
  73. package/dist/ui/plan-pane.js +101 -0
  74. package/dist/ui/plan-pane.js.map +1 -0
  75. package/package.json +4 -1
package/bin/clai.mjs CHANGED
@@ -1,2 +1,27 @@
1
1
  #!/usr/bin/env node
2
+ // Guard against a deleted / inaccessible working directory BEFORE importing
3
+ // anything from dist. If clai was launched (or elevated via `sudo`) from a
4
+ // folder that no longer exists, process.cwd() throws ENOENT (uv_cwd) and the
5
+ // whole CLI used to crash at module-load. Relocate to a directory that
6
+ // definitely exists so startup — and every later spawn — works.
7
+ try {
8
+ process.cwd();
9
+ } catch {
10
+ const candidates = [
11
+ process.env.HOME,
12
+ process.env.USERPROFILE,
13
+ process.env.TMPDIR,
14
+ "/tmp",
15
+ "/",
16
+ ].filter(Boolean);
17
+ for (const dir of candidates) {
18
+ try {
19
+ process.chdir(dir);
20
+ break;
21
+ } catch {
22
+ // try the next candidate
23
+ }
24
+ }
25
+ }
26
+
2
27
  await import('../dist/index.js');
@@ -1,4 +1,4 @@
1
- import type { ChatMessage, ProviderId, ToolCall, ToolResult } from "../types.js";
1
+ import type { ChatMessage, ChatImage, ProviderId, ToolCall, ToolResult } from "../types.js";
2
2
  export interface SessionPolicy {
3
3
  /** Tools the user authorized once during this REPL session. Not persisted. */
4
4
  allow: Set<string>;
@@ -6,6 +6,12 @@ export interface SessionPolicy {
6
6
  pentestAuthorized: {
7
7
  value: boolean;
8
8
  };
9
+ /** Stable id used to scope the session's plan/tasks in the plan store. */
10
+ sessionId: string;
11
+ /** When true, the agent must follow its approved plan (set by /implement). */
12
+ planApproved: {
13
+ value: boolean;
14
+ };
9
15
  }
10
16
  export declare function createSessionPolicy(): SessionPolicy;
11
17
  export interface AgentRunOptions {
@@ -15,6 +21,7 @@ export interface AgentRunOptions {
15
21
  autoConfirm?: boolean | undefined;
16
22
  maxSteps?: number | undefined;
17
23
  signal?: AbortSignal | undefined;
24
+ images?: ChatImage[] | undefined;
18
25
  onToolStart?: ((call: ToolCall) => void) | undefined;
19
26
  onToolResult?: ((call: ToolCall, result: ToolResult) => void) | undefined;
20
27
  session?: SessionPolicy | undefined;
@@ -30,6 +37,29 @@ export interface ParseToolCallOptions {
30
37
  strict?: boolean | undefined;
31
38
  }
32
39
  export declare function parseToolCall(text: string, options?: ParseToolCallOptions): ToolCall | undefined;
40
+ /**
41
+ * When a model means to call a tool but emits ONLY a bare JSON object —
42
+ * either a proper {"name","args"} that the strict matchers missed, or a bare
43
+ * args object like {"path":"file.pdf"} with the wrapper/fence dropped — this
44
+ * recognizes it. Returns:
45
+ * - { call } when the object is a complete {name, args} tool call, or
46
+ * - { argsOnly: true } when it looks like a bare args object (so the caller
47
+ * can nudge the model to re-emit a properly named, fenced tool call).
48
+ * Returns undefined for anything that is plainly a normal prose/JSON answer.
49
+ */
50
+ export declare function recognizeBareToolJson(text: string): {
51
+ call?: ToolCall;
52
+ argsOnly?: boolean;
53
+ } | undefined;
54
+ /**
55
+ * Detect an opened-but-unparseable tool call. This happens when the model's
56
+ * output is truncated by the token limit mid-JSON: we see the ```tool fence
57
+ * (or a bare {"name":"...","args" prefix) open, but parseToolCall returns
58
+ * undefined because the JSON never closed. Without this, the broken block
59
+ * leaks to the screen as a "final answer" and the requested action (e.g. a
60
+ * multi-file fs.writeMany scaffold) silently never runs.
61
+ */
62
+ export declare function looksLikeTruncatedToolCall(text: string): boolean;
33
63
  /**
34
64
  * Decide whether this turn should get a generous step budget because it is
35
65
  * a multi-file build, a continuation of one, or a "it's not done yet" nudge.
@@ -19,12 +19,25 @@ import { ensureProviderConfigured } from "../commands/providers.js";
19
19
  import { rememberThinkingFromText, renderThinkingSummary, } from "../ui/thinking.js";
20
20
  import { renderMarkdown, indentAndWrapText } from "../ui/markdown.js";
21
21
  import { startThinkingSpinner } from "../ui/spinner.js";
22
+ import { safeCwd } from "../os/cwd.js";
22
23
  import { analyzeTask } from "./task-analyzer.js";
23
24
  import { LoopGuard } from "./loop-guard.js";
25
+ import { createPlan, loadPlan, savePlan, markTask, } from "../store/plan.js";
26
+ import { renderPlanChecklist, renderPlanSidePane } from "../ui/plan-pane.js";
27
+ /** Render the plan as a right-side pane on wide terminals, else inline. */
28
+ function renderPlanForTerminal(plan) {
29
+ const cols = process.stdout.columns ?? 0;
30
+ const side = process.stdout.isTTY
31
+ ? renderPlanSidePane(plan, cols)
32
+ : undefined;
33
+ return side ?? renderPlanChecklist(plan);
34
+ }
24
35
  export function createSessionPolicy() {
25
36
  return {
26
37
  allow: new Set(),
27
38
  pentestAuthorized: { value: false },
39
+ sessionId: `sess-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`,
40
+ planApproved: { value: false },
28
41
  };
29
42
  }
30
43
  function tryParseCall(raw) {
@@ -139,6 +152,147 @@ export function parseToolCall(text, options = {}) {
139
152
  }
140
153
  return undefined;
141
154
  }
155
+ // Argument keys that the built-in tools accept. Used to recognize when a
156
+ // model emitted a bare args object (e.g. {"path":"file.pdf"}) — intending a
157
+ // tool call but forgetting the {"name","args"} wrapper and the ```tool fence.
158
+ const TOOL_ARG_KEYS = new Set([
159
+ "command",
160
+ "path",
161
+ "paths",
162
+ "url",
163
+ "query",
164
+ "target",
165
+ "pattern",
166
+ "tool",
167
+ "tools",
168
+ "files",
169
+ "content",
170
+ "calls",
171
+ "record",
172
+ "ports",
173
+ "profile",
174
+ "id",
175
+ "lang",
176
+ "dpi",
177
+ "psm",
178
+ "recursive",
179
+ "oldText",
180
+ "newText",
181
+ "expectedReplacements",
182
+ "goal",
183
+ "tasks",
184
+ "taskId",
185
+ "state",
186
+ "method",
187
+ "body",
188
+ "headers",
189
+ "maxBytes",
190
+ "maxResults",
191
+ "cwd",
192
+ "name",
193
+ "concurrency",
194
+ ]);
195
+ /**
196
+ * Strip a single wrapping ```json / ``` fence (if any) and return the inner
197
+ * text trimmed. Leaves un-fenced text unchanged.
198
+ */
199
+ function stripLoneFence(text) {
200
+ const fenced = text
201
+ .trim()
202
+ .match(/^```[a-zA-Z]*\s*\n?([\s\S]*?)\n?```$/);
203
+ return (fenced?.[1] ?? text).trim();
204
+ }
205
+ /**
206
+ * When a model means to call a tool but emits ONLY a bare JSON object —
207
+ * either a proper {"name","args"} that the strict matchers missed, or a bare
208
+ * args object like {"path":"file.pdf"} with the wrapper/fence dropped — this
209
+ * recognizes it. Returns:
210
+ * - { call } when the object is a complete {name, args} tool call, or
211
+ * - { argsOnly: true } when it looks like a bare args object (so the caller
212
+ * can nudge the model to re-emit a properly named, fenced tool call).
213
+ * Returns undefined for anything that is plainly a normal prose/JSON answer.
214
+ */
215
+ export function recognizeBareToolJson(text) {
216
+ const inner = stripLoneFence(text);
217
+ // Must be a single JSON object spanning the whole (de-fenced) output.
218
+ if (!inner.startsWith("{") || !inner.endsWith("}"))
219
+ return undefined;
220
+ let parsed;
221
+ try {
222
+ parsed = JSON.parse(inner);
223
+ }
224
+ catch {
225
+ return undefined;
226
+ }
227
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
228
+ return undefined;
229
+ }
230
+ const obj = parsed;
231
+ // Complete {name, args} call the earlier matchers didn't catch (e.g. not
232
+ // anchored to end-of-string). Recover it directly.
233
+ const direct = tryParseCall(inner);
234
+ if (direct)
235
+ return { call: direct };
236
+ // Bare args object: every key is a known tool-arg key, and it carries at
237
+ // least one identifying arg. Don't treat huge/odd objects as tool args.
238
+ const keys = Object.keys(obj);
239
+ if (keys.length === 0 || keys.length > 6)
240
+ return undefined;
241
+ const allKnown = keys.every((key) => TOOL_ARG_KEYS.has(key));
242
+ if (allKnown)
243
+ return { argsOnly: true };
244
+ return undefined;
245
+ }
246
+ /**
247
+ * Detect an opened-but-unparseable tool call. This happens when the model's
248
+ * output is truncated by the token limit mid-JSON: we see the ```tool fence
249
+ * (or a bare {"name":"...","args" prefix) open, but parseToolCall returns
250
+ * undefined because the JSON never closed. Without this, the broken block
251
+ * leaks to the screen as a "final answer" and the requested action (e.g. a
252
+ * multi-file fs.writeMany scaffold) silently never runs.
253
+ */
254
+ export function looksLikeTruncatedToolCall(text) {
255
+ // An opened ```tool fence with no closing fence.
256
+ const openFence = /```tool\s*\n?/i.test(text);
257
+ const closeFence = /```tool[\s\S]*?```/i.test(text);
258
+ if (openFence && !closeFence)
259
+ return true;
260
+ // A tool-call JSON object that started but whose braces never balanced.
261
+ const jsonStart = text.search(/\{\s*"name"\s*:\s*"[A-Za-z][\w.]*"\s*,\s*"args"/);
262
+ if (jsonStart >= 0) {
263
+ const slice = text.slice(jsonStart);
264
+ let depth = 0;
265
+ let inString = false;
266
+ let escaped = false;
267
+ let balanced = false;
268
+ for (const ch of slice) {
269
+ if (escaped) {
270
+ escaped = false;
271
+ continue;
272
+ }
273
+ if (ch === "\\") {
274
+ escaped = true;
275
+ continue;
276
+ }
277
+ if (ch === '"')
278
+ inString = !inString;
279
+ if (inString)
280
+ continue;
281
+ if (ch === "{")
282
+ depth += 1;
283
+ else if (ch === "}") {
284
+ depth -= 1;
285
+ if (depth === 0) {
286
+ balanced = true;
287
+ break;
288
+ }
289
+ }
290
+ }
291
+ if (!balanced)
292
+ return true;
293
+ }
294
+ return false;
295
+ }
142
296
  /** Extract the text before the tool call block for display purposes */
143
297
  function textBeforeToolCall(text) {
144
298
  const patterns = [
@@ -177,13 +331,17 @@ function formatToolArgs(call) {
177
331
  if (call.name === "fs.writeMany") {
178
332
  const files = Array.isArray(call.args.files) ? call.args.files : [];
179
333
  const names = files
180
- .map((f) => f && typeof f === "object" ? String(f.path ?? "") : "")
334
+ .map((f) => f && typeof f === "object"
335
+ ? String(f.path ?? "")
336
+ : "")
181
337
  .filter(Boolean);
182
338
  const preview = names.slice(0, 4).join(", ");
183
339
  return `${names.length} file(s)${preview ? `: ${preview}${names.length > 4 ? ", …" : ""}` : ""}`;
184
340
  }
185
341
  if (call.name === "fs.search")
186
342
  return String(call.args.pattern ?? "");
343
+ if (call.name === "image.ocr" || call.name === "pdf.read")
344
+ return String(call.args.path ?? "");
187
345
  if (call.name === "http.fetch" || call.name === "web.fetch")
188
346
  return String(call.args.url ?? "");
189
347
  if (call.name === "web.search")
@@ -191,7 +349,7 @@ function formatToolArgs(call) {
191
349
  if (call.name === "pkg.install")
192
350
  return String(call.args.tool ?? "");
193
351
  if (call.name === "fs.list")
194
- return String(call.args.path ?? process.cwd());
352
+ return String(call.args.path ?? safeCwd());
195
353
  return JSON.stringify(call.args);
196
354
  }
197
355
  const VOLATILE_SIGNAL_RE = /\b(?:current(?:ly)?|latest|newest|today|now|right now|live|recent|breaking|news|release[sd]?|version|prices?|stocks?|market|rates?|weather|forecast|elections?|results?|rankings?|standings?|stats?|cve|advis(?:ory|ories)|vulnerabilit(?:y|ies))\b/i;
@@ -378,6 +536,132 @@ async function confirmToolExecution(call, autoConfirm, session) {
378
536
  default: true,
379
537
  });
380
538
  }
539
+ /** Build the system-context block describing the session's active plan. */
540
+ function planContextMessage(plan, approved) {
541
+ const lines = [];
542
+ lines.push(`ACTIVE PLAN for this session (goal: ${plan.goal}, status: ${plan.status}):`);
543
+ if (plan.detail.trim())
544
+ lines.push(plan.detail.trim());
545
+ lines.push("Tasks:");
546
+ plan.tasks.forEach((t, i) => {
547
+ lines.push(` ${i + 1}. [${t.id}] (${t.state}) ${t.title}`);
548
+ });
549
+ if (approved) {
550
+ lines.push("The user APPROVED this plan. Execute it task by task NOW: before starting a task call " +
551
+ 'task.update with {"taskId":"<id>","state":"in_progress"}, do the work with real tool calls, ' +
552
+ 'then call task.update {"taskId":"<id>","state":"done"} (or "failed"/"skipped" with a note). ' +
553
+ "Actually run installs and start servers — never claim something ran without a successful tool call. " +
554
+ "When all tasks are done, verify and give a final summary.");
555
+ }
556
+ else {
557
+ lines.push("This plan is NOT yet approved. If the user is refining it, update it with plan.create again. " +
558
+ "Do NOT execute tasks until the user runs /implement.");
559
+ }
560
+ return lines.join("\n");
561
+ }
562
+ /**
563
+ * Handle plan.create / task.update inline. These are session-scoped and
564
+ * persisted via the plan store so the user can view the plan (Ctrl+P) and
565
+ * the agent keeps it in context across the whole session.
566
+ */
567
+ async function handlePlanTool(call, session, ctx) {
568
+ void ctx;
569
+ if (call.name === "plan.create") {
570
+ const goal = typeof call.args.goal === "string" ? call.args.goal : "";
571
+ const detail = typeof call.args.detail === "string" ? call.args.detail : "";
572
+ const kind = typeof call.args.kind === "string" ? call.args.kind : "general";
573
+ const rawTasks = Array.isArray(call.args.tasks) ? call.args.tasks : [];
574
+ const taskTitles = rawTasks
575
+ .map((t) => (typeof t === "string" ? t : ""))
576
+ .filter(Boolean);
577
+ if (!goal || taskTitles.length === 0) {
578
+ return {
579
+ handled: true,
580
+ ok: false,
581
+ display: chalk.red(" ✗ plan.create needs a non-empty goal and at least one task title\n"),
582
+ modelNote: "plan.create failed: provide a string goal and a non-empty tasks array of step titles.",
583
+ };
584
+ }
585
+ const plan = createPlan({
586
+ sessionId: session.sessionId,
587
+ goal,
588
+ detail,
589
+ taskTitles,
590
+ kind,
591
+ });
592
+ await savePlan(plan).catch(() => undefined);
593
+ // A freshly (re)created plan resets approval — the user must /implement.
594
+ session.planApproved.value = false;
595
+ const checklist = renderPlanForTerminal(plan);
596
+ const display = chalk.cyan(" ● planning\n") +
597
+ checklist +
598
+ "\n" +
599
+ chalk.dim(" ✦ plan created — press Ctrl+P to view it, or type /implement to approve and run it\n");
600
+ return {
601
+ handled: true,
602
+ ok: true,
603
+ display,
604
+ modelNote: `Plan saved with ${plan.tasks.length} task(s). STOP here and wait. ` +
605
+ "Do NOT start executing tasks until the user approves with /implement. " +
606
+ "When approved you will receive a message telling you to begin; then work task by task, " +
607
+ "calling task.update to mark each in_progress before and done after you finish it.",
608
+ };
609
+ }
610
+ // task.update
611
+ const plan = await loadPlan(session.sessionId).catch(() => undefined);
612
+ if (!plan) {
613
+ return {
614
+ handled: true,
615
+ ok: false,
616
+ display: chalk.red(" ✗ task.update: no active plan — call plan.create first\n"),
617
+ modelNote: "task.update failed: there is no active plan. Call plan.create first.",
618
+ };
619
+ }
620
+ const taskId = typeof call.args.taskId === "string" ? call.args.taskId : "";
621
+ const stateRaw = typeof call.args.state === "string" ? call.args.state : "";
622
+ const note = typeof call.args.note === "string" ? call.args.note : undefined;
623
+ const validStates = [
624
+ "pending",
625
+ "in_progress",
626
+ "done",
627
+ "failed",
628
+ "skipped",
629
+ ];
630
+ if (!validStates.includes(stateRaw)) {
631
+ return {
632
+ handled: true,
633
+ ok: false,
634
+ display: chalk.red(` ✗ task.update: state must be one of ${validStates.join(", ")}\n`),
635
+ modelNote: `task.update failed: state must be one of ${validStates.join(", ")}.`,
636
+ };
637
+ }
638
+ const ok = markTask(plan, taskId, stateRaw, note);
639
+ if (!ok) {
640
+ const ids = plan.tasks.map((t) => t.id).join(", ");
641
+ return {
642
+ handled: true,
643
+ ok: false,
644
+ display: chalk.red(` ✗ task.update: unknown taskId "${taskId}" (have: ${ids})\n`),
645
+ modelNote: `task.update failed: unknown taskId. Valid ids: ${ids}.`,
646
+ };
647
+ }
648
+ if (plan.status === "draft" || plan.status === "approved") {
649
+ plan.status = "in_progress";
650
+ }
651
+ const allDone = plan.tasks.every((t) => t.state === "done" || t.state === "skipped" || t.state === "failed");
652
+ if (allDone)
653
+ plan.status = "completed";
654
+ await savePlan(plan).catch(() => undefined);
655
+ const checklist = renderPlanForTerminal(plan);
656
+ return {
657
+ handled: true,
658
+ ok: true,
659
+ display: checklist + "\n",
660
+ modelNote: allDone
661
+ ? "Task updated. ALL tasks are now finished. Verify the result and give your final summary."
662
+ : "Task updated. Continue with the next pending task.",
663
+ };
664
+ }
381
665
  export async function runAgentLoop(prompt, options = {}) {
382
666
  const config = getConfig();
383
667
  const maxSteps = options.maxSteps ?? 30;
@@ -391,17 +675,39 @@ export async function runAgentLoop(prompt, options = {}) {
391
675
  if (freshWebSearchRequired) {
392
676
  systemSections.push(freshnessGuardMessage());
393
677
  }
394
- const fullSystemPrompt = systemSections.join("\n\n");
395
- const messages = [
396
- { role: "system", content: fullSystemPrompt },
397
- ...(options.history ?? []),
398
- { role: "user", content: prompt },
399
- ];
400
678
  let provider = options.provider ?? config.defaultProvider;
401
679
  await ensureProviderConfigured(provider);
402
680
  let model = options.model ?? config.defaultModel;
403
681
  let lastAnswer = "";
404
682
  const session = options.session ?? createSessionPolicy();
683
+ // ── Active plan context ────────────────────────────────────────────
684
+ // If this session already has a plan, inject it so the model keeps it in
685
+ // context. When the user has approved it (via /implement) we instruct the
686
+ // agent to execute task by task; otherwise the agent should refine/wait.
687
+ const activePlan = await loadPlan(session.sessionId).catch(() => undefined);
688
+ if (activePlan) {
689
+ systemSections.push(planContextMessage(activePlan, session.planApproved.value));
690
+ }
691
+ const fullSystemPrompt = systemSections.join("\n\n");
692
+ const userMessage = { role: "user", content: prompt };
693
+ if (options.images && options.images.length > 0) {
694
+ userMessage.images = options.images;
695
+ }
696
+ const messages = [
697
+ { role: "system", content: fullSystemPrompt },
698
+ ...(options.history ?? []),
699
+ userMessage,
700
+ ];
701
+ const recoveryUserMessage = (content) => {
702
+ const message = { role: "user", content };
703
+ if (options.images && options.images.length > 0) {
704
+ // Some OpenAI-compatible gateways/models attend most strongly to the
705
+ // latest user turn. Keep the image attached on recovery nudges so a
706
+ // thinking-only retry does not degrade into OCR/tool guessing.
707
+ message.images = options.images;
708
+ }
709
+ return message;
710
+ };
405
711
  // Track recent tool calls to detect models stuck in a loop calling the
406
712
  // same tool with the same arguments over and over (e.g. pentest.recon
407
713
  // called 3× on the same target without summarizing).
@@ -409,6 +715,13 @@ export async function runAgentLoop(prompt, options = {}) {
409
715
  // Track consecutive thinking-only responses so we can nudge the model
410
716
  // to actually act instead of silently returning an empty answer.
411
717
  let emptyVisibleRetries = 0;
718
+ // Track tool calls truncated by the token limit so we can ask the model
719
+ // to retry in smaller pieces instead of leaking broken JSON as an answer.
720
+ let truncatedToolRetries = 0;
721
+ // Track bare-args JSON tool calls (missing the {name,args} wrapper / fence)
722
+ // so we can nudge the model to re-emit a proper fenced call a few times
723
+ // before giving up, instead of leaking the JSON as a final answer.
724
+ let bareToolJsonRetries = 0;
412
725
  // For volatile live-info prompts, make one corrective pass if a model
413
726
  // ignores the freshness guard and tries to answer from stale memory.
414
727
  let sawFreshWebSearch = false;
@@ -472,9 +785,11 @@ export async function runAgentLoop(prompt, options = {}) {
472
785
  temperature: 0.2,
473
786
  // Reasoning models can spend a lot on hidden thinking; give
474
787
  // them headroom so the visible answer / tool call isn't
475
- // truncated to silence. Keep the no-thinking default lean so
476
- // fast models like kimi-k2.6 respond instantly.
477
- maxTokens: config.thinking?.enabled ? 8_192 : 4_096,
788
+ // truncated to silence. The non-thinking budget must be large
789
+ // enough for a multi-file fs.writeMany payload — a truncated
790
+ // tool-call JSON fails to parse and used to leak a broken
791
+ // ```tool block to the screen with no files written.
792
+ maxTokens: config.thinking?.enabled ? 16_384 : 8_192,
478
793
  signal: options.signal,
479
794
  thinking: config.thinking,
480
795
  }, (token) => {
@@ -526,12 +841,10 @@ export async function runAgentLoop(prompt, options = {}) {
526
841
  process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
527
842
  process.stdout.write(chalk.yellow(" ⚠ model produced only thinking — nudging it to take action\n"));
528
843
  messages.push({ role: "assistant", content: completion.text });
529
- messages.push({
530
- role: "user",
531
- content: "You only produced internal reasoning with no visible answer or tool call. " +
532
- "You MUST either call a tool using the ```tool format or provide your final answer. " +
533
- "Do NOT just think — take action NOW.",
534
- });
844
+ messages.push(recoveryUserMessage("You only produced internal reasoning with no visible answer or tool call. " +
845
+ "You MUST either call a tool using the ```tool format or provide your final answer. " +
846
+ "If images are attached, inspect them directly for visual details (text, colors, layout, spacing, style) instead of using OCR unless explicitly needed. " +
847
+ "Do NOT just think take action NOW."));
535
848
  continue;
536
849
  }
537
850
  // Exhausted retries — fall through to the normal empty-answer path
@@ -541,10 +854,42 @@ export async function runAgentLoop(prompt, options = {}) {
541
854
  // Reset the counter on any successful visible output.
542
855
  emptyVisibleRetries = 0;
543
856
  }
544
- const call = parseToolCall(assistantText.visible, {
857
+ let call = parseToolCall(assistantText.visible, {
545
858
  strict: getConfig().parserStrict,
546
859
  });
860
+ // Recovery: the model meant to call a tool but emitted a bare JSON object
861
+ // with no ```tool fence — either a complete {name,args} the strict
862
+ // matchers missed (recover it directly), or just an args object like
863
+ // {"path":"file.pdf"} with the wrapper dropped (nudge a retry below so
864
+ // the requested action runs instead of the JSON leaking as the answer).
865
+ let bareArgsOnly = false;
866
+ let recoveredFromBareJson = false;
547
867
  if (!call) {
868
+ const bare = recognizeBareToolJson(assistantText.visible);
869
+ if (bare?.call) {
870
+ call = bare.call;
871
+ recoveredFromBareJson = true;
872
+ process.stdout.write(chalk.dim(" ℹ recovered an unfenced tool call from bare JSON\n"));
873
+ }
874
+ else if (bare?.argsOnly) {
875
+ bareArgsOnly = true;
876
+ }
877
+ }
878
+ if (!call) {
879
+ if (bareArgsOnly) {
880
+ bareToolJsonRetries += 1;
881
+ if (bareToolJsonRetries <= 3) {
882
+ process.stdout.write(chalk.yellow(" ⚠ tool call missing its name/fence — asking the model to re-emit a proper ```tool block\n"));
883
+ messages.push({ role: "assistant", content: assistantText.visible });
884
+ messages.push(recoveryUserMessage("Your previous message was a bare JSON args object with no tool name and no ```tool fence, so NOTHING ran. " +
885
+ "Reply with ONLY a fenced ```tool block of the form " +
886
+ '`{"name": "<tool>", "args": { ... }}`. For example, to read a PDF:\n' +
887
+ '```tool\n{"name":"pdf.read","args":{"path":"/abs/file.pdf"}}\n```\n' +
888
+ "Choose the correct tool name for the task and include those args."));
889
+ continue;
890
+ }
891
+ // Exhausted retries — fall through to the normal answer path.
892
+ }
548
893
  // Detect the case where the model emitted sentinel-style tool-call
549
894
  // markers but the body was malformed or truncated. Printing those
550
895
  // raw tokens looks like a crash to the user — instead, ask the
@@ -552,15 +897,33 @@ export async function runAgentLoop(prompt, options = {}) {
552
897
  if (/<\|tool_call(?:s_section)?_begin\|>|<\|tool_call_argument_begin\|>/i.test(assistantText.visible)) {
553
898
  process.stdout.write(chalk.yellow(" ⚠ tool call was malformed or cut off — asking the model to retry in JSON form\n"));
554
899
  messages.push({ role: "assistant", content: assistantText.visible });
555
- messages.push({
556
- role: "user",
557
- content: "Your previous tool call was malformed or truncated. " +
558
- "Reply with ONLY a fenced ```tool block containing valid JSON " +
559
- 'of the form `{"name": "<tool>", "args": { ... }}`. ' +
560
- "Do not use <|tool_call_begin|> markers.",
561
- });
900
+ messages.push(recoveryUserMessage("Your previous tool call was malformed or truncated. " +
901
+ "Reply with ONLY a fenced ```tool block containing valid JSON " +
902
+ 'of the form `{"name": "<tool>", "args": { ... }}`. ' +
903
+ "Do not use <|tool_call_begin|> markers."));
562
904
  continue;
563
905
  }
906
+ // Detect a tool call that opened but was cut off by the token limit
907
+ // (most common with a large multi-file fs.writeMany). Retrying with a
908
+ // nudge to split the work is far better than rendering broken JSON as
909
+ // a final answer and leaving the project half-created.
910
+ if (looksLikeTruncatedToolCall(assistantText.visible)) {
911
+ truncatedToolRetries += 1;
912
+ if (truncatedToolRetries <= 3) {
913
+ process.stdout.write(chalk.yellow(" ⚠ tool call was cut off (output too long) — asking the model to retry in smaller pieces\n"));
914
+ messages.push({ role: "assistant", content: assistantText.visible });
915
+ messages.push({
916
+ role: "user",
917
+ content: "Your previous tool call was cut off before it finished — the JSON was incomplete, so NOTHING ran. " +
918
+ "Retry now with a COMPLETE, valid ```tool block. " +
919
+ "If it was a large fs.writeMany, split it into SMALLER batches (3-5 files per call, and keep each file's content concise) " +
920
+ "so the whole JSON fits in one response. Do NOT claim any file was written until a tool call actually succeeds.",
921
+ });
922
+ continue;
923
+ }
924
+ // Exhausted retries — fall through so we don't loop forever, but the
925
+ // user at least sees the (broken) output and the stop notice.
926
+ }
564
927
  // Normal final-answer path: strip any stray sentinel tokens that
565
928
  // somehow leaked into prose so the answer renders cleanly.
566
929
  const cleaned = stripSentinelTokens(assistantText.visible);
@@ -612,8 +975,12 @@ export async function runAgentLoop(prompt, options = {}) {
612
975
  if (loopCheck.reason) {
613
976
  process.stdout.write(chalk.dim(` ℹ ${loopCheck.reason}\n`));
614
977
  }
615
- // Print only non-thinking text before the tool call.
616
- const beforeTool = textBeforeToolCall(assistantText.visible);
978
+ // Print only non-thinking text before the tool call. When the call was
979
+ // recovered from a bare JSON object (the whole message WAS the call),
980
+ // there is no prose to show — skip it so we don't echo the raw JSON.
981
+ const beforeTool = recoveredFromBareJson
982
+ ? ""
983
+ : textBeforeToolCall(assistantText.visible);
617
984
  if (beforeTool) {
618
985
  process.stdout.write(renderMarkdown(beforeTool) + "\n");
619
986
  }
@@ -621,6 +988,25 @@ export async function runAgentLoop(prompt, options = {}) {
621
988
  process.stdout.write(`${renderThinkingSummary(assistantText.thinkContent)}\n`);
622
989
  }
623
990
  messages.push({ role: "assistant", content: assistantText.visible });
991
+ // ── Plan / task tools (session-scoped, handled inline) ─────────────
992
+ // These don't go through the generic registry because they need the
993
+ // session id and mutate the live plan that the user can view (Ctrl+P).
994
+ if (call.name === "plan.create" || call.name === "task.update") {
995
+ const planResult = await handlePlanTool(call, session, {
996
+ loopGuard,
997
+ step,
998
+ });
999
+ if (planResult.handled) {
1000
+ productiveSteps += 1;
1001
+ loopGuard.recordAttempt(step, call.name, call.args, planResult.ok, 0);
1002
+ process.stdout.write(planResult.display);
1003
+ messages.push({
1004
+ role: "tool",
1005
+ content: `Tool ${call.name} result (ok=${planResult.ok}):\n${planResult.modelNote}`,
1006
+ });
1007
+ continue;
1008
+ }
1009
+ }
624
1010
  const scope = await loadScope();
625
1011
  const decision = classifyToolCall(call, { scope });
626
1012
  await auditLog("tool.classified", {
@@ -809,7 +1195,9 @@ export async function runAgentLoop(prompt, options = {}) {
809
1195
  ? String(call.args.command ?? "").split(/\s+/)[0]
810
1196
  : call.name === "net.scan"
811
1197
  ? "nmap"
812
- : undefined;
1198
+ : call.name === "image.ocr"
1199
+ ? "tesseract"
1200
+ : undefined;
813
1201
  if (cmdName) {
814
1202
  process.stdout.write(chalk.yellow(` ⚠ ${cmdName} not found — asking model to install and retry\n`));
815
1203
  messages.push({