npm - omnius - Versions diffs - 1.0.214 → 1.0.216 - Mend

omnius 1.0.214 → 1.0.216

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.js +200 -104
package/npm-shrinkwrap.json +2 -2
package/package.json +1 -1
package/prompts/agentic/system-large.md +2 -2
package/prompts/agentic/system-medium.md +4 -4
package/prompts/agentic/system-small.md +2 -2

package/dist/index.js CHANGED Viewed

@@ -24413,7 +24413,7 @@ var EXCLUDED, MAX_ENTRIES, ListDirectoryTool;
 var init_list_directory = __esm({
   "packages/execution/dist/tools/list-directory.js"() {
     "use strict";
-    EXCLUDED = /* @__PURE__ */ new Set(["node_modules", ".git"]);
+    EXCLUDED = /* @__PURE__ */ new Set(["node_modules", ".git", ".omnius"]);
     MAX_ENTRIES = 100;
     ListDirectoryTool = class {
       name = "list_directory";
@@ -289853,6 +289853,53 @@ function getTodoSessionId() {
     return envSession;
   return "default";
 }
+function normalizeIncomingTodos(args) {
+  const repairNotes = [];
+  const record = args;
+  if (Array.isArray(args)) {
+    repairNotes.push("coerced top-level array into {todos:[...]}");
+    return { todos: args, repairNotes, error: "" };
+  }
+  const direct = record["todos"];
+  if (Array.isArray(direct)) {
+    return { todos: direct, repairNotes, error: "" };
+  }
+  if (direct && typeof direct === "object") {
+    const nested = direct;
+    for (const key of ["todos", "items", "tasks", "checklist"]) {
+      if (Array.isArray(nested[key])) {
+        repairNotes.push(`coerced todos.${key} into todos array`);
+        return { todos: nested[key], repairNotes, error: "" };
+      }
+    }
+    if (typeof nested["content"] === "string") {
+      repairNotes.push("wrapped single todo object in todos array");
+      return { todos: [nested], repairNotes, error: "" };
+    }
+  }
+  for (const key of ["items", "tasks", "checklist", "todo_items"]) {
+    if (Array.isArray(record[key])) {
+      repairNotes.push(`coerced ${key} into todos array`);
+      return { todos: record[key], repairNotes, error: "" };
+    }
+  }
+  const single = record["todo"] ?? record["task"];
+  if (single && typeof single === "object" && !Array.isArray(single)) {
+    const obj = single;
+    if (typeof obj["content"] === "string") {
+      repairNotes.push("coerced single todo/task object into todos array");
+      return { todos: [obj], repairNotes, error: "" };
+    }
+  }
+  if (typeof single === "string" && single.trim()) {
+    repairNotes.push("coerced single todo/task string into todos array");
+    return { todos: [single.trim()], repairNotes, error: "" };
+  }
+  return {
+    repairNotes,
+    error: 'todos must be an array. Correct shape: todo_write({"todos":[{"content":"Inspect files","status":"in_progress"},{"content":"Make changes","status":"pending"}]})'
+  };
+}
 var _currentSessionId, TodoWriteTool, TodoReadTool;
 var init_todo_write = __esm({
   "packages/execution/dist/tools/todo-write.js"() {
@@ -289861,7 +289908,27 @@ var init_todo_write = __esm({
     _currentSessionId = "";
     TodoWriteTool = class {
       name = "todo_write";
-      description = "Update the session task checklist. To be used proactively and often to track progress and pending tasks. Make sure that at least one task is in_progress at all times. \n\n## When to use\n1. Complex multi-step tasks — when a task requires 3 or more distinct steps or actions\n2. When the user provides multiple tasks (numbered or comma-separated)\n3. After receiving new instructions — capture user requirements as todos immediately\n4. When you start a task — mark it in_progress BEFORE beginning work. Only ONE in_progress at a time\n5. After completing a task — mark it completed and add follow-up tasks you discovered\n\n## When NOT to use\n- Single, straightforward tasks (a trivial edit, a one-line fix)\n- Conversational or informational questions\n- Tasks completable in <3 trivial steps\n\n## Task states\n- pending: not started\n- in_progress: currently working on (exactly ONE at a time)\n- completed: fully done (tests pass, code works, goal met)\n- blocked: stuck on a dependency (include blocker text)\n\nMark tasks complete IMMEDIATELY after finishing — don't batch. Never mark completed if tests are failing or implementation is partial. The user watches this list in the chat UI in real time.";
+      description = `Update the session task checklist. To be used proactively and often to track progress and pending tasks. Make sure that at least one task is in_progress at all times.
+## When to use
+1. Complex multi-step tasks — when a task requires 3 or more distinct steps or actions
+2. When the user provides multiple tasks (numbered or comma-separated)
+3. After receiving new instructions — capture user requirements as todos immediately
+4. When you start a task — mark it in_progress BEFORE beginning work. Only ONE in_progress at a time
+5. After completing a task — mark it completed and add follow-up tasks you discovered
+## When NOT to use
+- Single, straightforward tasks (a trivial edit, a one-line fix)
+- Conversational or informational questions
+- Tasks completable in <3 trivial steps
+## Task states
+- pending: not started
+- in_progress: currently working on (exactly ONE at a time)
+- completed: fully done (tests pass, code works, goal met)
+- blocked: stuck on a dependency (include blocker text)
+Mark tasks complete IMMEDIATELY after finishing — don't batch. Never mark completed if tests are failing or implementation is partial. The user watches this list in the chat UI in real time. Canonical call shape: todo_write({"todos":[{"content":"Inspect files","status":"in_progress"},{"content":"Make changes","status":"pending"},{"content":"Verify results","status":"pending"}]})`;
       parameters = {
         type: "object",
         required: ["todos"],
@@ -289902,48 +289969,62 @@ var init_todo_write = __esm({
       async execute(args) {
         const start2 = performance.now();
         try {
-          const incomingRaw = args["todos"];
-          if (!Array.isArray(incomingRaw)) {
+          const normalized = normalizeIncomingTodos(args);
+          if (!normalized.todos) {
             return {
               success: false,
               output: "",
-              error: "todos must be an array",
+              error: normalized.error,
               durationMs: performance.now() - start2
             };
           }
           const incoming = [];
-          for (const raw of incomingRaw) {
+          const repairNotes = [...normalized.repairNotes];
+          for (let index = 0; index < normalized.todos.length; index++) {
+            const raw = normalized.todos[index];
             if (!raw || typeof raw !== "object") {
+              if (typeof raw === "string" && raw.trim()) {
+                incoming.push({
+                  content: raw.trim(),
+                  status: index === 0 ? "in_progress" : "pending"
+                });
+                repairNotes.push("coerced string todo item into {content,status}");
+                continue;
+              }
               return {
                 success: false,
                 output: "",
-                error: "each todo must be an object with content+status",
+                error: 'each todo must be an object with content+status. Correct shape: {"todos":[{"content":"...","status":"in_progress"}]}',
                 durationMs: performance.now() - start2
               };
             }
             const entry = raw;
             const content = entry["content"];
             const status = entry["status"];
-            if (typeof content !== "string" || typeof status !== "string") {
+            if (typeof content !== "string") {
               return {
                 success: false,
                 output: "",
-                error: "todo must have string content and string status",
+                error: 'todo must have string content. Correct shape: {"todos":[{"content":"...","status":"in_progress"}]}',
                 durationMs: performance.now() - start2
               };
             }
-            if (!["pending", "in_progress", "completed", "blocked"].includes(status)) {
+            const resolvedStatus = typeof status === "string" ? status : index === 0 ? "in_progress" : "pending";
+            if (typeof status !== "string") {
+              repairNotes.push("defaulted missing todo status to in_progress/pending");
+            }
+            if (!["pending", "in_progress", "completed", "blocked"].includes(resolvedStatus)) {
               return {
                 success: false,
                 output: "",
-                error: `invalid status: ${status}`,
+                error: `invalid status: ${resolvedStatus}`,
                 durationMs: performance.now() - start2
               };
             }
             incoming.push({
               id: typeof entry["id"] === "string" ? entry["id"] : void 0,
               content,
-              status,
+              status: resolvedStatus,
               parentId: typeof entry["parentId"] === "string" ? entry["parentId"] : void 0,
               blocker: typeof entry["blocker"] === "string" ? entry["blocker"] : void 0,
               // REG-37: verification-aware planning
@@ -289986,6 +290067,16 @@ var init_todo_write = __esm({
             newTodos: result.newTodos,
             verificationNudgeNeeded
           };
+          if (repairNotes.length > 0) {
+            payload["inputRepair"] = Array.from(new Set(repairNotes));
+            payload["canonicalShape"] = {
+              todos: [
+                { content: "Inspect files", status: "in_progress" },
+                { content: "Make changes", status: "pending" },
+                { content: "Verify results", status: "pending" }
+              ]
+            };
+          }
           if (verificationNudgeNeeded) {
             payload["nudge"] = "You just closed 3+ todos without scheduling a verification step. Add a 'Verify the changes work' item and spawn a verification agent before declaring task_complete.";
           }
@@ -564459,10 +564550,12 @@ ${_staleSamples.join("\n")}` : ``,
           const turnTier = this.options.modelTier ?? "large";
           if (turn === 0 && !this.options.disableTodoPlanningNudges && (turnTier === "small" || turnTier === "medium")) {
             const goal = this._taskState.goal || "";
-            const wordCount2 = goal.split(/\s+/).length;
-            const hasMultipleActions = /\band\b.*\band\b|then.*then|also.*also/i.test(goal);
-            const hasMultipleFiles = /files?.*files?|\.ts.*\.ts|create.*write|modify.*create/i.test(goal);
-            const isComplex = wordCount2 > 40 || hasMultipleActions || hasMultipleFiles;
+            const substantiveGoal = goal.replace(/\b(?:then\s+)?call\s+task_complete\b[^.?!;]*/gi, "").replace(/\b(?:observe|report|summarize|finish|complete)\b[^.?!;]*/gi, "");
+            const wordCount2 = substantiveGoal.split(/\s+/).filter(Boolean).length;
+            const hasMultipleActions = /\band\b.*\band\b|then.*then|also.*also/i.test(substantiveGoal);
+            const hasMultipleFiles = /files?.*files?|\.ts.*\.ts|create.*write|modify.*create/i.test(substantiveGoal);
+            const explicitSingleTool = /\b(exactly once|single tool|one tool|one tool call)\b/i.test(goal) || /\b(call|use)\s+(?:list_directory|file_read|grep_search|find_files|shell|web_search|web_fetch)\b/i.test(goal) && !/\b(edit|write|modify|create|fix|implement|patch|test|build|install|refactor)\b/i.test(substantiveGoal);
+            const isComplex = !explicitSingleTool && (wordCount2 > 40 || hasMultipleActions || hasMultipleFiles);
             if (isComplex) {
               messages2.push({
                 role: "user",
@@ -564471,6 +564564,7 @@ ${_staleSamples.join("\n")}` : ``,
 MANDATORY FIRST ACTION: Call todo_write NOW with the complete plan.
 Each todo item is { content: "what to do", status: "pending" | "in_progress" | "completed" | "blocked" }.
 Mark item 1 as in_progress, the rest as pending.
+Only count substantive work phases. Do NOT count observing a tool result, reporting findings, or calling task_complete as todo phases.
 Example: todo_write({todos: [{content: "read source files", status: "in_progress"}, {content: "make changes", status: "pending"}, {content: "run tests", status: "pending"}]})
 After EACH phase finishes, call todo_write AGAIN with item N marked completed and item N+1 marked in_progress.
@@ -564566,7 +564660,7 @@ ${top.map((t2) => `- ${t2.name}: ${t2.desc}`).join("\n")}`);
             const isReadTask = /\bread\b|\bshow\b|\btell me\b|\bwhat is\b/i.test(taskGoal);
             const hints = [];
             if (isSimpleTask) {
-              hints.push("This is a simple task — if it needs only ONE tool call, skip todo_write and call the tool directly. If it needs 2+ steps, use todo_write to plan.");
+              hints.push("This is a simple task — if it needs only ONE substantive tool call, skip todo_write and call the tool directly, then task_complete. Do not count reporting, observing output, or task_complete as planning steps. If it needs 2+ substantive work steps, use todo_write to plan.");
             }
             if (isSearchTask) {
               hints.push("SEARCH STRATEGY: Use grep_search to find what you need FIRST, THEN file_read only the specific file and lines. Do NOT read entire files hoping to find something.");
@@ -578422,6 +578516,70 @@ var init_generative_progress = __esm({
   }
 });
+// packages/cli/src/tui/tool-adapter.ts
+function mapExecutionToolResult(result) {
+  return {
+    success: result.success,
+    output: result.output,
+    error: result.error,
+    llmContent: result.llmContent,
+    mutated: result.mutated,
+    mutatedFiles: result.mutatedFiles,
+    diff: result.diff,
+    dryRun: result.dryRun,
+    noop: result.noop,
+    partial: result.partial,
+    beforeHash: result.beforeHash,
+    afterHash: result.afterHash
+  };
+}
+function adaptExecutionTool(tool, options2 = {}) {
+  const progressTool = tool;
+  if (generationKindForToolName(tool.name) && typeof progressTool.setProgressCallback === "function") {
+    progressTool.setProgressCallback((event) => {
+      options2.onProgress?.(tool.name, event);
+    });
+  }
+  const adapted = {
+    name: tool.name,
+    aliases: tool.aliases,
+    description: tool.description,
+    parameters: tool.parameters,
+    inputSchema: tool.inputSchema,
+    maxResultSizeChars: tool.maxResultSizeChars,
+    async execute(args) {
+      const invoke = () => tool.execute(args);
+      const result = options2.execute ? await options2.execute(tool, args, invoke) : await invoke();
+      return mapExecutionToolResult(result);
+    }
+  };
+  if (typeof tool.prompt === "function") {
+    adapted.prompt = (context2) => tool.prompt(context2);
+  }
+  if (typeof tool.executeStream === "function") {
+    adapted.executeStream = async function* (args) {
+      const result = yield* tool.executeStream(args);
+      return mapExecutionToolResult(result);
+    };
+  }
+  if (typeof tool.validateInput === "function") {
+    adapted.validateInput = (args, context2) => tool.validateInput(args, context2);
+  }
+  if (typeof tool.isConcurrencySafe === "function") {
+    adapted.isConcurrencySafe = (args) => tool.isConcurrencySafe(args);
+  }
+  if (typeof tool.isReadOnly === "function") {
+    adapted.isReadOnly = (args) => tool.isReadOnly(args);
+  }
+  return adapted;
+}
+var init_tool_adapter = __esm({
+  "packages/cli/src/tui/tool-adapter.ts"() {
+    "use strict";
+    init_generative_progress();
+  }
+});
 // packages/cli/src/tui/runtime-verification.ts
 import { execFileSync as execFileSync6 } from "node:child_process";
 import { existsSync as existsSync92, readFileSync as readFileSync74, readdirSync as readdirSync29 } from "node:fs";
@@ -592024,15 +592182,7 @@ var init_p2p = __esm({
 import { EventEmitter as EventEmitter11 } from "node:events";
 import crypto13 from "node:crypto";
 function adaptTool(tool) {
-  return {
-    name: tool.name,
-    description: tool.description,
-    parameters: tool.parameters,
-    async execute(args) {
-      const result = await tool.execute(args);
-      return { success: result.success, output: result.output, error: result.error };
-    }
-  };
+  return adaptExecutionTool(tool);
 }
 function getActivityFeed() {
   if (!_globalFeed) _globalFeed = new ActivityFeed();
@@ -592047,6 +592197,7 @@ var init_call_agent = __esm({
     "use strict";
     init_dist8();
     init_dist6();
+    init_tool_adapter();
     ActivityFeed = class {
       entries = [];
       maxEntries = 100;
@@ -632130,15 +632281,7 @@ function computeSparsity(entries) {
   return Math.max(0, Math.min(1, 1 - avgOverlap));
 }
 function adaptTool2(tool) {
-  return {
-    name: tool.name,
-    description: tool.description,
-    parameters: tool.parameters,
-    async execute(args) {
-      const result = await tool.execute(args);
-      return { success: result.success, output: result.output, error: result.error };
-    }
-  };
+  return adaptExecutionTool(tool);
 }
 var SNREngine;
 var init_snr_engine = __esm({
@@ -632148,6 +632291,7 @@ var init_snr_engine = __esm({
     init_dist6();
     init_project_context();
     init_render();
+    init_tool_adapter();
     SNREngine = class {
       constructor(config, repoRoot) {
         this.config = config;
@@ -632608,15 +632752,7 @@ ${sections.join("\n\n")}`;
   }
 }
 function adaptTool3(tool) {
-  return {
-    name: tool.name,
-    description: tool.description,
-    parameters: tool.parameters,
-    async execute(args) {
-      const result = await tool.execute(args);
-      return { success: result.success, output: result.output, error: result.error };
-    }
-  };
+  return adaptExecutionTool(tool);
 }
 function buildDreamPrompt(mode, stage, cycleNum, totalCycles, previousFindings, dreamsDir) {
   const modeDesc = mode === "lucid" ? "LUCID DREAM MODE: You have full implementation capability. After ideation, you will implement, test, and evaluate changes." : mode === "deep" ? "DEEP DREAM MODE: Explore deeply with multiple expansion/contraction cycles. All proposals go in .omnius/dreams/." : "DREAM MODE: Creative exploration only. All output must be written to .omnius/dreams/ directory using file_write.";
@@ -632758,6 +632894,7 @@ var init_dream_engine = __esm({
     init_setup();
     init_render();
     init_promptLoader3();
+    init_tool_adapter();
     _dreamWriteContent = null;
     SWARM_ROLE_CONFIG = {
       researcher: { maxTurns: 25, temperature: 0.4 },
@@ -634546,15 +634683,7 @@ Reflect on what went well and what could improve.`;
   }
 }
 function adaptTool4(tool) {
-  return {
-    name: tool.name,
-    description: tool.description,
-    parameters: tool.parameters,
-    async execute(args) {
-      const result = await tool.execute(args);
-      return { success: result.success, output: result.output, error: result.error };
-    }
-  };
+  return adaptExecutionTool(tool);
 }
 function renderDMNCycleStart(cycleNum, deliberation = false) {
   process.stdout.write(`
@@ -634622,6 +634751,7 @@ var init_dmn_engine = __esm({
     init_project_context();
     init_render();
     init_promptLoader3();
+    init_tool_adapter();
     DMNEngine = class {
       constructor(config, repoRoot) {
         this.config = config;
@@ -642252,25 +642382,17 @@ function normalizeTelegramCallbackQuery(update2) {
   };
 }
 function adaptTool5(tool, todoSessionId, progress) {
-  const progressTool = tool;
-  if (generationKindForToolName(tool.name) && typeof progressTool.setProgressCallback === "function") {
-    progressTool.setProgressCallback((event) => {
-      progress?.onProgress(tool.name, event);
-    });
-  }
-  return {
-    name: tool.name,
-    description: tool.description,
-    parameters: tool.parameters,
-    async execute(args) {
+  return adaptExecutionTool(tool, {
+    onProgress: (toolName, event) => progress?.onProgress(toolName, event),
+    execute: async (_tool, args, invoke) => {
       const previousTodoSession = todoSessionId ? getTodoSessionId() : "";
       if (todoSessionId && (tool.name === "todo_write" || tool.name === "todo_read")) {
         setTodoSessionId(todoSessionId);
       }
       try {
-        const result = await tool.execute(args);
+        const result = await invoke();
         progress?.complete(tool.name, result);
-        return { success: result.success, output: result.output, error: result.error, llmContent: result.llmContent };
+        return result;
       } catch (err) {
         progress?.complete(tool.name, {
           success: false,
@@ -642284,7 +642406,7 @@ function adaptTool5(tool, todoSessionId, progress) {
         }
       }
     }
-  };
+  });
 }
 function telegramBotAccessSettingsFromApi(settings) {
   return {
@@ -642449,6 +642571,7 @@ var init_telegram_bridge = __esm({
     init_voice_soul();
     init_telegram_creative_tools();
     init_generative_progress();
+    init_tool_adapter();
     init_omnius_directory();
     init_stimulation();
     init_pid_controller();
@@ -681209,42 +681332,9 @@ function getVersion4() {
   return "0.0.0";
 }
 function adaptTool6(tool) {
-  const progressTool = tool;
-  if (generationKindForToolName(tool.name) && typeof progressTool.setProgressCallback === "function") {
-    progressTool.setProgressCallback((event) => {
-      _generativeProgressSink?.(tool.name, event);
-    });
-  }
-  return {
-    name: tool.name,
-    aliases: tool.aliases,
-    description: tool.description,
-    parameters: tool.parameters,
-    inputSchema: tool.inputSchema,
-    prompt: tool.prompt,
-    executeStream: tool.executeStream,
-    validateInput: tool.validateInput,
-    isConcurrencySafe: tool.isConcurrencySafe,
-    isReadOnly: tool.isReadOnly,
-    maxResultSizeChars: tool.maxResultSizeChars,
-    async execute(args) {
-      const result = await tool.execute(args);
-      return {
-        success: result.success,
-        output: result.output,
-        error: result.error,
-        llmContent: result.llmContent,
-        mutated: result.mutated,
-        mutatedFiles: result.mutatedFiles,
-        diff: result.diff,
-        dryRun: result.dryRun,
-        noop: result.noop,
-        partial: result.partial,
-        beforeHash: result.beforeHash,
-        afterHash: result.afterHash
-      };
-    }
-  };
+  return adaptExecutionTool(tool, {
+    onProgress: (toolName, event) => _generativeProgressSink?.(toolName, event)
+  });
 }
 function createTuiReminderOptions(allowActionDelivery = true) {
   const sessionId = process.env["OMNIUS_SESSION_ID"] || "terminal";
@@ -690440,6 +690530,7 @@ var init_interactive = __esm({
     init_dist8();
     init_dist6();
     init_generative_progress();
+    init_tool_adapter();
     init_runtime_verification();
     init_dist();
     init_listen();
@@ -691679,6 +691770,7 @@ function parseCliArgs(argv) {
       local: { type: "boolean", short: "l" },
       port: { type: "string" },
       suite: { type: "string" },
+      live: { type: "boolean" },
       json: { type: "boolean", short: "j" },
       background: { type: "boolean" },
       help: { type: "boolean", short: "h" },
@@ -691738,6 +691830,7 @@ function parseCliArgs(argv) {
         break;
       case "eval":
         result.evalSuite = typeof values.suite === "string" ? values.suite : void 0;
+        result.evalLive = values.live === true;
         break;
       default:
         break;
@@ -691776,6 +691869,7 @@ Flags:
       --max-retries <n>      Max retries per model request
       --timeout-ms <ms>      Overall task timeout
       --suite <name>         Eval suite: basic (default) or full
+      --live                 Run eval against configured backend instead of FakeBackend
       --port <n>             Server port (serve command, vLLM only, default: 8000)
   -h, --help                 Show this help
   -V, --version              Show version
@@ -691801,6 +691895,7 @@ Examples:
   omnius serve
   omnius serve --backend vllm --port 9000
   omnius eval --suite full --verbose
+  omnius eval --suite basic --live --backend ollama --model qwen3.5:9b
   omnius config set model qwen3.5:122b
 `.trim();
   process.stdout.write(text + "\n");
@@ -691936,7 +692031,8 @@ async function main() {
           {
             suite: parsed.evalSuite,
             repoPath: parsed.repoPath,
-            verbose: parsed.verbose
+            verbose: parsed.verbose,
+            live: parsed.evalLive
           },
           config
         );

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "omnius",
-  "version": "1.0.214",
+  "version": "1.0.216",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "omnius",
-      "version": "1.0.214",
+      "version": "1.0.216",
       "bundleDependencies": [
         "image-to-ascii"
       ],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "omnius",
-  "version": "1.0.214",
+  "version": "1.0.216",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",

package/prompts/agentic/system-large.md CHANGED Viewed

@@ -51,7 +51,7 @@ If you anticipate a large result before calling a tool, prefer narrow flags firs
 - list_directory: List files in a directory with types and sizes
 - web_search: Search the web for documentation or solutions
 - web_fetch: Fetch a web page and extract text content (for docs, MDN, w3schools.com, etc.)
-- todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ logical phases, your FIRST tool call must be todo_write declaring the entire plan as an array of items with status pending|in_progress|completed|blocked. After each phase completes, call todo_write again with item N marked completed and item N+1 marked in_progress. The user watches this checklist update live in the chat UI — it is your primary planning surface for long-horizon work and the user can see at a glance whether you are making progress or stuck. Use todo_write for any task naturally containing 3+ phases (build/test/ship, scrape/parse/store, plan/draft/edit, explore/refactor/verify, etc.). Do NOT use it for trivial single-step questions. Each todo accepts two OPTIONAL fields you should USE whenever the todo has objective completion criteria: `verifyCommand` (a shell command that PROVES the todo is complete — typecheck/test/build invocations etc.) and `declaredArtifacts` (a list of file paths this todo will produce). The orchestrator auto-checks both at completion-claim time; missing/unverified completions are rejected with a specific gap critique. **Worked example — emit todos in this exact shape:** `todo_write({"todos":[{"id":"p1","content":"Implement cache module","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts","tests/cache.test"]},{"id":"p2","content":"Make build pass","status":"pending","verifyCommand":"<your build command>"}]})`. Substitute placeholder strings with commands native to YOUR stack.
+- todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ substantive work phases, your FIRST tool call must be todo_write declaring the entire plan as an array of items with status pending|in_progress|completed|blocked. After each phase completes, call todo_write again with item N marked completed and item N+1 marked in_progress. Do NOT count observing a tool result, reporting findings, or task_complete as phases. The user watches this checklist update live in the chat UI — it is your primary planning surface for long-horizon work and the user can see at a glance whether you are making progress or stuck. Use todo_write for any task naturally containing 3+ real work phases (build/test/ship, scrape/parse/store, plan/draft/edit, explore/refactor/verify, etc.). Do NOT use it for trivial single-step questions. Each todo accepts two OPTIONAL fields you should USE whenever the todo has objective completion criteria: `verifyCommand` (a shell command that PROVES the todo is complete — typecheck/test/build invocations etc.) and `declaredArtifacts` (a list of file paths this todo will produce). The orchestrator auto-checks both at completion-claim time; missing/unverified completions are rejected with a specific gap critique. **Worked example — emit todos in this exact shape:** `todo_write({"todos":[{"id":"p1","content":"Implement cache module","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts","tests/cache.test"]},{"id":"p2","content":"Make build pass","status":"pending","verifyCommand":"<your build command>"}]})`. Substitute placeholder strings with commands native to YOUR stack.
 ## Web Tool Selection
@@ -182,7 +182,7 @@ When you discover image files (png, jpg, gif, svg, webp, bmp) during codebase ex
 ## Workflow
-0. **PLAN AT THE TOP** — for any task with 3+ logical phases, your VERY FIRST tool call must be `todo_write` with a complete checklist (each item: `{content, status}`). Mark item 1 as `in_progress`, the rest as `pending`. The user watches this checklist update live in the chat UI as you work, so they always know what step you're on. After each phase, call todo_write again to mark the finished item `completed` and the next one `in_progress`.
+0. **PLAN AT THE TOP** — for any task with 3+ substantive work phases, your VERY FIRST tool call must be `todo_write` with a complete checklist (each item: `{content, status}`). Mark item 1 as `in_progress`, the rest as `pending`. Do not count observing output, reporting findings, or task_complete as phases. The user watches this checklist update live in the chat UI as you work, so they always know what step you're on. After each phase, call todo_write again to mark the finished item `completed` and the next one `in_progress`.
 1. EXPLORE: Use find_files and grep_search to locate relevant code. Read specific files.
 2. PLAN: Determine what changes are needed based on the code you've read.
 3. IMPLEMENT: Make changes using file_edit (preferred) or file_write for new files.

package/prompts/agentic/system-medium.md CHANGED Viewed

@@ -11,7 +11,7 @@ You operate in two modes based on what the user needs:
 **TASK MODE** — coding tasks, file operations, technical directives:
 - Call tools iteratively until complete. NEVER write code blocks as text — only tool calls execute.
 - If you need to read a file, call file_read. If you need to run a command, call shell.
-- **MANDATORY: For ANY task that will take 3 or more tool calls, your VERY FIRST tool call MUST be `todo_write` declaring the complete plan.** Items have `{content, status}` where status is one of pending|in_progress|completed|blocked. Mark item 1 in_progress, the rest pending. Then re-call todo_write after each phase finishes to mark item N completed and N+1 in_progress. The user watches this checklist update live in the chat UI — without it they can't see your plan or track your progress.
+- **MANDATORY: For ANY task that will take 3 or more substantive work tool calls, your VERY FIRST tool call MUST be `todo_write` declaring the complete plan.** Items have `{content, status}` where status is one of pending|in_progress|completed|blocked. Mark item 1 in_progress, the rest pending. Then re-call todo_write after each phase finishes to mark item N completed and N+1 in_progress. Do NOT count observing tool output, reporting findings, or task_complete as work phases. For one-tool tasks, call the tool directly and then task_complete. The user watches this checklist update live in the chat UI — without it they can't see your plan or track your progress.
 ## Instruction Hierarchy
@@ -41,7 +41,7 @@ Tool results over ~100KB are NOT truncated. The orchestrator saves the full payl
 - list_directory: List files in a directory
 - web_search: Search the web
 - web_fetch: Fetch a web page's text
-- todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ logical steps, start by calling todo_write to declare your plan, then re-call todo_write as each step transitions (mark item N "completed" + N+1 "in_progress"). The user sees this list update live in the UI — it is your primary planning surface for long-horizon work. Use it whenever the task naturally has 3+ phases (build/refactor/test/ship, scrape/parse/store/report, plan/draft/edit/publish, etc.).
+- todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ substantive work steps, start by calling todo_write to declare your plan, then re-call todo_write as each step transitions (mark item N "completed" + N+1 "in_progress"). The user sees this list update live in the UI — it is your primary planning surface for long-horizon work. Use it whenever the task naturally has 3+ real work phases (build/refactor/test/ship, scrape/parse/store/report, plan/draft/edit/publish, etc.). Skip it for a single tool action followed only by reporting and task_complete.
   Each todo accepts two OPTIONAL fields you should USE whenever the todo has objective completion criteria:
@@ -105,8 +105,8 @@ Launch ALL sub_agent calls in ONE response. This saves your context window for o
 ## Workflow
-For tasks requiring 3+ tool calls — plan before acting:
-1. LIST all steps needed before your first tool call. **For 3+ step tasks, your FIRST tool call must be `todo_write` declaring the full plan with item 1 set to status:"in_progress" and the rest "pending".** Then call todo_write again as each step finishes to mark items "completed" and the next one "in_progress". The user watches this list update live in the chat UI.
+For tasks requiring 3+ substantive work tool calls — plan before acting:
+1. LIST all real work steps needed before your first tool call. **For 3+ substantive-step tasks, your FIRST tool call must be `todo_write` declaring the full plan with item 1 set to status:"in_progress" and the rest "pending".** Do not count reporting, observing output, or task_complete as steps. Then call todo_write again as each step finishes to mark items "completed" and the next one "in_progress". The user watches this list update live in the chat UI.
 2. If task mentions 3+ independent modules/files: delegate each to a sub_agent (saves context)
 3. EXPLORE: Use find_files, grep_search, file_explore to understand the codebase
    - For large files (200+ lines): use file_explore(strategy='overview') then search/chunk — NEVER read entire file

package/prompts/agentic/system-small.md CHANGED Viewed

@@ -34,7 +34,7 @@ File edits: Use file_write/file_edit/file_patch/batch_edit for project files, no
 Tool choice: Use file/search/code-graph tools for repository discovery, web_fetch/web_download/browser_action for web work, and repl_exec for multi-step data processing. Use shell when the command itself is the verifier or work product: tests, builds, package managers, git, system operations, and small native scripts. Do not hide diagnostics inside opaque shell blobs or `|| true`. Use background_run for long commands and poll with task_status/task_output.
-todo_write: visible task checklist for the user. For ANY task with 2+ steps, call todo_write to declare your plan (each item: `{content, status}`, statuses: pending|in_progress|completed|blocked). Update status as you complete each step. Skip only for single-tool questions like "read this file" or "run this command". Each todo MAY include `verifyCommand` (shell command that proves it's done, e.g. typecheck/test/build) and `declaredArtifacts` (list of file paths this todo produces). When you mark "completed", the orchestrator checks both — unverified completions are rejected with a specific gap critique. **Example shape:** `{"id":"p1","content":"Implement cache","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts"]}`. Substitute placeholders with commands native to YOUR stack.
+todo_write: visible task checklist for the user. Use it for substantive multi-step work, not ceremony. For tasks with 2+ substantive work steps, call todo_write to declare your plan (each item: `{content, status}`, statuses: pending|in_progress|completed|blocked). Update status as you complete each step. Skip single-tool questions like "read this file", "list this directory", or "run this command", even if you will report findings and call task_complete afterward. Do NOT count observing a tool result, reporting findings, or task_complete as todo steps. Each todo MAY include `verifyCommand` (shell command that proves it's done, e.g. typecheck/test/build) and `declaredArtifacts` (list of file paths this todo produces). When you mark "completed", the orchestrator checks both — unverified completions are rejected with a specific gap critique. **Example shape:** `{"id":"p1","content":"Implement cache","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts"]}`. Substitute placeholders with commands native to YOUR stack.
 Web: web_search finds URLs, web_fetch reads them. For JS pages use web_crawl, for clicking/login use browser_action.
@@ -100,7 +100,7 @@ Creating new files — WRITE FIRST, refine later:
 - After writing: fill in each method, test after each one.
 - A bad first draft you can fix is better than no draft at all.
-Complex tasks (5+ steps) — DECOMPOSE before acting:
+Complex tasks (5+ substantive work steps) — DECOMPOSE before acting:
 1. Call todo_write with the checklist. Mark item 1 "in_progress".
 2. Execute ONE STEP AT A TIME. After each, update todo_write status.
 3. After each file edit, VERIFY: file_read or shell test.