omnius 1.0.215 → 1.0.216

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -24413,7 +24413,7 @@ var EXCLUDED, MAX_ENTRIES, ListDirectoryTool;
24413
24413
  var init_list_directory = __esm({
24414
24414
  "packages/execution/dist/tools/list-directory.js"() {
24415
24415
  "use strict";
24416
- EXCLUDED = /* @__PURE__ */ new Set(["node_modules", ".git"]);
24416
+ EXCLUDED = /* @__PURE__ */ new Set(["node_modules", ".git", ".omnius"]);
24417
24417
  MAX_ENTRIES = 100;
24418
24418
  ListDirectoryTool = class {
24419
24419
  name = "list_directory";
@@ -289853,6 +289853,53 @@ function getTodoSessionId() {
289853
289853
  return envSession;
289854
289854
  return "default";
289855
289855
  }
289856
+ function normalizeIncomingTodos(args) {
289857
+ const repairNotes = [];
289858
+ const record = args;
289859
+ if (Array.isArray(args)) {
289860
+ repairNotes.push("coerced top-level array into {todos:[...]}");
289861
+ return { todos: args, repairNotes, error: "" };
289862
+ }
289863
+ const direct = record["todos"];
289864
+ if (Array.isArray(direct)) {
289865
+ return { todos: direct, repairNotes, error: "" };
289866
+ }
289867
+ if (direct && typeof direct === "object") {
289868
+ const nested = direct;
289869
+ for (const key of ["todos", "items", "tasks", "checklist"]) {
289870
+ if (Array.isArray(nested[key])) {
289871
+ repairNotes.push(`coerced todos.${key} into todos array`);
289872
+ return { todos: nested[key], repairNotes, error: "" };
289873
+ }
289874
+ }
289875
+ if (typeof nested["content"] === "string") {
289876
+ repairNotes.push("wrapped single todo object in todos array");
289877
+ return { todos: [nested], repairNotes, error: "" };
289878
+ }
289879
+ }
289880
+ for (const key of ["items", "tasks", "checklist", "todo_items"]) {
289881
+ if (Array.isArray(record[key])) {
289882
+ repairNotes.push(`coerced ${key} into todos array`);
289883
+ return { todos: record[key], repairNotes, error: "" };
289884
+ }
289885
+ }
289886
+ const single = record["todo"] ?? record["task"];
289887
+ if (single && typeof single === "object" && !Array.isArray(single)) {
289888
+ const obj = single;
289889
+ if (typeof obj["content"] === "string") {
289890
+ repairNotes.push("coerced single todo/task object into todos array");
289891
+ return { todos: [obj], repairNotes, error: "" };
289892
+ }
289893
+ }
289894
+ if (typeof single === "string" && single.trim()) {
289895
+ repairNotes.push("coerced single todo/task string into todos array");
289896
+ return { todos: [single.trim()], repairNotes, error: "" };
289897
+ }
289898
+ return {
289899
+ repairNotes,
289900
+ error: 'todos must be an array. Correct shape: todo_write({"todos":[{"content":"Inspect files","status":"in_progress"},{"content":"Make changes","status":"pending"}]})'
289901
+ };
289902
+ }
289856
289903
  var _currentSessionId, TodoWriteTool, TodoReadTool;
289857
289904
  var init_todo_write = __esm({
289858
289905
  "packages/execution/dist/tools/todo-write.js"() {
@@ -289861,7 +289908,27 @@ var init_todo_write = __esm({
289861
289908
  _currentSessionId = "";
289862
289909
  TodoWriteTool = class {
289863
289910
  name = "todo_write";
289864
- description = "Update the session task checklist. To be used proactively and often to track progress and pending tasks. Make sure that at least one task is in_progress at all times. \n\n## When to use\n1. Complex multi-step tasks — when a task requires 3 or more distinct steps or actions\n2. When the user provides multiple tasks (numbered or comma-separated)\n3. After receiving new instructions — capture user requirements as todos immediately\n4. When you start a task — mark it in_progress BEFORE beginning work. Only ONE in_progress at a time\n5. After completing a task — mark it completed and add follow-up tasks you discovered\n\n## When NOT to use\n- Single, straightforward tasks (a trivial edit, a one-line fix)\n- Conversational or informational questions\n- Tasks completable in <3 trivial steps\n\n## Task states\n- pending: not started\n- in_progress: currently working on (exactly ONE at a time)\n- completed: fully done (tests pass, code works, goal met)\n- blocked: stuck on a dependency (include blocker text)\n\nMark tasks complete IMMEDIATELY after finishing — don't batch. Never mark completed if tests are failing or implementation is partial. The user watches this list in the chat UI in real time.";
289911
+ description = `Update the session task checklist. To be used proactively and often to track progress and pending tasks. Make sure that at least one task is in_progress at all times.
289912
+
289913
+ ## When to use
289914
+ 1. Complex multi-step tasks — when a task requires 3 or more distinct steps or actions
289915
+ 2. When the user provides multiple tasks (numbered or comma-separated)
289916
+ 3. After receiving new instructions — capture user requirements as todos immediately
289917
+ 4. When you start a task — mark it in_progress BEFORE beginning work. Only ONE in_progress at a time
289918
+ 5. After completing a task — mark it completed and add follow-up tasks you discovered
289919
+
289920
+ ## When NOT to use
289921
+ - Single, straightforward tasks (a trivial edit, a one-line fix)
289922
+ - Conversational or informational questions
289923
+ - Tasks completable in <3 trivial steps
289924
+
289925
+ ## Task states
289926
+ - pending: not started
289927
+ - in_progress: currently working on (exactly ONE at a time)
289928
+ - completed: fully done (tests pass, code works, goal met)
289929
+ - blocked: stuck on a dependency (include blocker text)
289930
+
289931
+ Mark tasks complete IMMEDIATELY after finishing — don't batch. Never mark completed if tests are failing or implementation is partial. The user watches this list in the chat UI in real time. Canonical call shape: todo_write({"todos":[{"content":"Inspect files","status":"in_progress"},{"content":"Make changes","status":"pending"},{"content":"Verify results","status":"pending"}]})`;
289865
289932
  parameters = {
289866
289933
  type: "object",
289867
289934
  required: ["todos"],
@@ -289902,48 +289969,62 @@ var init_todo_write = __esm({
289902
289969
  async execute(args) {
289903
289970
  const start2 = performance.now();
289904
289971
  try {
289905
- const incomingRaw = args["todos"];
289906
- if (!Array.isArray(incomingRaw)) {
289972
+ const normalized = normalizeIncomingTodos(args);
289973
+ if (!normalized.todos) {
289907
289974
  return {
289908
289975
  success: false,
289909
289976
  output: "",
289910
- error: "todos must be an array",
289977
+ error: normalized.error,
289911
289978
  durationMs: performance.now() - start2
289912
289979
  };
289913
289980
  }
289914
289981
  const incoming = [];
289915
- for (const raw of incomingRaw) {
289982
+ const repairNotes = [...normalized.repairNotes];
289983
+ for (let index = 0; index < normalized.todos.length; index++) {
289984
+ const raw = normalized.todos[index];
289916
289985
  if (!raw || typeof raw !== "object") {
289986
+ if (typeof raw === "string" && raw.trim()) {
289987
+ incoming.push({
289988
+ content: raw.trim(),
289989
+ status: index === 0 ? "in_progress" : "pending"
289990
+ });
289991
+ repairNotes.push("coerced string todo item into {content,status}");
289992
+ continue;
289993
+ }
289917
289994
  return {
289918
289995
  success: false,
289919
289996
  output: "",
289920
- error: "each todo must be an object with content+status",
289997
+ error: 'each todo must be an object with content+status. Correct shape: {"todos":[{"content":"...","status":"in_progress"}]}',
289921
289998
  durationMs: performance.now() - start2
289922
289999
  };
289923
290000
  }
289924
290001
  const entry = raw;
289925
290002
  const content = entry["content"];
289926
290003
  const status = entry["status"];
289927
- if (typeof content !== "string" || typeof status !== "string") {
290004
+ if (typeof content !== "string") {
289928
290005
  return {
289929
290006
  success: false,
289930
290007
  output: "",
289931
- error: "todo must have string content and string status",
290008
+ error: 'todo must have string content. Correct shape: {"todos":[{"content":"...","status":"in_progress"}]}',
289932
290009
  durationMs: performance.now() - start2
289933
290010
  };
289934
290011
  }
289935
- if (!["pending", "in_progress", "completed", "blocked"].includes(status)) {
290012
+ const resolvedStatus = typeof status === "string" ? status : index === 0 ? "in_progress" : "pending";
290013
+ if (typeof status !== "string") {
290014
+ repairNotes.push("defaulted missing todo status to in_progress/pending");
290015
+ }
290016
+ if (!["pending", "in_progress", "completed", "blocked"].includes(resolvedStatus)) {
289936
290017
  return {
289937
290018
  success: false,
289938
290019
  output: "",
289939
- error: `invalid status: ${status}`,
290020
+ error: `invalid status: ${resolvedStatus}`,
289940
290021
  durationMs: performance.now() - start2
289941
290022
  };
289942
290023
  }
289943
290024
  incoming.push({
289944
290025
  id: typeof entry["id"] === "string" ? entry["id"] : void 0,
289945
290026
  content,
289946
- status,
290027
+ status: resolvedStatus,
289947
290028
  parentId: typeof entry["parentId"] === "string" ? entry["parentId"] : void 0,
289948
290029
  blocker: typeof entry["blocker"] === "string" ? entry["blocker"] : void 0,
289949
290030
  // REG-37: verification-aware planning
@@ -289986,6 +290067,16 @@ var init_todo_write = __esm({
289986
290067
  newTodos: result.newTodos,
289987
290068
  verificationNudgeNeeded
289988
290069
  };
290070
+ if (repairNotes.length > 0) {
290071
+ payload["inputRepair"] = Array.from(new Set(repairNotes));
290072
+ payload["canonicalShape"] = {
290073
+ todos: [
290074
+ { content: "Inspect files", status: "in_progress" },
290075
+ { content: "Make changes", status: "pending" },
290076
+ { content: "Verify results", status: "pending" }
290077
+ ]
290078
+ };
290079
+ }
289989
290080
  if (verificationNudgeNeeded) {
289990
290081
  payload["nudge"] = "You just closed 3+ todos without scheduling a verification step. Add a 'Verify the changes work' item and spawn a verification agent before declaring task_complete.";
289991
290082
  }
@@ -564459,10 +564550,12 @@ ${_staleSamples.join("\n")}` : ``,
564459
564550
  const turnTier = this.options.modelTier ?? "large";
564460
564551
  if (turn === 0 && !this.options.disableTodoPlanningNudges && (turnTier === "small" || turnTier === "medium")) {
564461
564552
  const goal = this._taskState.goal || "";
564462
- const wordCount2 = goal.split(/\s+/).length;
564463
- const hasMultipleActions = /\band\b.*\band\b|then.*then|also.*also/i.test(goal);
564464
- const hasMultipleFiles = /files?.*files?|\.ts.*\.ts|create.*write|modify.*create/i.test(goal);
564465
- const isComplex = wordCount2 > 40 || hasMultipleActions || hasMultipleFiles;
564553
+ const substantiveGoal = goal.replace(/\b(?:then\s+)?call\s+task_complete\b[^.?!;]*/gi, "").replace(/\b(?:observe|report|summarize|finish|complete)\b[^.?!;]*/gi, "");
564554
+ const wordCount2 = substantiveGoal.split(/\s+/).filter(Boolean).length;
564555
+ const hasMultipleActions = /\band\b.*\band\b|then.*then|also.*also/i.test(substantiveGoal);
564556
+ const hasMultipleFiles = /files?.*files?|\.ts.*\.ts|create.*write|modify.*create/i.test(substantiveGoal);
564557
+ const explicitSingleTool = /\b(exactly once|single tool|one tool|one tool call)\b/i.test(goal) || /\b(call|use)\s+(?:list_directory|file_read|grep_search|find_files|shell|web_search|web_fetch)\b/i.test(goal) && !/\b(edit|write|modify|create|fix|implement|patch|test|build|install|refactor)\b/i.test(substantiveGoal);
564558
+ const isComplex = !explicitSingleTool && (wordCount2 > 40 || hasMultipleActions || hasMultipleFiles);
564466
564559
  if (isComplex) {
564467
564560
  messages2.push({
564468
564561
  role: "user",
@@ -564471,6 +564564,7 @@ ${_staleSamples.join("\n")}` : ``,
564471
564564
  MANDATORY FIRST ACTION: Call todo_write NOW with the complete plan.
564472
564565
  Each todo item is { content: "what to do", status: "pending" | "in_progress" | "completed" | "blocked" }.
564473
564566
  Mark item 1 as in_progress, the rest as pending.
564567
+ Only count substantive work phases. Do NOT count observing a tool result, reporting findings, or calling task_complete as todo phases.
564474
564568
  Example: todo_write({todos: [{content: "read source files", status: "in_progress"}, {content: "make changes", status: "pending"}, {content: "run tests", status: "pending"}]})
564475
564569
 
564476
564570
  After EACH phase finishes, call todo_write AGAIN with item N marked completed and item N+1 marked in_progress.
@@ -564566,7 +564660,7 @@ ${top.map((t2) => `- ${t2.name}: ${t2.desc}`).join("\n")}`);
564566
564660
  const isReadTask = /\bread\b|\bshow\b|\btell me\b|\bwhat is\b/i.test(taskGoal);
564567
564661
  const hints = [];
564568
564662
  if (isSimpleTask) {
564569
- hints.push("This is a simple task — if it needs only ONE tool call, skip todo_write and call the tool directly. If it needs 2+ steps, use todo_write to plan.");
564663
+ hints.push("This is a simple task — if it needs only ONE substantive tool call, skip todo_write and call the tool directly, then task_complete. Do not count reporting, observing output, or task_complete as planning steps. If it needs 2+ substantive work steps, use todo_write to plan.");
564570
564664
  }
564571
564665
  if (isSearchTask) {
564572
564666
  hints.push("SEARCH STRATEGY: Use grep_search to find what you need FIRST, THEN file_read only the specific file and lines. Do NOT read entire files hoping to find something.");
@@ -691676,6 +691770,7 @@ function parseCliArgs(argv) {
691676
691770
  local: { type: "boolean", short: "l" },
691677
691771
  port: { type: "string" },
691678
691772
  suite: { type: "string" },
691773
+ live: { type: "boolean" },
691679
691774
  json: { type: "boolean", short: "j" },
691680
691775
  background: { type: "boolean" },
691681
691776
  help: { type: "boolean", short: "h" },
@@ -691735,6 +691830,7 @@ function parseCliArgs(argv) {
691735
691830
  break;
691736
691831
  case "eval":
691737
691832
  result.evalSuite = typeof values.suite === "string" ? values.suite : void 0;
691833
+ result.evalLive = values.live === true;
691738
691834
  break;
691739
691835
  default:
691740
691836
  break;
@@ -691773,6 +691869,7 @@ Flags:
691773
691869
  --max-retries <n> Max retries per model request
691774
691870
  --timeout-ms <ms> Overall task timeout
691775
691871
  --suite <name> Eval suite: basic (default) or full
691872
+ --live Run eval against configured backend instead of FakeBackend
691776
691873
  --port <n> Server port (serve command, vLLM only, default: 8000)
691777
691874
  -h, --help Show this help
691778
691875
  -V, --version Show version
@@ -691798,6 +691895,7 @@ Examples:
691798
691895
  omnius serve
691799
691896
  omnius serve --backend vllm --port 9000
691800
691897
  omnius eval --suite full --verbose
691898
+ omnius eval --suite basic --live --backend ollama --model qwen3.5:9b
691801
691899
  omnius config set model qwen3.5:122b
691802
691900
  `.trim();
691803
691901
  process.stdout.write(text + "\n");
@@ -691933,7 +692031,8 @@ async function main() {
691933
692031
  {
691934
692032
  suite: parsed.evalSuite,
691935
692033
  repoPath: parsed.repoPath,
691936
- verbose: parsed.verbose
692034
+ verbose: parsed.verbose,
692035
+ live: parsed.evalLive
691937
692036
  },
691938
692037
  config
691939
692038
  );
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.215",
3
+ "version": "1.0.216",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.215",
9
+ "version": "1.0.216",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.215",
3
+ "version": "1.0.216",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -51,7 +51,7 @@ If you anticipate a large result before calling a tool, prefer narrow flags firs
51
51
  - list_directory: List files in a directory with types and sizes
52
52
  - web_search: Search the web for documentation or solutions
53
53
  - web_fetch: Fetch a web page and extract text content (for docs, MDN, w3schools.com, etc.)
54
- - todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ logical phases, your FIRST tool call must be todo_write declaring the entire plan as an array of items with status pending|in_progress|completed|blocked. After each phase completes, call todo_write again with item N marked completed and item N+1 marked in_progress. The user watches this checklist update live in the chat UI — it is your primary planning surface for long-horizon work and the user can see at a glance whether you are making progress or stuck. Use todo_write for any task naturally containing 3+ phases (build/test/ship, scrape/parse/store, plan/draft/edit, explore/refactor/verify, etc.). Do NOT use it for trivial single-step questions. Each todo accepts two OPTIONAL fields you should USE whenever the todo has objective completion criteria: `verifyCommand` (a shell command that PROVES the todo is complete — typecheck/test/build invocations etc.) and `declaredArtifacts` (a list of file paths this todo will produce). The orchestrator auto-checks both at completion-claim time; missing/unverified completions are rejected with a specific gap critique. **Worked example — emit todos in this exact shape:** `todo_write({"todos":[{"id":"p1","content":"Implement cache module","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts","tests/cache.test"]},{"id":"p2","content":"Make build pass","status":"pending","verifyCommand":"<your build command>"}]})`. Substitute placeholder strings with commands native to YOUR stack.
54
+ - todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ substantive work phases, your FIRST tool call must be todo_write declaring the entire plan as an array of items with status pending|in_progress|completed|blocked. After each phase completes, call todo_write again with item N marked completed and item N+1 marked in_progress. Do NOT count observing a tool result, reporting findings, or task_complete as phases. The user watches this checklist update live in the chat UI — it is your primary planning surface for long-horizon work and the user can see at a glance whether you are making progress or stuck. Use todo_write for any task naturally containing 3+ real work phases (build/test/ship, scrape/parse/store, plan/draft/edit, explore/refactor/verify, etc.). Do NOT use it for trivial single-step questions. Each todo accepts two OPTIONAL fields you should USE whenever the todo has objective completion criteria: `verifyCommand` (a shell command that PROVES the todo is complete — typecheck/test/build invocations etc.) and `declaredArtifacts` (a list of file paths this todo will produce). The orchestrator auto-checks both at completion-claim time; missing/unverified completions are rejected with a specific gap critique. **Worked example — emit todos in this exact shape:** `todo_write({"todos":[{"id":"p1","content":"Implement cache module","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts","tests/cache.test"]},{"id":"p2","content":"Make build pass","status":"pending","verifyCommand":"<your build command>"}]})`. Substitute placeholder strings with commands native to YOUR stack.
55
55
 
56
56
  ## Web Tool Selection
57
57
 
@@ -182,7 +182,7 @@ When you discover image files (png, jpg, gif, svg, webp, bmp) during codebase ex
182
182
 
183
183
  ## Workflow
184
184
 
185
- 0. **PLAN AT THE TOP** — for any task with 3+ logical phases, your VERY FIRST tool call must be `todo_write` with a complete checklist (each item: `{content, status}`). Mark item 1 as `in_progress`, the rest as `pending`. The user watches this checklist update live in the chat UI as you work, so they always know what step you're on. After each phase, call todo_write again to mark the finished item `completed` and the next one `in_progress`.
185
+ 0. **PLAN AT THE TOP** — for any task with 3+ substantive work phases, your VERY FIRST tool call must be `todo_write` with a complete checklist (each item: `{content, status}`). Mark item 1 as `in_progress`, the rest as `pending`. Do not count observing output, reporting findings, or task_complete as phases. The user watches this checklist update live in the chat UI as you work, so they always know what step you're on. After each phase, call todo_write again to mark the finished item `completed` and the next one `in_progress`.
186
186
  1. EXPLORE: Use find_files and grep_search to locate relevant code. Read specific files.
187
187
  2. PLAN: Determine what changes are needed based on the code you've read.
188
188
  3. IMPLEMENT: Make changes using file_edit (preferred) or file_write for new files.
@@ -11,7 +11,7 @@ You operate in two modes based on what the user needs:
11
11
  **TASK MODE** — coding tasks, file operations, technical directives:
12
12
  - Call tools iteratively until complete. NEVER write code blocks as text — only tool calls execute.
13
13
  - If you need to read a file, call file_read. If you need to run a command, call shell.
14
- - **MANDATORY: For ANY task that will take 3 or more tool calls, your VERY FIRST tool call MUST be `todo_write` declaring the complete plan.** Items have `{content, status}` where status is one of pending|in_progress|completed|blocked. Mark item 1 in_progress, the rest pending. Then re-call todo_write after each phase finishes to mark item N completed and N+1 in_progress. The user watches this checklist update live in the chat UI — without it they can't see your plan or track your progress.
14
+ - **MANDATORY: For ANY task that will take 3 or more substantive work tool calls, your VERY FIRST tool call MUST be `todo_write` declaring the complete plan.** Items have `{content, status}` where status is one of pending|in_progress|completed|blocked. Mark item 1 in_progress, the rest pending. Then re-call todo_write after each phase finishes to mark item N completed and N+1 in_progress. Do NOT count observing tool output, reporting findings, or task_complete as work phases. For one-tool tasks, call the tool directly and then task_complete. The user watches this checklist update live in the chat UI — without it they can't see your plan or track your progress.
15
15
 
16
16
  ## Instruction Hierarchy
17
17
 
@@ -41,7 +41,7 @@ Tool results over ~100KB are NOT truncated. The orchestrator saves the full payl
41
41
  - list_directory: List files in a directory
42
42
  - web_search: Search the web
43
43
  - web_fetch: Fetch a web page's text
44
- - todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ logical steps, start by calling todo_write to declare your plan, then re-call todo_write as each step transitions (mark item N "completed" + N+1 "in_progress"). The user sees this list update live in the UI — it is your primary planning surface for long-horizon work. Use it whenever the task naturally has 3+ phases (build/refactor/test/ship, scrape/parse/store/report, plan/draft/edit/publish, etc.).
44
+ - todo_write / todo_read: Visible task checklist for the user. For ANY multi-step task with 3+ substantive work steps, start by calling todo_write to declare your plan, then re-call todo_write as each step transitions (mark item N "completed" + N+1 "in_progress"). The user sees this list update live in the UI — it is your primary planning surface for long-horizon work. Use it whenever the task naturally has 3+ real work phases (build/refactor/test/ship, scrape/parse/store/report, plan/draft/edit/publish, etc.). Skip it for a single tool action followed only by reporting and task_complete.
45
45
 
46
46
  Each todo accepts two OPTIONAL fields you should USE whenever the todo has objective completion criteria:
47
47
 
@@ -105,8 +105,8 @@ Launch ALL sub_agent calls in ONE response. This saves your context window for o
105
105
 
106
106
  ## Workflow
107
107
 
108
- For tasks requiring 3+ tool calls — plan before acting:
109
- 1. LIST all steps needed before your first tool call. **For 3+ step tasks, your FIRST tool call must be `todo_write` declaring the full plan with item 1 set to status:"in_progress" and the rest "pending".** Then call todo_write again as each step finishes to mark items "completed" and the next one "in_progress". The user watches this list update live in the chat UI.
108
+ For tasks requiring 3+ substantive work tool calls — plan before acting:
109
+ 1. LIST all real work steps needed before your first tool call. **For 3+ substantive-step tasks, your FIRST tool call must be `todo_write` declaring the full plan with item 1 set to status:"in_progress" and the rest "pending".** Do not count reporting, observing output, or task_complete as steps. Then call todo_write again as each step finishes to mark items "completed" and the next one "in_progress". The user watches this list update live in the chat UI.
110
110
  2. If task mentions 3+ independent modules/files: delegate each to a sub_agent (saves context)
111
111
  3. EXPLORE: Use find_files, grep_search, file_explore to understand the codebase
112
112
  - For large files (200+ lines): use file_explore(strategy='overview') then search/chunk — NEVER read entire file
@@ -34,7 +34,7 @@ File edits: Use file_write/file_edit/file_patch/batch_edit for project files, no
34
34
 
35
35
  Tool choice: Use file/search/code-graph tools for repository discovery, web_fetch/web_download/browser_action for web work, and repl_exec for multi-step data processing. Use shell when the command itself is the verifier or work product: tests, builds, package managers, git, system operations, and small native scripts. Do not hide diagnostics inside opaque shell blobs or `|| true`. Use background_run for long commands and poll with task_status/task_output.
36
36
 
37
- todo_write: visible task checklist for the user. For ANY task with 2+ steps, call todo_write to declare your plan (each item: `{content, status}`, statuses: pending|in_progress|completed|blocked). Update status as you complete each step. Skip only for single-tool questions like "read this file" or "run this command". Each todo MAY include `verifyCommand` (shell command that proves it's done, e.g. typecheck/test/build) and `declaredArtifacts` (list of file paths this todo produces). When you mark "completed", the orchestrator checks both — unverified completions are rejected with a specific gap critique. **Example shape:** `{"id":"p1","content":"Implement cache","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts"]}`. Substitute placeholders with commands native to YOUR stack.
37
+ todo_write: visible task checklist for the user. Use it for substantive multi-step work, not ceremony. For tasks with 2+ substantive work steps, call todo_write to declare your plan (each item: `{content, status}`, statuses: pending|in_progress|completed|blocked). Update status as you complete each step. Skip single-tool questions like "read this file", "list this directory", or "run this command", even if you will report findings and call task_complete afterward. Do NOT count observing a tool result, reporting findings, or task_complete as todo steps. Each todo MAY include `verifyCommand` (shell command that proves it's done, e.g. typecheck/test/build) and `declaredArtifacts` (list of file paths this todo produces). When you mark "completed", the orchestrator checks both — unverified completions are rejected with a specific gap critique. **Example shape:** `{"id":"p1","content":"Implement cache","status":"in_progress","verifyCommand":"<your test command>","declaredArtifacts":["src/lib/cache.ts"]}`. Substitute placeholders with commands native to YOUR stack.
38
38
 
39
39
  Web: web_search finds URLs, web_fetch reads them. For JS pages use web_crawl, for clicking/login use browser_action.
40
40
 
@@ -100,7 +100,7 @@ Creating new files — WRITE FIRST, refine later:
100
100
  - After writing: fill in each method, test after each one.
101
101
  - A bad first draft you can fix is better than no draft at all.
102
102
 
103
- Complex tasks (5+ steps) — DECOMPOSE before acting:
103
+ Complex tasks (5+ substantive work steps) — DECOMPOSE before acting:
104
104
  1. Call todo_write with the checklist. Mark item 1 "in_progress".
105
105
  2. Execute ONE STEP AT A TIME. After each, update todo_write status.
106
106
  3. After each file edit, VERIFY: file_read or shell test.