npm - polymath-agent - Versions diffs - 0.3.1 → 0.5.0 - Mend

polymath-agent 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/cli.js CHANGED Viewed

@@ -599,11 +599,14 @@ var TASK_SPECS = {
   command: { type: "command", minTier: "cheap", needsTools: true, label: "Run command" },
   review: { type: "review", minTier: "frontier", needsTools: false, label: "Review / critique" },
   reason: { type: "reason", minTier: "frontier", needsTools: false, label: "Hard reasoning" },
+  // The verify gate inspects files / runs tests — it MUST have tools.
+  verify: { type: "verify", minTier: "frontier", needsTools: true, label: "Verify result" },
   explain: { type: "explain", minTier: "cheap", needsTools: false, label: "Explain" },
   summarize: { type: "summarize", minTier: "cheap", needsTools: false, label: "Summarize" },
   chat: { type: "chat", minTier: "cheap", needsTools: false, label: "Chat" }
 };
 var ALL_TASK_TYPES = Object.keys(TASK_SPECS);
+var ALL_GOAL_TYPES = ["feature", "bugfix", "refactor", "test", "docs", "chore", "other"];
 // src/planner/planner.ts
 var PLAN_SYSTEM = `You are the planning stage of a coding agent. Break the user's request into a short, ordered list of concrete steps.
@@ -619,9 +622,21 @@ Each step must be classified by type, chosen from EXACTLY this set:
   summarize - condense long content
   chat      - a simple conversational reply
+Also classify the request's goalType (one of: feature, bugfix, refactor, test, docs, chore, other) and write 2-5 MEASURABLE acceptance criteria \u2014 concrete, checkable conditions that mean the goal is fully achieved (e.g. "hello.js exists and prints the greeting", "npm test passes", "the function handles empty input").
 Return ONLY minified JSON of the form:
-{"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
+{"goalType":"<type>","criteria":["...","..."],"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
 Use 3-8 steps for non-trivial work, fewer for simple requests. Estimate tokens realistically (prompts often 2000-15000, completions 200-3000).`;
+function classifyGoalType(goal) {
+  const g = goal.toLowerCase();
+  if (/\b(fix|bug|broken|error|crash|regression|fails?)\b/.test(g)) return "bugfix";
+  if (/\b(refactor|rename|clean ?up|restructure|extract|simplif)/.test(g)) return "refactor";
+  if (/\b(test|spec|coverage|unit test|e2e)\b/.test(g)) return "test";
+  if (/\b(docs?|readme|comment|documentation)\b/.test(g)) return "docs";
+  if (/\b(bump|upgrade|dependency|deps|config|chore|lint|format)\b/.test(g)) return "chore";
+  if (/\b(add|create|implement|build|feature|support|new)\b/.test(g)) return "feature";
+  return "other";
+}
 function heuristicPlan(goal) {
   const steps = [
     { id: 1, type: "plan", description: "Decompose the request", estPromptTokens: 2e3, estCompletionTokens: 600 },
@@ -630,7 +645,12 @@ function heuristicPlan(goal) {
     { id: 4, type: "edit", description: "Implement the change", estPromptTokens: 9e3, estCompletionTokens: 1500 },
     { id: 5, type: "review", description: "Review the change", estPromptTokens: 6e3, estCompletionTokens: 800 }
   ];
-  return { goal, steps };
+  return {
+    goal,
+    steps,
+    goalType: classifyGoalType(goal),
+    criteria: ["The stated goal is fully implemented and works", "No obvious errors or omissions remain"]
+  };
 }
 async function planRequest(goal, client2, planModel, onUsage) {
   const result = await client2.complete(
@@ -648,7 +668,7 @@ async function planRequest(goal, client2, planModel, onUsage) {
   onUsage?.(result);
   const parsed = extractPlan(result.content);
   if (!parsed) return heuristicPlan(goal);
-  return { goal, steps: parsed };
+  return { goal, ...parsed };
 }
 function extractPlan(text) {
   const json = extractJson(text);
@@ -663,7 +683,10 @@ function extractPlan(text) {
       estPromptTokens: clampInt(s.estPromptTokens, 500, 6e4, 4e3),
       estCompletionTokens: clampInt(s.estCompletionTokens, 100, 8e3, 800)
     }));
-    return steps.length ? steps : null;
+    if (!steps.length) return null;
+    const goalType = ALL_GOAL_TYPES.includes(String(obj.goalType)) ? obj.goalType : "other";
+    const criteria = Array.isArray(obj.criteria) ? obj.criteria.map((x) => String(x).slice(0, 200)).filter(Boolean).slice(0, 6) : [];
+    return { steps, goalType, criteria: criteria.length ? criteria : ["The stated goal is fully achieved"] };
   } catch {
     return null;
   }
@@ -705,10 +728,29 @@ function extractJson(text) {
 }
 // src/router/policy.ts
+var ESCALATION_LADDER = [
+  { objective: "value", maxTokens: 2e3, maxIters: 6, liftCostCap: false, label: "value \xB7 cheapest-capable" },
+  { tierFloor: "standard", objective: "value", maxTokens: 4e3, maxIters: 8, liftCostCap: true, label: "standard+ \xB7 more tokens" },
+  { tierFloor: "frontier", objective: "quality", maxTokens: 8e3, maxIters: 10, liftCostCap: true, label: "frontier \xB7 strongest" }
+];
+function rungForTier(tier) {
+  return ESCALATION_LADDER.findIndex((r) => r.tierFloor === tier || !r.tierFloor && tier === "cheap");
+}
+function applyRung(base, rung) {
+  return {
+    ...base,
+    objective: rung.objective,
+    tierFloor: rung.tierFloor,
+    maxCostPerCallUsd: rung.liftCostCap ? void 0 : base.maxCostPerCallUsd
+  };
+}
 var TIER_RANK = { cheap: 0, standard: 1, frontier: 2 };
 function tierAtLeast(tier, min) {
   return TIER_RANK[tier] >= TIER_RANK[min];
 }
+function tierRank(tier) {
+  return TIER_RANK[tier];
+}
 function blendedPrice(m) {
   return (m.pricing.promptUsdPerMTok * 3 + m.pricing.completionUsdPerMTok) / 4;
 }
@@ -755,6 +797,7 @@ var TASK_SKILL = {
   command: "speed",
   review: "reasoning",
   reason: "reasoning",
+  verify: "reasoning",
   explain: "general",
   summarize: "speed",
   chat: "speed"
@@ -781,6 +824,7 @@ var TASK_MIN_STRENGTH = {
   edit: 1.4,
   review: 1.5,
   reason: 1.5,
+  verify: 1.4,
   plan: 1.2
 };
 var HEADLINE_SKILLS = ["coding", "reasoning", "retrieval", "speed"];
@@ -798,9 +842,10 @@ function taskValue(m, taskType, empirical) {
 function candidatesFor(taskType, models, policy, est) {
   const spec = TASK_SPECS[taskType];
   const strengthFloor = TASK_MIN_STRENGTH[taskType] ?? 0;
+  const minTier = policy.tierFloor && tierRank(policy.tierFloor) > tierRank(spec.minTier) ? policy.tierFloor : spec.minTier;
   return models.filter((m) => {
     if (m.id === "openrouter/auto") return false;
-    const covers = tierAtLeast(m.tier, spec.minTier) || taskStrength(m, taskType) >= strengthFloor;
+    const covers = tierAtLeast(m.tier, minTier) || !policy.tierFloor && taskStrength(m, taskType) >= strengthFloor;
     if (!covers) return false;
     if (spec.needsTools && !m.capabilities.tools) return false;
     if (policy.maxCostPerCallUsd != null && est) {
@@ -846,6 +891,19 @@ function route(taskType, models, policy, est = { promptTokens: 4e3, completionTo
   const reason = policy.objective === "cheapest" ? `cheapest model that covers ${skill}` : policy.objective === "quality" ? `strongest at ${skill}` : proven ? `proven ${Math.round(proven)}% fewer tokens on ${taskType} (playbook)` : `best ${skill}-per-dollar`;
   return { model: chosen, reason, estCostUsd: projectCost(chosen, est) };
 }
+function routeOrBest(taskType, models, policy, est = { promptTokens: 4e3, completionTokens: 1e3 }) {
+  const r = route(taskType, models, policy, est);
+  if (r) return r;
+  const spec = TASK_SPECS[taskType];
+  const usable = models.filter(
+    (m) => m.id !== "openrouter/auto" && (!spec.needsTools || m.capabilities.tools)
+  );
+  if (!usable.length) return null;
+  const byStrength = (a, b) => taskStrength(b, taskType) - taskStrength(a, taskType);
+  const withTools = usable.filter((m) => m.capabilities.tools).sort(byStrength);
+  const best = (withTools.length ? withTools : [...usable].sort(byStrength))[0];
+  return { model: best, reason: `best available for ${TASK_SKILL[taskType]} (fallback)`, estCostUsd: projectCost(best, est) };
+}
 // src/recommend/recommend.ts
 var OBJECTIVES = [
@@ -1075,6 +1133,27 @@ function getDb() {
     );
     CREATE INDEX IF NOT EXISTS idx_cmd_date ON command_runs(date);
+    -- One row per verify-and-escalate attempt within a session. Powers the
+    -- "optimal starting model per goal type" statistical learning.
+    CREATE TABLE IF NOT EXISTS attempts (
+      id INTEGER PRIMARY KEY AUTOINCREMENT,
+      session_id TEXT NOT NULL,
+      attempt_no INTEGER NOT NULL,
+      goal_type TEXT NOT NULL,
+      tier_floor TEXT,
+      objective TEXT NOT NULL,
+      prompt_tokens INTEGER NOT NULL,
+      completion_tokens INTEGER NOT NULL,
+      cost_usd REAL NOT NULL,
+      criteria_total INTEGER NOT NULL,
+      criteria_met INTEGER NOT NULL,
+      passed INTEGER NOT NULL,
+      duration_ms INTEGER NOT NULL,
+      synced INTEGER NOT NULL DEFAULT 0
+    );
+    CREATE INDEX IF NOT EXISTS idx_attempts_session ON attempts(session_id);
+    CREATE INDEX IF NOT EXISTS idx_attempts_goal ON attempts(goal_type, tier_floor);
     -- Distilled efficiency insights: ONLY the notably cost-efficient approaches.
     -- This is what syncs to the cloud by default (raw logs stay local).
     CREATE TABLE IF NOT EXISTS insights (
@@ -1096,6 +1175,15 @@ function getDb() {
   if (!cols.some((c2) => c2.name === "command")) {
     db.exec(`ALTER TABLE usage_log ADD COLUMN command TEXT NOT NULL DEFAULT 'run'`);
   }
+  const conn = db;
+  const scols = conn.prepare(`PRAGMA table_info(sessions)`).all();
+  const addSession = (name, decl) => {
+    if (!scols.some((c2) => c2.name === name)) conn.exec(`ALTER TABLE sessions ADD COLUMN ${name} ${decl}`);
+  };
+  addSession("goal_type", "TEXT NOT NULL DEFAULT 'other'");
+  addSession("start_tier", "TEXT");
+  addSession("attempts", "INTEGER NOT NULL DEFAULT 1");
+  addSession("final_passed", "INTEGER");
   return db;
 }
 function recordUsage(e) {
@@ -1188,14 +1276,14 @@ function markSynced(ids) {
 }
 function startSession(s) {
   getDb().prepare(
-    `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps)
-       VALUES (?, ?, ?, ?, ?, ?, ?)`
-  ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps);
+    `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps, goal_type, start_tier)
+       VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
+  ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps, s.goalType, s.startTier ?? null);
 }
 function finishSession(id, u) {
   getDb().prepare(
     `UPDATE sessions SET planned_steps=?, completed_steps=?, failed_steps=?, auto_score=?,
-         prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=? WHERE id=?`
+         prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=?, attempts=?, final_passed=? WHERE id=?`
   ).run(
     u.plannedSteps,
     u.completedSteps,
@@ -1205,9 +1293,60 @@ function finishSession(id, u) {
     u.completionTokens,
     u.costUsd,
     u.durationMs,
+    u.attempts ?? 1,
+    u.finalPassed == null ? null : u.finalPassed ? 1 : 0,
     id
   );
 }
+function recordAttempt(a) {
+  getDb().prepare(
+    `INSERT INTO attempts
+        (session_id, attempt_no, goal_type, tier_floor, objective, prompt_tokens, completion_tokens,
+         cost_usd, criteria_total, criteria_met, passed, duration_ms)
+       VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
+  ).run(
+    a.sessionId,
+    a.attemptNo,
+    a.goalType,
+    a.tierFloor,
+    a.objective,
+    a.promptTokens,
+    a.completionTokens,
+    a.costUsd,
+    a.criteriaTotal,
+    a.criteriaMet,
+    a.passed ? 1 : 0,
+    a.durationMs
+  );
+}
+function goalTierStats() {
+  const rows = getDb().prepare(
+    `SELECT goal_type AS goalType, COALESCE(start_tier,'cheap') AS startTier,
+              COUNT(*) AS sessions,
+              AVG(CASE WHEN final_passed=1 THEN 1.0 ELSE 0.0 END) AS passRate,
+              AVG(prompt_tokens + completion_tokens) AS avgTotalTokens,
+              AVG(attempts) AS avgAttempts
+       FROM sessions
+       WHERE final_passed IS NOT NULL
+       GROUP BY goal_type, startTier
+       ORDER BY goal_type, avgTotalTokens ASC`
+  ).all();
+  return rows.map((r) => ({
+    goalType: String(r.goalType),
+    startTier: String(r.startTier),
+    sessions: Number(r.sessions),
+    passRate: Number(r.passRate ?? 0),
+    avgTotalTokens: Number(r.avgTotalTokens ?? 0),
+    avgAttempts: Number(r.avgAttempts ?? 0)
+  }));
+}
+function optimalStartTier(goalType, minSessions = 3) {
+  const stats = goalTierStats().filter(
+    (s) => s.goalType === goalType && s.sessions >= minSessions && s.passRate >= 0.6
+  );
+  if (!stats.length) return null;
+  return stats.sort((a, b) => a.avgTotalTokens - b.avgTotalTokens)[0].startTier;
+}
 function setUserScore(sessionId, score) {
   getDb().prepare(`UPDATE sessions SET user_score=? WHERE id=?`).run(score, sessionId);
 }
@@ -1651,6 +1790,35 @@ function renderAnalysis(filter = {}) {
     }
     out.push("");
   }
+  const tierStats = goalTierStats();
+  if (tierStats.length) {
+    out.push(c.bold("Optimal starting model per goal type") + c.dim("  (pass rate vs total tokens to reach the goal)"));
+    out.push(
+      table(
+        ["Goal type", "Start tier", "Sessions", "Pass rate", "Avg total tok", "Avg attempts"],
+        tierStats.map((s) => [
+          s.goalType,
+          tierColor(s.startTier),
+          String(s.sessions),
+          `${Math.round(s.passRate * 100)}%`,
+          tokens(Math.round(s.avgTotalTokens)),
+          s.avgAttempts.toFixed(1)
+        ])
+      )
+    );
+    const goalTypes = [...new Set(tierStats.map((s) => s.goalType))];
+    const learned = goalTypes.map((g) => ({ g, tier: optimalStartTier(g) })).filter((x) => x.tier);
+    if (learned.length) {
+      out.push(
+        c.green(
+          "\u2192 Learned starts (auto-applied on `poly run`): " + learned.map((x) => `${x.g}\u2192${x.tier}`).join(", ")
+        )
+      );
+    } else {
+      out.push(c.dim("\u2192 Not enough evidence yet to auto-pick a starting tier (needs \u22653 verified sessions per goal type)."));
+    }
+    out.push("");
+  }
   if (byCommand.length) {
     out.push(c.bold("Usage by command"));
     out.push(
@@ -2012,6 +2180,274 @@ function logCompletion(result, taskType, sessionId, command = "run") {
   return entry;
 }
+// src/setup/commands.ts
+import { execSync as execSync2 } from "node:child_process";
+// src/util/prompt.ts
+import readline2 from "node:readline";
+function interactive() {
+  return process.stdin.isTTY === true && process.stdout.isTTY === true;
+}
+function ask2(question) {
+  return new Promise((resolve2) => {
+    const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
+    rl.question(question, (a) => {
+      rl.close();
+      resolve2(a.trim());
+    });
+  });
+}
+async function confirm(question, def = true) {
+  if (!interactive()) return def;
+  const hint = def ? "[Y/n]" : "[y/N]";
+  const a = (await ask2(`${question} ${hint} `)).toLowerCase();
+  if (!a) return def;
+  return /^y/.test(a);
+}
+async function select(question, items, render2) {
+  if (!interactive() || items.length <= 1) return items[0];
+  console.log(question);
+  items.forEach((it, i) => console.log(`  ${i + 1}) ${render2(it)}`));
+  const a = await ask2(`Choose [1-${items.length}] (default 1): `);
+  const n = parseInt(a, 10);
+  return Number.isInteger(n) && n >= 1 && n <= items.length ? items[n - 1] : items[0];
+}
+// src/setup/localllm.ts
+import { execSync, spawn } from "node:child_process";
+import os2 from "node:os";
+function suggestModels() {
+  const ramGb = Math.round(os2.totalmem() / 1024 ** 3);
+  const list = [];
+  if (ramGb >= 13) list.push({ id: "qwen2.5-coder:7b", label: "Qwen2.5 Coder 7B", sizeGb: 4.7, note: "best coding pick for ~16GB" });
+  list.push({ id: "llama3.2:3b", label: "Llama 3.2 3B", sizeGb: 2, note: "fast, light; great for cheap tasks" });
+  if (ramGb >= 30) list.push({ id: "qwen2.5-coder:14b", label: "Qwen2.5 Coder 14B", sizeGb: 9, note: "stronger coding for 32GB+" });
+  return list;
+}
+function totalRamGb() {
+  return Math.round(os2.totalmem() / 1024 ** 3);
+}
+function which(cmd) {
+  try {
+    execSync(process.platform === "win32" ? `where ${cmd}` : `command -v ${cmd}`, { stdio: "ignore" });
+    return true;
+  } catch {
+    return false;
+  }
+}
+function ollamaInstalled() {
+  return which("ollama");
+}
+function ollamaVersion() {
+  try {
+    return execSync("ollama --version", { encoding: "utf8" }).trim();
+  } catch {
+    return null;
+  }
+}
+async function ollamaServerUp(baseUrl = "http://localhost:11434") {
+  try {
+    const res = await fetch(`${baseUrl.replace(/\/v1\/?$/, "")}/api/version`);
+    return res.ok;
+  } catch {
+    return false;
+  }
+}
+async function installedModels(baseUrl = "http://localhost:11434") {
+  try {
+    const res = await fetch(`${baseUrl.replace(/\/v1\/?$/, "")}/api/tags`);
+    if (!res.ok) return [];
+    const json = await res.json();
+    return (json.models ?? []).map((m) => m.name);
+  } catch {
+    return [];
+  }
+}
+function run(cmd, args) {
+  return new Promise((resolve2) => {
+    const child = spawn(cmd, args, { stdio: "inherit" });
+    child.on("error", () => resolve2(false));
+    child.on("exit", (code) => resolve2(code === 0));
+  });
+}
+function ollamaInstallPlan() {
+  const platform = process.platform;
+  if (platform === "darwin") {
+    if (which("brew")) return { canAuto: true, command: { cmd: "brew", args: ["install", "ollama"] }, manual: "brew install ollama" };
+    return { canAuto: false, manual: "Install Homebrew (https://brew.sh) then `brew install ollama`, or download https://ollama.com/download" };
+  }
+  if (platform === "linux") {
+    return { canAuto: true, command: { cmd: "sh", args: ["-c", "curl -fsSL https://ollama.com/install.sh | sh"] }, manual: "curl -fsSL https://ollama.com/install.sh | sh" };
+  }
+  if (platform === "win32") {
+    if (which("winget")) return { canAuto: true, command: { cmd: "winget", args: ["install", "-e", "--id", "Ollama.Ollama"] }, manual: "winget install Ollama.Ollama" };
+    return { canAuto: false, manual: "Download the installer from https://ollama.com/download" };
+  }
+  return { canAuto: false, manual: "See https://ollama.com/download" };
+}
+async function ensureServer(baseUrl = "http://localhost:11434") {
+  if (await ollamaServerUp(baseUrl)) return true;
+  if (process.platform === "darwin" && which("brew")) {
+    await run("brew", ["services", "start", "ollama"]);
+  } else {
+    try {
+      const child = spawn("ollama", ["serve"], { stdio: "ignore", detached: true });
+      child.unref();
+    } catch {
+    }
+  }
+  for (let i = 0; i < 10; i++) {
+    if (await ollamaServerUp(baseUrl)) return true;
+    await delay(500);
+  }
+  return false;
+}
+function delay(ms) {
+  return new Promise((r) => setTimeout(r, ms));
+}
+// src/setup/commands.ts
+async function runSetup(opts) {
+  console.log(c.bold("\n\u{1F527} Polymath setup\n"));
+  const config = loadConfig();
+  let wantLocal = opts.local;
+  if (wantLocal === void 0) {
+    wantLocal = await confirm(
+      `Install a local LLM (Ollama) for $0, offline, no-API-key runs? (RAM detected: ${totalRamGb()}GB)`,
+      true
+    );
+  }
+  if (wantLocal) {
+    await setupLocal(opts, config);
+  } else {
+    config.local.enabled = false;
+    saveConfig(config);
+    console.log(c.dim("Skipping local LLM. (You can run `poly setup --local` later.)"));
+  }
+  const freshConfig = loadConfig();
+  if (!resolveApiKey(freshConfig)) {
+    const wantKey = opts.yes ? false : await confirm("Connect an OpenRouter API key for cloud models (300+ models)?", !wantLocal);
+    if (wantKey) await runLogin();
+    else if (!wantLocal) console.log(c.yellow("No models configured yet \u2014 run `poly login` or `poly setup --local`."));
+  }
+  console.log(c.green("\n\u2713 Setup complete.") + c.dim('  Try: poly recommend "add a dark-mode toggle"  \xB7  poly run -w "..."'));
+}
+async function setupLocal(opts, config) {
+  if (!ollamaInstalled()) {
+    const plan = ollamaInstallPlan();
+    console.log(c.cyan("Local LLM runtime: Ollama is not installed."));
+    if (plan.canAuto && plan.command) {
+      const go = opts.yes || await confirm(`Install Ollama via \`${plan.command.cmd} ${plan.command.args.join(" ")}\`?`, true);
+      if (go) {
+        const ok = await run(plan.command.cmd, plan.command.args);
+        if (!ok) console.log(c.yellow("Auto-install failed. Manual: " + plan.manual));
+      } else {
+        console.log(c.dim("Manual install: " + plan.manual));
+      }
+    } else {
+      console.log(c.yellow("Install manually: " + plan.manual));
+    }
+  } else {
+    console.log(c.green("\u2713 Ollama present ") + c.dim(ollamaVersion() ?? ""));
+  }
+  if (!ollamaInstalled()) {
+    console.log(c.yellow("Ollama still not on PATH \u2014 re-run `poly setup --local` after installing."));
+    return;
+  }
+  process.stdout.write("Starting Ollama server\u2026 ");
+  const up = await ensureServer(config.local.baseUrl);
+  console.log(up ? c.green("ok") : c.yellow("could not confirm (start it with `ollama serve`)"));
+  const have = await installedModels(config.local.baseUrl);
+  let modelId = opts.model;
+  if (!modelId) {
+    const suggestions = suggestModels().filter((s) => !have.includes(s.id));
+    if (have.length && !suggestions.length) {
+      modelId = have[0];
+      console.log(c.dim(`Using already-installed model ${modelId}.`));
+    } else {
+      const pick = opts.yes ? suggestModels()[0] : await select(
+        "Pick a model to download:",
+        suggestModels(),
+        (s) => `${s.label}  (~${s.sizeGb}GB) \u2014 ${s.note}${have.includes(s.id) ? " [installed]" : ""}`
+      );
+      modelId = pick.id;
+    }
+  }
+  if (!have.includes(modelId)) {
+    console.log(c.cyan(`Downloading ${modelId}\u2026`));
+    const ok = await run("ollama", ["pull", modelId]);
+    if (!ok) {
+      console.log(c.yellow(`Could not pull ${modelId}. Run \`ollama pull ${modelId}\` manually.`));
+      return;
+    }
+  }
+  config.local.enabled = true;
+  saveConfig(config);
+  console.log(c.green(`\u2713 Local LLM ready: ${modelId} \u2192 local/${modelId} ($0). `) + c.dim("Enabled in config."));
+}
+function cmp(a, b) {
+  const pa = a.replace(/^v/, "").split(".").map((n) => parseInt(n, 10) || 0);
+  const pb = b.replace(/^v/, "").split(".").map((n) => parseInt(n, 10) || 0);
+  for (let i = 0; i < 3; i++) {
+    if ((pa[i] ?? 0) !== (pb[i] ?? 0)) return (pa[i] ?? 0) - (pb[i] ?? 0);
+  }
+  return 0;
+}
+async function runUpdate(currentVersion, opts) {
+  const all = !opts.self && !opts.ollama && !opts.models;
+  console.log(c.bold("\n\u2B06\uFE0F  Polymath update") + (opts.check ? c.dim("  (check only)") : "") + "\n");
+  if (all || opts.self) {
+    let latest = "";
+    try {
+      latest = execSync2("npm view polymath-agent version", { encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] }).trim();
+    } catch {
+      latest = "";
+    }
+    if (!latest) {
+      console.log(c.dim("CLI: could not reach npm registry."));
+    } else if (cmp(latest, currentVersion) > 0) {
+      console.log(c.yellow(`CLI: ${currentVersion} \u2192 ${latest} available.`));
+      if (!opts.check) {
+        const ok = await run("npm", ["install", "-g", `polymath-agent@${latest}`]);
+        console.log(ok ? c.green(`\u2713 Updated to ${latest}.`) : c.red("npm update failed (try: sudo npm i -g polymath-agent@latest)."));
+      } else {
+        console.log(c.dim("  Run `poly update` to install."));
+      }
+    } else {
+      console.log(c.green(`\u2713 CLI is up to date (${currentVersion}).`));
+    }
+  }
+  if (all || opts.ollama) {
+    if (!ollamaInstalled()) {
+      console.log(c.dim("Ollama: not installed (run `poly setup --local`)."));
+    } else if (opts.check) {
+      console.log(c.dim(`Ollama: ${ollamaVersion() ?? "present"} (update with \`poly update --ollama\`).`));
+    } else if (process.platform === "darwin") {
+      console.log(c.cyan("Updating Ollama\u2026"));
+      await run("brew", ["upgrade", "ollama"]).then((ok) => !ok && console.log(c.dim("  (brew upgrade skipped/failed)")));
+    } else if (process.platform === "linux") {
+      await run("sh", ["-c", "curl -fsSL https://ollama.com/install.sh | sh"]);
+    } else {
+      console.log(c.dim("Ollama: update via your installer (winget upgrade Ollama.Ollama)."));
+    }
+  }
+  if (all || opts.models) {
+    const config = loadConfig();
+    const models = await installedModels(config.local.baseUrl);
+    if (!models.length) {
+      console.log(c.dim("Models: none installed."));
+    } else if (opts.check) {
+      console.log(c.dim(`Models: ${models.join(", ")} (re-pull to update).`));
+    } else {
+      for (const m of models) {
+        console.log(c.cyan(`Updating ${m}\u2026`));
+        await run("ollama", ["pull", m]);
+      }
+    }
+  }
+  console.log("");
+}
 // src/tui/App.tsx
 import { useState, useEffect, useCallback } from "react";
 import { Box, Text, useApp, useInput } from "ink";
@@ -2021,7 +2457,7 @@ import Spinner from "ink-spinner";
 // src/agent/tools.ts
 import fs4 from "node:fs";
 import path2 from "node:path";
-import { execSync } from "node:child_process";
+import { execSync as execSync3 } from "node:child_process";
 var TOOL_SCHEMAS = [
   {
     type: "function",
@@ -2086,6 +2522,28 @@ var TOOL_SCHEMAS = [
     }
   }
 ];
+var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
+var READONLY_TOOL_SCHEMAS = TOOL_SCHEMAS.filter(
+  (t) => ["read_file", "list_dir", "run_command"].includes(t.function.name)
+);
+function parseTextToolCall(content) {
+  if (!content) return null;
+  const json = extractJson(content);
+  if (!json) return null;
+  try {
+    const obj = JSON.parse(json);
+    const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
+    if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
+    const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
+    return {
+      id: `textcall_${name}`,
+      type: "function",
+      function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
+    };
+  } catch {
+    return null;
+  }
+}
 var MAX_OUTPUT = 8e3;
 function clip(s) {
   return s.length > MAX_OUTPUT ? s.slice(0, MAX_OUTPUT) + `
@@ -2142,7 +2600,7 @@ function executeTool(name, argsJson, ctx) {
       }
       case "run_command": {
         if (!ctx.allowCommands) return { result: "Denied: run_command is disabled." };
-        const out = execSync(String(args.command), {
+        const out = execSync3(String(args.command), {
           cwd: ctx.cwd,
           encoding: "utf8",
           env: scrubbedEnv(),
@@ -2167,50 +2625,124 @@ ${stderr}`)) };
   }
 }
-// src/agent/loop.ts
-var MAX_ITERS_PER_STEP = 6;
-var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
-function parseTextToolCall(content) {
-  if (!content) return null;
-  const json = extractJson(content);
+// src/agent/verify.ts
+var VERIFY_MAX_ITERS = 8;
+var VERIFY_SYSTEM = `You are the VERIFY stage of an autonomous coding agent. Your job is to MEASURE whether the goal was actually achieved \u2014 be skeptical and check the real workspace, do not assume.
+Use the read-only tools (read_file, list_dir, run_command) to inspect files and, where relevant, run build/test commands. Then judge EACH acceptance criterion against what you actually observed.
+When done, reply with ONLY this JSON (no prose, no code fence):
+{"results":[{"criterion":"<verbatim>","met":true|false,"reason":"<evidence>"}],"feedback":"<concrete guidance to fix any unmet criteria>"}`;
+async function verifyGoal(goal, criteria, deps, ev = {}) {
+  const toolCtx = { cwd: deps.cwd, allowWrite: false, allowCommands: deps.allowCommands };
+  const useTools = deps.model.capabilities.tools;
+  const messages = [
+    { role: "system", content: VERIFY_SYSTEM },
+    {
+      role: "user",
+      content: `Goal: ${goal}
+Acceptance criteria:
+` + criteria.map((c2, i) => `${i + 1}. ${c2}`).join("\n") + `
+Inspect the workspace, then return the verdict JSON.`
+    }
+  ];
+  let verdict = null;
+  for (let iter = 0; iter < VERIFY_MAX_ITERS; iter++) {
+    const gen = deps.client.stream(
+      { model: deps.model.id, messages, tools: useTools ? READONLY_TOOL_SCHEMAS : void 0, temperature: 0, maxTokens: 1500 },
+      deps.model.pricing
+    );
+    let next = await gen.next();
+    while (!next.done) next = await gen.next();
+    const result = next.value;
+    ev.onUsage?.(result);
+    const calls = result.toolCalls.length ? result.toolCalls : useTools && parseTextToolCall(result.content) ? [parseTextToolCall(result.content)] : [];
+    const parsed = parseVerdict(result.content, criteria);
+    if (parsed) {
+      verdict = parsed;
+      break;
+    }
+    if (calls.length) {
+      if (result.toolCalls.length) messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
+      for (const tc of calls) {
+        ev.onToolCall?.(tc.function.name, tc.function.arguments);
+        const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
+        ev.onToolResult?.(tc.function.name, outcome.result);
+        if (result.toolCalls.length) {
+          messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
+        } else {
+          messages.push({ role: "assistant", content: result.content });
+          messages.push({ role: "user", content: `Tool ${tc.function.name} returned:
+${outcome.result}
+Continue, then return the verdict JSON.` });
+        }
+      }
+      continue;
+    }
+    messages.push({ role: "assistant", content: result.content });
+    messages.push({ role: "user", content: `Return ONLY the verdict JSON now.` });
+  }
+  return verdict ?? fallbackVerdict(criteria);
+}
+function parseVerdict(text, criteria) {
+  const json = extractJson(text);
   if (!json) return null;
   try {
     const obj = JSON.parse(json);
-    const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
-    if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
-    const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
+    if (!Array.isArray(obj.results)) return null;
+    const results = obj.results.map((r) => ({
+      criterion: String(r.criterion ?? ""),
+      met: r.met === true || String(r.met).toLowerCase() === "true",
+      reason: String(r.reason ?? "").slice(0, 300)
+    }));
+    if (!results.length) return null;
+    const unmet = results.filter((r) => !r.met);
     return {
-      id: `textcall_${name}`,
-      type: "function",
-      function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
+      total: results.length,
+      metCount: results.length - unmet.length,
+      allMet: unmet.length === 0,
+      results,
+      unmet,
+      feedback: String(obj.feedback ?? "").slice(0, 1e3) || unmet.map((u) => u.reason).join("; ")
     };
   } catch {
     return null;
   }
 }
+function fallbackVerdict(criteria) {
+  const results = criteria.map((c2) => ({ criterion: c2, met: false, reason: "verifier produced no verdict" }));
+  return { total: results.length, metCount: 0, allMet: false, results, unmet: results, feedback: "Verification inconclusive; re-attempt with a stronger model." };
+}
+// src/agent/loop.ts
+function localDate2(d = /* @__PURE__ */ new Date()) {
+  const y = d.getFullYear();
+  const m = String(d.getMonth() + 1).padStart(2, "0");
+  const day = String(d.getDate()).padStart(2, "0");
+  return `${y}-${m}-${day}`;
+}
 async function runAgent(goal, deps, emit) {
-  const { client: client2, models, policy, sessionId, cwd } = deps;
-  let totalCostUsd = 0;
-  let totalTokens = 0;
-  let totalPromptTokens = 0;
-  let totalCompletionTokens = 0;
-  let calls = 0;
+  const { client: client2, models, cwd } = deps;
+  const verifyOn = deps.verify ?? true;
+  const maxAttempts = deps.maxAttempts ?? 3;
+  const acc = { cost: 0, tokens: 0, prompt: 0, completion: 0, calls: 0 };
   const sessionStart = Date.now();
-  let completedSteps = 0;
-  let failedSteps = 0;
-  const planRoute = route("plan", models, policy);
+  const toolCtx = { cwd, allowWrite: deps.allowWrite, allowCommands: deps.allowCommands };
+  const logUsage = (r, taskType) => {
+    const entry = logCompletion(r, taskType, deps.sessionId);
+    emit({ type: "usage", entry });
+    acc.cost += entry.costUsd;
+    acc.tokens += entry.totalTokens;
+    acc.prompt += entry.promptTokens;
+    acc.completion += entry.completionTokens;
+    acc.calls++;
+    return entry;
+  };
+  const planRoute = route("plan", models, deps.policy);
   let plan;
   if (planRoute) {
     try {
-      plan = await planRequest(goal, client2, planRoute.model, (result) => {
-        const entry = logCompletion(result, "plan", sessionId);
-        emit({ type: "usage", entry });
-        totalCostUsd += entry.costUsd;
-        totalTokens += entry.totalTokens;
-        totalPromptTokens += entry.promptTokens;
-        totalCompletionTokens += entry.completionTokens;
-        calls++;
-      });
+      plan = await planRequest(goal, client2, planRoute.model, (r) => logUsage(r, "plan"));
     } catch {
       plan = heuristicPlan(goal);
     }
@@ -2218,164 +2750,276 @@ async function runAgent(goal, deps, emit) {
     plan = heuristicPlan(goal);
   }
   emit({ type: "plan", plan, planModel: planRoute?.model.id ?? "heuristic" });
+  let startRung = 0;
+  let learned = false;
+  if (verifyOn) {
+    const tier = optimalStartTier(plan.goalType);
+    if (tier) {
+      const r = rungForTier(tier);
+      if (r > 0) {
+        startRung = r;
+        learned = true;
+      }
+    }
+  }
+  const startTier = ESCALATION_LADDER[startRung].tierFloor ?? "cheap";
+  emit({ type: "criteria", goalType: plan.goalType, criteria: plan.criteria, startTier, learned });
   startSession({
-    id: sessionId,
+    id: deps.sessionId,
     ts: sessionStart,
     date: localDate2(),
     goal,
     command: "run",
-    objective: policy.objective,
-    plannedSteps: plan.steps.length
+    objective: deps.policy.objective,
+    plannedSteps: plan.steps.length,
+    goalType: plan.goalType,
+    startTier
   });
-  const toolCtx = {
-    cwd,
-    allowWrite: deps.allowWrite,
-    allowCommands: deps.allowCommands
-  };
+  let rung = startRung;
+  let attemptNo = 0;
+  let verdict = null;
+  let completedSteps = 0;
+  let failedSteps = 0;
   const priorSummaries = [];
-  for (const step of plan.steps) {
-    const r = route(step.type, models, policy, {
-      promptTokens: step.estPromptTokens,
-      completionTokens: step.estCompletionTokens
-    });
-    if (!r) {
-      failedSteps++;
-      emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
-      continue;
-    }
-    const model = r.model;
-    emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
-    const useTools = model.capabilities.tools;
-    const messages = [
-      { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, useTools) },
-      { role: "user", content: step.description }
-    ];
-    const stepStart = Date.now();
-    let stepPrompt = 0;
-    let stepCompletion = 0;
-    let stepCost = 0;
-    let stepToolCalls = 0;
-    let iterations = 0;
-    let finishedBy = "max-iters";
-    let summary = "";
-    try {
-      for (let iter = 0; iter < MAX_ITERS_PER_STEP; iter++) {
-        iterations = iter + 1;
-        const gen = client2.stream(
-          {
-            model: model.id,
-            messages,
-            tools: useTools ? TOOL_SCHEMAS : void 0,
-            temperature: 0.2,
-            maxTokens: 2e3
-          },
-          model.pricing
-        );
-        let next = await gen.next();
-        while (!next.done) {
-          emit({ type: "text", delta: next.value });
-          next = await gen.next();
-        }
-        const result = next.value;
-        const entry = logCompletion(result, step.type, sessionId);
-        emit({ type: "usage", entry });
-        totalCostUsd += entry.costUsd;
-        totalTokens += entry.totalTokens;
-        totalPromptTokens += entry.promptTokens;
-        totalCompletionTokens += entry.completionTokens;
-        stepPrompt += entry.promptTokens;
-        stepCompletion += entry.completionTokens;
-        stepCost += entry.costUsd;
-        calls++;
-        if (result.toolCalls.length && useTools) {
-          messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
-          let finished = false;
-          for (const tc of result.toolCalls) {
-            stepToolCalls++;
-            emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
-            const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
-            emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
-            messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
-            if (outcome.finishSummary != null) {
-              summary = outcome.finishSummary;
-              finished = true;
-            }
-          }
-          if (finished) {
-            finishedBy = "finish-tool";
-            break;
-          }
-          continue;
-        }
-        const textCall = useTools ? parseTextToolCall(result.content) : null;
-        if (textCall) {
-          stepToolCalls++;
-          emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
-          const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
-          emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
-          if (outcome.finishSummary != null) {
-            summary = outcome.finishSummary;
-            finishedBy = "finish-tool";
-            break;
-          }
-          messages.push({ role: "assistant", content: result.content });
-          messages.push({
-            role: "user",
-            content: `Tool ${textCall.function.name} returned:
-${outcome.result}
-Continue with this step. When the objective is met, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
-          });
-          continue;
-        }
-        summary = result.content || summary;
-        if (summary) finishedBy = "text";
-        break;
+  while (attemptNo < maxAttempts) {
+    const rungDef = ESCALATION_LADDER[Math.min(rung, ESCALATION_LADDER.length - 1)];
+    const rungPolicy = applyRung(deps.policy, rungDef);
+    const attemptStart = Date.now();
+    const before = { ...acc };
+    if (attemptNo === 0) {
+      for (const step of plan.steps) {
+        const res = await runStep(step, rungPolicy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal);
+        if (res.success) completedSteps++;
+        else failedSteps++;
       }
-    } catch (err) {
-      finishedBy = "error";
-      emit({ type: "error", message: `Step ${step.id} failed: ${err?.message ?? err}` });
+    } else {
+      await runFix(goal, plan, verdict, rungPolicy, rungDef, deps, toolCtx, emit, logUsage);
     }
-    const success = finishedBy === "finish-tool" || finishedBy === "text";
-    if (success) completedSteps++;
-    else failedSteps++;
-    recordStepRun({
-      sessionId,
-      stepNo: step.id,
-      taskType: step.type,
-      skill: TASK_SKILL[step.type],
-      model: model.id,
-      provider: model.provider,
-      iterations,
-      toolCalls: stepToolCalls,
-      promptTokens: stepPrompt,
-      completionTokens: stepCompletion,
-      costUsd: stepCost,
-      finishedBy,
-      success,
-      durationMs: Date.now() - stepStart
+    if (!verifyOn) {
+      attemptNo++;
+      break;
+    }
+    const verifyPolicy = { ...deps.policy, objective: "quality", tierFloor: rungDef.tierFloor };
+    const verifier = routeOrBest("verify", models, verifyPolicy);
+    if (!verifier) {
+      emit({ type: "error", message: "No model available to verify." });
+      attemptNo++;
+      break;
+    }
+    emit({ type: "verify-start", model: verifier.model.id, attempt: attemptNo + 1 });
+    verdict = await verifyGoal(goal, plan.criteria, { client: client2, model: verifier.model, cwd, allowCommands: deps.allowCommands }, {
+      onToolCall: (name, args) => emit({ type: "tool-call", name, args }),
+      onToolResult: (name, result) => emit({ type: "tool-result", name, result }),
+      onUsage: (r) => logUsage(r, "review")
+    });
+    emit({ type: "verdict", attempt: attemptNo + 1, metCount: verdict.metCount, total: verdict.total, allMet: verdict.allMet, unmet: verdict.unmet });
+    recordAttempt({
+      sessionId: deps.sessionId,
+      attemptNo: attemptNo + 1,
+      goalType: plan.goalType,
+      tierFloor: rungDef.tierFloor ?? null,
+      objective: rungDef.objective,
+      promptTokens: acc.prompt - before.prompt,
+      completionTokens: acc.completion - before.completion,
+      costUsd: acc.cost - before.cost,
+      criteriaTotal: verdict.total,
+      criteriaMet: verdict.metCount,
+      passed: verdict.allMet,
+      durationMs: Date.now() - attemptStart
     });
-    if (!summary) summary = "(no summary)";
-    priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
-    emit({ type: "step-end", step, summary });
+    attemptNo++;
+    if (verdict.allMet) break;
+    if (attemptNo < maxAttempts) {
+      const next = Math.min(rung + 1, ESCALATION_LADDER.length - 1);
+      rung = next;
+      emit({
+        type: "escalate",
+        toRung: ESCALATION_LADDER[next].label,
+        reason: `${verdict.unmet.length}/${verdict.total} criteria unmet`
+      });
+    }
   }
-  finishSession(sessionId, {
+  const passed = verifyOn ? verdict ? verdict.allMet : false : null;
+  finishSession(deps.sessionId, {
     plannedSteps: plan.steps.length,
     completedSteps,
     failedSteps,
-    autoScore: plan.steps.length ? completedSteps / plan.steps.length : null,
-    promptTokens: totalPromptTokens,
-    completionTokens: totalCompletionTokens,
-    costUsd: totalCostUsd,
-    durationMs: Date.now() - sessionStart
+    autoScore: verdict ? verdict.metCount / Math.max(verdict.total, 1) : plan.steps.length ? completedSteps / plan.steps.length : null,
+    promptTokens: acc.prompt,
+    completionTokens: acc.completion,
+    costUsd: acc.cost,
+    durationMs: Date.now() - sessionStart,
+    attempts: attemptNo,
+    finalPassed: passed
   });
-  emit({ type: "done", totalCostUsd, totalTokens, calls });
-  return { totalCostUsd, totalTokens, calls };
+  emit({ type: "done", totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed, attempts: attemptNo });
+  return { totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed };
 }
-function localDate2(d = /* @__PURE__ */ new Date()) {
-  const y = d.getFullYear();
-  const m = String(d.getMonth() + 1).padStart(2, "0");
-  const day = String(d.getDate()).padStart(2, "0");
-  return `${y}-${m}-${day}`;
+async function runStep(step, policy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal) {
+  const r = routeOrBest(step.type, deps.models, policy, {
+    promptTokens: step.estPromptTokens,
+    completionTokens: step.estCompletionTokens
+  });
+  if (!r) {
+    emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
+    return { summary: "(no model)", success: false };
+  }
+  const model = r.model;
+  emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
+  const messages = [
+    { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, model.capabilities.tools) },
+    { role: "user", content: step.description }
+  ];
+  const loop = await runToolLoop(model, messages, step.type, rungDef, deps, toolCtx, emit, logUsage);
+  recordStepRun({
+    sessionId: deps.sessionId,
+    stepNo: step.id,
+    taskType: step.type,
+    skill: TASK_SKILL[step.type],
+    model: model.id,
+    provider: model.provider,
+    iterations: loop.iterations,
+    toolCalls: loop.toolCalls,
+    promptTokens: loop.prompt,
+    completionTokens: loop.completion,
+    costUsd: loop.cost,
+    finishedBy: loop.finishedBy,
+    success: loop.success,
+    durationMs: loop.durationMs
+  });
+  const summary = loop.summary || "(no summary)";
+  priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
+  emit({ type: "step-end", step, summary });
+  return { summary, success: loop.success };
+}
+async function runFix(goal, plan, verdict, policy, rungDef, deps, toolCtx, emit, logUsage) {
+  const r = routeOrBest("edit", deps.models, policy);
+  if (!r) return { summary: "(no model)", success: false };
+  const model = r.model;
+  const fixStep = {
+    id: 100,
+    type: "edit",
+    description: "Fix the unmet acceptance criteria",
+    estPromptTokens: 9e3,
+    estCompletionTokens: 1500
+  };
+  emit({ type: "step-start", step: fixStep, model, estCostUsd: r.estCostUsd });
+  const unmet = verdict.unmet.map((u, i) => `${i + 1}. ${u.criterion} \u2014 ${u.reason}`).join("\n");
+  const messages = [
+    {
+      role: "system",
+      content: `You are the FIX stage of an autonomous coding agent (escalated model). The verify gate found unmet acceptance criteria; resolve them.
+Overall goal: ${goal}
+You may use the tools (read_file, write_file, list_dir, run_command). Inspect what's there, then make the changes. Call \`finish\` with a one-line summary when all listed criteria should now pass.
+If you cannot call tools natively, reply with ONLY one JSON object per turn: {"name":"<tool>","arguments":{...}}`
+    },
+    { role: "user", content: `Unmet criteria:
+${unmet}
+Verifier feedback: ${verdict.feedback}` }
+  ];
+  const loop = await runToolLoop(model, messages, "edit", rungDef, deps, toolCtx, emit, logUsage);
+  recordStepRun({
+    sessionId: deps.sessionId,
+    stepNo: fixStep.id,
+    taskType: "edit",
+    skill: TASK_SKILL.edit,
+    model: model.id,
+    provider: model.provider,
+    iterations: loop.iterations,
+    toolCalls: loop.toolCalls,
+    promptTokens: loop.prompt,
+    completionTokens: loop.completion,
+    costUsd: loop.cost,
+    finishedBy: loop.finishedBy,
+    success: loop.success,
+    durationMs: loop.durationMs
+  });
+  emit({ type: "step-end", step: fixStep, summary: loop.summary || "(fix pass)" });
+  return { summary: loop.summary, success: loop.success };
+}
+async function runToolLoop(model, messages, taskTypeForLog, rungDef, deps, toolCtx, emit, logUsage) {
+  const useTools = model.capabilities.tools;
+  const start = Date.now();
+  let prompt = 0, completion = 0, cost = 0, toolCalls = 0, iterations = 0;
+  let summary = "";
+  let finishedBy = "max-iters";
+  try {
+    for (let iter = 0; iter < rungDef.maxIters; iter++) {
+      iterations = iter + 1;
+      const gen = deps.client.stream(
+        { model: model.id, messages, tools: useTools ? TOOL_SCHEMAS : void 0, temperature: 0.2, maxTokens: rungDef.maxTokens },
+        model.pricing
+      );
+      let next = await gen.next();
+      while (!next.done) {
+        emit({ type: "text", delta: next.value });
+        next = await gen.next();
+      }
+      const result = next.value;
+      const entry = logUsage(result, taskTypeForLog);
+      prompt += entry.promptTokens;
+      completion += entry.completionTokens;
+      cost += entry.costUsd;
+      if (result.toolCalls.length && useTools) {
+        messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
+        let finished = false;
+        for (const tc of result.toolCalls) {
+          toolCalls++;
+          emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
+          const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
+          emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
+          messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
+          if (outcome.finishSummary != null) {
+            summary = outcome.finishSummary;
+            finished = true;
+          }
+        }
+        if (finished) {
+          finishedBy = "finish-tool";
+          break;
+        }
+        continue;
+      }
+      const textCall = useTools ? parseTextToolCall(result.content) : null;
+      if (textCall) {
+        toolCalls++;
+        emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
+        const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
+        emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
+        if (outcome.finishSummary != null) {
+          summary = outcome.finishSummary;
+          finishedBy = "finish-tool";
+          break;
+        }
+        messages.push({ role: "assistant", content: result.content });
+        messages.push({
+          role: "user",
+          content: `Tool ${textCall.function.name} returned:
+${outcome.result}
+Continue. When done, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
+        });
+        continue;
+      }
+      summary = result.content || summary;
+      if (summary) finishedBy = "text";
+      break;
+    }
+  } catch (err) {
+    finishedBy = "error";
+    emit({ type: "error", message: `${taskTypeForLog} failed: ${err?.message ?? err}` });
+  }
+  return {
+    summary,
+    success: finishedBy === "finish-tool" || finishedBy === "text",
+    finishedBy,
+    iterations,
+    toolCalls,
+    prompt,
+    completion,
+    cost,
+    durationMs: Date.now() - start
+  };
 }
 function stepSystemPrompt(goal, step, priorSummaries, useTools) {
   const context = priorSummaries.length ? `
@@ -2389,7 +3033,7 @@ Return a concise result for this step. Do not ask the user questions.`;
   return `You are the "${step.type}" stage of an autonomous coding agent.
 Overall goal: ${goal}
 Your current step: ${step.description}${context}${toolNote}
-Be efficient \u2014 you were selected as the cheapest capable model for this step.`;
+Be efficient \u2014 you were selected as the most cost-effective capable model for this step.`;
 }
 // src/tui/App.tsx
@@ -2405,6 +3049,8 @@ function App(props) {
   const [tok, setTok] = useState(0);
   const [calls, setCalls] = useState(0);
   const [rated, setRated] = useState(null);
+  const [passed, setPassed] = useState(null);
+  const [attempts, setAttempts] = useState(0);
   const push = useCallback((text, color) => {
     setLog((l) => [...l, { key: l.length, text, color }]);
   }, []);
@@ -2422,7 +3068,9 @@ function App(props) {
       sessionId: props.sessionId,
       cwd: props.cwd,
       allowWrite: props.allowWrite,
-      allowCommands: props.allowCommands
+      allowCommands: props.allowCommands,
+      verify: props.verify,
+      maxAttempts: props.maxAttempts
     };
     let textBuf = "";
     const flush = () => {
@@ -2434,6 +3082,24 @@ function App(props) {
         case "plan":
           push(`\u{1F4CB} Plan (${e.plan.steps.length} steps) \xB7 planner: ${e.planModel}`, "cyan");
           break;
+        case "criteria":
+          push(`\u{1F3AF} ${e.goalType} \xB7 ${e.criteria.length} criteria \xB7 start: ${e.startTier}${e.learned ? " (learned)" : ""}`, "cyan");
+          e.criteria.forEach((cr, i) => push(`   ${i + 1}. ${cr}`, "gray"));
+          break;
+        case "verify-start":
+          flush();
+          push(`\u{1F50D} Verify (attempt ${e.attempt}) \u2192 ${e.model}`, "cyan");
+          break;
+        case "verdict":
+          flush();
+          push(
+            `${e.allMet ? "\u2705" : "\u274C"} ${e.metCount}/${e.total} criteria met` + (e.unmet.length ? " \u2014 unmet: " + e.unmet.map((u) => u.criterion).join("; ").slice(0, 100) : ""),
+            e.allMet ? "green" : "red"
+          );
+          break;
+        case "escalate":
+          push(`\u23EB Escalate \u2192 ${e.toRung}  (${e.reason})`, "magenta");
+          break;
         case "step-start":
           flush();
           push(`\u25B6 Step ${e.step.id} [${e.step.type}] \u2192 ${e.model.id}  ~${usd(e.estCostUsd)}`, "yellow");
@@ -2463,6 +3129,8 @@ function App(props) {
           break;
         case "done":
           flush();
+          setPassed(e.passed);
+          setAttempts(e.attempts);
           break;
       }
     };
@@ -2522,10 +3190,14 @@ function App(props) {
         " working\u2026"
       ] }),
       phase === "rate" && /* @__PURE__ */ jsxs(Text, { children: [
-        /* @__PURE__ */ jsxs(Text, { color: "green", children: [
-          "\u2713 Done \xB7 ",
+        /* @__PURE__ */ jsxs(Text, { color: passed === false ? "yellow" : "green", children: [
+          passed === false ? "\u26A0 goal not fully met" : "\u2713 goal met",
+          " \xB7 ",
+          attempts,
+          " attempt(s) \xB7 ",
           calls,
-          " calls \xB7 ",
+          " calls \xB7",
+          " ",
           tokens(tok),
           " tokens \xB7 ",
           usd(cost)
@@ -2609,8 +3281,9 @@ function truncate2(s, n) {
 }
 // src/index.ts
+var VERSION = "0.5.0";
 var program = new Command();
-program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.3.1");
+program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version(VERSION);
 function client(config) {
   return new OpenRouterClient({
     apiKey: resolveApiKey(config),
@@ -2681,10 +3354,18 @@ async function loadCatalog(config, refresh = false) {
   }
   return models;
 }
+program.command("setup").description("First-run setup: optionally install a local LLM (Ollama) and connect models").option("--local", "install a local LLM (Ollama) \u2014 skips the prompt").option("--no-local", "skip the local LLM \u2014 skips the prompt").option("-m, --model <id>", "local model to pull (e.g. qwen2.5-coder:7b)").option("-y, --yes", "accept defaults / auto-install without prompts", false).action(async (opts) => {
+  const argv = process.argv;
+  const local = argv.includes("--local") ? true : argv.includes("--no-local") ? false : void 0;
+  await runSetup({ local, model: opts.model, yes: !!opts.yes });
+});
+program.command("update").description("Update Polymath, the Ollama runtime, and local models").option("--check", "report available updates without installing", false).option("--self", "only the Polymath CLI", false).option("--ollama", "only the Ollama runtime", false).option("--models", "only the local models", false).action(async (opts) => {
+  await runUpdate(VERSION, { check: !!opts.check, self: !!opts.self, ollama: !!opts.ollama, models: !!opts.models });
+});
 program.command("login").description("Connect Polymath to OpenRouter (set/replace your API key)").action(async () => {
   await runLogin();
 });
-program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).action(async (goalParts, opts) => {
+program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).option("--no-verify", "skip the verify-and-escalate loop (single pass)").option("--max-attempts <n>", "max code\u2192verify\u2192escalate attempts until goals met", "3").action(async (goalParts, opts) => {
   const startedAt = Date.now();
   const config = loadConfig();
   if (!config.local.enabled || resolveApiKey(config)) {
@@ -2709,6 +3390,8 @@ program.command("run", { isDefault: true }).description("Launch the interactive
       allowWrite: !!opts.write,
       allowCommands: !!opts.commands,
       objectiveLabel: policy.objective,
+      verify: opts.verify !== false,
+      maxAttempts: Math.max(1, parseInt(opts.maxAttempts, 10) || 3),
       initialGoal: goal
     })
   );
@@ -2888,3 +3571,6 @@ program.parseAsync().catch((err) => {
   console.error(c.red(err?.message ?? String(err)));
   process.exit(1);
 });
+export {
+  VERSION
+};