npm - polymath-agent - Versions diffs - 0.3.0 → 0.4.0 - Mend

polymath-agent 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -100,6 +100,37 @@ poly usage                                # cost by date + model
 After each `poly run`, rate the result 0–9 (one keypress) — your goal-achievement
 rating joins the auto score (completed/planned steps) to power `poly analyze`.
+### Outcome-driven loop (verify → escalate → repeat)
+`poly run` doesn't stop at "code written" — it measures the result and keeps going
+until the goal is actually met:
+```
+command → plan + acceptance criteria → code (cheapest model)
+        → VERIFY result against criteria (inspects files, runs tests)
+        → if unmet: ESCALATE (higher tier, more tokens, cost cap lifted) → fix → re-verify
+        → repeat until all criteria pass (or --max-attempts)
+```
+The cheapest model gets first crack; only the criteria it *fails* trigger a pricier
+model — so you pay for frontier capability exactly when (and only when) it's needed.
+```bash
+poly run -w -x "add an add(a,b) to calc.js and make the tests pass"
+poly run --no-verify "..."        # single pass, no verify/escalate
+poly run --max-attempts 5 "..."   # try harder before giving up
+```
+After each run you'll see `✓ goal met · 2 attempts` (or `⚠ goal not fully met`).
+### Statistical model optimization (learned starting tier)
+Every attempt is recorded with its goal type, starting tier, tokens, and pass/fail.
+`poly analyze` then shows, per goal type, **which starting model reaches the goal
+with the fewest total tokens** — and once there's enough evidence (≥3 verified
+sessions), `poly run` **auto-starts at that tier**, skipping cheap attempts for goal
+types that historically need a stronger model from the start.
 ### The efficiency playbook (learned routing)
 Everything is captured locally (SQLite). `poly analyze` distills it into a **playbook**

package/dist/cli.js CHANGED Viewed

@@ -599,11 +599,14 @@ var TASK_SPECS = {
   command: { type: "command", minTier: "cheap", needsTools: true, label: "Run command" },
   review: { type: "review", minTier: "frontier", needsTools: false, label: "Review / critique" },
   reason: { type: "reason", minTier: "frontier", needsTools: false, label: "Hard reasoning" },
+  // The verify gate inspects files / runs tests — it MUST have tools.
+  verify: { type: "verify", minTier: "frontier", needsTools: true, label: "Verify result" },
   explain: { type: "explain", minTier: "cheap", needsTools: false, label: "Explain" },
   summarize: { type: "summarize", minTier: "cheap", needsTools: false, label: "Summarize" },
   chat: { type: "chat", minTier: "cheap", needsTools: false, label: "Chat" }
 };
 var ALL_TASK_TYPES = Object.keys(TASK_SPECS);
+var ALL_GOAL_TYPES = ["feature", "bugfix", "refactor", "test", "docs", "chore", "other"];
 // src/planner/planner.ts
 var PLAN_SYSTEM = `You are the planning stage of a coding agent. Break the user's request into a short, ordered list of concrete steps.
@@ -619,9 +622,21 @@ Each step must be classified by type, chosen from EXACTLY this set:
   summarize - condense long content
   chat      - a simple conversational reply
+Also classify the request's goalType (one of: feature, bugfix, refactor, test, docs, chore, other) and write 2-5 MEASURABLE acceptance criteria \u2014 concrete, checkable conditions that mean the goal is fully achieved (e.g. "hello.js exists and prints the greeting", "npm test passes", "the function handles empty input").
 Return ONLY minified JSON of the form:
-{"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
+{"goalType":"<type>","criteria":["...","..."],"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
 Use 3-8 steps for non-trivial work, fewer for simple requests. Estimate tokens realistically (prompts often 2000-15000, completions 200-3000).`;
+function classifyGoalType(goal) {
+  const g = goal.toLowerCase();
+  if (/\b(fix|bug|broken|error|crash|regression|fails?)\b/.test(g)) return "bugfix";
+  if (/\b(refactor|rename|clean ?up|restructure|extract|simplif)/.test(g)) return "refactor";
+  if (/\b(test|spec|coverage|unit test|e2e)\b/.test(g)) return "test";
+  if (/\b(docs?|readme|comment|documentation)\b/.test(g)) return "docs";
+  if (/\b(bump|upgrade|dependency|deps|config|chore|lint|format)\b/.test(g)) return "chore";
+  if (/\b(add|create|implement|build|feature|support|new)\b/.test(g)) return "feature";
+  return "other";
+}
 function heuristicPlan(goal) {
   const steps = [
     { id: 1, type: "plan", description: "Decompose the request", estPromptTokens: 2e3, estCompletionTokens: 600 },
@@ -630,7 +645,12 @@ function heuristicPlan(goal) {
     { id: 4, type: "edit", description: "Implement the change", estPromptTokens: 9e3, estCompletionTokens: 1500 },
     { id: 5, type: "review", description: "Review the change", estPromptTokens: 6e3, estCompletionTokens: 800 }
   ];
-  return { goal, steps };
+  return {
+    goal,
+    steps,
+    goalType: classifyGoalType(goal),
+    criteria: ["The stated goal is fully implemented and works", "No obvious errors or omissions remain"]
+  };
 }
 async function planRequest(goal, client2, planModel, onUsage) {
   const result = await client2.complete(
@@ -648,7 +668,7 @@ async function planRequest(goal, client2, planModel, onUsage) {
   onUsage?.(result);
   const parsed = extractPlan(result.content);
   if (!parsed) return heuristicPlan(goal);
-  return { goal, steps: parsed };
+  return { goal, ...parsed };
 }
 function extractPlan(text) {
   const json = extractJson(text);
@@ -663,7 +683,10 @@ function extractPlan(text) {
       estPromptTokens: clampInt(s.estPromptTokens, 500, 6e4, 4e3),
       estCompletionTokens: clampInt(s.estCompletionTokens, 100, 8e3, 800)
     }));
-    return steps.length ? steps : null;
+    if (!steps.length) return null;
+    const goalType = ALL_GOAL_TYPES.includes(String(obj.goalType)) ? obj.goalType : "other";
+    const criteria = Array.isArray(obj.criteria) ? obj.criteria.map((x) => String(x).slice(0, 200)).filter(Boolean).slice(0, 6) : [];
+    return { steps, goalType, criteria: criteria.length ? criteria : ["The stated goal is fully achieved"] };
   } catch {
     return null;
   }
@@ -705,10 +728,29 @@ function extractJson(text) {
 }
 // src/router/policy.ts
+var ESCALATION_LADDER = [
+  { objective: "value", maxTokens: 2e3, maxIters: 6, liftCostCap: false, label: "value \xB7 cheapest-capable" },
+  { tierFloor: "standard", objective: "value", maxTokens: 4e3, maxIters: 8, liftCostCap: true, label: "standard+ \xB7 more tokens" },
+  { tierFloor: "frontier", objective: "quality", maxTokens: 8e3, maxIters: 10, liftCostCap: true, label: "frontier \xB7 strongest" }
+];
+function rungForTier(tier) {
+  return ESCALATION_LADDER.findIndex((r) => r.tierFloor === tier || !r.tierFloor && tier === "cheap");
+}
+function applyRung(base, rung) {
+  return {
+    ...base,
+    objective: rung.objective,
+    tierFloor: rung.tierFloor,
+    maxCostPerCallUsd: rung.liftCostCap ? void 0 : base.maxCostPerCallUsd
+  };
+}
 var TIER_RANK = { cheap: 0, standard: 1, frontier: 2 };
 function tierAtLeast(tier, min) {
   return TIER_RANK[tier] >= TIER_RANK[min];
 }
+function tierRank(tier) {
+  return TIER_RANK[tier];
+}
 function blendedPrice(m) {
   return (m.pricing.promptUsdPerMTok * 3 + m.pricing.completionUsdPerMTok) / 4;
 }
@@ -755,6 +797,7 @@ var TASK_SKILL = {
   command: "speed",
   review: "reasoning",
   reason: "reasoning",
+  verify: "reasoning",
   explain: "general",
   summarize: "speed",
   chat: "speed"
@@ -781,6 +824,7 @@ var TASK_MIN_STRENGTH = {
   edit: 1.4,
   review: 1.5,
   reason: 1.5,
+  verify: 1.4,
   plan: 1.2
 };
 var HEADLINE_SKILLS = ["coding", "reasoning", "retrieval", "speed"];
@@ -798,9 +842,10 @@ function taskValue(m, taskType, empirical) {
 function candidatesFor(taskType, models, policy, est) {
   const spec = TASK_SPECS[taskType];
   const strengthFloor = TASK_MIN_STRENGTH[taskType] ?? 0;
+  const minTier = policy.tierFloor && tierRank(policy.tierFloor) > tierRank(spec.minTier) ? policy.tierFloor : spec.minTier;
   return models.filter((m) => {
     if (m.id === "openrouter/auto") return false;
-    const covers = tierAtLeast(m.tier, spec.minTier) || taskStrength(m, taskType) >= strengthFloor;
+    const covers = tierAtLeast(m.tier, minTier) || !policy.tierFloor && taskStrength(m, taskType) >= strengthFloor;
     if (!covers) return false;
     if (spec.needsTools && !m.capabilities.tools) return false;
     if (policy.maxCostPerCallUsd != null && est) {
@@ -846,6 +891,19 @@ function route(taskType, models, policy, est = { promptTokens: 4e3, completionTo
   const reason = policy.objective === "cheapest" ? `cheapest model that covers ${skill}` : policy.objective === "quality" ? `strongest at ${skill}` : proven ? `proven ${Math.round(proven)}% fewer tokens on ${taskType} (playbook)` : `best ${skill}-per-dollar`;
   return { model: chosen, reason, estCostUsd: projectCost(chosen, est) };
 }
+function routeOrBest(taskType, models, policy, est = { promptTokens: 4e3, completionTokens: 1e3 }) {
+  const r = route(taskType, models, policy, est);
+  if (r) return r;
+  const spec = TASK_SPECS[taskType];
+  const usable = models.filter(
+    (m) => m.id !== "openrouter/auto" && (!spec.needsTools || m.capabilities.tools)
+  );
+  if (!usable.length) return null;
+  const byStrength = (a, b) => taskStrength(b, taskType) - taskStrength(a, taskType);
+  const withTools = usable.filter((m) => m.capabilities.tools).sort(byStrength);
+  const best = (withTools.length ? withTools : [...usable].sort(byStrength))[0];
+  return { model: best, reason: `best available for ${TASK_SKILL[taskType]} (fallback)`, estCostUsd: projectCost(best, est) };
+}
 // src/recommend/recommend.ts
 var OBJECTIVES = [
@@ -1075,6 +1133,27 @@ function getDb() {
     );
     CREATE INDEX IF NOT EXISTS idx_cmd_date ON command_runs(date);
+    -- One row per verify-and-escalate attempt within a session. Powers the
+    -- "optimal starting model per goal type" statistical learning.
+    CREATE TABLE IF NOT EXISTS attempts (
+      id INTEGER PRIMARY KEY AUTOINCREMENT,
+      session_id TEXT NOT NULL,
+      attempt_no INTEGER NOT NULL,
+      goal_type TEXT NOT NULL,
+      tier_floor TEXT,
+      objective TEXT NOT NULL,
+      prompt_tokens INTEGER NOT NULL,
+      completion_tokens INTEGER NOT NULL,
+      cost_usd REAL NOT NULL,
+      criteria_total INTEGER NOT NULL,
+      criteria_met INTEGER NOT NULL,
+      passed INTEGER NOT NULL,
+      duration_ms INTEGER NOT NULL,
+      synced INTEGER NOT NULL DEFAULT 0
+    );
+    CREATE INDEX IF NOT EXISTS idx_attempts_session ON attempts(session_id);
+    CREATE INDEX IF NOT EXISTS idx_attempts_goal ON attempts(goal_type, tier_floor);
     -- Distilled efficiency insights: ONLY the notably cost-efficient approaches.
     -- This is what syncs to the cloud by default (raw logs stay local).
     CREATE TABLE IF NOT EXISTS insights (
@@ -1096,6 +1175,15 @@ function getDb() {
   if (!cols.some((c2) => c2.name === "command")) {
     db.exec(`ALTER TABLE usage_log ADD COLUMN command TEXT NOT NULL DEFAULT 'run'`);
   }
+  const conn = db;
+  const scols = conn.prepare(`PRAGMA table_info(sessions)`).all();
+  const addSession = (name, decl) => {
+    if (!scols.some((c2) => c2.name === name)) conn.exec(`ALTER TABLE sessions ADD COLUMN ${name} ${decl}`);
+  };
+  addSession("goal_type", "TEXT NOT NULL DEFAULT 'other'");
+  addSession("start_tier", "TEXT");
+  addSession("attempts", "INTEGER NOT NULL DEFAULT 1");
+  addSession("final_passed", "INTEGER");
   return db;
 }
 function recordUsage(e) {
@@ -1188,14 +1276,14 @@ function markSynced(ids) {
 }
 function startSession(s) {
   getDb().prepare(
-    `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps)
-       VALUES (?, ?, ?, ?, ?, ?, ?)`
-  ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps);
+    `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps, goal_type, start_tier)
+       VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
+  ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps, s.goalType, s.startTier ?? null);
 }
 function finishSession(id, u) {
   getDb().prepare(
     `UPDATE sessions SET planned_steps=?, completed_steps=?, failed_steps=?, auto_score=?,
-         prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=? WHERE id=?`
+         prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=?, attempts=?, final_passed=? WHERE id=?`
   ).run(
     u.plannedSteps,
     u.completedSteps,
@@ -1205,9 +1293,60 @@ function finishSession(id, u) {
     u.completionTokens,
     u.costUsd,
     u.durationMs,
+    u.attempts ?? 1,
+    u.finalPassed == null ? null : u.finalPassed ? 1 : 0,
     id
   );
 }
+function recordAttempt(a) {
+  getDb().prepare(
+    `INSERT INTO attempts
+        (session_id, attempt_no, goal_type, tier_floor, objective, prompt_tokens, completion_tokens,
+         cost_usd, criteria_total, criteria_met, passed, duration_ms)
+       VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
+  ).run(
+    a.sessionId,
+    a.attemptNo,
+    a.goalType,
+    a.tierFloor,
+    a.objective,
+    a.promptTokens,
+    a.completionTokens,
+    a.costUsd,
+    a.criteriaTotal,
+    a.criteriaMet,
+    a.passed ? 1 : 0,
+    a.durationMs
+  );
+}
+function goalTierStats() {
+  const rows = getDb().prepare(
+    `SELECT goal_type AS goalType, COALESCE(start_tier,'cheap') AS startTier,
+              COUNT(*) AS sessions,
+              AVG(CASE WHEN final_passed=1 THEN 1.0 ELSE 0.0 END) AS passRate,
+              AVG(prompt_tokens + completion_tokens) AS avgTotalTokens,
+              AVG(attempts) AS avgAttempts
+       FROM sessions
+       WHERE final_passed IS NOT NULL
+       GROUP BY goal_type, startTier
+       ORDER BY goal_type, avgTotalTokens ASC`
+  ).all();
+  return rows.map((r) => ({
+    goalType: String(r.goalType),
+    startTier: String(r.startTier),
+    sessions: Number(r.sessions),
+    passRate: Number(r.passRate ?? 0),
+    avgTotalTokens: Number(r.avgTotalTokens ?? 0),
+    avgAttempts: Number(r.avgAttempts ?? 0)
+  }));
+}
+function optimalStartTier(goalType, minSessions = 3) {
+  const stats = goalTierStats().filter(
+    (s) => s.goalType === goalType && s.sessions >= minSessions && s.passRate >= 0.6
+  );
+  if (!stats.length) return null;
+  return stats.sort((a, b) => a.avgTotalTokens - b.avgTotalTokens)[0].startTier;
+}
 function setUserScore(sessionId, score) {
   getDb().prepare(`UPDATE sessions SET user_score=? WHERE id=?`).run(score, sessionId);
 }
@@ -1651,6 +1790,35 @@ function renderAnalysis(filter = {}) {
     }
     out.push("");
   }
+  const tierStats = goalTierStats();
+  if (tierStats.length) {
+    out.push(c.bold("Optimal starting model per goal type") + c.dim("  (pass rate vs total tokens to reach the goal)"));
+    out.push(
+      table(
+        ["Goal type", "Start tier", "Sessions", "Pass rate", "Avg total tok", "Avg attempts"],
+        tierStats.map((s) => [
+          s.goalType,
+          tierColor(s.startTier),
+          String(s.sessions),
+          `${Math.round(s.passRate * 100)}%`,
+          tokens(Math.round(s.avgTotalTokens)),
+          s.avgAttempts.toFixed(1)
+        ])
+      )
+    );
+    const goalTypes = [...new Set(tierStats.map((s) => s.goalType))];
+    const learned = goalTypes.map((g) => ({ g, tier: optimalStartTier(g) })).filter((x) => x.tier);
+    if (learned.length) {
+      out.push(
+        c.green(
+          "\u2192 Learned starts (auto-applied on `poly run`): " + learned.map((x) => `${x.g}\u2192${x.tier}`).join(", ")
+        )
+      );
+    } else {
+      out.push(c.dim("\u2192 Not enough evidence yet to auto-pick a starting tier (needs \u22653 verified sessions per goal type)."));
+    }
+    out.push("");
+  }
   if (byCommand.length) {
     out.push(c.bold("Usage by command"));
     out.push(
@@ -2086,6 +2254,28 @@ var TOOL_SCHEMAS = [
     }
   }
 ];
+var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
+var READONLY_TOOL_SCHEMAS = TOOL_SCHEMAS.filter(
+  (t) => ["read_file", "list_dir", "run_command"].includes(t.function.name)
+);
+function parseTextToolCall(content) {
+  if (!content) return null;
+  const json = extractJson(content);
+  if (!json) return null;
+  try {
+    const obj = JSON.parse(json);
+    const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
+    if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
+    const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
+    return {
+      id: `textcall_${name}`,
+      type: "function",
+      function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
+    };
+  } catch {
+    return null;
+  }
+}
 var MAX_OUTPUT = 8e3;
 function clip(s) {
   return s.length > MAX_OUTPUT ? s.slice(0, MAX_OUTPUT) + `
@@ -2167,31 +2357,124 @@ ${stderr}`)) };
   }
 }
+// src/agent/verify.ts
+var VERIFY_MAX_ITERS = 8;
+var VERIFY_SYSTEM = `You are the VERIFY stage of an autonomous coding agent. Your job is to MEASURE whether the goal was actually achieved \u2014 be skeptical and check the real workspace, do not assume.
+Use the read-only tools (read_file, list_dir, run_command) to inspect files and, where relevant, run build/test commands. Then judge EACH acceptance criterion against what you actually observed.
+When done, reply with ONLY this JSON (no prose, no code fence):
+{"results":[{"criterion":"<verbatim>","met":true|false,"reason":"<evidence>"}],"feedback":"<concrete guidance to fix any unmet criteria>"}`;
+async function verifyGoal(goal, criteria, deps, ev = {}) {
+  const toolCtx = { cwd: deps.cwd, allowWrite: false, allowCommands: deps.allowCommands };
+  const useTools = deps.model.capabilities.tools;
+  const messages = [
+    { role: "system", content: VERIFY_SYSTEM },
+    {
+      role: "user",
+      content: `Goal: ${goal}
+Acceptance criteria:
+` + criteria.map((c2, i) => `${i + 1}. ${c2}`).join("\n") + `
+Inspect the workspace, then return the verdict JSON.`
+    }
+  ];
+  let verdict = null;
+  for (let iter = 0; iter < VERIFY_MAX_ITERS; iter++) {
+    const gen = deps.client.stream(
+      { model: deps.model.id, messages, tools: useTools ? READONLY_TOOL_SCHEMAS : void 0, temperature: 0, maxTokens: 1500 },
+      deps.model.pricing
+    );
+    let next = await gen.next();
+    while (!next.done) next = await gen.next();
+    const result = next.value;
+    ev.onUsage?.(result);
+    const calls = result.toolCalls.length ? result.toolCalls : useTools && parseTextToolCall(result.content) ? [parseTextToolCall(result.content)] : [];
+    const parsed = parseVerdict(result.content, criteria);
+    if (parsed) {
+      verdict = parsed;
+      break;
+    }
+    if (calls.length) {
+      if (result.toolCalls.length) messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
+      for (const tc of calls) {
+        ev.onToolCall?.(tc.function.name, tc.function.arguments);
+        const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
+        ev.onToolResult?.(tc.function.name, outcome.result);
+        if (result.toolCalls.length) {
+          messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
+        } else {
+          messages.push({ role: "assistant", content: result.content });
+          messages.push({ role: "user", content: `Tool ${tc.function.name} returned:
+${outcome.result}
+Continue, then return the verdict JSON.` });
+        }
+      }
+      continue;
+    }
+    messages.push({ role: "assistant", content: result.content });
+    messages.push({ role: "user", content: `Return ONLY the verdict JSON now.` });
+  }
+  return verdict ?? fallbackVerdict(criteria);
+}
+function parseVerdict(text, criteria) {
+  const json = extractJson(text);
+  if (!json) return null;
+  try {
+    const obj = JSON.parse(json);
+    if (!Array.isArray(obj.results)) return null;
+    const results = obj.results.map((r) => ({
+      criterion: String(r.criterion ?? ""),
+      met: r.met === true || String(r.met).toLowerCase() === "true",
+      reason: String(r.reason ?? "").slice(0, 300)
+    }));
+    if (!results.length) return null;
+    const unmet = results.filter((r) => !r.met);
+    return {
+      total: results.length,
+      metCount: results.length - unmet.length,
+      allMet: unmet.length === 0,
+      results,
+      unmet,
+      feedback: String(obj.feedback ?? "").slice(0, 1e3) || unmet.map((u) => u.reason).join("; ")
+    };
+  } catch {
+    return null;
+  }
+}
+function fallbackVerdict(criteria) {
+  const results = criteria.map((c2) => ({ criterion: c2, met: false, reason: "verifier produced no verdict" }));
+  return { total: results.length, metCount: 0, allMet: false, results, unmet: results, feedback: "Verification inconclusive; re-attempt with a stronger model." };
+}
 // src/agent/loop.ts
-var MAX_ITERS_PER_STEP = 6;
+function localDate2(d = /* @__PURE__ */ new Date()) {
+  const y = d.getFullYear();
+  const m = String(d.getMonth() + 1).padStart(2, "0");
+  const day = String(d.getDate()).padStart(2, "0");
+  return `${y}-${m}-${day}`;
+}
 async function runAgent(goal, deps, emit) {
-  const { client: client2, models, policy, sessionId, cwd } = deps;
-  let totalCostUsd = 0;
-  let totalTokens = 0;
-  let totalPromptTokens = 0;
-  let totalCompletionTokens = 0;
-  let calls = 0;
+  const { client: client2, models, cwd } = deps;
+  const verifyOn = deps.verify ?? true;
+  const maxAttempts = deps.maxAttempts ?? 3;
+  const acc = { cost: 0, tokens: 0, prompt: 0, completion: 0, calls: 0 };
   const sessionStart = Date.now();
-  let completedSteps = 0;
-  let failedSteps = 0;
-  const planRoute = route("plan", models, policy);
+  const toolCtx = { cwd, allowWrite: deps.allowWrite, allowCommands: deps.allowCommands };
+  const logUsage = (r, taskType) => {
+    const entry = logCompletion(r, taskType, deps.sessionId);
+    emit({ type: "usage", entry });
+    acc.cost += entry.costUsd;
+    acc.tokens += entry.totalTokens;
+    acc.prompt += entry.promptTokens;
+    acc.completion += entry.completionTokens;
+    acc.calls++;
+    return entry;
+  };
+  const planRoute = route("plan", models, deps.policy);
   let plan;
   if (planRoute) {
     try {
-      plan = await planRequest(goal, client2, planRoute.model, (result) => {
-        const entry = logCompletion(result, "plan", sessionId);
-        emit({ type: "usage", entry });
-        totalCostUsd += entry.costUsd;
-        totalTokens += entry.totalTokens;
-        totalPromptTokens += entry.promptTokens;
-        totalCompletionTokens += entry.completionTokens;
-        calls++;
-      });
+      plan = await planRequest(goal, client2, planRoute.model, (r) => logUsage(r, "plan"));
     } catch {
       plan = heuristicPlan(goal);
     }
@@ -2199,144 +2482,276 @@ async function runAgent(goal, deps, emit) {
     plan = heuristicPlan(goal);
   }
   emit({ type: "plan", plan, planModel: planRoute?.model.id ?? "heuristic" });
+  let startRung = 0;
+  let learned = false;
+  if (verifyOn) {
+    const tier = optimalStartTier(plan.goalType);
+    if (tier) {
+      const r = rungForTier(tier);
+      if (r > 0) {
+        startRung = r;
+        learned = true;
+      }
+    }
+  }
+  const startTier = ESCALATION_LADDER[startRung].tierFloor ?? "cheap";
+  emit({ type: "criteria", goalType: plan.goalType, criteria: plan.criteria, startTier, learned });
   startSession({
-    id: sessionId,
+    id: deps.sessionId,
     ts: sessionStart,
     date: localDate2(),
     goal,
     command: "run",
-    objective: policy.objective,
-    plannedSteps: plan.steps.length
+    objective: deps.policy.objective,
+    plannedSteps: plan.steps.length,
+    goalType: plan.goalType,
+    startTier
   });
-  const toolCtx = {
-    cwd,
-    allowWrite: deps.allowWrite,
-    allowCommands: deps.allowCommands
-  };
+  let rung = startRung;
+  let attemptNo = 0;
+  let verdict = null;
+  let completedSteps = 0;
+  let failedSteps = 0;
   const priorSummaries = [];
-  for (const step of plan.steps) {
-    const r = route(step.type, models, policy, {
-      promptTokens: step.estPromptTokens,
-      completionTokens: step.estCompletionTokens
-    });
-    if (!r) {
-      failedSteps++;
-      emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
-      continue;
-    }
-    const model = r.model;
-    emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
-    const useTools = model.capabilities.tools;
-    const messages = [
-      { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, useTools) },
-      { role: "user", content: step.description }
-    ];
-    const stepStart = Date.now();
-    let stepPrompt = 0;
-    let stepCompletion = 0;
-    let stepCost = 0;
-    let stepToolCalls = 0;
-    let iterations = 0;
-    let finishedBy = "max-iters";
-    let summary = "";
-    try {
-      for (let iter = 0; iter < MAX_ITERS_PER_STEP; iter++) {
-        iterations = iter + 1;
-        const gen = client2.stream(
-          {
-            model: model.id,
-            messages,
-            tools: useTools ? TOOL_SCHEMAS : void 0,
-            temperature: 0.2,
-            maxTokens: 2e3
-          },
-          model.pricing
-        );
-        let next = await gen.next();
-        while (!next.done) {
-          emit({ type: "text", delta: next.value });
-          next = await gen.next();
-        }
-        const result = next.value;
-        const entry = logCompletion(result, step.type, sessionId);
-        emit({ type: "usage", entry });
-        totalCostUsd += entry.costUsd;
-        totalTokens += entry.totalTokens;
-        totalPromptTokens += entry.promptTokens;
-        totalCompletionTokens += entry.completionTokens;
-        stepPrompt += entry.promptTokens;
-        stepCompletion += entry.completionTokens;
-        stepCost += entry.costUsd;
-        calls++;
-        if (result.toolCalls.length && useTools) {
-          messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
-          let finished = false;
-          for (const tc of result.toolCalls) {
-            stepToolCalls++;
-            emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
-            const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
-            emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
-            messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
-            if (outcome.finishSummary != null) {
-              summary = outcome.finishSummary;
-              finished = true;
-            }
-          }
-          if (finished) {
-            finishedBy = "finish-tool";
-            break;
-          }
-          continue;
-        }
-        summary = result.content || summary;
-        if (summary) finishedBy = "text";
-        break;
+  while (attemptNo < maxAttempts) {
+    const rungDef = ESCALATION_LADDER[Math.min(rung, ESCALATION_LADDER.length - 1)];
+    const rungPolicy = applyRung(deps.policy, rungDef);
+    const attemptStart = Date.now();
+    const before = { ...acc };
+    if (attemptNo === 0) {
+      for (const step of plan.steps) {
+        const res = await runStep(step, rungPolicy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal);
+        if (res.success) completedSteps++;
+        else failedSteps++;
       }
-    } catch (err) {
-      finishedBy = "error";
-      emit({ type: "error", message: `Step ${step.id} failed: ${err?.message ?? err}` });
+    } else {
+      await runFix(goal, plan, verdict, rungPolicy, rungDef, deps, toolCtx, emit, logUsage);
     }
-    const success = finishedBy === "finish-tool" || finishedBy === "text";
-    if (success) completedSteps++;
-    else failedSteps++;
-    recordStepRun({
-      sessionId,
-      stepNo: step.id,
-      taskType: step.type,
-      skill: TASK_SKILL[step.type],
-      model: model.id,
-      provider: model.provider,
-      iterations,
-      toolCalls: stepToolCalls,
-      promptTokens: stepPrompt,
-      completionTokens: stepCompletion,
-      costUsd: stepCost,
-      finishedBy,
-      success,
-      durationMs: Date.now() - stepStart
+    if (!verifyOn) {
+      attemptNo++;
+      break;
+    }
+    const verifyPolicy = { ...deps.policy, objective: "quality", tierFloor: rungDef.tierFloor };
+    const verifier = routeOrBest("verify", models, verifyPolicy);
+    if (!verifier) {
+      emit({ type: "error", message: "No model available to verify." });
+      attemptNo++;
+      break;
+    }
+    emit({ type: "verify-start", model: verifier.model.id, attempt: attemptNo + 1 });
+    verdict = await verifyGoal(goal, plan.criteria, { client: client2, model: verifier.model, cwd, allowCommands: deps.allowCommands }, {
+      onToolCall: (name, args) => emit({ type: "tool-call", name, args }),
+      onToolResult: (name, result) => emit({ type: "tool-result", name, result }),
+      onUsage: (r) => logUsage(r, "review")
+    });
+    emit({ type: "verdict", attempt: attemptNo + 1, metCount: verdict.metCount, total: verdict.total, allMet: verdict.allMet, unmet: verdict.unmet });
+    recordAttempt({
+      sessionId: deps.sessionId,
+      attemptNo: attemptNo + 1,
+      goalType: plan.goalType,
+      tierFloor: rungDef.tierFloor ?? null,
+      objective: rungDef.objective,
+      promptTokens: acc.prompt - before.prompt,
+      completionTokens: acc.completion - before.completion,
+      costUsd: acc.cost - before.cost,
+      criteriaTotal: verdict.total,
+      criteriaMet: verdict.metCount,
+      passed: verdict.allMet,
+      durationMs: Date.now() - attemptStart
     });
-    if (!summary) summary = "(no summary)";
-    priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
-    emit({ type: "step-end", step, summary });
+    attemptNo++;
+    if (verdict.allMet) break;
+    if (attemptNo < maxAttempts) {
+      const next = Math.min(rung + 1, ESCALATION_LADDER.length - 1);
+      rung = next;
+      emit({
+        type: "escalate",
+        toRung: ESCALATION_LADDER[next].label,
+        reason: `${verdict.unmet.length}/${verdict.total} criteria unmet`
+      });
+    }
   }
-  finishSession(sessionId, {
+  const passed = verifyOn ? verdict ? verdict.allMet : false : null;
+  finishSession(deps.sessionId, {
     plannedSteps: plan.steps.length,
     completedSteps,
     failedSteps,
-    autoScore: plan.steps.length ? completedSteps / plan.steps.length : null,
-    promptTokens: totalPromptTokens,
-    completionTokens: totalCompletionTokens,
-    costUsd: totalCostUsd,
-    durationMs: Date.now() - sessionStart
+    autoScore: verdict ? verdict.metCount / Math.max(verdict.total, 1) : plan.steps.length ? completedSteps / plan.steps.length : null,
+    promptTokens: acc.prompt,
+    completionTokens: acc.completion,
+    costUsd: acc.cost,
+    durationMs: Date.now() - sessionStart,
+    attempts: attemptNo,
+    finalPassed: passed
   });
-  emit({ type: "done", totalCostUsd, totalTokens, calls });
-  return { totalCostUsd, totalTokens, calls };
+  emit({ type: "done", totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed, attempts: attemptNo });
+  return { totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed };
 }
-function localDate2(d = /* @__PURE__ */ new Date()) {
-  const y = d.getFullYear();
-  const m = String(d.getMonth() + 1).padStart(2, "0");
-  const day = String(d.getDate()).padStart(2, "0");
-  return `${y}-${m}-${day}`;
+async function runStep(step, policy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal) {
+  const r = routeOrBest(step.type, deps.models, policy, {
+    promptTokens: step.estPromptTokens,
+    completionTokens: step.estCompletionTokens
+  });
+  if (!r) {
+    emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
+    return { summary: "(no model)", success: false };
+  }
+  const model = r.model;
+  emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
+  const messages = [
+    { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, model.capabilities.tools) },
+    { role: "user", content: step.description }
+  ];
+  const loop = await runToolLoop(model, messages, step.type, rungDef, deps, toolCtx, emit, logUsage);
+  recordStepRun({
+    sessionId: deps.sessionId,
+    stepNo: step.id,
+    taskType: step.type,
+    skill: TASK_SKILL[step.type],
+    model: model.id,
+    provider: model.provider,
+    iterations: loop.iterations,
+    toolCalls: loop.toolCalls,
+    promptTokens: loop.prompt,
+    completionTokens: loop.completion,
+    costUsd: loop.cost,
+    finishedBy: loop.finishedBy,
+    success: loop.success,
+    durationMs: loop.durationMs
+  });
+  const summary = loop.summary || "(no summary)";
+  priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
+  emit({ type: "step-end", step, summary });
+  return { summary, success: loop.success };
+}
+async function runFix(goal, plan, verdict, policy, rungDef, deps, toolCtx, emit, logUsage) {
+  const r = routeOrBest("edit", deps.models, policy);
+  if (!r) return { summary: "(no model)", success: false };
+  const model = r.model;
+  const fixStep = {
+    id: 100,
+    type: "edit",
+    description: "Fix the unmet acceptance criteria",
+    estPromptTokens: 9e3,
+    estCompletionTokens: 1500
+  };
+  emit({ type: "step-start", step: fixStep, model, estCostUsd: r.estCostUsd });
+  const unmet = verdict.unmet.map((u, i) => `${i + 1}. ${u.criterion} \u2014 ${u.reason}`).join("\n");
+  const messages = [
+    {
+      role: "system",
+      content: `You are the FIX stage of an autonomous coding agent (escalated model). The verify gate found unmet acceptance criteria; resolve them.
+Overall goal: ${goal}
+You may use the tools (read_file, write_file, list_dir, run_command). Inspect what's there, then make the changes. Call \`finish\` with a one-line summary when all listed criteria should now pass.
+If you cannot call tools natively, reply with ONLY one JSON object per turn: {"name":"<tool>","arguments":{...}}`
+    },
+    { role: "user", content: `Unmet criteria:
+${unmet}
+Verifier feedback: ${verdict.feedback}` }
+  ];
+  const loop = await runToolLoop(model, messages, "edit", rungDef, deps, toolCtx, emit, logUsage);
+  recordStepRun({
+    sessionId: deps.sessionId,
+    stepNo: fixStep.id,
+    taskType: "edit",
+    skill: TASK_SKILL.edit,
+    model: model.id,
+    provider: model.provider,
+    iterations: loop.iterations,
+    toolCalls: loop.toolCalls,
+    promptTokens: loop.prompt,
+    completionTokens: loop.completion,
+    costUsd: loop.cost,
+    finishedBy: loop.finishedBy,
+    success: loop.success,
+    durationMs: loop.durationMs
+  });
+  emit({ type: "step-end", step: fixStep, summary: loop.summary || "(fix pass)" });
+  return { summary: loop.summary, success: loop.success };
+}
+async function runToolLoop(model, messages, taskTypeForLog, rungDef, deps, toolCtx, emit, logUsage) {
+  const useTools = model.capabilities.tools;
+  const start = Date.now();
+  let prompt = 0, completion = 0, cost = 0, toolCalls = 0, iterations = 0;
+  let summary = "";
+  let finishedBy = "max-iters";
+  try {
+    for (let iter = 0; iter < rungDef.maxIters; iter++) {
+      iterations = iter + 1;
+      const gen = deps.client.stream(
+        { model: model.id, messages, tools: useTools ? TOOL_SCHEMAS : void 0, temperature: 0.2, maxTokens: rungDef.maxTokens },
+        model.pricing
+      );
+      let next = await gen.next();
+      while (!next.done) {
+        emit({ type: "text", delta: next.value });
+        next = await gen.next();
+      }
+      const result = next.value;
+      const entry = logUsage(result, taskTypeForLog);
+      prompt += entry.promptTokens;
+      completion += entry.completionTokens;
+      cost += entry.costUsd;
+      if (result.toolCalls.length && useTools) {
+        messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
+        let finished = false;
+        for (const tc of result.toolCalls) {
+          toolCalls++;
+          emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
+          const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
+          emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
+          messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
+          if (outcome.finishSummary != null) {
+            summary = outcome.finishSummary;
+            finished = true;
+          }
+        }
+        if (finished) {
+          finishedBy = "finish-tool";
+          break;
+        }
+        continue;
+      }
+      const textCall = useTools ? parseTextToolCall(result.content) : null;
+      if (textCall) {
+        toolCalls++;
+        emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
+        const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
+        emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
+        if (outcome.finishSummary != null) {
+          summary = outcome.finishSummary;
+          finishedBy = "finish-tool";
+          break;
+        }
+        messages.push({ role: "assistant", content: result.content });
+        messages.push({
+          role: "user",
+          content: `Tool ${textCall.function.name} returned:
+${outcome.result}
+Continue. When done, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
+        });
+        continue;
+      }
+      summary = result.content || summary;
+      if (summary) finishedBy = "text";
+      break;
+    }
+  } catch (err) {
+    finishedBy = "error";
+    emit({ type: "error", message: `${taskTypeForLog} failed: ${err?.message ?? err}` });
+  }
+  return {
+    summary,
+    success: finishedBy === "finish-tool" || finishedBy === "text",
+    finishedBy,
+    iterations,
+    toolCalls,
+    prompt,
+    completion,
+    cost,
+    durationMs: Date.now() - start
+  };
 }
 function stepSystemPrompt(goal, step, priorSummaries, useTools) {
   const context = priorSummaries.length ? `
@@ -2344,12 +2759,13 @@ function stepSystemPrompt(goal, step, priorSummaries, useTools) {
 What previous steps accomplished:
 ${priorSummaries.join("\n")}` : "";
   const toolNote = useTools ? `
-You may use the provided tools (read_file, write_file, list_dir, run_command). Call the \`finish\` tool with a one-line summary when this step's objective is met.` : `
+You may use the provided tools (read_file, write_file, list_dir, run_command). Call the \`finish\` tool with a one-line summary when this step's objective is met.
+If you cannot call tools natively, reply with ONLY one JSON object per turn, no prose: {"name":"<tool>","arguments":{...}}` : `
 Return a concise result for this step. Do not ask the user questions.`;
   return `You are the "${step.type}" stage of an autonomous coding agent.
 Overall goal: ${goal}
 Your current step: ${step.description}${context}${toolNote}
-Be efficient \u2014 you were selected as the cheapest capable model for this step.`;
+Be efficient \u2014 you were selected as the most cost-effective capable model for this step.`;
 }
 // src/tui/App.tsx
@@ -2365,6 +2781,8 @@ function App(props) {
   const [tok, setTok] = useState(0);
   const [calls, setCalls] = useState(0);
   const [rated, setRated] = useState(null);
+  const [passed, setPassed] = useState(null);
+  const [attempts, setAttempts] = useState(0);
   const push = useCallback((text, color) => {
     setLog((l) => [...l, { key: l.length, text, color }]);
   }, []);
@@ -2382,7 +2800,9 @@ function App(props) {
       sessionId: props.sessionId,
       cwd: props.cwd,
       allowWrite: props.allowWrite,
-      allowCommands: props.allowCommands
+      allowCommands: props.allowCommands,
+      verify: props.verify,
+      maxAttempts: props.maxAttempts
     };
     let textBuf = "";
     const flush = () => {
@@ -2394,6 +2814,24 @@ function App(props) {
         case "plan":
           push(`\u{1F4CB} Plan (${e.plan.steps.length} steps) \xB7 planner: ${e.planModel}`, "cyan");
           break;
+        case "criteria":
+          push(`\u{1F3AF} ${e.goalType} \xB7 ${e.criteria.length} criteria \xB7 start: ${e.startTier}${e.learned ? " (learned)" : ""}`, "cyan");
+          e.criteria.forEach((cr, i) => push(`   ${i + 1}. ${cr}`, "gray"));
+          break;
+        case "verify-start":
+          flush();
+          push(`\u{1F50D} Verify (attempt ${e.attempt}) \u2192 ${e.model}`, "cyan");
+          break;
+        case "verdict":
+          flush();
+          push(
+            `${e.allMet ? "\u2705" : "\u274C"} ${e.metCount}/${e.total} criteria met` + (e.unmet.length ? " \u2014 unmet: " + e.unmet.map((u) => u.criterion).join("; ").slice(0, 100) : ""),
+            e.allMet ? "green" : "red"
+          );
+          break;
+        case "escalate":
+          push(`\u23EB Escalate \u2192 ${e.toRung}  (${e.reason})`, "magenta");
+          break;
         case "step-start":
           flush();
           push(`\u25B6 Step ${e.step.id} [${e.step.type}] \u2192 ${e.model.id}  ~${usd(e.estCostUsd)}`, "yellow");
@@ -2423,6 +2861,8 @@ function App(props) {
           break;
         case "done":
           flush();
+          setPassed(e.passed);
+          setAttempts(e.attempts);
           break;
       }
     };
@@ -2482,10 +2922,14 @@ function App(props) {
         " working\u2026"
       ] }),
       phase === "rate" && /* @__PURE__ */ jsxs(Text, { children: [
-        /* @__PURE__ */ jsxs(Text, { color: "green", children: [
-          "\u2713 Done \xB7 ",
+        /* @__PURE__ */ jsxs(Text, { color: passed === false ? "yellow" : "green", children: [
+          passed === false ? "\u26A0 goal not fully met" : "\u2713 goal met",
+          " \xB7 ",
+          attempts,
+          " attempt(s) \xB7 ",
           calls,
-          " calls \xB7 ",
+          " calls \xB7",
+          " ",
           tokens(tok),
           " tokens \xB7 ",
           usd(cost)
@@ -2570,7 +3014,7 @@ function truncate2(s, n) {
 // src/index.ts
 var program = new Command();
-program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.3.0");
+program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.4.0");
 function client(config) {
   return new OpenRouterClient({
     apiKey: resolveApiKey(config),
@@ -2644,7 +3088,7 @@ async function loadCatalog(config, refresh = false) {
 program.command("login").description("Connect Polymath to OpenRouter (set/replace your API key)").action(async () => {
   await runLogin();
 });
-program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).action(async (goalParts, opts) => {
+program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).option("--no-verify", "skip the verify-and-escalate loop (single pass)").option("--max-attempts <n>", "max code\u2192verify\u2192escalate attempts until goals met", "3").action(async (goalParts, opts) => {
   const startedAt = Date.now();
   const config = loadConfig();
   if (!config.local.enabled || resolveApiKey(config)) {
@@ -2669,6 +3113,8 @@ program.command("run", { isDefault: true }).description("Launch the interactive
       allowWrite: !!opts.write,
       allowCommands: !!opts.commands,
       objectiveLabel: policy.objective,
+      verify: opts.verify !== false,
+      maxAttempts: Math.max(1, parseInt(opts.maxAttempts, 10) || 3),
       initialGoal: goal
     })
   );

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "polymath-agent",
-  "version": "0.3.0",
+  "version": "0.4.0",
   "description": "Polymath — a cost-optimized, multi-model TUI coding agent. Decomposes work into typed tasks, routes each task to the cheapest capable model via OpenRouter, and logs real usage/cost by date + model.",
   "type": "module",
   "bin": {