polymath-agent 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +52 -1
  2. package/dist/cli.js +876 -190
  3. package/package.json +3 -2
package/dist/cli.js CHANGED
@@ -599,11 +599,14 @@ var TASK_SPECS = {
599
599
  command: { type: "command", minTier: "cheap", needsTools: true, label: "Run command" },
600
600
  review: { type: "review", minTier: "frontier", needsTools: false, label: "Review / critique" },
601
601
  reason: { type: "reason", minTier: "frontier", needsTools: false, label: "Hard reasoning" },
602
+ // The verify gate inspects files / runs tests — it MUST have tools.
603
+ verify: { type: "verify", minTier: "frontier", needsTools: true, label: "Verify result" },
602
604
  explain: { type: "explain", minTier: "cheap", needsTools: false, label: "Explain" },
603
605
  summarize: { type: "summarize", minTier: "cheap", needsTools: false, label: "Summarize" },
604
606
  chat: { type: "chat", minTier: "cheap", needsTools: false, label: "Chat" }
605
607
  };
606
608
  var ALL_TASK_TYPES = Object.keys(TASK_SPECS);
609
+ var ALL_GOAL_TYPES = ["feature", "bugfix", "refactor", "test", "docs", "chore", "other"];
607
610
 
608
611
  // src/planner/planner.ts
609
612
  var PLAN_SYSTEM = `You are the planning stage of a coding agent. Break the user's request into a short, ordered list of concrete steps.
@@ -619,9 +622,21 @@ Each step must be classified by type, chosen from EXACTLY this set:
619
622
  summarize - condense long content
620
623
  chat - a simple conversational reply
621
624
 
625
+ Also classify the request's goalType (one of: feature, bugfix, refactor, test, docs, chore, other) and write 2-5 MEASURABLE acceptance criteria \u2014 concrete, checkable conditions that mean the goal is fully achieved (e.g. "hello.js exists and prints the greeting", "npm test passes", "the function handles empty input").
626
+
622
627
  Return ONLY minified JSON of the form:
623
- {"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
628
+ {"goalType":"<type>","criteria":["...","..."],"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
624
629
  Use 3-8 steps for non-trivial work, fewer for simple requests. Estimate tokens realistically (prompts often 2000-15000, completions 200-3000).`;
630
+ function classifyGoalType(goal) {
631
+ const g = goal.toLowerCase();
632
+ if (/\b(fix|bug|broken|error|crash|regression|fails?)\b/.test(g)) return "bugfix";
633
+ if (/\b(refactor|rename|clean ?up|restructure|extract|simplif)/.test(g)) return "refactor";
634
+ if (/\b(test|spec|coverage|unit test|e2e)\b/.test(g)) return "test";
635
+ if (/\b(docs?|readme|comment|documentation)\b/.test(g)) return "docs";
636
+ if (/\b(bump|upgrade|dependency|deps|config|chore|lint|format)\b/.test(g)) return "chore";
637
+ if (/\b(add|create|implement|build|feature|support|new)\b/.test(g)) return "feature";
638
+ return "other";
639
+ }
625
640
  function heuristicPlan(goal) {
626
641
  const steps = [
627
642
  { id: 1, type: "plan", description: "Decompose the request", estPromptTokens: 2e3, estCompletionTokens: 600 },
@@ -630,7 +645,12 @@ function heuristicPlan(goal) {
630
645
  { id: 4, type: "edit", description: "Implement the change", estPromptTokens: 9e3, estCompletionTokens: 1500 },
631
646
  { id: 5, type: "review", description: "Review the change", estPromptTokens: 6e3, estCompletionTokens: 800 }
632
647
  ];
633
- return { goal, steps };
648
+ return {
649
+ goal,
650
+ steps,
651
+ goalType: classifyGoalType(goal),
652
+ criteria: ["The stated goal is fully implemented and works", "No obvious errors or omissions remain"]
653
+ };
634
654
  }
635
655
  async function planRequest(goal, client2, planModel, onUsage) {
636
656
  const result = await client2.complete(
@@ -648,7 +668,7 @@ async function planRequest(goal, client2, planModel, onUsage) {
648
668
  onUsage?.(result);
649
669
  const parsed = extractPlan(result.content);
650
670
  if (!parsed) return heuristicPlan(goal);
651
- return { goal, steps: parsed };
671
+ return { goal, ...parsed };
652
672
  }
653
673
  function extractPlan(text) {
654
674
  const json = extractJson(text);
@@ -663,7 +683,10 @@ function extractPlan(text) {
663
683
  estPromptTokens: clampInt(s.estPromptTokens, 500, 6e4, 4e3),
664
684
  estCompletionTokens: clampInt(s.estCompletionTokens, 100, 8e3, 800)
665
685
  }));
666
- return steps.length ? steps : null;
686
+ if (!steps.length) return null;
687
+ const goalType = ALL_GOAL_TYPES.includes(String(obj.goalType)) ? obj.goalType : "other";
688
+ const criteria = Array.isArray(obj.criteria) ? obj.criteria.map((x) => String(x).slice(0, 200)).filter(Boolean).slice(0, 6) : [];
689
+ return { steps, goalType, criteria: criteria.length ? criteria : ["The stated goal is fully achieved"] };
667
690
  } catch {
668
691
  return null;
669
692
  }
@@ -705,10 +728,29 @@ function extractJson(text) {
705
728
  }
706
729
 
707
730
  // src/router/policy.ts
731
+ var ESCALATION_LADDER = [
732
+ { objective: "value", maxTokens: 2e3, maxIters: 6, liftCostCap: false, label: "value \xB7 cheapest-capable" },
733
+ { tierFloor: "standard", objective: "value", maxTokens: 4e3, maxIters: 8, liftCostCap: true, label: "standard+ \xB7 more tokens" },
734
+ { tierFloor: "frontier", objective: "quality", maxTokens: 8e3, maxIters: 10, liftCostCap: true, label: "frontier \xB7 strongest" }
735
+ ];
736
+ function rungForTier(tier) {
737
+ return ESCALATION_LADDER.findIndex((r) => r.tierFloor === tier || !r.tierFloor && tier === "cheap");
738
+ }
739
+ function applyRung(base, rung) {
740
+ return {
741
+ ...base,
742
+ objective: rung.objective,
743
+ tierFloor: rung.tierFloor,
744
+ maxCostPerCallUsd: rung.liftCostCap ? void 0 : base.maxCostPerCallUsd
745
+ };
746
+ }
708
747
  var TIER_RANK = { cheap: 0, standard: 1, frontier: 2 };
709
748
  function tierAtLeast(tier, min) {
710
749
  return TIER_RANK[tier] >= TIER_RANK[min];
711
750
  }
751
+ function tierRank(tier) {
752
+ return TIER_RANK[tier];
753
+ }
712
754
  function blendedPrice(m) {
713
755
  return (m.pricing.promptUsdPerMTok * 3 + m.pricing.completionUsdPerMTok) / 4;
714
756
  }
@@ -755,6 +797,7 @@ var TASK_SKILL = {
755
797
  command: "speed",
756
798
  review: "reasoning",
757
799
  reason: "reasoning",
800
+ verify: "reasoning",
758
801
  explain: "general",
759
802
  summarize: "speed",
760
803
  chat: "speed"
@@ -781,6 +824,7 @@ var TASK_MIN_STRENGTH = {
781
824
  edit: 1.4,
782
825
  review: 1.5,
783
826
  reason: 1.5,
827
+ verify: 1.4,
784
828
  plan: 1.2
785
829
  };
786
830
  var HEADLINE_SKILLS = ["coding", "reasoning", "retrieval", "speed"];
@@ -798,9 +842,10 @@ function taskValue(m, taskType, empirical) {
798
842
  function candidatesFor(taskType, models, policy, est) {
799
843
  const spec = TASK_SPECS[taskType];
800
844
  const strengthFloor = TASK_MIN_STRENGTH[taskType] ?? 0;
845
+ const minTier = policy.tierFloor && tierRank(policy.tierFloor) > tierRank(spec.minTier) ? policy.tierFloor : spec.minTier;
801
846
  return models.filter((m) => {
802
847
  if (m.id === "openrouter/auto") return false;
803
- const covers = tierAtLeast(m.tier, spec.minTier) || taskStrength(m, taskType) >= strengthFloor;
848
+ const covers = tierAtLeast(m.tier, minTier) || !policy.tierFloor && taskStrength(m, taskType) >= strengthFloor;
804
849
  if (!covers) return false;
805
850
  if (spec.needsTools && !m.capabilities.tools) return false;
806
851
  if (policy.maxCostPerCallUsd != null && est) {
@@ -846,6 +891,19 @@ function route(taskType, models, policy, est = { promptTokens: 4e3, completionTo
846
891
  const reason = policy.objective === "cheapest" ? `cheapest model that covers ${skill}` : policy.objective === "quality" ? `strongest at ${skill}` : proven ? `proven ${Math.round(proven)}% fewer tokens on ${taskType} (playbook)` : `best ${skill}-per-dollar`;
847
892
  return { model: chosen, reason, estCostUsd: projectCost(chosen, est) };
848
893
  }
894
+ function routeOrBest(taskType, models, policy, est = { promptTokens: 4e3, completionTokens: 1e3 }) {
895
+ const r = route(taskType, models, policy, est);
896
+ if (r) return r;
897
+ const spec = TASK_SPECS[taskType];
898
+ const usable = models.filter(
899
+ (m) => m.id !== "openrouter/auto" && (!spec.needsTools || m.capabilities.tools)
900
+ );
901
+ if (!usable.length) return null;
902
+ const byStrength = (a, b) => taskStrength(b, taskType) - taskStrength(a, taskType);
903
+ const withTools = usable.filter((m) => m.capabilities.tools).sort(byStrength);
904
+ const best = (withTools.length ? withTools : [...usable].sort(byStrength))[0];
905
+ return { model: best, reason: `best available for ${TASK_SKILL[taskType]} (fallback)`, estCostUsd: projectCost(best, est) };
906
+ }
849
907
 
850
908
  // src/recommend/recommend.ts
851
909
  var OBJECTIVES = [
@@ -1075,6 +1133,27 @@ function getDb() {
1075
1133
  );
1076
1134
  CREATE INDEX IF NOT EXISTS idx_cmd_date ON command_runs(date);
1077
1135
 
1136
+ -- One row per verify-and-escalate attempt within a session. Powers the
1137
+ -- "optimal starting model per goal type" statistical learning.
1138
+ CREATE TABLE IF NOT EXISTS attempts (
1139
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
1140
+ session_id TEXT NOT NULL,
1141
+ attempt_no INTEGER NOT NULL,
1142
+ goal_type TEXT NOT NULL,
1143
+ tier_floor TEXT,
1144
+ objective TEXT NOT NULL,
1145
+ prompt_tokens INTEGER NOT NULL,
1146
+ completion_tokens INTEGER NOT NULL,
1147
+ cost_usd REAL NOT NULL,
1148
+ criteria_total INTEGER NOT NULL,
1149
+ criteria_met INTEGER NOT NULL,
1150
+ passed INTEGER NOT NULL,
1151
+ duration_ms INTEGER NOT NULL,
1152
+ synced INTEGER NOT NULL DEFAULT 0
1153
+ );
1154
+ CREATE INDEX IF NOT EXISTS idx_attempts_session ON attempts(session_id);
1155
+ CREATE INDEX IF NOT EXISTS idx_attempts_goal ON attempts(goal_type, tier_floor);
1156
+
1078
1157
  -- Distilled efficiency insights: ONLY the notably cost-efficient approaches.
1079
1158
  -- This is what syncs to the cloud by default (raw logs stay local).
1080
1159
  CREATE TABLE IF NOT EXISTS insights (
@@ -1096,6 +1175,15 @@ function getDb() {
1096
1175
  if (!cols.some((c2) => c2.name === "command")) {
1097
1176
  db.exec(`ALTER TABLE usage_log ADD COLUMN command TEXT NOT NULL DEFAULT 'run'`);
1098
1177
  }
1178
+ const conn = db;
1179
+ const scols = conn.prepare(`PRAGMA table_info(sessions)`).all();
1180
+ const addSession = (name, decl) => {
1181
+ if (!scols.some((c2) => c2.name === name)) conn.exec(`ALTER TABLE sessions ADD COLUMN ${name} ${decl}`);
1182
+ };
1183
+ addSession("goal_type", "TEXT NOT NULL DEFAULT 'other'");
1184
+ addSession("start_tier", "TEXT");
1185
+ addSession("attempts", "INTEGER NOT NULL DEFAULT 1");
1186
+ addSession("final_passed", "INTEGER");
1099
1187
  return db;
1100
1188
  }
1101
1189
  function recordUsage(e) {
@@ -1188,14 +1276,14 @@ function markSynced(ids) {
1188
1276
  }
1189
1277
  function startSession(s) {
1190
1278
  getDb().prepare(
1191
- `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps)
1192
- VALUES (?, ?, ?, ?, ?, ?, ?)`
1193
- ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps);
1279
+ `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps, goal_type, start_tier)
1280
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
1281
+ ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps, s.goalType, s.startTier ?? null);
1194
1282
  }
1195
1283
  function finishSession(id, u) {
1196
1284
  getDb().prepare(
1197
1285
  `UPDATE sessions SET planned_steps=?, completed_steps=?, failed_steps=?, auto_score=?,
1198
- prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=? WHERE id=?`
1286
+ prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=?, attempts=?, final_passed=? WHERE id=?`
1199
1287
  ).run(
1200
1288
  u.plannedSteps,
1201
1289
  u.completedSteps,
@@ -1205,9 +1293,60 @@ function finishSession(id, u) {
1205
1293
  u.completionTokens,
1206
1294
  u.costUsd,
1207
1295
  u.durationMs,
1296
+ u.attempts ?? 1,
1297
+ u.finalPassed == null ? null : u.finalPassed ? 1 : 0,
1208
1298
  id
1209
1299
  );
1210
1300
  }
1301
+ function recordAttempt(a) {
1302
+ getDb().prepare(
1303
+ `INSERT INTO attempts
1304
+ (session_id, attempt_no, goal_type, tier_floor, objective, prompt_tokens, completion_tokens,
1305
+ cost_usd, criteria_total, criteria_met, passed, duration_ms)
1306
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
1307
+ ).run(
1308
+ a.sessionId,
1309
+ a.attemptNo,
1310
+ a.goalType,
1311
+ a.tierFloor,
1312
+ a.objective,
1313
+ a.promptTokens,
1314
+ a.completionTokens,
1315
+ a.costUsd,
1316
+ a.criteriaTotal,
1317
+ a.criteriaMet,
1318
+ a.passed ? 1 : 0,
1319
+ a.durationMs
1320
+ );
1321
+ }
1322
+ function goalTierStats() {
1323
+ const rows = getDb().prepare(
1324
+ `SELECT goal_type AS goalType, COALESCE(start_tier,'cheap') AS startTier,
1325
+ COUNT(*) AS sessions,
1326
+ AVG(CASE WHEN final_passed=1 THEN 1.0 ELSE 0.0 END) AS passRate,
1327
+ AVG(prompt_tokens + completion_tokens) AS avgTotalTokens,
1328
+ AVG(attempts) AS avgAttempts
1329
+ FROM sessions
1330
+ WHERE final_passed IS NOT NULL
1331
+ GROUP BY goal_type, startTier
1332
+ ORDER BY goal_type, avgTotalTokens ASC`
1333
+ ).all();
1334
+ return rows.map((r) => ({
1335
+ goalType: String(r.goalType),
1336
+ startTier: String(r.startTier),
1337
+ sessions: Number(r.sessions),
1338
+ passRate: Number(r.passRate ?? 0),
1339
+ avgTotalTokens: Number(r.avgTotalTokens ?? 0),
1340
+ avgAttempts: Number(r.avgAttempts ?? 0)
1341
+ }));
1342
+ }
1343
+ function optimalStartTier(goalType, minSessions = 3) {
1344
+ const stats = goalTierStats().filter(
1345
+ (s) => s.goalType === goalType && s.sessions >= minSessions && s.passRate >= 0.6
1346
+ );
1347
+ if (!stats.length) return null;
1348
+ return stats.sort((a, b) => a.avgTotalTokens - b.avgTotalTokens)[0].startTier;
1349
+ }
1211
1350
  function setUserScore(sessionId, score) {
1212
1351
  getDb().prepare(`UPDATE sessions SET user_score=? WHERE id=?`).run(score, sessionId);
1213
1352
  }
@@ -1651,6 +1790,35 @@ function renderAnalysis(filter = {}) {
1651
1790
  }
1652
1791
  out.push("");
1653
1792
  }
1793
+ const tierStats = goalTierStats();
1794
+ if (tierStats.length) {
1795
+ out.push(c.bold("Optimal starting model per goal type") + c.dim(" (pass rate vs total tokens to reach the goal)"));
1796
+ out.push(
1797
+ table(
1798
+ ["Goal type", "Start tier", "Sessions", "Pass rate", "Avg total tok", "Avg attempts"],
1799
+ tierStats.map((s) => [
1800
+ s.goalType,
1801
+ tierColor(s.startTier),
1802
+ String(s.sessions),
1803
+ `${Math.round(s.passRate * 100)}%`,
1804
+ tokens(Math.round(s.avgTotalTokens)),
1805
+ s.avgAttempts.toFixed(1)
1806
+ ])
1807
+ )
1808
+ );
1809
+ const goalTypes = [...new Set(tierStats.map((s) => s.goalType))];
1810
+ const learned = goalTypes.map((g) => ({ g, tier: optimalStartTier(g) })).filter((x) => x.tier);
1811
+ if (learned.length) {
1812
+ out.push(
1813
+ c.green(
1814
+ "\u2192 Learned starts (auto-applied on `poly run`): " + learned.map((x) => `${x.g}\u2192${x.tier}`).join(", ")
1815
+ )
1816
+ );
1817
+ } else {
1818
+ out.push(c.dim("\u2192 Not enough evidence yet to auto-pick a starting tier (needs \u22653 verified sessions per goal type)."));
1819
+ }
1820
+ out.push("");
1821
+ }
1654
1822
  if (byCommand.length) {
1655
1823
  out.push(c.bold("Usage by command"));
1656
1824
  out.push(
@@ -2012,6 +2180,274 @@ function logCompletion(result, taskType, sessionId, command = "run") {
2012
2180
  return entry;
2013
2181
  }
2014
2182
 
2183
+ // src/setup/commands.ts
2184
+ import { execSync as execSync2 } from "node:child_process";
2185
+
2186
+ // src/util/prompt.ts
2187
+ import readline2 from "node:readline";
2188
+ function interactive() {
2189
+ return process.stdin.isTTY === true && process.stdout.isTTY === true;
2190
+ }
2191
+ function ask2(question) {
2192
+ return new Promise((resolve2) => {
2193
+ const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
2194
+ rl.question(question, (a) => {
2195
+ rl.close();
2196
+ resolve2(a.trim());
2197
+ });
2198
+ });
2199
+ }
2200
+ async function confirm(question, def = true) {
2201
+ if (!interactive()) return def;
2202
+ const hint = def ? "[Y/n]" : "[y/N]";
2203
+ const a = (await ask2(`${question} ${hint} `)).toLowerCase();
2204
+ if (!a) return def;
2205
+ return /^y/.test(a);
2206
+ }
2207
+ async function select(question, items, render2) {
2208
+ if (!interactive() || items.length <= 1) return items[0];
2209
+ console.log(question);
2210
+ items.forEach((it, i) => console.log(` ${i + 1}) ${render2(it)}`));
2211
+ const a = await ask2(`Choose [1-${items.length}] (default 1): `);
2212
+ const n = parseInt(a, 10);
2213
+ return Number.isInteger(n) && n >= 1 && n <= items.length ? items[n - 1] : items[0];
2214
+ }
2215
+
2216
+ // src/setup/localllm.ts
2217
+ import { execSync, spawn } from "node:child_process";
2218
+ import os2 from "node:os";
2219
+ function suggestModels() {
2220
+ const ramGb = Math.round(os2.totalmem() / 1024 ** 3);
2221
+ const list = [];
2222
+ if (ramGb >= 13) list.push({ id: "qwen2.5-coder:7b", label: "Qwen2.5 Coder 7B", sizeGb: 4.7, note: "best coding pick for ~16GB" });
2223
+ list.push({ id: "llama3.2:3b", label: "Llama 3.2 3B", sizeGb: 2, note: "fast, light; great for cheap tasks" });
2224
+ if (ramGb >= 30) list.push({ id: "qwen2.5-coder:14b", label: "Qwen2.5 Coder 14B", sizeGb: 9, note: "stronger coding for 32GB+" });
2225
+ return list;
2226
+ }
2227
+ function totalRamGb() {
2228
+ return Math.round(os2.totalmem() / 1024 ** 3);
2229
+ }
2230
+ function which(cmd) {
2231
+ try {
2232
+ execSync(process.platform === "win32" ? `where ${cmd}` : `command -v ${cmd}`, { stdio: "ignore" });
2233
+ return true;
2234
+ } catch {
2235
+ return false;
2236
+ }
2237
+ }
2238
+ function ollamaInstalled() {
2239
+ return which("ollama");
2240
+ }
2241
+ function ollamaVersion() {
2242
+ try {
2243
+ return execSync("ollama --version", { encoding: "utf8" }).trim();
2244
+ } catch {
2245
+ return null;
2246
+ }
2247
+ }
2248
+ async function ollamaServerUp(baseUrl = "http://localhost:11434") {
2249
+ try {
2250
+ const res = await fetch(`${baseUrl.replace(/\/v1\/?$/, "")}/api/version`);
2251
+ return res.ok;
2252
+ } catch {
2253
+ return false;
2254
+ }
2255
+ }
2256
+ async function installedModels(baseUrl = "http://localhost:11434") {
2257
+ try {
2258
+ const res = await fetch(`${baseUrl.replace(/\/v1\/?$/, "")}/api/tags`);
2259
+ if (!res.ok) return [];
2260
+ const json = await res.json();
2261
+ return (json.models ?? []).map((m) => m.name);
2262
+ } catch {
2263
+ return [];
2264
+ }
2265
+ }
2266
+ function run(cmd, args) {
2267
+ return new Promise((resolve2) => {
2268
+ const child = spawn(cmd, args, { stdio: "inherit" });
2269
+ child.on("error", () => resolve2(false));
2270
+ child.on("exit", (code) => resolve2(code === 0));
2271
+ });
2272
+ }
2273
+ function ollamaInstallPlan() {
2274
+ const platform = process.platform;
2275
+ if (platform === "darwin") {
2276
+ if (which("brew")) return { canAuto: true, command: { cmd: "brew", args: ["install", "ollama"] }, manual: "brew install ollama" };
2277
+ return { canAuto: false, manual: "Install Homebrew (https://brew.sh) then `brew install ollama`, or download https://ollama.com/download" };
2278
+ }
2279
+ if (platform === "linux") {
2280
+ return { canAuto: true, command: { cmd: "sh", args: ["-c", "curl -fsSL https://ollama.com/install.sh | sh"] }, manual: "curl -fsSL https://ollama.com/install.sh | sh" };
2281
+ }
2282
+ if (platform === "win32") {
2283
+ if (which("winget")) return { canAuto: true, command: { cmd: "winget", args: ["install", "-e", "--id", "Ollama.Ollama"] }, manual: "winget install Ollama.Ollama" };
2284
+ return { canAuto: false, manual: "Download the installer from https://ollama.com/download" };
2285
+ }
2286
+ return { canAuto: false, manual: "See https://ollama.com/download" };
2287
+ }
2288
+ async function ensureServer(baseUrl = "http://localhost:11434") {
2289
+ if (await ollamaServerUp(baseUrl)) return true;
2290
+ if (process.platform === "darwin" && which("brew")) {
2291
+ await run("brew", ["services", "start", "ollama"]);
2292
+ } else {
2293
+ try {
2294
+ const child = spawn("ollama", ["serve"], { stdio: "ignore", detached: true });
2295
+ child.unref();
2296
+ } catch {
2297
+ }
2298
+ }
2299
+ for (let i = 0; i < 10; i++) {
2300
+ if (await ollamaServerUp(baseUrl)) return true;
2301
+ await delay(500);
2302
+ }
2303
+ return false;
2304
+ }
2305
+ function delay(ms) {
2306
+ return new Promise((r) => setTimeout(r, ms));
2307
+ }
2308
+
2309
+ // src/setup/commands.ts
2310
+ async function runSetup(opts) {
2311
+ console.log(c.bold("\n\u{1F527} Polymath setup\n"));
2312
+ const config = loadConfig();
2313
+ let wantLocal = opts.local;
2314
+ if (wantLocal === void 0) {
2315
+ wantLocal = await confirm(
2316
+ `Install a local LLM (Ollama) for $0, offline, no-API-key runs? (RAM detected: ${totalRamGb()}GB)`,
2317
+ true
2318
+ );
2319
+ }
2320
+ if (wantLocal) {
2321
+ await setupLocal(opts, config);
2322
+ } else {
2323
+ config.local.enabled = false;
2324
+ saveConfig(config);
2325
+ console.log(c.dim("Skipping local LLM. (You can run `poly setup --local` later.)"));
2326
+ }
2327
+ const freshConfig = loadConfig();
2328
+ if (!resolveApiKey(freshConfig)) {
2329
+ const wantKey = opts.yes ? false : await confirm("Connect an OpenRouter API key for cloud models (300+ models)?", !wantLocal);
2330
+ if (wantKey) await runLogin();
2331
+ else if (!wantLocal) console.log(c.yellow("No models configured yet \u2014 run `poly login` or `poly setup --local`."));
2332
+ }
2333
+ console.log(c.green("\n\u2713 Setup complete.") + c.dim(' Try: poly recommend "add a dark-mode toggle" \xB7 poly run -w "..."'));
2334
+ }
2335
+ async function setupLocal(opts, config) {
2336
+ if (!ollamaInstalled()) {
2337
+ const plan = ollamaInstallPlan();
2338
+ console.log(c.cyan("Local LLM runtime: Ollama is not installed."));
2339
+ if (plan.canAuto && plan.command) {
2340
+ const go = opts.yes || await confirm(`Install Ollama via \`${plan.command.cmd} ${plan.command.args.join(" ")}\`?`, true);
2341
+ if (go) {
2342
+ const ok = await run(plan.command.cmd, plan.command.args);
2343
+ if (!ok) console.log(c.yellow("Auto-install failed. Manual: " + plan.manual));
2344
+ } else {
2345
+ console.log(c.dim("Manual install: " + plan.manual));
2346
+ }
2347
+ } else {
2348
+ console.log(c.yellow("Install manually: " + plan.manual));
2349
+ }
2350
+ } else {
2351
+ console.log(c.green("\u2713 Ollama present ") + c.dim(ollamaVersion() ?? ""));
2352
+ }
2353
+ if (!ollamaInstalled()) {
2354
+ console.log(c.yellow("Ollama still not on PATH \u2014 re-run `poly setup --local` after installing."));
2355
+ return;
2356
+ }
2357
+ process.stdout.write("Starting Ollama server\u2026 ");
2358
+ const up = await ensureServer(config.local.baseUrl);
2359
+ console.log(up ? c.green("ok") : c.yellow("could not confirm (start it with `ollama serve`)"));
2360
+ const have = await installedModels(config.local.baseUrl);
2361
+ let modelId = opts.model;
2362
+ if (!modelId) {
2363
+ const suggestions = suggestModels().filter((s) => !have.includes(s.id));
2364
+ if (have.length && !suggestions.length) {
2365
+ modelId = have[0];
2366
+ console.log(c.dim(`Using already-installed model ${modelId}.`));
2367
+ } else {
2368
+ const pick = opts.yes ? suggestModels()[0] : await select(
2369
+ "Pick a model to download:",
2370
+ suggestModels(),
2371
+ (s) => `${s.label} (~${s.sizeGb}GB) \u2014 ${s.note}${have.includes(s.id) ? " [installed]" : ""}`
2372
+ );
2373
+ modelId = pick.id;
2374
+ }
2375
+ }
2376
+ if (!have.includes(modelId)) {
2377
+ console.log(c.cyan(`Downloading ${modelId}\u2026`));
2378
+ const ok = await run("ollama", ["pull", modelId]);
2379
+ if (!ok) {
2380
+ console.log(c.yellow(`Could not pull ${modelId}. Run \`ollama pull ${modelId}\` manually.`));
2381
+ return;
2382
+ }
2383
+ }
2384
+ config.local.enabled = true;
2385
+ saveConfig(config);
2386
+ console.log(c.green(`\u2713 Local LLM ready: ${modelId} \u2192 local/${modelId} ($0). `) + c.dim("Enabled in config."));
2387
+ }
2388
+ function cmp(a, b) {
2389
+ const pa = a.replace(/^v/, "").split(".").map((n) => parseInt(n, 10) || 0);
2390
+ const pb = b.replace(/^v/, "").split(".").map((n) => parseInt(n, 10) || 0);
2391
+ for (let i = 0; i < 3; i++) {
2392
+ if ((pa[i] ?? 0) !== (pb[i] ?? 0)) return (pa[i] ?? 0) - (pb[i] ?? 0);
2393
+ }
2394
+ return 0;
2395
+ }
2396
+ async function runUpdate(currentVersion, opts) {
2397
+ const all = !opts.self && !opts.ollama && !opts.models;
2398
+ console.log(c.bold("\n\u2B06\uFE0F Polymath update") + (opts.check ? c.dim(" (check only)") : "") + "\n");
2399
+ if (all || opts.self) {
2400
+ let latest = "";
2401
+ try {
2402
+ latest = execSync2("npm view polymath-agent version", { encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] }).trim();
2403
+ } catch {
2404
+ latest = "";
2405
+ }
2406
+ if (!latest) {
2407
+ console.log(c.dim("CLI: could not reach npm registry."));
2408
+ } else if (cmp(latest, currentVersion) > 0) {
2409
+ console.log(c.yellow(`CLI: ${currentVersion} \u2192 ${latest} available.`));
2410
+ if (!opts.check) {
2411
+ const ok = await run("npm", ["install", "-g", `polymath-agent@${latest}`]);
2412
+ console.log(ok ? c.green(`\u2713 Updated to ${latest}.`) : c.red("npm update failed (try: sudo npm i -g polymath-agent@latest)."));
2413
+ } else {
2414
+ console.log(c.dim(" Run `poly update` to install."));
2415
+ }
2416
+ } else {
2417
+ console.log(c.green(`\u2713 CLI is up to date (${currentVersion}).`));
2418
+ }
2419
+ }
2420
+ if (all || opts.ollama) {
2421
+ if (!ollamaInstalled()) {
2422
+ console.log(c.dim("Ollama: not installed (run `poly setup --local`)."));
2423
+ } else if (opts.check) {
2424
+ console.log(c.dim(`Ollama: ${ollamaVersion() ?? "present"} (update with \`poly update --ollama\`).`));
2425
+ } else if (process.platform === "darwin") {
2426
+ console.log(c.cyan("Updating Ollama\u2026"));
2427
+ await run("brew", ["upgrade", "ollama"]).then((ok) => !ok && console.log(c.dim(" (brew upgrade skipped/failed)")));
2428
+ } else if (process.platform === "linux") {
2429
+ await run("sh", ["-c", "curl -fsSL https://ollama.com/install.sh | sh"]);
2430
+ } else {
2431
+ console.log(c.dim("Ollama: update via your installer (winget upgrade Ollama.Ollama)."));
2432
+ }
2433
+ }
2434
+ if (all || opts.models) {
2435
+ const config = loadConfig();
2436
+ const models = await installedModels(config.local.baseUrl);
2437
+ if (!models.length) {
2438
+ console.log(c.dim("Models: none installed."));
2439
+ } else if (opts.check) {
2440
+ console.log(c.dim(`Models: ${models.join(", ")} (re-pull to update).`));
2441
+ } else {
2442
+ for (const m of models) {
2443
+ console.log(c.cyan(`Updating ${m}\u2026`));
2444
+ await run("ollama", ["pull", m]);
2445
+ }
2446
+ }
2447
+ }
2448
+ console.log("");
2449
+ }
2450
+
2015
2451
  // src/tui/App.tsx
2016
2452
  import { useState, useEffect, useCallback } from "react";
2017
2453
  import { Box, Text, useApp, useInput } from "ink";
@@ -2021,7 +2457,7 @@ import Spinner from "ink-spinner";
2021
2457
  // src/agent/tools.ts
2022
2458
  import fs4 from "node:fs";
2023
2459
  import path2 from "node:path";
2024
- import { execSync } from "node:child_process";
2460
+ import { execSync as execSync3 } from "node:child_process";
2025
2461
  var TOOL_SCHEMAS = [
2026
2462
  {
2027
2463
  type: "function",
@@ -2086,6 +2522,28 @@ var TOOL_SCHEMAS = [
2086
2522
  }
2087
2523
  }
2088
2524
  ];
2525
+ var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
2526
+ var READONLY_TOOL_SCHEMAS = TOOL_SCHEMAS.filter(
2527
+ (t) => ["read_file", "list_dir", "run_command"].includes(t.function.name)
2528
+ );
2529
+ function parseTextToolCall(content) {
2530
+ if (!content) return null;
2531
+ const json = extractJson(content);
2532
+ if (!json) return null;
2533
+ try {
2534
+ const obj = JSON.parse(json);
2535
+ const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
2536
+ if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
2537
+ const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
2538
+ return {
2539
+ id: `textcall_${name}`,
2540
+ type: "function",
2541
+ function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
2542
+ };
2543
+ } catch {
2544
+ return null;
2545
+ }
2546
+ }
2089
2547
  var MAX_OUTPUT = 8e3;
2090
2548
  function clip(s) {
2091
2549
  return s.length > MAX_OUTPUT ? s.slice(0, MAX_OUTPUT) + `
@@ -2142,7 +2600,7 @@ function executeTool(name, argsJson, ctx) {
2142
2600
  }
2143
2601
  case "run_command": {
2144
2602
  if (!ctx.allowCommands) return { result: "Denied: run_command is disabled." };
2145
- const out = execSync(String(args.command), {
2603
+ const out = execSync3(String(args.command), {
2146
2604
  cwd: ctx.cwd,
2147
2605
  encoding: "utf8",
2148
2606
  env: scrubbedEnv(),
@@ -2167,50 +2625,124 @@ ${stderr}`)) };
2167
2625
  }
2168
2626
  }
2169
2627
 
2170
- // src/agent/loop.ts
2171
- var MAX_ITERS_PER_STEP = 6;
2172
- var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
2173
- function parseTextToolCall(content) {
2174
- if (!content) return null;
2175
- const json = extractJson(content);
2628
+ // src/agent/verify.ts
2629
+ var VERIFY_MAX_ITERS = 8;
2630
+ var VERIFY_SYSTEM = `You are the VERIFY stage of an autonomous coding agent. Your job is to MEASURE whether the goal was actually achieved \u2014 be skeptical and check the real workspace, do not assume.
2631
+ Use the read-only tools (read_file, list_dir, run_command) to inspect files and, where relevant, run build/test commands. Then judge EACH acceptance criterion against what you actually observed.
2632
+ When done, reply with ONLY this JSON (no prose, no code fence):
2633
+ {"results":[{"criterion":"<verbatim>","met":true|false,"reason":"<evidence>"}],"feedback":"<concrete guidance to fix any unmet criteria>"}`;
2634
+ async function verifyGoal(goal, criteria, deps, ev = {}) {
2635
+ const toolCtx = { cwd: deps.cwd, allowWrite: false, allowCommands: deps.allowCommands };
2636
+ const useTools = deps.model.capabilities.tools;
2637
+ const messages = [
2638
+ { role: "system", content: VERIFY_SYSTEM },
2639
+ {
2640
+ role: "user",
2641
+ content: `Goal: ${goal}
2642
+
2643
+ Acceptance criteria:
2644
+ ` + criteria.map((c2, i) => `${i + 1}. ${c2}`).join("\n") + `
2645
+
2646
+ Inspect the workspace, then return the verdict JSON.`
2647
+ }
2648
+ ];
2649
+ let verdict = null;
2650
+ for (let iter = 0; iter < VERIFY_MAX_ITERS; iter++) {
2651
+ const gen = deps.client.stream(
2652
+ { model: deps.model.id, messages, tools: useTools ? READONLY_TOOL_SCHEMAS : void 0, temperature: 0, maxTokens: 1500 },
2653
+ deps.model.pricing
2654
+ );
2655
+ let next = await gen.next();
2656
+ while (!next.done) next = await gen.next();
2657
+ const result = next.value;
2658
+ ev.onUsage?.(result);
2659
+ const calls = result.toolCalls.length ? result.toolCalls : useTools && parseTextToolCall(result.content) ? [parseTextToolCall(result.content)] : [];
2660
+ const parsed = parseVerdict(result.content, criteria);
2661
+ if (parsed) {
2662
+ verdict = parsed;
2663
+ break;
2664
+ }
2665
+ if (calls.length) {
2666
+ if (result.toolCalls.length) messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
2667
+ for (const tc of calls) {
2668
+ ev.onToolCall?.(tc.function.name, tc.function.arguments);
2669
+ const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
2670
+ ev.onToolResult?.(tc.function.name, outcome.result);
2671
+ if (result.toolCalls.length) {
2672
+ messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
2673
+ } else {
2674
+ messages.push({ role: "assistant", content: result.content });
2675
+ messages.push({ role: "user", content: `Tool ${tc.function.name} returned:
2676
+ ${outcome.result}
2677
+ Continue, then return the verdict JSON.` });
2678
+ }
2679
+ }
2680
+ continue;
2681
+ }
2682
+ messages.push({ role: "assistant", content: result.content });
2683
+ messages.push({ role: "user", content: `Return ONLY the verdict JSON now.` });
2684
+ }
2685
+ return verdict ?? fallbackVerdict(criteria);
2686
+ }
2687
+ function parseVerdict(text, criteria) {
2688
+ const json = extractJson(text);
2176
2689
  if (!json) return null;
2177
2690
  try {
2178
2691
  const obj = JSON.parse(json);
2179
- const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
2180
- if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
2181
- const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
2692
+ if (!Array.isArray(obj.results)) return null;
2693
+ const results = obj.results.map((r) => ({
2694
+ criterion: String(r.criterion ?? ""),
2695
+ met: r.met === true || String(r.met).toLowerCase() === "true",
2696
+ reason: String(r.reason ?? "").slice(0, 300)
2697
+ }));
2698
+ if (!results.length) return null;
2699
+ const unmet = results.filter((r) => !r.met);
2182
2700
  return {
2183
- id: `textcall_${name}`,
2184
- type: "function",
2185
- function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
2701
+ total: results.length,
2702
+ metCount: results.length - unmet.length,
2703
+ allMet: unmet.length === 0,
2704
+ results,
2705
+ unmet,
2706
+ feedback: String(obj.feedback ?? "").slice(0, 1e3) || unmet.map((u) => u.reason).join("; ")
2186
2707
  };
2187
2708
  } catch {
2188
2709
  return null;
2189
2710
  }
2190
2711
  }
2712
+ function fallbackVerdict(criteria) {
2713
+ const results = criteria.map((c2) => ({ criterion: c2, met: false, reason: "verifier produced no verdict" }));
2714
+ return { total: results.length, metCount: 0, allMet: false, results, unmet: results, feedback: "Verification inconclusive; re-attempt with a stronger model." };
2715
+ }
2716
+
2717
+ // src/agent/loop.ts
2718
+ function localDate2(d = /* @__PURE__ */ new Date()) {
2719
+ const y = d.getFullYear();
2720
+ const m = String(d.getMonth() + 1).padStart(2, "0");
2721
+ const day = String(d.getDate()).padStart(2, "0");
2722
+ return `${y}-${m}-${day}`;
2723
+ }
2191
2724
  async function runAgent(goal, deps, emit) {
2192
- const { client: client2, models, policy, sessionId, cwd } = deps;
2193
- let totalCostUsd = 0;
2194
- let totalTokens = 0;
2195
- let totalPromptTokens = 0;
2196
- let totalCompletionTokens = 0;
2197
- let calls = 0;
2725
+ const { client: client2, models, cwd } = deps;
2726
+ const verifyOn = deps.verify ?? true;
2727
+ const maxAttempts = deps.maxAttempts ?? 3;
2728
+ const acc = { cost: 0, tokens: 0, prompt: 0, completion: 0, calls: 0 };
2198
2729
  const sessionStart = Date.now();
2199
- let completedSteps = 0;
2200
- let failedSteps = 0;
2201
- const planRoute = route("plan", models, policy);
2730
+ const toolCtx = { cwd, allowWrite: deps.allowWrite, allowCommands: deps.allowCommands };
2731
+ const logUsage = (r, taskType) => {
2732
+ const entry = logCompletion(r, taskType, deps.sessionId);
2733
+ emit({ type: "usage", entry });
2734
+ acc.cost += entry.costUsd;
2735
+ acc.tokens += entry.totalTokens;
2736
+ acc.prompt += entry.promptTokens;
2737
+ acc.completion += entry.completionTokens;
2738
+ acc.calls++;
2739
+ return entry;
2740
+ };
2741
+ const planRoute = route("plan", models, deps.policy);
2202
2742
  let plan;
2203
2743
  if (planRoute) {
2204
2744
  try {
2205
- plan = await planRequest(goal, client2, planRoute.model, (result) => {
2206
- const entry = logCompletion(result, "plan", sessionId);
2207
- emit({ type: "usage", entry });
2208
- totalCostUsd += entry.costUsd;
2209
- totalTokens += entry.totalTokens;
2210
- totalPromptTokens += entry.promptTokens;
2211
- totalCompletionTokens += entry.completionTokens;
2212
- calls++;
2213
- });
2745
+ plan = await planRequest(goal, client2, planRoute.model, (r) => logUsage(r, "plan"));
2214
2746
  } catch {
2215
2747
  plan = heuristicPlan(goal);
2216
2748
  }
@@ -2218,164 +2750,276 @@ async function runAgent(goal, deps, emit) {
2218
2750
  plan = heuristicPlan(goal);
2219
2751
  }
2220
2752
  emit({ type: "plan", plan, planModel: planRoute?.model.id ?? "heuristic" });
2753
+ let startRung = 0;
2754
+ let learned = false;
2755
+ if (verifyOn) {
2756
+ const tier = optimalStartTier(plan.goalType);
2757
+ if (tier) {
2758
+ const r = rungForTier(tier);
2759
+ if (r > 0) {
2760
+ startRung = r;
2761
+ learned = true;
2762
+ }
2763
+ }
2764
+ }
2765
+ const startTier = ESCALATION_LADDER[startRung].tierFloor ?? "cheap";
2766
+ emit({ type: "criteria", goalType: plan.goalType, criteria: plan.criteria, startTier, learned });
2221
2767
  startSession({
2222
- id: sessionId,
2768
+ id: deps.sessionId,
2223
2769
  ts: sessionStart,
2224
2770
  date: localDate2(),
2225
2771
  goal,
2226
2772
  command: "run",
2227
- objective: policy.objective,
2228
- plannedSteps: plan.steps.length
2773
+ objective: deps.policy.objective,
2774
+ plannedSteps: plan.steps.length,
2775
+ goalType: plan.goalType,
2776
+ startTier
2229
2777
  });
2230
- const toolCtx = {
2231
- cwd,
2232
- allowWrite: deps.allowWrite,
2233
- allowCommands: deps.allowCommands
2234
- };
2778
+ let rung = startRung;
2779
+ let attemptNo = 0;
2780
+ let verdict = null;
2781
+ let completedSteps = 0;
2782
+ let failedSteps = 0;
2235
2783
  const priorSummaries = [];
2236
- for (const step of plan.steps) {
2237
- const r = route(step.type, models, policy, {
2238
- promptTokens: step.estPromptTokens,
2239
- completionTokens: step.estCompletionTokens
2240
- });
2241
- if (!r) {
2242
- failedSteps++;
2243
- emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
2244
- continue;
2245
- }
2246
- const model = r.model;
2247
- emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
2248
- const useTools = model.capabilities.tools;
2249
- const messages = [
2250
- { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, useTools) },
2251
- { role: "user", content: step.description }
2252
- ];
2253
- const stepStart = Date.now();
2254
- let stepPrompt = 0;
2255
- let stepCompletion = 0;
2256
- let stepCost = 0;
2257
- let stepToolCalls = 0;
2258
- let iterations = 0;
2259
- let finishedBy = "max-iters";
2260
- let summary = "";
2261
- try {
2262
- for (let iter = 0; iter < MAX_ITERS_PER_STEP; iter++) {
2263
- iterations = iter + 1;
2264
- const gen = client2.stream(
2265
- {
2266
- model: model.id,
2267
- messages,
2268
- tools: useTools ? TOOL_SCHEMAS : void 0,
2269
- temperature: 0.2,
2270
- maxTokens: 2e3
2271
- },
2272
- model.pricing
2273
- );
2274
- let next = await gen.next();
2275
- while (!next.done) {
2276
- emit({ type: "text", delta: next.value });
2277
- next = await gen.next();
2278
- }
2279
- const result = next.value;
2280
- const entry = logCompletion(result, step.type, sessionId);
2281
- emit({ type: "usage", entry });
2282
- totalCostUsd += entry.costUsd;
2283
- totalTokens += entry.totalTokens;
2284
- totalPromptTokens += entry.promptTokens;
2285
- totalCompletionTokens += entry.completionTokens;
2286
- stepPrompt += entry.promptTokens;
2287
- stepCompletion += entry.completionTokens;
2288
- stepCost += entry.costUsd;
2289
- calls++;
2290
- if (result.toolCalls.length && useTools) {
2291
- messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
2292
- let finished = false;
2293
- for (const tc of result.toolCalls) {
2294
- stepToolCalls++;
2295
- emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
2296
- const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
2297
- emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
2298
- messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
2299
- if (outcome.finishSummary != null) {
2300
- summary = outcome.finishSummary;
2301
- finished = true;
2302
- }
2303
- }
2304
- if (finished) {
2305
- finishedBy = "finish-tool";
2306
- break;
2307
- }
2308
- continue;
2309
- }
2310
- const textCall = useTools ? parseTextToolCall(result.content) : null;
2311
- if (textCall) {
2312
- stepToolCalls++;
2313
- emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
2314
- const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
2315
- emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
2316
- if (outcome.finishSummary != null) {
2317
- summary = outcome.finishSummary;
2318
- finishedBy = "finish-tool";
2319
- break;
2320
- }
2321
- messages.push({ role: "assistant", content: result.content });
2322
- messages.push({
2323
- role: "user",
2324
- content: `Tool ${textCall.function.name} returned:
2325
- ${outcome.result}
2326
- Continue with this step. When the objective is met, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
2327
- });
2328
- continue;
2329
- }
2330
- summary = result.content || summary;
2331
- if (summary) finishedBy = "text";
2332
- break;
2784
+ while (attemptNo < maxAttempts) {
2785
+ const rungDef = ESCALATION_LADDER[Math.min(rung, ESCALATION_LADDER.length - 1)];
2786
+ const rungPolicy = applyRung(deps.policy, rungDef);
2787
+ const attemptStart = Date.now();
2788
+ const before = { ...acc };
2789
+ if (attemptNo === 0) {
2790
+ for (const step of plan.steps) {
2791
+ const res = await runStep(step, rungPolicy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal);
2792
+ if (res.success) completedSteps++;
2793
+ else failedSteps++;
2333
2794
  }
2334
- } catch (err) {
2335
- finishedBy = "error";
2336
- emit({ type: "error", message: `Step ${step.id} failed: ${err?.message ?? err}` });
2795
+ } else {
2796
+ await runFix(goal, plan, verdict, rungPolicy, rungDef, deps, toolCtx, emit, logUsage);
2337
2797
  }
2338
- const success = finishedBy === "finish-tool" || finishedBy === "text";
2339
- if (success) completedSteps++;
2340
- else failedSteps++;
2341
- recordStepRun({
2342
- sessionId,
2343
- stepNo: step.id,
2344
- taskType: step.type,
2345
- skill: TASK_SKILL[step.type],
2346
- model: model.id,
2347
- provider: model.provider,
2348
- iterations,
2349
- toolCalls: stepToolCalls,
2350
- promptTokens: stepPrompt,
2351
- completionTokens: stepCompletion,
2352
- costUsd: stepCost,
2353
- finishedBy,
2354
- success,
2355
- durationMs: Date.now() - stepStart
2798
+ if (!verifyOn) {
2799
+ attemptNo++;
2800
+ break;
2801
+ }
2802
+ const verifyPolicy = { ...deps.policy, objective: "quality", tierFloor: rungDef.tierFloor };
2803
+ const verifier = routeOrBest("verify", models, verifyPolicy);
2804
+ if (!verifier) {
2805
+ emit({ type: "error", message: "No model available to verify." });
2806
+ attemptNo++;
2807
+ break;
2808
+ }
2809
+ emit({ type: "verify-start", model: verifier.model.id, attempt: attemptNo + 1 });
2810
+ verdict = await verifyGoal(goal, plan.criteria, { client: client2, model: verifier.model, cwd, allowCommands: deps.allowCommands }, {
2811
+ onToolCall: (name, args) => emit({ type: "tool-call", name, args }),
2812
+ onToolResult: (name, result) => emit({ type: "tool-result", name, result }),
2813
+ onUsage: (r) => logUsage(r, "review")
2814
+ });
2815
+ emit({ type: "verdict", attempt: attemptNo + 1, metCount: verdict.metCount, total: verdict.total, allMet: verdict.allMet, unmet: verdict.unmet });
2816
+ recordAttempt({
2817
+ sessionId: deps.sessionId,
2818
+ attemptNo: attemptNo + 1,
2819
+ goalType: plan.goalType,
2820
+ tierFloor: rungDef.tierFloor ?? null,
2821
+ objective: rungDef.objective,
2822
+ promptTokens: acc.prompt - before.prompt,
2823
+ completionTokens: acc.completion - before.completion,
2824
+ costUsd: acc.cost - before.cost,
2825
+ criteriaTotal: verdict.total,
2826
+ criteriaMet: verdict.metCount,
2827
+ passed: verdict.allMet,
2828
+ durationMs: Date.now() - attemptStart
2356
2829
  });
2357
- if (!summary) summary = "(no summary)";
2358
- priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
2359
- emit({ type: "step-end", step, summary });
2830
+ attemptNo++;
2831
+ if (verdict.allMet) break;
2832
+ if (attemptNo < maxAttempts) {
2833
+ const next = Math.min(rung + 1, ESCALATION_LADDER.length - 1);
2834
+ rung = next;
2835
+ emit({
2836
+ type: "escalate",
2837
+ toRung: ESCALATION_LADDER[next].label,
2838
+ reason: `${verdict.unmet.length}/${verdict.total} criteria unmet`
2839
+ });
2840
+ }
2360
2841
  }
2361
- finishSession(sessionId, {
2842
+ const passed = verifyOn ? verdict ? verdict.allMet : false : null;
2843
+ finishSession(deps.sessionId, {
2362
2844
  plannedSteps: plan.steps.length,
2363
2845
  completedSteps,
2364
2846
  failedSteps,
2365
- autoScore: plan.steps.length ? completedSteps / plan.steps.length : null,
2366
- promptTokens: totalPromptTokens,
2367
- completionTokens: totalCompletionTokens,
2368
- costUsd: totalCostUsd,
2369
- durationMs: Date.now() - sessionStart
2847
+ autoScore: verdict ? verdict.metCount / Math.max(verdict.total, 1) : plan.steps.length ? completedSteps / plan.steps.length : null,
2848
+ promptTokens: acc.prompt,
2849
+ completionTokens: acc.completion,
2850
+ costUsd: acc.cost,
2851
+ durationMs: Date.now() - sessionStart,
2852
+ attempts: attemptNo,
2853
+ finalPassed: passed
2370
2854
  });
2371
- emit({ type: "done", totalCostUsd, totalTokens, calls });
2372
- return { totalCostUsd, totalTokens, calls };
2855
+ emit({ type: "done", totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed, attempts: attemptNo });
2856
+ return { totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed };
2373
2857
  }
2374
- function localDate2(d = /* @__PURE__ */ new Date()) {
2375
- const y = d.getFullYear();
2376
- const m = String(d.getMonth() + 1).padStart(2, "0");
2377
- const day = String(d.getDate()).padStart(2, "0");
2378
- return `${y}-${m}-${day}`;
2858
+ async function runStep(step, policy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal) {
2859
+ const r = routeOrBest(step.type, deps.models, policy, {
2860
+ promptTokens: step.estPromptTokens,
2861
+ completionTokens: step.estCompletionTokens
2862
+ });
2863
+ if (!r) {
2864
+ emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
2865
+ return { summary: "(no model)", success: false };
2866
+ }
2867
+ const model = r.model;
2868
+ emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
2869
+ const messages = [
2870
+ { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, model.capabilities.tools) },
2871
+ { role: "user", content: step.description }
2872
+ ];
2873
+ const loop = await runToolLoop(model, messages, step.type, rungDef, deps, toolCtx, emit, logUsage);
2874
+ recordStepRun({
2875
+ sessionId: deps.sessionId,
2876
+ stepNo: step.id,
2877
+ taskType: step.type,
2878
+ skill: TASK_SKILL[step.type],
2879
+ model: model.id,
2880
+ provider: model.provider,
2881
+ iterations: loop.iterations,
2882
+ toolCalls: loop.toolCalls,
2883
+ promptTokens: loop.prompt,
2884
+ completionTokens: loop.completion,
2885
+ costUsd: loop.cost,
2886
+ finishedBy: loop.finishedBy,
2887
+ success: loop.success,
2888
+ durationMs: loop.durationMs
2889
+ });
2890
+ const summary = loop.summary || "(no summary)";
2891
+ priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
2892
+ emit({ type: "step-end", step, summary });
2893
+ return { summary, success: loop.success };
2894
+ }
2895
+ async function runFix(goal, plan, verdict, policy, rungDef, deps, toolCtx, emit, logUsage) {
2896
+ const r = routeOrBest("edit", deps.models, policy);
2897
+ if (!r) return { summary: "(no model)", success: false };
2898
+ const model = r.model;
2899
+ const fixStep = {
2900
+ id: 100,
2901
+ type: "edit",
2902
+ description: "Fix the unmet acceptance criteria",
2903
+ estPromptTokens: 9e3,
2904
+ estCompletionTokens: 1500
2905
+ };
2906
+ emit({ type: "step-start", step: fixStep, model, estCostUsd: r.estCostUsd });
2907
+ const unmet = verdict.unmet.map((u, i) => `${i + 1}. ${u.criterion} \u2014 ${u.reason}`).join("\n");
2908
+ const messages = [
2909
+ {
2910
+ role: "system",
2911
+ content: `You are the FIX stage of an autonomous coding agent (escalated model). The verify gate found unmet acceptance criteria; resolve them.
2912
+ Overall goal: ${goal}
2913
+ You may use the tools (read_file, write_file, list_dir, run_command). Inspect what's there, then make the changes. Call \`finish\` with a one-line summary when all listed criteria should now pass.
2914
+ If you cannot call tools natively, reply with ONLY one JSON object per turn: {"name":"<tool>","arguments":{...}}`
2915
+ },
2916
+ { role: "user", content: `Unmet criteria:
2917
+ ${unmet}
2918
+
2919
+ Verifier feedback: ${verdict.feedback}` }
2920
+ ];
2921
+ const loop = await runToolLoop(model, messages, "edit", rungDef, deps, toolCtx, emit, logUsage);
2922
+ recordStepRun({
2923
+ sessionId: deps.sessionId,
2924
+ stepNo: fixStep.id,
2925
+ taskType: "edit",
2926
+ skill: TASK_SKILL.edit,
2927
+ model: model.id,
2928
+ provider: model.provider,
2929
+ iterations: loop.iterations,
2930
+ toolCalls: loop.toolCalls,
2931
+ promptTokens: loop.prompt,
2932
+ completionTokens: loop.completion,
2933
+ costUsd: loop.cost,
2934
+ finishedBy: loop.finishedBy,
2935
+ success: loop.success,
2936
+ durationMs: loop.durationMs
2937
+ });
2938
+ emit({ type: "step-end", step: fixStep, summary: loop.summary || "(fix pass)" });
2939
+ return { summary: loop.summary, success: loop.success };
2940
+ }
2941
+ async function runToolLoop(model, messages, taskTypeForLog, rungDef, deps, toolCtx, emit, logUsage) {
2942
+ const useTools = model.capabilities.tools;
2943
+ const start = Date.now();
2944
+ let prompt = 0, completion = 0, cost = 0, toolCalls = 0, iterations = 0;
2945
+ let summary = "";
2946
+ let finishedBy = "max-iters";
2947
+ try {
2948
+ for (let iter = 0; iter < rungDef.maxIters; iter++) {
2949
+ iterations = iter + 1;
2950
+ const gen = deps.client.stream(
2951
+ { model: model.id, messages, tools: useTools ? TOOL_SCHEMAS : void 0, temperature: 0.2, maxTokens: rungDef.maxTokens },
2952
+ model.pricing
2953
+ );
2954
+ let next = await gen.next();
2955
+ while (!next.done) {
2956
+ emit({ type: "text", delta: next.value });
2957
+ next = await gen.next();
2958
+ }
2959
+ const result = next.value;
2960
+ const entry = logUsage(result, taskTypeForLog);
2961
+ prompt += entry.promptTokens;
2962
+ completion += entry.completionTokens;
2963
+ cost += entry.costUsd;
2964
+ if (result.toolCalls.length && useTools) {
2965
+ messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
2966
+ let finished = false;
2967
+ for (const tc of result.toolCalls) {
2968
+ toolCalls++;
2969
+ emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
2970
+ const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
2971
+ emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
2972
+ messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
2973
+ if (outcome.finishSummary != null) {
2974
+ summary = outcome.finishSummary;
2975
+ finished = true;
2976
+ }
2977
+ }
2978
+ if (finished) {
2979
+ finishedBy = "finish-tool";
2980
+ break;
2981
+ }
2982
+ continue;
2983
+ }
2984
+ const textCall = useTools ? parseTextToolCall(result.content) : null;
2985
+ if (textCall) {
2986
+ toolCalls++;
2987
+ emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
2988
+ const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
2989
+ emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
2990
+ if (outcome.finishSummary != null) {
2991
+ summary = outcome.finishSummary;
2992
+ finishedBy = "finish-tool";
2993
+ break;
2994
+ }
2995
+ messages.push({ role: "assistant", content: result.content });
2996
+ messages.push({
2997
+ role: "user",
2998
+ content: `Tool ${textCall.function.name} returned:
2999
+ ${outcome.result}
3000
+ Continue. When done, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
3001
+ });
3002
+ continue;
3003
+ }
3004
+ summary = result.content || summary;
3005
+ if (summary) finishedBy = "text";
3006
+ break;
3007
+ }
3008
+ } catch (err) {
3009
+ finishedBy = "error";
3010
+ emit({ type: "error", message: `${taskTypeForLog} failed: ${err?.message ?? err}` });
3011
+ }
3012
+ return {
3013
+ summary,
3014
+ success: finishedBy === "finish-tool" || finishedBy === "text",
3015
+ finishedBy,
3016
+ iterations,
3017
+ toolCalls,
3018
+ prompt,
3019
+ completion,
3020
+ cost,
3021
+ durationMs: Date.now() - start
3022
+ };
2379
3023
  }
2380
3024
  function stepSystemPrompt(goal, step, priorSummaries, useTools) {
2381
3025
  const context = priorSummaries.length ? `
@@ -2389,7 +3033,7 @@ Return a concise result for this step. Do not ask the user questions.`;
2389
3033
  return `You are the "${step.type}" stage of an autonomous coding agent.
2390
3034
  Overall goal: ${goal}
2391
3035
  Your current step: ${step.description}${context}${toolNote}
2392
- Be efficient \u2014 you were selected as the cheapest capable model for this step.`;
3036
+ Be efficient \u2014 you were selected as the most cost-effective capable model for this step.`;
2393
3037
  }
2394
3038
 
2395
3039
  // src/tui/App.tsx
@@ -2405,6 +3049,8 @@ function App(props) {
2405
3049
  const [tok, setTok] = useState(0);
2406
3050
  const [calls, setCalls] = useState(0);
2407
3051
  const [rated, setRated] = useState(null);
3052
+ const [passed, setPassed] = useState(null);
3053
+ const [attempts, setAttempts] = useState(0);
2408
3054
  const push = useCallback((text, color) => {
2409
3055
  setLog((l) => [...l, { key: l.length, text, color }]);
2410
3056
  }, []);
@@ -2422,7 +3068,9 @@ function App(props) {
2422
3068
  sessionId: props.sessionId,
2423
3069
  cwd: props.cwd,
2424
3070
  allowWrite: props.allowWrite,
2425
- allowCommands: props.allowCommands
3071
+ allowCommands: props.allowCommands,
3072
+ verify: props.verify,
3073
+ maxAttempts: props.maxAttempts
2426
3074
  };
2427
3075
  let textBuf = "";
2428
3076
  const flush = () => {
@@ -2434,6 +3082,24 @@ function App(props) {
2434
3082
  case "plan":
2435
3083
  push(`\u{1F4CB} Plan (${e.plan.steps.length} steps) \xB7 planner: ${e.planModel}`, "cyan");
2436
3084
  break;
3085
+ case "criteria":
3086
+ push(`\u{1F3AF} ${e.goalType} \xB7 ${e.criteria.length} criteria \xB7 start: ${e.startTier}${e.learned ? " (learned)" : ""}`, "cyan");
3087
+ e.criteria.forEach((cr, i) => push(` ${i + 1}. ${cr}`, "gray"));
3088
+ break;
3089
+ case "verify-start":
3090
+ flush();
3091
+ push(`\u{1F50D} Verify (attempt ${e.attempt}) \u2192 ${e.model}`, "cyan");
3092
+ break;
3093
+ case "verdict":
3094
+ flush();
3095
+ push(
3096
+ `${e.allMet ? "\u2705" : "\u274C"} ${e.metCount}/${e.total} criteria met` + (e.unmet.length ? " \u2014 unmet: " + e.unmet.map((u) => u.criterion).join("; ").slice(0, 100) : ""),
3097
+ e.allMet ? "green" : "red"
3098
+ );
3099
+ break;
3100
+ case "escalate":
3101
+ push(`\u23EB Escalate \u2192 ${e.toRung} (${e.reason})`, "magenta");
3102
+ break;
2437
3103
  case "step-start":
2438
3104
  flush();
2439
3105
  push(`\u25B6 Step ${e.step.id} [${e.step.type}] \u2192 ${e.model.id} ~${usd(e.estCostUsd)}`, "yellow");
@@ -2463,6 +3129,8 @@ function App(props) {
2463
3129
  break;
2464
3130
  case "done":
2465
3131
  flush();
3132
+ setPassed(e.passed);
3133
+ setAttempts(e.attempts);
2466
3134
  break;
2467
3135
  }
2468
3136
  };
@@ -2522,10 +3190,14 @@ function App(props) {
2522
3190
  " working\u2026"
2523
3191
  ] }),
2524
3192
  phase === "rate" && /* @__PURE__ */ jsxs(Text, { children: [
2525
- /* @__PURE__ */ jsxs(Text, { color: "green", children: [
2526
- "\u2713 Done \xB7 ",
3193
+ /* @__PURE__ */ jsxs(Text, { color: passed === false ? "yellow" : "green", children: [
3194
+ passed === false ? "\u26A0 goal not fully met" : "\u2713 goal met",
3195
+ " \xB7 ",
3196
+ attempts,
3197
+ " attempt(s) \xB7 ",
2527
3198
  calls,
2528
- " calls \xB7 ",
3199
+ " calls \xB7",
3200
+ " ",
2529
3201
  tokens(tok),
2530
3202
  " tokens \xB7 ",
2531
3203
  usd(cost)
@@ -2609,8 +3281,9 @@ function truncate2(s, n) {
2609
3281
  }
2610
3282
 
2611
3283
  // src/index.ts
3284
+ var VERSION = "0.5.0";
2612
3285
  var program = new Command();
2613
- program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.3.1");
3286
+ program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version(VERSION);
2614
3287
  function client(config) {
2615
3288
  return new OpenRouterClient({
2616
3289
  apiKey: resolveApiKey(config),
@@ -2681,10 +3354,18 @@ async function loadCatalog(config, refresh = false) {
2681
3354
  }
2682
3355
  return models;
2683
3356
  }
3357
+ program.command("setup").description("First-run setup: optionally install a local LLM (Ollama) and connect models").option("--local", "install a local LLM (Ollama) \u2014 skips the prompt").option("--no-local", "skip the local LLM \u2014 skips the prompt").option("-m, --model <id>", "local model to pull (e.g. qwen2.5-coder:7b)").option("-y, --yes", "accept defaults / auto-install without prompts", false).action(async (opts) => {
3358
+ const argv = process.argv;
3359
+ const local = argv.includes("--local") ? true : argv.includes("--no-local") ? false : void 0;
3360
+ await runSetup({ local, model: opts.model, yes: !!opts.yes });
3361
+ });
3362
+ program.command("update").description("Update Polymath, the Ollama runtime, and local models").option("--check", "report available updates without installing", false).option("--self", "only the Polymath CLI", false).option("--ollama", "only the Ollama runtime", false).option("--models", "only the local models", false).action(async (opts) => {
3363
+ await runUpdate(VERSION, { check: !!opts.check, self: !!opts.self, ollama: !!opts.ollama, models: !!opts.models });
3364
+ });
2684
3365
  program.command("login").description("Connect Polymath to OpenRouter (set/replace your API key)").action(async () => {
2685
3366
  await runLogin();
2686
3367
  });
2687
- program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).action(async (goalParts, opts) => {
3368
+ program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).option("--no-verify", "skip the verify-and-escalate loop (single pass)").option("--max-attempts <n>", "max code\u2192verify\u2192escalate attempts until goals met", "3").action(async (goalParts, opts) => {
2688
3369
  const startedAt = Date.now();
2689
3370
  const config = loadConfig();
2690
3371
  if (!config.local.enabled || resolveApiKey(config)) {
@@ -2709,6 +3390,8 @@ program.command("run", { isDefault: true }).description("Launch the interactive
2709
3390
  allowWrite: !!opts.write,
2710
3391
  allowCommands: !!opts.commands,
2711
3392
  objectiveLabel: policy.objective,
3393
+ verify: opts.verify !== false,
3394
+ maxAttempts: Math.max(1, parseInt(opts.maxAttempts, 10) || 3),
2712
3395
  initialGoal: goal
2713
3396
  })
2714
3397
  );
@@ -2888,3 +3571,6 @@ program.parseAsync().catch((err) => {
2888
3571
  console.error(c.red(err?.message ?? String(err)));
2889
3572
  process.exit(1);
2890
3573
  });
3574
+ export {
3575
+ VERSION
3576
+ };