polymath-agent 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +31 -0
  2. package/dist/cli.js +604 -158
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -100,6 +100,37 @@ poly usage # cost by date + model
100
100
  After each `poly run`, rate the result 0–9 (one keypress) — your goal-achievement
101
101
  rating joins the auto score (completed/planned steps) to power `poly analyze`.
102
102
 
103
+ ### Outcome-driven loop (verify → escalate → repeat)
104
+
105
+ `poly run` doesn't stop at "code written" — it measures the result and keeps going
106
+ until the goal is actually met:
107
+
108
+ ```
109
+ command → plan + acceptance criteria → code (cheapest model)
110
+ → VERIFY result against criteria (inspects files, runs tests)
111
+ → if unmet: ESCALATE (higher tier, more tokens, cost cap lifted) → fix → re-verify
112
+ → repeat until all criteria pass (or --max-attempts)
113
+ ```
114
+
115
+ The cheapest model gets first crack; only the criteria it *fails* trigger a pricier
116
+ model — so you pay for frontier capability exactly when (and only when) it's needed.
117
+
118
+ ```bash
119
+ poly run -w -x "add an add(a,b) to calc.js and make the tests pass"
120
+ poly run --no-verify "..." # single pass, no verify/escalate
121
+ poly run --max-attempts 5 "..." # try harder before giving up
122
+ ```
123
+
124
+ After each run you'll see `✓ goal met · 2 attempts` (or `⚠ goal not fully met`).
125
+
126
+ ### Statistical model optimization (learned starting tier)
127
+
128
+ Every attempt is recorded with its goal type, starting tier, tokens, and pass/fail.
129
+ `poly analyze` then shows, per goal type, **which starting model reaches the goal
130
+ with the fewest total tokens** — and once there's enough evidence (≥3 verified
131
+ sessions), `poly run` **auto-starts at that tier**, skipping cheap attempts for goal
132
+ types that historically need a stronger model from the start.
133
+
103
134
  ### The efficiency playbook (learned routing)
104
135
 
105
136
  Everything is captured locally (SQLite). `poly analyze` distills it into a **playbook**
package/dist/cli.js CHANGED
@@ -599,11 +599,14 @@ var TASK_SPECS = {
599
599
  command: { type: "command", minTier: "cheap", needsTools: true, label: "Run command" },
600
600
  review: { type: "review", minTier: "frontier", needsTools: false, label: "Review / critique" },
601
601
  reason: { type: "reason", minTier: "frontier", needsTools: false, label: "Hard reasoning" },
602
+ // The verify gate inspects files / runs tests — it MUST have tools.
603
+ verify: { type: "verify", minTier: "frontier", needsTools: true, label: "Verify result" },
602
604
  explain: { type: "explain", minTier: "cheap", needsTools: false, label: "Explain" },
603
605
  summarize: { type: "summarize", minTier: "cheap", needsTools: false, label: "Summarize" },
604
606
  chat: { type: "chat", minTier: "cheap", needsTools: false, label: "Chat" }
605
607
  };
606
608
  var ALL_TASK_TYPES = Object.keys(TASK_SPECS);
609
+ var ALL_GOAL_TYPES = ["feature", "bugfix", "refactor", "test", "docs", "chore", "other"];
607
610
 
608
611
  // src/planner/planner.ts
609
612
  var PLAN_SYSTEM = `You are the planning stage of a coding agent. Break the user's request into a short, ordered list of concrete steps.
@@ -619,9 +622,21 @@ Each step must be classified by type, chosen from EXACTLY this set:
619
622
  summarize - condense long content
620
623
  chat - a simple conversational reply
621
624
 
625
+ Also classify the request's goalType (one of: feature, bugfix, refactor, test, docs, chore, other) and write 2-5 MEASURABLE acceptance criteria \u2014 concrete, checkable conditions that mean the goal is fully achieved (e.g. "hello.js exists and prints the greeting", "npm test passes", "the function handles empty input").
626
+
622
627
  Return ONLY minified JSON of the form:
623
- {"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
628
+ {"goalType":"<type>","criteria":["...","..."],"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
624
629
  Use 3-8 steps for non-trivial work, fewer for simple requests. Estimate tokens realistically (prompts often 2000-15000, completions 200-3000).`;
630
+ function classifyGoalType(goal) {
631
+ const g = goal.toLowerCase();
632
+ if (/\b(fix|bug|broken|error|crash|regression|fails?)\b/.test(g)) return "bugfix";
633
+ if (/\b(refactor|rename|clean ?up|restructure|extract|simplif)/.test(g)) return "refactor";
634
+ if (/\b(test|spec|coverage|unit test|e2e)\b/.test(g)) return "test";
635
+ if (/\b(docs?|readme|comment|documentation)\b/.test(g)) return "docs";
636
+ if (/\b(bump|upgrade|dependency|deps|config|chore|lint|format)\b/.test(g)) return "chore";
637
+ if (/\b(add|create|implement|build|feature|support|new)\b/.test(g)) return "feature";
638
+ return "other";
639
+ }
625
640
  function heuristicPlan(goal) {
626
641
  const steps = [
627
642
  { id: 1, type: "plan", description: "Decompose the request", estPromptTokens: 2e3, estCompletionTokens: 600 },
@@ -630,7 +645,12 @@ function heuristicPlan(goal) {
630
645
  { id: 4, type: "edit", description: "Implement the change", estPromptTokens: 9e3, estCompletionTokens: 1500 },
631
646
  { id: 5, type: "review", description: "Review the change", estPromptTokens: 6e3, estCompletionTokens: 800 }
632
647
  ];
633
- return { goal, steps };
648
+ return {
649
+ goal,
650
+ steps,
651
+ goalType: classifyGoalType(goal),
652
+ criteria: ["The stated goal is fully implemented and works", "No obvious errors or omissions remain"]
653
+ };
634
654
  }
635
655
  async function planRequest(goal, client2, planModel, onUsage) {
636
656
  const result = await client2.complete(
@@ -648,7 +668,7 @@ async function planRequest(goal, client2, planModel, onUsage) {
648
668
  onUsage?.(result);
649
669
  const parsed = extractPlan(result.content);
650
670
  if (!parsed) return heuristicPlan(goal);
651
- return { goal, steps: parsed };
671
+ return { goal, ...parsed };
652
672
  }
653
673
  function extractPlan(text) {
654
674
  const json = extractJson(text);
@@ -663,7 +683,10 @@ function extractPlan(text) {
663
683
  estPromptTokens: clampInt(s.estPromptTokens, 500, 6e4, 4e3),
664
684
  estCompletionTokens: clampInt(s.estCompletionTokens, 100, 8e3, 800)
665
685
  }));
666
- return steps.length ? steps : null;
686
+ if (!steps.length) return null;
687
+ const goalType = ALL_GOAL_TYPES.includes(String(obj.goalType)) ? obj.goalType : "other";
688
+ const criteria = Array.isArray(obj.criteria) ? obj.criteria.map((x) => String(x).slice(0, 200)).filter(Boolean).slice(0, 6) : [];
689
+ return { steps, goalType, criteria: criteria.length ? criteria : ["The stated goal is fully achieved"] };
667
690
  } catch {
668
691
  return null;
669
692
  }
@@ -705,10 +728,29 @@ function extractJson(text) {
705
728
  }
706
729
 
707
730
  // src/router/policy.ts
731
+ var ESCALATION_LADDER = [
732
+ { objective: "value", maxTokens: 2e3, maxIters: 6, liftCostCap: false, label: "value \xB7 cheapest-capable" },
733
+ { tierFloor: "standard", objective: "value", maxTokens: 4e3, maxIters: 8, liftCostCap: true, label: "standard+ \xB7 more tokens" },
734
+ { tierFloor: "frontier", objective: "quality", maxTokens: 8e3, maxIters: 10, liftCostCap: true, label: "frontier \xB7 strongest" }
735
+ ];
736
+ function rungForTier(tier) {
737
+ return ESCALATION_LADDER.findIndex((r) => r.tierFloor === tier || !r.tierFloor && tier === "cheap");
738
+ }
739
+ function applyRung(base, rung) {
740
+ return {
741
+ ...base,
742
+ objective: rung.objective,
743
+ tierFloor: rung.tierFloor,
744
+ maxCostPerCallUsd: rung.liftCostCap ? void 0 : base.maxCostPerCallUsd
745
+ };
746
+ }
708
747
  var TIER_RANK = { cheap: 0, standard: 1, frontier: 2 };
709
748
  function tierAtLeast(tier, min) {
710
749
  return TIER_RANK[tier] >= TIER_RANK[min];
711
750
  }
751
+ function tierRank(tier) {
752
+ return TIER_RANK[tier];
753
+ }
712
754
  function blendedPrice(m) {
713
755
  return (m.pricing.promptUsdPerMTok * 3 + m.pricing.completionUsdPerMTok) / 4;
714
756
  }
@@ -755,6 +797,7 @@ var TASK_SKILL = {
755
797
  command: "speed",
756
798
  review: "reasoning",
757
799
  reason: "reasoning",
800
+ verify: "reasoning",
758
801
  explain: "general",
759
802
  summarize: "speed",
760
803
  chat: "speed"
@@ -781,6 +824,7 @@ var TASK_MIN_STRENGTH = {
781
824
  edit: 1.4,
782
825
  review: 1.5,
783
826
  reason: 1.5,
827
+ verify: 1.4,
784
828
  plan: 1.2
785
829
  };
786
830
  var HEADLINE_SKILLS = ["coding", "reasoning", "retrieval", "speed"];
@@ -798,9 +842,10 @@ function taskValue(m, taskType, empirical) {
798
842
  function candidatesFor(taskType, models, policy, est) {
799
843
  const spec = TASK_SPECS[taskType];
800
844
  const strengthFloor = TASK_MIN_STRENGTH[taskType] ?? 0;
845
+ const minTier = policy.tierFloor && tierRank(policy.tierFloor) > tierRank(spec.minTier) ? policy.tierFloor : spec.minTier;
801
846
  return models.filter((m) => {
802
847
  if (m.id === "openrouter/auto") return false;
803
- const covers = tierAtLeast(m.tier, spec.minTier) || taskStrength(m, taskType) >= strengthFloor;
848
+ const covers = tierAtLeast(m.tier, minTier) || !policy.tierFloor && taskStrength(m, taskType) >= strengthFloor;
804
849
  if (!covers) return false;
805
850
  if (spec.needsTools && !m.capabilities.tools) return false;
806
851
  if (policy.maxCostPerCallUsd != null && est) {
@@ -846,6 +891,19 @@ function route(taskType, models, policy, est = { promptTokens: 4e3, completionTo
846
891
  const reason = policy.objective === "cheapest" ? `cheapest model that covers ${skill}` : policy.objective === "quality" ? `strongest at ${skill}` : proven ? `proven ${Math.round(proven)}% fewer tokens on ${taskType} (playbook)` : `best ${skill}-per-dollar`;
847
892
  return { model: chosen, reason, estCostUsd: projectCost(chosen, est) };
848
893
  }
894
+ function routeOrBest(taskType, models, policy, est = { promptTokens: 4e3, completionTokens: 1e3 }) {
895
+ const r = route(taskType, models, policy, est);
896
+ if (r) return r;
897
+ const spec = TASK_SPECS[taskType];
898
+ const usable = models.filter(
899
+ (m) => m.id !== "openrouter/auto" && (!spec.needsTools || m.capabilities.tools)
900
+ );
901
+ if (!usable.length) return null;
902
+ const byStrength = (a, b) => taskStrength(b, taskType) - taskStrength(a, taskType);
903
+ const withTools = usable.filter((m) => m.capabilities.tools).sort(byStrength);
904
+ const best = (withTools.length ? withTools : [...usable].sort(byStrength))[0];
905
+ return { model: best, reason: `best available for ${TASK_SKILL[taskType]} (fallback)`, estCostUsd: projectCost(best, est) };
906
+ }
849
907
 
850
908
  // src/recommend/recommend.ts
851
909
  var OBJECTIVES = [
@@ -1075,6 +1133,27 @@ function getDb() {
1075
1133
  );
1076
1134
  CREATE INDEX IF NOT EXISTS idx_cmd_date ON command_runs(date);
1077
1135
 
1136
+ -- One row per verify-and-escalate attempt within a session. Powers the
1137
+ -- "optimal starting model per goal type" statistical learning.
1138
+ CREATE TABLE IF NOT EXISTS attempts (
1139
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
1140
+ session_id TEXT NOT NULL,
1141
+ attempt_no INTEGER NOT NULL,
1142
+ goal_type TEXT NOT NULL,
1143
+ tier_floor TEXT,
1144
+ objective TEXT NOT NULL,
1145
+ prompt_tokens INTEGER NOT NULL,
1146
+ completion_tokens INTEGER NOT NULL,
1147
+ cost_usd REAL NOT NULL,
1148
+ criteria_total INTEGER NOT NULL,
1149
+ criteria_met INTEGER NOT NULL,
1150
+ passed INTEGER NOT NULL,
1151
+ duration_ms INTEGER NOT NULL,
1152
+ synced INTEGER NOT NULL DEFAULT 0
1153
+ );
1154
+ CREATE INDEX IF NOT EXISTS idx_attempts_session ON attempts(session_id);
1155
+ CREATE INDEX IF NOT EXISTS idx_attempts_goal ON attempts(goal_type, tier_floor);
1156
+
1078
1157
  -- Distilled efficiency insights: ONLY the notably cost-efficient approaches.
1079
1158
  -- This is what syncs to the cloud by default (raw logs stay local).
1080
1159
  CREATE TABLE IF NOT EXISTS insights (
@@ -1096,6 +1175,15 @@ function getDb() {
1096
1175
  if (!cols.some((c2) => c2.name === "command")) {
1097
1176
  db.exec(`ALTER TABLE usage_log ADD COLUMN command TEXT NOT NULL DEFAULT 'run'`);
1098
1177
  }
1178
+ const conn = db;
1179
+ const scols = conn.prepare(`PRAGMA table_info(sessions)`).all();
1180
+ const addSession = (name, decl) => {
1181
+ if (!scols.some((c2) => c2.name === name)) conn.exec(`ALTER TABLE sessions ADD COLUMN ${name} ${decl}`);
1182
+ };
1183
+ addSession("goal_type", "TEXT NOT NULL DEFAULT 'other'");
1184
+ addSession("start_tier", "TEXT");
1185
+ addSession("attempts", "INTEGER NOT NULL DEFAULT 1");
1186
+ addSession("final_passed", "INTEGER");
1099
1187
  return db;
1100
1188
  }
1101
1189
  function recordUsage(e) {
@@ -1188,14 +1276,14 @@ function markSynced(ids) {
1188
1276
  }
1189
1277
  function startSession(s) {
1190
1278
  getDb().prepare(
1191
- `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps)
1192
- VALUES (?, ?, ?, ?, ?, ?, ?)`
1193
- ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps);
1279
+ `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps, goal_type, start_tier)
1280
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
1281
+ ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps, s.goalType, s.startTier ?? null);
1194
1282
  }
1195
1283
  function finishSession(id, u) {
1196
1284
  getDb().prepare(
1197
1285
  `UPDATE sessions SET planned_steps=?, completed_steps=?, failed_steps=?, auto_score=?,
1198
- prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=? WHERE id=?`
1286
+ prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=?, attempts=?, final_passed=? WHERE id=?`
1199
1287
  ).run(
1200
1288
  u.plannedSteps,
1201
1289
  u.completedSteps,
@@ -1205,9 +1293,60 @@ function finishSession(id, u) {
1205
1293
  u.completionTokens,
1206
1294
  u.costUsd,
1207
1295
  u.durationMs,
1296
+ u.attempts ?? 1,
1297
+ u.finalPassed == null ? null : u.finalPassed ? 1 : 0,
1208
1298
  id
1209
1299
  );
1210
1300
  }
1301
+ function recordAttempt(a) {
1302
+ getDb().prepare(
1303
+ `INSERT INTO attempts
1304
+ (session_id, attempt_no, goal_type, tier_floor, objective, prompt_tokens, completion_tokens,
1305
+ cost_usd, criteria_total, criteria_met, passed, duration_ms)
1306
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
1307
+ ).run(
1308
+ a.sessionId,
1309
+ a.attemptNo,
1310
+ a.goalType,
1311
+ a.tierFloor,
1312
+ a.objective,
1313
+ a.promptTokens,
1314
+ a.completionTokens,
1315
+ a.costUsd,
1316
+ a.criteriaTotal,
1317
+ a.criteriaMet,
1318
+ a.passed ? 1 : 0,
1319
+ a.durationMs
1320
+ );
1321
+ }
1322
+ function goalTierStats() {
1323
+ const rows = getDb().prepare(
1324
+ `SELECT goal_type AS goalType, COALESCE(start_tier,'cheap') AS startTier,
1325
+ COUNT(*) AS sessions,
1326
+ AVG(CASE WHEN final_passed=1 THEN 1.0 ELSE 0.0 END) AS passRate,
1327
+ AVG(prompt_tokens + completion_tokens) AS avgTotalTokens,
1328
+ AVG(attempts) AS avgAttempts
1329
+ FROM sessions
1330
+ WHERE final_passed IS NOT NULL
1331
+ GROUP BY goal_type, startTier
1332
+ ORDER BY goal_type, avgTotalTokens ASC`
1333
+ ).all();
1334
+ return rows.map((r) => ({
1335
+ goalType: String(r.goalType),
1336
+ startTier: String(r.startTier),
1337
+ sessions: Number(r.sessions),
1338
+ passRate: Number(r.passRate ?? 0),
1339
+ avgTotalTokens: Number(r.avgTotalTokens ?? 0),
1340
+ avgAttempts: Number(r.avgAttempts ?? 0)
1341
+ }));
1342
+ }
1343
+ function optimalStartTier(goalType, minSessions = 3) {
1344
+ const stats = goalTierStats().filter(
1345
+ (s) => s.goalType === goalType && s.sessions >= minSessions && s.passRate >= 0.6
1346
+ );
1347
+ if (!stats.length) return null;
1348
+ return stats.sort((a, b) => a.avgTotalTokens - b.avgTotalTokens)[0].startTier;
1349
+ }
1211
1350
  function setUserScore(sessionId, score) {
1212
1351
  getDb().prepare(`UPDATE sessions SET user_score=? WHERE id=?`).run(score, sessionId);
1213
1352
  }
@@ -1651,6 +1790,35 @@ function renderAnalysis(filter = {}) {
1651
1790
  }
1652
1791
  out.push("");
1653
1792
  }
1793
+ const tierStats = goalTierStats();
1794
+ if (tierStats.length) {
1795
+ out.push(c.bold("Optimal starting model per goal type") + c.dim(" (pass rate vs total tokens to reach the goal)"));
1796
+ out.push(
1797
+ table(
1798
+ ["Goal type", "Start tier", "Sessions", "Pass rate", "Avg total tok", "Avg attempts"],
1799
+ tierStats.map((s) => [
1800
+ s.goalType,
1801
+ tierColor(s.startTier),
1802
+ String(s.sessions),
1803
+ `${Math.round(s.passRate * 100)}%`,
1804
+ tokens(Math.round(s.avgTotalTokens)),
1805
+ s.avgAttempts.toFixed(1)
1806
+ ])
1807
+ )
1808
+ );
1809
+ const goalTypes = [...new Set(tierStats.map((s) => s.goalType))];
1810
+ const learned = goalTypes.map((g) => ({ g, tier: optimalStartTier(g) })).filter((x) => x.tier);
1811
+ if (learned.length) {
1812
+ out.push(
1813
+ c.green(
1814
+ "\u2192 Learned starts (auto-applied on `poly run`): " + learned.map((x) => `${x.g}\u2192${x.tier}`).join(", ")
1815
+ )
1816
+ );
1817
+ } else {
1818
+ out.push(c.dim("\u2192 Not enough evidence yet to auto-pick a starting tier (needs \u22653 verified sessions per goal type)."));
1819
+ }
1820
+ out.push("");
1821
+ }
1654
1822
  if (byCommand.length) {
1655
1823
  out.push(c.bold("Usage by command"));
1656
1824
  out.push(
@@ -2086,6 +2254,28 @@ var TOOL_SCHEMAS = [
2086
2254
  }
2087
2255
  }
2088
2256
  ];
2257
+ var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
2258
+ var READONLY_TOOL_SCHEMAS = TOOL_SCHEMAS.filter(
2259
+ (t) => ["read_file", "list_dir", "run_command"].includes(t.function.name)
2260
+ );
2261
+ function parseTextToolCall(content) {
2262
+ if (!content) return null;
2263
+ const json = extractJson(content);
2264
+ if (!json) return null;
2265
+ try {
2266
+ const obj = JSON.parse(json);
2267
+ const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
2268
+ if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
2269
+ const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
2270
+ return {
2271
+ id: `textcall_${name}`,
2272
+ type: "function",
2273
+ function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
2274
+ };
2275
+ } catch {
2276
+ return null;
2277
+ }
2278
+ }
2089
2279
  var MAX_OUTPUT = 8e3;
2090
2280
  function clip(s) {
2091
2281
  return s.length > MAX_OUTPUT ? s.slice(0, MAX_OUTPUT) + `
@@ -2167,31 +2357,124 @@ ${stderr}`)) };
2167
2357
  }
2168
2358
  }
2169
2359
 
2360
+ // src/agent/verify.ts
2361
+ var VERIFY_MAX_ITERS = 8;
2362
+ var VERIFY_SYSTEM = `You are the VERIFY stage of an autonomous coding agent. Your job is to MEASURE whether the goal was actually achieved \u2014 be skeptical and check the real workspace, do not assume.
2363
+ Use the read-only tools (read_file, list_dir, run_command) to inspect files and, where relevant, run build/test commands. Then judge EACH acceptance criterion against what you actually observed.
2364
+ When done, reply with ONLY this JSON (no prose, no code fence):
2365
+ {"results":[{"criterion":"<verbatim>","met":true|false,"reason":"<evidence>"}],"feedback":"<concrete guidance to fix any unmet criteria>"}`;
2366
+ async function verifyGoal(goal, criteria, deps, ev = {}) {
2367
+ const toolCtx = { cwd: deps.cwd, allowWrite: false, allowCommands: deps.allowCommands };
2368
+ const useTools = deps.model.capabilities.tools;
2369
+ const messages = [
2370
+ { role: "system", content: VERIFY_SYSTEM },
2371
+ {
2372
+ role: "user",
2373
+ content: `Goal: ${goal}
2374
+
2375
+ Acceptance criteria:
2376
+ ` + criteria.map((c2, i) => `${i + 1}. ${c2}`).join("\n") + `
2377
+
2378
+ Inspect the workspace, then return the verdict JSON.`
2379
+ }
2380
+ ];
2381
+ let verdict = null;
2382
+ for (let iter = 0; iter < VERIFY_MAX_ITERS; iter++) {
2383
+ const gen = deps.client.stream(
2384
+ { model: deps.model.id, messages, tools: useTools ? READONLY_TOOL_SCHEMAS : void 0, temperature: 0, maxTokens: 1500 },
2385
+ deps.model.pricing
2386
+ );
2387
+ let next = await gen.next();
2388
+ while (!next.done) next = await gen.next();
2389
+ const result = next.value;
2390
+ ev.onUsage?.(result);
2391
+ const calls = result.toolCalls.length ? result.toolCalls : useTools && parseTextToolCall(result.content) ? [parseTextToolCall(result.content)] : [];
2392
+ const parsed = parseVerdict(result.content, criteria);
2393
+ if (parsed) {
2394
+ verdict = parsed;
2395
+ break;
2396
+ }
2397
+ if (calls.length) {
2398
+ if (result.toolCalls.length) messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
2399
+ for (const tc of calls) {
2400
+ ev.onToolCall?.(tc.function.name, tc.function.arguments);
2401
+ const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
2402
+ ev.onToolResult?.(tc.function.name, outcome.result);
2403
+ if (result.toolCalls.length) {
2404
+ messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
2405
+ } else {
2406
+ messages.push({ role: "assistant", content: result.content });
2407
+ messages.push({ role: "user", content: `Tool ${tc.function.name} returned:
2408
+ ${outcome.result}
2409
+ Continue, then return the verdict JSON.` });
2410
+ }
2411
+ }
2412
+ continue;
2413
+ }
2414
+ messages.push({ role: "assistant", content: result.content });
2415
+ messages.push({ role: "user", content: `Return ONLY the verdict JSON now.` });
2416
+ }
2417
+ return verdict ?? fallbackVerdict(criteria);
2418
+ }
2419
+ function parseVerdict(text, criteria) {
2420
+ const json = extractJson(text);
2421
+ if (!json) return null;
2422
+ try {
2423
+ const obj = JSON.parse(json);
2424
+ if (!Array.isArray(obj.results)) return null;
2425
+ const results = obj.results.map((r) => ({
2426
+ criterion: String(r.criterion ?? ""),
2427
+ met: r.met === true || String(r.met).toLowerCase() === "true",
2428
+ reason: String(r.reason ?? "").slice(0, 300)
2429
+ }));
2430
+ if (!results.length) return null;
2431
+ const unmet = results.filter((r) => !r.met);
2432
+ return {
2433
+ total: results.length,
2434
+ metCount: results.length - unmet.length,
2435
+ allMet: unmet.length === 0,
2436
+ results,
2437
+ unmet,
2438
+ feedback: String(obj.feedback ?? "").slice(0, 1e3) || unmet.map((u) => u.reason).join("; ")
2439
+ };
2440
+ } catch {
2441
+ return null;
2442
+ }
2443
+ }
2444
+ function fallbackVerdict(criteria) {
2445
+ const results = criteria.map((c2) => ({ criterion: c2, met: false, reason: "verifier produced no verdict" }));
2446
+ return { total: results.length, metCount: 0, allMet: false, results, unmet: results, feedback: "Verification inconclusive; re-attempt with a stronger model." };
2447
+ }
2448
+
2170
2449
  // src/agent/loop.ts
2171
- var MAX_ITERS_PER_STEP = 6;
2450
+ function localDate2(d = /* @__PURE__ */ new Date()) {
2451
+ const y = d.getFullYear();
2452
+ const m = String(d.getMonth() + 1).padStart(2, "0");
2453
+ const day = String(d.getDate()).padStart(2, "0");
2454
+ return `${y}-${m}-${day}`;
2455
+ }
2172
2456
  async function runAgent(goal, deps, emit) {
2173
- const { client: client2, models, policy, sessionId, cwd } = deps;
2174
- let totalCostUsd = 0;
2175
- let totalTokens = 0;
2176
- let totalPromptTokens = 0;
2177
- let totalCompletionTokens = 0;
2178
- let calls = 0;
2457
+ const { client: client2, models, cwd } = deps;
2458
+ const verifyOn = deps.verify ?? true;
2459
+ const maxAttempts = deps.maxAttempts ?? 3;
2460
+ const acc = { cost: 0, tokens: 0, prompt: 0, completion: 0, calls: 0 };
2179
2461
  const sessionStart = Date.now();
2180
- let completedSteps = 0;
2181
- let failedSteps = 0;
2182
- const planRoute = route("plan", models, policy);
2462
+ const toolCtx = { cwd, allowWrite: deps.allowWrite, allowCommands: deps.allowCommands };
2463
+ const logUsage = (r, taskType) => {
2464
+ const entry = logCompletion(r, taskType, deps.sessionId);
2465
+ emit({ type: "usage", entry });
2466
+ acc.cost += entry.costUsd;
2467
+ acc.tokens += entry.totalTokens;
2468
+ acc.prompt += entry.promptTokens;
2469
+ acc.completion += entry.completionTokens;
2470
+ acc.calls++;
2471
+ return entry;
2472
+ };
2473
+ const planRoute = route("plan", models, deps.policy);
2183
2474
  let plan;
2184
2475
  if (planRoute) {
2185
2476
  try {
2186
- plan = await planRequest(goal, client2, planRoute.model, (result) => {
2187
- const entry = logCompletion(result, "plan", sessionId);
2188
- emit({ type: "usage", entry });
2189
- totalCostUsd += entry.costUsd;
2190
- totalTokens += entry.totalTokens;
2191
- totalPromptTokens += entry.promptTokens;
2192
- totalCompletionTokens += entry.completionTokens;
2193
- calls++;
2194
- });
2477
+ plan = await planRequest(goal, client2, planRoute.model, (r) => logUsage(r, "plan"));
2195
2478
  } catch {
2196
2479
  plan = heuristicPlan(goal);
2197
2480
  }
@@ -2199,144 +2482,276 @@ async function runAgent(goal, deps, emit) {
2199
2482
  plan = heuristicPlan(goal);
2200
2483
  }
2201
2484
  emit({ type: "plan", plan, planModel: planRoute?.model.id ?? "heuristic" });
2485
+ let startRung = 0;
2486
+ let learned = false;
2487
+ if (verifyOn) {
2488
+ const tier = optimalStartTier(plan.goalType);
2489
+ if (tier) {
2490
+ const r = rungForTier(tier);
2491
+ if (r > 0) {
2492
+ startRung = r;
2493
+ learned = true;
2494
+ }
2495
+ }
2496
+ }
2497
+ const startTier = ESCALATION_LADDER[startRung].tierFloor ?? "cheap";
2498
+ emit({ type: "criteria", goalType: plan.goalType, criteria: plan.criteria, startTier, learned });
2202
2499
  startSession({
2203
- id: sessionId,
2500
+ id: deps.sessionId,
2204
2501
  ts: sessionStart,
2205
2502
  date: localDate2(),
2206
2503
  goal,
2207
2504
  command: "run",
2208
- objective: policy.objective,
2209
- plannedSteps: plan.steps.length
2505
+ objective: deps.policy.objective,
2506
+ plannedSteps: plan.steps.length,
2507
+ goalType: plan.goalType,
2508
+ startTier
2210
2509
  });
2211
- const toolCtx = {
2212
- cwd,
2213
- allowWrite: deps.allowWrite,
2214
- allowCommands: deps.allowCommands
2215
- };
2510
+ let rung = startRung;
2511
+ let attemptNo = 0;
2512
+ let verdict = null;
2513
+ let completedSteps = 0;
2514
+ let failedSteps = 0;
2216
2515
  const priorSummaries = [];
2217
- for (const step of plan.steps) {
2218
- const r = route(step.type, models, policy, {
2219
- promptTokens: step.estPromptTokens,
2220
- completionTokens: step.estCompletionTokens
2221
- });
2222
- if (!r) {
2223
- failedSteps++;
2224
- emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
2225
- continue;
2226
- }
2227
- const model = r.model;
2228
- emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
2229
- const useTools = model.capabilities.tools;
2230
- const messages = [
2231
- { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, useTools) },
2232
- { role: "user", content: step.description }
2233
- ];
2234
- const stepStart = Date.now();
2235
- let stepPrompt = 0;
2236
- let stepCompletion = 0;
2237
- let stepCost = 0;
2238
- let stepToolCalls = 0;
2239
- let iterations = 0;
2240
- let finishedBy = "max-iters";
2241
- let summary = "";
2242
- try {
2243
- for (let iter = 0; iter < MAX_ITERS_PER_STEP; iter++) {
2244
- iterations = iter + 1;
2245
- const gen = client2.stream(
2246
- {
2247
- model: model.id,
2248
- messages,
2249
- tools: useTools ? TOOL_SCHEMAS : void 0,
2250
- temperature: 0.2,
2251
- maxTokens: 2e3
2252
- },
2253
- model.pricing
2254
- );
2255
- let next = await gen.next();
2256
- while (!next.done) {
2257
- emit({ type: "text", delta: next.value });
2258
- next = await gen.next();
2259
- }
2260
- const result = next.value;
2261
- const entry = logCompletion(result, step.type, sessionId);
2262
- emit({ type: "usage", entry });
2263
- totalCostUsd += entry.costUsd;
2264
- totalTokens += entry.totalTokens;
2265
- totalPromptTokens += entry.promptTokens;
2266
- totalCompletionTokens += entry.completionTokens;
2267
- stepPrompt += entry.promptTokens;
2268
- stepCompletion += entry.completionTokens;
2269
- stepCost += entry.costUsd;
2270
- calls++;
2271
- if (result.toolCalls.length && useTools) {
2272
- messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
2273
- let finished = false;
2274
- for (const tc of result.toolCalls) {
2275
- stepToolCalls++;
2276
- emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
2277
- const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
2278
- emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
2279
- messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
2280
- if (outcome.finishSummary != null) {
2281
- summary = outcome.finishSummary;
2282
- finished = true;
2283
- }
2284
- }
2285
- if (finished) {
2286
- finishedBy = "finish-tool";
2287
- break;
2288
- }
2289
- continue;
2290
- }
2291
- summary = result.content || summary;
2292
- if (summary) finishedBy = "text";
2293
- break;
2516
+ while (attemptNo < maxAttempts) {
2517
+ const rungDef = ESCALATION_LADDER[Math.min(rung, ESCALATION_LADDER.length - 1)];
2518
+ const rungPolicy = applyRung(deps.policy, rungDef);
2519
+ const attemptStart = Date.now();
2520
+ const before = { ...acc };
2521
+ if (attemptNo === 0) {
2522
+ for (const step of plan.steps) {
2523
+ const res = await runStep(step, rungPolicy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal);
2524
+ if (res.success) completedSteps++;
2525
+ else failedSteps++;
2294
2526
  }
2295
- } catch (err) {
2296
- finishedBy = "error";
2297
- emit({ type: "error", message: `Step ${step.id} failed: ${err?.message ?? err}` });
2527
+ } else {
2528
+ await runFix(goal, plan, verdict, rungPolicy, rungDef, deps, toolCtx, emit, logUsage);
2298
2529
  }
2299
- const success = finishedBy === "finish-tool" || finishedBy === "text";
2300
- if (success) completedSteps++;
2301
- else failedSteps++;
2302
- recordStepRun({
2303
- sessionId,
2304
- stepNo: step.id,
2305
- taskType: step.type,
2306
- skill: TASK_SKILL[step.type],
2307
- model: model.id,
2308
- provider: model.provider,
2309
- iterations,
2310
- toolCalls: stepToolCalls,
2311
- promptTokens: stepPrompt,
2312
- completionTokens: stepCompletion,
2313
- costUsd: stepCost,
2314
- finishedBy,
2315
- success,
2316
- durationMs: Date.now() - stepStart
2530
+ if (!verifyOn) {
2531
+ attemptNo++;
2532
+ break;
2533
+ }
2534
+ const verifyPolicy = { ...deps.policy, objective: "quality", tierFloor: rungDef.tierFloor };
2535
+ const verifier = routeOrBest("verify", models, verifyPolicy);
2536
+ if (!verifier) {
2537
+ emit({ type: "error", message: "No model available to verify." });
2538
+ attemptNo++;
2539
+ break;
2540
+ }
2541
+ emit({ type: "verify-start", model: verifier.model.id, attempt: attemptNo + 1 });
2542
+ verdict = await verifyGoal(goal, plan.criteria, { client: client2, model: verifier.model, cwd, allowCommands: deps.allowCommands }, {
2543
+ onToolCall: (name, args) => emit({ type: "tool-call", name, args }),
2544
+ onToolResult: (name, result) => emit({ type: "tool-result", name, result }),
2545
+ onUsage: (r) => logUsage(r, "review")
2546
+ });
2547
+ emit({ type: "verdict", attempt: attemptNo + 1, metCount: verdict.metCount, total: verdict.total, allMet: verdict.allMet, unmet: verdict.unmet });
2548
+ recordAttempt({
2549
+ sessionId: deps.sessionId,
2550
+ attemptNo: attemptNo + 1,
2551
+ goalType: plan.goalType,
2552
+ tierFloor: rungDef.tierFloor ?? null,
2553
+ objective: rungDef.objective,
2554
+ promptTokens: acc.prompt - before.prompt,
2555
+ completionTokens: acc.completion - before.completion,
2556
+ costUsd: acc.cost - before.cost,
2557
+ criteriaTotal: verdict.total,
2558
+ criteriaMet: verdict.metCount,
2559
+ passed: verdict.allMet,
2560
+ durationMs: Date.now() - attemptStart
2317
2561
  });
2318
- if (!summary) summary = "(no summary)";
2319
- priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
2320
- emit({ type: "step-end", step, summary });
2562
+ attemptNo++;
2563
+ if (verdict.allMet) break;
2564
+ if (attemptNo < maxAttempts) {
2565
+ const next = Math.min(rung + 1, ESCALATION_LADDER.length - 1);
2566
+ rung = next;
2567
+ emit({
2568
+ type: "escalate",
2569
+ toRung: ESCALATION_LADDER[next].label,
2570
+ reason: `${verdict.unmet.length}/${verdict.total} criteria unmet`
2571
+ });
2572
+ }
2321
2573
  }
2322
- finishSession(sessionId, {
2574
+ const passed = verifyOn ? verdict ? verdict.allMet : false : null;
2575
+ finishSession(deps.sessionId, {
2323
2576
  plannedSteps: plan.steps.length,
2324
2577
  completedSteps,
2325
2578
  failedSteps,
2326
- autoScore: plan.steps.length ? completedSteps / plan.steps.length : null,
2327
- promptTokens: totalPromptTokens,
2328
- completionTokens: totalCompletionTokens,
2329
- costUsd: totalCostUsd,
2330
- durationMs: Date.now() - sessionStart
2579
+ autoScore: verdict ? verdict.metCount / Math.max(verdict.total, 1) : plan.steps.length ? completedSteps / plan.steps.length : null,
2580
+ promptTokens: acc.prompt,
2581
+ completionTokens: acc.completion,
2582
+ costUsd: acc.cost,
2583
+ durationMs: Date.now() - sessionStart,
2584
+ attempts: attemptNo,
2585
+ finalPassed: passed
2331
2586
  });
2332
- emit({ type: "done", totalCostUsd, totalTokens, calls });
2333
- return { totalCostUsd, totalTokens, calls };
2587
+ emit({ type: "done", totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed, attempts: attemptNo });
2588
+ return { totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed };
2334
2589
  }
2335
- function localDate2(d = /* @__PURE__ */ new Date()) {
2336
- const y = d.getFullYear();
2337
- const m = String(d.getMonth() + 1).padStart(2, "0");
2338
- const day = String(d.getDate()).padStart(2, "0");
2339
- return `${y}-${m}-${day}`;
2590
+ async function runStep(step, policy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal) {
2591
+ const r = routeOrBest(step.type, deps.models, policy, {
2592
+ promptTokens: step.estPromptTokens,
2593
+ completionTokens: step.estCompletionTokens
2594
+ });
2595
+ if (!r) {
2596
+ emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
2597
+ return { summary: "(no model)", success: false };
2598
+ }
2599
+ const model = r.model;
2600
+ emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
2601
+ const messages = [
2602
+ { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, model.capabilities.tools) },
2603
+ { role: "user", content: step.description }
2604
+ ];
2605
+ const loop = await runToolLoop(model, messages, step.type, rungDef, deps, toolCtx, emit, logUsage);
2606
+ recordStepRun({
2607
+ sessionId: deps.sessionId,
2608
+ stepNo: step.id,
2609
+ taskType: step.type,
2610
+ skill: TASK_SKILL[step.type],
2611
+ model: model.id,
2612
+ provider: model.provider,
2613
+ iterations: loop.iterations,
2614
+ toolCalls: loop.toolCalls,
2615
+ promptTokens: loop.prompt,
2616
+ completionTokens: loop.completion,
2617
+ costUsd: loop.cost,
2618
+ finishedBy: loop.finishedBy,
2619
+ success: loop.success,
2620
+ durationMs: loop.durationMs
2621
+ });
2622
+ const summary = loop.summary || "(no summary)";
2623
+ priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
2624
+ emit({ type: "step-end", step, summary });
2625
+ return { summary, success: loop.success };
2626
+ }
2627
+ async function runFix(goal, plan, verdict, policy, rungDef, deps, toolCtx, emit, logUsage) {
2628
+ const r = routeOrBest("edit", deps.models, policy);
2629
+ if (!r) return { summary: "(no model)", success: false };
2630
+ const model = r.model;
2631
+ const fixStep = {
2632
+ id: 100,
2633
+ type: "edit",
2634
+ description: "Fix the unmet acceptance criteria",
2635
+ estPromptTokens: 9e3,
2636
+ estCompletionTokens: 1500
2637
+ };
2638
+ emit({ type: "step-start", step: fixStep, model, estCostUsd: r.estCostUsd });
2639
+ const unmet = verdict.unmet.map((u, i) => `${i + 1}. ${u.criterion} \u2014 ${u.reason}`).join("\n");
2640
+ const messages = [
2641
+ {
2642
+ role: "system",
2643
+ content: `You are the FIX stage of an autonomous coding agent (escalated model). The verify gate found unmet acceptance criteria; resolve them.
2644
+ Overall goal: ${goal}
2645
+ You may use the tools (read_file, write_file, list_dir, run_command). Inspect what's there, then make the changes. Call \`finish\` with a one-line summary when all listed criteria should now pass.
2646
+ If you cannot call tools natively, reply with ONLY one JSON object per turn: {"name":"<tool>","arguments":{...}}`
2647
+ },
2648
+ { role: "user", content: `Unmet criteria:
2649
+ ${unmet}
2650
+
2651
+ Verifier feedback: ${verdict.feedback}` }
2652
+ ];
2653
+ const loop = await runToolLoop(model, messages, "edit", rungDef, deps, toolCtx, emit, logUsage);
2654
+ recordStepRun({
2655
+ sessionId: deps.sessionId,
2656
+ stepNo: fixStep.id,
2657
+ taskType: "edit",
2658
+ skill: TASK_SKILL.edit,
2659
+ model: model.id,
2660
+ provider: model.provider,
2661
+ iterations: loop.iterations,
2662
+ toolCalls: loop.toolCalls,
2663
+ promptTokens: loop.prompt,
2664
+ completionTokens: loop.completion,
2665
+ costUsd: loop.cost,
2666
+ finishedBy: loop.finishedBy,
2667
+ success: loop.success,
2668
+ durationMs: loop.durationMs
2669
+ });
2670
+ emit({ type: "step-end", step: fixStep, summary: loop.summary || "(fix pass)" });
2671
+ return { summary: loop.summary, success: loop.success };
2672
+ }
2673
+ async function runToolLoop(model, messages, taskTypeForLog, rungDef, deps, toolCtx, emit, logUsage) {
2674
+ const useTools = model.capabilities.tools;
2675
+ const start = Date.now();
2676
+ let prompt = 0, completion = 0, cost = 0, toolCalls = 0, iterations = 0;
2677
+ let summary = "";
2678
+ let finishedBy = "max-iters";
2679
+ try {
2680
+ for (let iter = 0; iter < rungDef.maxIters; iter++) {
2681
+ iterations = iter + 1;
2682
+ const gen = deps.client.stream(
2683
+ { model: model.id, messages, tools: useTools ? TOOL_SCHEMAS : void 0, temperature: 0.2, maxTokens: rungDef.maxTokens },
2684
+ model.pricing
2685
+ );
2686
+ let next = await gen.next();
2687
+ while (!next.done) {
2688
+ emit({ type: "text", delta: next.value });
2689
+ next = await gen.next();
2690
+ }
2691
+ const result = next.value;
2692
+ const entry = logUsage(result, taskTypeForLog);
2693
+ prompt += entry.promptTokens;
2694
+ completion += entry.completionTokens;
2695
+ cost += entry.costUsd;
2696
+ if (result.toolCalls.length && useTools) {
2697
+ messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
2698
+ let finished = false;
2699
+ for (const tc of result.toolCalls) {
2700
+ toolCalls++;
2701
+ emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
2702
+ const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
2703
+ emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
2704
+ messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
2705
+ if (outcome.finishSummary != null) {
2706
+ summary = outcome.finishSummary;
2707
+ finished = true;
2708
+ }
2709
+ }
2710
+ if (finished) {
2711
+ finishedBy = "finish-tool";
2712
+ break;
2713
+ }
2714
+ continue;
2715
+ }
2716
+ const textCall = useTools ? parseTextToolCall(result.content) : null;
2717
+ if (textCall) {
2718
+ toolCalls++;
2719
+ emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
2720
+ const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
2721
+ emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
2722
+ if (outcome.finishSummary != null) {
2723
+ summary = outcome.finishSummary;
2724
+ finishedBy = "finish-tool";
2725
+ break;
2726
+ }
2727
+ messages.push({ role: "assistant", content: result.content });
2728
+ messages.push({
2729
+ role: "user",
2730
+ content: `Tool ${textCall.function.name} returned:
2731
+ ${outcome.result}
2732
+ Continue. When done, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
2733
+ });
2734
+ continue;
2735
+ }
2736
+ summary = result.content || summary;
2737
+ if (summary) finishedBy = "text";
2738
+ break;
2739
+ }
2740
+ } catch (err) {
2741
+ finishedBy = "error";
2742
+ emit({ type: "error", message: `${taskTypeForLog} failed: ${err?.message ?? err}` });
2743
+ }
2744
+ return {
2745
+ summary,
2746
+ success: finishedBy === "finish-tool" || finishedBy === "text",
2747
+ finishedBy,
2748
+ iterations,
2749
+ toolCalls,
2750
+ prompt,
2751
+ completion,
2752
+ cost,
2753
+ durationMs: Date.now() - start
2754
+ };
2340
2755
  }
2341
2756
  function stepSystemPrompt(goal, step, priorSummaries, useTools) {
2342
2757
  const context = priorSummaries.length ? `
@@ -2344,12 +2759,13 @@ function stepSystemPrompt(goal, step, priorSummaries, useTools) {
2344
2759
  What previous steps accomplished:
2345
2760
  ${priorSummaries.join("\n")}` : "";
2346
2761
  const toolNote = useTools ? `
2347
- You may use the provided tools (read_file, write_file, list_dir, run_command). Call the \`finish\` tool with a one-line summary when this step's objective is met.` : `
2762
+ You may use the provided tools (read_file, write_file, list_dir, run_command). Call the \`finish\` tool with a one-line summary when this step's objective is met.
2763
+ If you cannot call tools natively, reply with ONLY one JSON object per turn, no prose: {"name":"<tool>","arguments":{...}}` : `
2348
2764
  Return a concise result for this step. Do not ask the user questions.`;
2349
2765
  return `You are the "${step.type}" stage of an autonomous coding agent.
2350
2766
  Overall goal: ${goal}
2351
2767
  Your current step: ${step.description}${context}${toolNote}
2352
- Be efficient \u2014 you were selected as the cheapest capable model for this step.`;
2768
+ Be efficient \u2014 you were selected as the most cost-effective capable model for this step.`;
2353
2769
  }
2354
2770
 
2355
2771
  // src/tui/App.tsx
@@ -2365,6 +2781,8 @@ function App(props) {
2365
2781
  const [tok, setTok] = useState(0);
2366
2782
  const [calls, setCalls] = useState(0);
2367
2783
  const [rated, setRated] = useState(null);
2784
+ const [passed, setPassed] = useState(null);
2785
+ const [attempts, setAttempts] = useState(0);
2368
2786
  const push = useCallback((text, color) => {
2369
2787
  setLog((l) => [...l, { key: l.length, text, color }]);
2370
2788
  }, []);
@@ -2382,7 +2800,9 @@ function App(props) {
2382
2800
  sessionId: props.sessionId,
2383
2801
  cwd: props.cwd,
2384
2802
  allowWrite: props.allowWrite,
2385
- allowCommands: props.allowCommands
2803
+ allowCommands: props.allowCommands,
2804
+ verify: props.verify,
2805
+ maxAttempts: props.maxAttempts
2386
2806
  };
2387
2807
  let textBuf = "";
2388
2808
  const flush = () => {
@@ -2394,6 +2814,24 @@ function App(props) {
2394
2814
  case "plan":
2395
2815
  push(`\u{1F4CB} Plan (${e.plan.steps.length} steps) \xB7 planner: ${e.planModel}`, "cyan");
2396
2816
  break;
2817
+ case "criteria":
2818
+ push(`\u{1F3AF} ${e.goalType} \xB7 ${e.criteria.length} criteria \xB7 start: ${e.startTier}${e.learned ? " (learned)" : ""}`, "cyan");
2819
+ e.criteria.forEach((cr, i) => push(` ${i + 1}. ${cr}`, "gray"));
2820
+ break;
2821
+ case "verify-start":
2822
+ flush();
2823
+ push(`\u{1F50D} Verify (attempt ${e.attempt}) \u2192 ${e.model}`, "cyan");
2824
+ break;
2825
+ case "verdict":
2826
+ flush();
2827
+ push(
2828
+ `${e.allMet ? "\u2705" : "\u274C"} ${e.metCount}/${e.total} criteria met` + (e.unmet.length ? " \u2014 unmet: " + e.unmet.map((u) => u.criterion).join("; ").slice(0, 100) : ""),
2829
+ e.allMet ? "green" : "red"
2830
+ );
2831
+ break;
2832
+ case "escalate":
2833
+ push(`\u23EB Escalate \u2192 ${e.toRung} (${e.reason})`, "magenta");
2834
+ break;
2397
2835
  case "step-start":
2398
2836
  flush();
2399
2837
  push(`\u25B6 Step ${e.step.id} [${e.step.type}] \u2192 ${e.model.id} ~${usd(e.estCostUsd)}`, "yellow");
@@ -2423,6 +2861,8 @@ function App(props) {
2423
2861
  break;
2424
2862
  case "done":
2425
2863
  flush();
2864
+ setPassed(e.passed);
2865
+ setAttempts(e.attempts);
2426
2866
  break;
2427
2867
  }
2428
2868
  };
@@ -2482,10 +2922,14 @@ function App(props) {
2482
2922
  " working\u2026"
2483
2923
  ] }),
2484
2924
  phase === "rate" && /* @__PURE__ */ jsxs(Text, { children: [
2485
- /* @__PURE__ */ jsxs(Text, { color: "green", children: [
2486
- "\u2713 Done \xB7 ",
2925
+ /* @__PURE__ */ jsxs(Text, { color: passed === false ? "yellow" : "green", children: [
2926
+ passed === false ? "\u26A0 goal not fully met" : "\u2713 goal met",
2927
+ " \xB7 ",
2928
+ attempts,
2929
+ " attempt(s) \xB7 ",
2487
2930
  calls,
2488
- " calls \xB7 ",
2931
+ " calls \xB7",
2932
+ " ",
2489
2933
  tokens(tok),
2490
2934
  " tokens \xB7 ",
2491
2935
  usd(cost)
@@ -2570,7 +3014,7 @@ function truncate2(s, n) {
2570
3014
 
2571
3015
  // src/index.ts
2572
3016
  var program = new Command();
2573
- program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.3.0");
3017
+ program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.4.0");
2574
3018
  function client(config) {
2575
3019
  return new OpenRouterClient({
2576
3020
  apiKey: resolveApiKey(config),
@@ -2644,7 +3088,7 @@ async function loadCatalog(config, refresh = false) {
2644
3088
  program.command("login").description("Connect Polymath to OpenRouter (set/replace your API key)").action(async () => {
2645
3089
  await runLogin();
2646
3090
  });
2647
- program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).action(async (goalParts, opts) => {
3091
+ program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).option("--no-verify", "skip the verify-and-escalate loop (single pass)").option("--max-attempts <n>", "max code\u2192verify\u2192escalate attempts until goals met", "3").action(async (goalParts, opts) => {
2648
3092
  const startedAt = Date.now();
2649
3093
  const config = loadConfig();
2650
3094
  if (!config.local.enabled || resolveApiKey(config)) {
@@ -2669,6 +3113,8 @@ program.command("run", { isDefault: true }).description("Launch the interactive
2669
3113
  allowWrite: !!opts.write,
2670
3114
  allowCommands: !!opts.commands,
2671
3115
  objectiveLabel: policy.objective,
3116
+ verify: opts.verify !== false,
3117
+ maxAttempts: Math.max(1, parseInt(opts.maxAttempts, 10) || 3),
2672
3118
  initialGoal: goal
2673
3119
  })
2674
3120
  );
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "polymath-agent",
3
- "version": "0.3.0",
3
+ "version": "0.4.0",
4
4
  "description": "Polymath — a cost-optimized, multi-model TUI coding agent. Decomposes work into typed tasks, routes each task to the cheapest capable model via OpenRouter, and logs real usage/cost by date + model.",
5
5
  "type": "module",
6
6
  "bin": {