polymath-agent 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +31 -0
  2. package/dist/cli.js +594 -188
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -100,6 +100,37 @@ poly usage # cost by date + model
100
100
  After each `poly run`, rate the result 0–9 (one keypress) — your goal-achievement
101
101
  rating joins the auto score (completed/planned steps) to power `poly analyze`.
102
102
 
103
+ ### Outcome-driven loop (verify → escalate → repeat)
104
+
105
+ `poly run` doesn't stop at "code written" — it measures the result and keeps going
106
+ until the goal is actually met:
107
+
108
+ ```
109
+ command → plan + acceptance criteria → code (cheapest model)
110
+ → VERIFY result against criteria (inspects files, runs tests)
111
+ → if unmet: ESCALATE (higher tier, more tokens, cost cap lifted) → fix → re-verify
112
+ → repeat until all criteria pass (or --max-attempts)
113
+ ```
114
+
115
+ The cheapest model gets first crack; only the criteria it *fails* trigger a pricier
116
+ model — so you pay for frontier capability exactly when (and only when) it's needed.
117
+
118
+ ```bash
119
+ poly run -w -x "add an add(a,b) to calc.js and make the tests pass"
120
+ poly run --no-verify "..." # single pass, no verify/escalate
121
+ poly run --max-attempts 5 "..." # try harder before giving up
122
+ ```
123
+
124
+ After each run you'll see `✓ goal met · 2 attempts` (or `⚠ goal not fully met`).
125
+
126
+ ### Statistical model optimization (learned starting tier)
127
+
128
+ Every attempt is recorded with its goal type, starting tier, tokens, and pass/fail.
129
+ `poly analyze` then shows, per goal type, **which starting model reaches the goal
130
+ with the fewest total tokens** — and once there's enough evidence (≥3 verified
131
+ sessions), `poly run` **auto-starts at that tier**, skipping cheap attempts for goal
132
+ types that historically need a stronger model from the start.
133
+
103
134
  ### The efficiency playbook (learned routing)
104
135
 
105
136
  Everything is captured locally (SQLite). `poly analyze` distills it into a **playbook**
package/dist/cli.js CHANGED
@@ -599,11 +599,14 @@ var TASK_SPECS = {
599
599
  command: { type: "command", minTier: "cheap", needsTools: true, label: "Run command" },
600
600
  review: { type: "review", minTier: "frontier", needsTools: false, label: "Review / critique" },
601
601
  reason: { type: "reason", minTier: "frontier", needsTools: false, label: "Hard reasoning" },
602
+ // The verify gate inspects files / runs tests — it MUST have tools.
603
+ verify: { type: "verify", minTier: "frontier", needsTools: true, label: "Verify result" },
602
604
  explain: { type: "explain", minTier: "cheap", needsTools: false, label: "Explain" },
603
605
  summarize: { type: "summarize", minTier: "cheap", needsTools: false, label: "Summarize" },
604
606
  chat: { type: "chat", minTier: "cheap", needsTools: false, label: "Chat" }
605
607
  };
606
608
  var ALL_TASK_TYPES = Object.keys(TASK_SPECS);
609
+ var ALL_GOAL_TYPES = ["feature", "bugfix", "refactor", "test", "docs", "chore", "other"];
607
610
 
608
611
  // src/planner/planner.ts
609
612
  var PLAN_SYSTEM = `You are the planning stage of a coding agent. Break the user's request into a short, ordered list of concrete steps.
@@ -619,9 +622,21 @@ Each step must be classified by type, chosen from EXACTLY this set:
619
622
  summarize - condense long content
620
623
  chat - a simple conversational reply
621
624
 
625
+ Also classify the request's goalType (one of: feature, bugfix, refactor, test, docs, chore, other) and write 2-5 MEASURABLE acceptance criteria \u2014 concrete, checkable conditions that mean the goal is fully achieved (e.g. "hello.js exists and prints the greeting", "npm test passes", "the function handles empty input").
626
+
622
627
  Return ONLY minified JSON of the form:
623
- {"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
628
+ {"goalType":"<type>","criteria":["...","..."],"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
624
629
  Use 3-8 steps for non-trivial work, fewer for simple requests. Estimate tokens realistically (prompts often 2000-15000, completions 200-3000).`;
630
+ function classifyGoalType(goal) {
631
+ const g = goal.toLowerCase();
632
+ if (/\b(fix|bug|broken|error|crash|regression|fails?)\b/.test(g)) return "bugfix";
633
+ if (/\b(refactor|rename|clean ?up|restructure|extract|simplif)/.test(g)) return "refactor";
634
+ if (/\b(test|spec|coverage|unit test|e2e)\b/.test(g)) return "test";
635
+ if (/\b(docs?|readme|comment|documentation)\b/.test(g)) return "docs";
636
+ if (/\b(bump|upgrade|dependency|deps|config|chore|lint|format)\b/.test(g)) return "chore";
637
+ if (/\b(add|create|implement|build|feature|support|new)\b/.test(g)) return "feature";
638
+ return "other";
639
+ }
625
640
  function heuristicPlan(goal) {
626
641
  const steps = [
627
642
  { id: 1, type: "plan", description: "Decompose the request", estPromptTokens: 2e3, estCompletionTokens: 600 },
@@ -630,7 +645,12 @@ function heuristicPlan(goal) {
630
645
  { id: 4, type: "edit", description: "Implement the change", estPromptTokens: 9e3, estCompletionTokens: 1500 },
631
646
  { id: 5, type: "review", description: "Review the change", estPromptTokens: 6e3, estCompletionTokens: 800 }
632
647
  ];
633
- return { goal, steps };
648
+ return {
649
+ goal,
650
+ steps,
651
+ goalType: classifyGoalType(goal),
652
+ criteria: ["The stated goal is fully implemented and works", "No obvious errors or omissions remain"]
653
+ };
634
654
  }
635
655
  async function planRequest(goal, client2, planModel, onUsage) {
636
656
  const result = await client2.complete(
@@ -648,7 +668,7 @@ async function planRequest(goal, client2, planModel, onUsage) {
648
668
  onUsage?.(result);
649
669
  const parsed = extractPlan(result.content);
650
670
  if (!parsed) return heuristicPlan(goal);
651
- return { goal, steps: parsed };
671
+ return { goal, ...parsed };
652
672
  }
653
673
  function extractPlan(text) {
654
674
  const json = extractJson(text);
@@ -663,7 +683,10 @@ function extractPlan(text) {
663
683
  estPromptTokens: clampInt(s.estPromptTokens, 500, 6e4, 4e3),
664
684
  estCompletionTokens: clampInt(s.estCompletionTokens, 100, 8e3, 800)
665
685
  }));
666
- return steps.length ? steps : null;
686
+ if (!steps.length) return null;
687
+ const goalType = ALL_GOAL_TYPES.includes(String(obj.goalType)) ? obj.goalType : "other";
688
+ const criteria = Array.isArray(obj.criteria) ? obj.criteria.map((x) => String(x).slice(0, 200)).filter(Boolean).slice(0, 6) : [];
689
+ return { steps, goalType, criteria: criteria.length ? criteria : ["The stated goal is fully achieved"] };
667
690
  } catch {
668
691
  return null;
669
692
  }
@@ -705,10 +728,29 @@ function extractJson(text) {
705
728
  }
706
729
 
707
730
  // src/router/policy.ts
731
+ var ESCALATION_LADDER = [
732
+ { objective: "value", maxTokens: 2e3, maxIters: 6, liftCostCap: false, label: "value \xB7 cheapest-capable" },
733
+ { tierFloor: "standard", objective: "value", maxTokens: 4e3, maxIters: 8, liftCostCap: true, label: "standard+ \xB7 more tokens" },
734
+ { tierFloor: "frontier", objective: "quality", maxTokens: 8e3, maxIters: 10, liftCostCap: true, label: "frontier \xB7 strongest" }
735
+ ];
736
+ function rungForTier(tier) {
737
+ return ESCALATION_LADDER.findIndex((r) => r.tierFloor === tier || !r.tierFloor && tier === "cheap");
738
+ }
739
+ function applyRung(base, rung) {
740
+ return {
741
+ ...base,
742
+ objective: rung.objective,
743
+ tierFloor: rung.tierFloor,
744
+ maxCostPerCallUsd: rung.liftCostCap ? void 0 : base.maxCostPerCallUsd
745
+ };
746
+ }
708
747
  var TIER_RANK = { cheap: 0, standard: 1, frontier: 2 };
709
748
  function tierAtLeast(tier, min) {
710
749
  return TIER_RANK[tier] >= TIER_RANK[min];
711
750
  }
751
+ function tierRank(tier) {
752
+ return TIER_RANK[tier];
753
+ }
712
754
  function blendedPrice(m) {
713
755
  return (m.pricing.promptUsdPerMTok * 3 + m.pricing.completionUsdPerMTok) / 4;
714
756
  }
@@ -755,6 +797,7 @@ var TASK_SKILL = {
755
797
  command: "speed",
756
798
  review: "reasoning",
757
799
  reason: "reasoning",
800
+ verify: "reasoning",
758
801
  explain: "general",
759
802
  summarize: "speed",
760
803
  chat: "speed"
@@ -781,6 +824,7 @@ var TASK_MIN_STRENGTH = {
781
824
  edit: 1.4,
782
825
  review: 1.5,
783
826
  reason: 1.5,
827
+ verify: 1.4,
784
828
  plan: 1.2
785
829
  };
786
830
  var HEADLINE_SKILLS = ["coding", "reasoning", "retrieval", "speed"];
@@ -798,9 +842,10 @@ function taskValue(m, taskType, empirical) {
798
842
  function candidatesFor(taskType, models, policy, est) {
799
843
  const spec = TASK_SPECS[taskType];
800
844
  const strengthFloor = TASK_MIN_STRENGTH[taskType] ?? 0;
845
+ const minTier = policy.tierFloor && tierRank(policy.tierFloor) > tierRank(spec.minTier) ? policy.tierFloor : spec.minTier;
801
846
  return models.filter((m) => {
802
847
  if (m.id === "openrouter/auto") return false;
803
- const covers = tierAtLeast(m.tier, spec.minTier) || taskStrength(m, taskType) >= strengthFloor;
848
+ const covers = tierAtLeast(m.tier, minTier) || !policy.tierFloor && taskStrength(m, taskType) >= strengthFloor;
804
849
  if (!covers) return false;
805
850
  if (spec.needsTools && !m.capabilities.tools) return false;
806
851
  if (policy.maxCostPerCallUsd != null && est) {
@@ -846,6 +891,19 @@ function route(taskType, models, policy, est = { promptTokens: 4e3, completionTo
846
891
  const reason = policy.objective === "cheapest" ? `cheapest model that covers ${skill}` : policy.objective === "quality" ? `strongest at ${skill}` : proven ? `proven ${Math.round(proven)}% fewer tokens on ${taskType} (playbook)` : `best ${skill}-per-dollar`;
847
892
  return { model: chosen, reason, estCostUsd: projectCost(chosen, est) };
848
893
  }
894
+ function routeOrBest(taskType, models, policy, est = { promptTokens: 4e3, completionTokens: 1e3 }) {
895
+ const r = route(taskType, models, policy, est);
896
+ if (r) return r;
897
+ const spec = TASK_SPECS[taskType];
898
+ const usable = models.filter(
899
+ (m) => m.id !== "openrouter/auto" && (!spec.needsTools || m.capabilities.tools)
900
+ );
901
+ if (!usable.length) return null;
902
+ const byStrength = (a, b) => taskStrength(b, taskType) - taskStrength(a, taskType);
903
+ const withTools = usable.filter((m) => m.capabilities.tools).sort(byStrength);
904
+ const best = (withTools.length ? withTools : [...usable].sort(byStrength))[0];
905
+ return { model: best, reason: `best available for ${TASK_SKILL[taskType]} (fallback)`, estCostUsd: projectCost(best, est) };
906
+ }
849
907
 
850
908
  // src/recommend/recommend.ts
851
909
  var OBJECTIVES = [
@@ -1075,6 +1133,27 @@ function getDb() {
1075
1133
  );
1076
1134
  CREATE INDEX IF NOT EXISTS idx_cmd_date ON command_runs(date);
1077
1135
 
1136
+ -- One row per verify-and-escalate attempt within a session. Powers the
1137
+ -- "optimal starting model per goal type" statistical learning.
1138
+ CREATE TABLE IF NOT EXISTS attempts (
1139
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
1140
+ session_id TEXT NOT NULL,
1141
+ attempt_no INTEGER NOT NULL,
1142
+ goal_type TEXT NOT NULL,
1143
+ tier_floor TEXT,
1144
+ objective TEXT NOT NULL,
1145
+ prompt_tokens INTEGER NOT NULL,
1146
+ completion_tokens INTEGER NOT NULL,
1147
+ cost_usd REAL NOT NULL,
1148
+ criteria_total INTEGER NOT NULL,
1149
+ criteria_met INTEGER NOT NULL,
1150
+ passed INTEGER NOT NULL,
1151
+ duration_ms INTEGER NOT NULL,
1152
+ synced INTEGER NOT NULL DEFAULT 0
1153
+ );
1154
+ CREATE INDEX IF NOT EXISTS idx_attempts_session ON attempts(session_id);
1155
+ CREATE INDEX IF NOT EXISTS idx_attempts_goal ON attempts(goal_type, tier_floor);
1156
+
1078
1157
  -- Distilled efficiency insights: ONLY the notably cost-efficient approaches.
1079
1158
  -- This is what syncs to the cloud by default (raw logs stay local).
1080
1159
  CREATE TABLE IF NOT EXISTS insights (
@@ -1096,6 +1175,15 @@ function getDb() {
1096
1175
  if (!cols.some((c2) => c2.name === "command")) {
1097
1176
  db.exec(`ALTER TABLE usage_log ADD COLUMN command TEXT NOT NULL DEFAULT 'run'`);
1098
1177
  }
1178
+ const conn = db;
1179
+ const scols = conn.prepare(`PRAGMA table_info(sessions)`).all();
1180
+ const addSession = (name, decl) => {
1181
+ if (!scols.some((c2) => c2.name === name)) conn.exec(`ALTER TABLE sessions ADD COLUMN ${name} ${decl}`);
1182
+ };
1183
+ addSession("goal_type", "TEXT NOT NULL DEFAULT 'other'");
1184
+ addSession("start_tier", "TEXT");
1185
+ addSession("attempts", "INTEGER NOT NULL DEFAULT 1");
1186
+ addSession("final_passed", "INTEGER");
1099
1187
  return db;
1100
1188
  }
1101
1189
  function recordUsage(e) {
@@ -1188,14 +1276,14 @@ function markSynced(ids) {
1188
1276
  }
1189
1277
  function startSession(s) {
1190
1278
  getDb().prepare(
1191
- `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps)
1192
- VALUES (?, ?, ?, ?, ?, ?, ?)`
1193
- ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps);
1279
+ `INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps, goal_type, start_tier)
1280
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
1281
+ ).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps, s.goalType, s.startTier ?? null);
1194
1282
  }
1195
1283
  function finishSession(id, u) {
1196
1284
  getDb().prepare(
1197
1285
  `UPDATE sessions SET planned_steps=?, completed_steps=?, failed_steps=?, auto_score=?,
1198
- prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=? WHERE id=?`
1286
+ prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=?, attempts=?, final_passed=? WHERE id=?`
1199
1287
  ).run(
1200
1288
  u.plannedSteps,
1201
1289
  u.completedSteps,
@@ -1205,9 +1293,60 @@ function finishSession(id, u) {
1205
1293
  u.completionTokens,
1206
1294
  u.costUsd,
1207
1295
  u.durationMs,
1296
+ u.attempts ?? 1,
1297
+ u.finalPassed == null ? null : u.finalPassed ? 1 : 0,
1208
1298
  id
1209
1299
  );
1210
1300
  }
1301
+ function recordAttempt(a) {
1302
+ getDb().prepare(
1303
+ `INSERT INTO attempts
1304
+ (session_id, attempt_no, goal_type, tier_floor, objective, prompt_tokens, completion_tokens,
1305
+ cost_usd, criteria_total, criteria_met, passed, duration_ms)
1306
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
1307
+ ).run(
1308
+ a.sessionId,
1309
+ a.attemptNo,
1310
+ a.goalType,
1311
+ a.tierFloor,
1312
+ a.objective,
1313
+ a.promptTokens,
1314
+ a.completionTokens,
1315
+ a.costUsd,
1316
+ a.criteriaTotal,
1317
+ a.criteriaMet,
1318
+ a.passed ? 1 : 0,
1319
+ a.durationMs
1320
+ );
1321
+ }
1322
+ function goalTierStats() {
1323
+ const rows = getDb().prepare(
1324
+ `SELECT goal_type AS goalType, COALESCE(start_tier,'cheap') AS startTier,
1325
+ COUNT(*) AS sessions,
1326
+ AVG(CASE WHEN final_passed=1 THEN 1.0 ELSE 0.0 END) AS passRate,
1327
+ AVG(prompt_tokens + completion_tokens) AS avgTotalTokens,
1328
+ AVG(attempts) AS avgAttempts
1329
+ FROM sessions
1330
+ WHERE final_passed IS NOT NULL
1331
+ GROUP BY goal_type, startTier
1332
+ ORDER BY goal_type, avgTotalTokens ASC`
1333
+ ).all();
1334
+ return rows.map((r) => ({
1335
+ goalType: String(r.goalType),
1336
+ startTier: String(r.startTier),
1337
+ sessions: Number(r.sessions),
1338
+ passRate: Number(r.passRate ?? 0),
1339
+ avgTotalTokens: Number(r.avgTotalTokens ?? 0),
1340
+ avgAttempts: Number(r.avgAttempts ?? 0)
1341
+ }));
1342
+ }
1343
+ function optimalStartTier(goalType, minSessions = 3) {
1344
+ const stats = goalTierStats().filter(
1345
+ (s) => s.goalType === goalType && s.sessions >= minSessions && s.passRate >= 0.6
1346
+ );
1347
+ if (!stats.length) return null;
1348
+ return stats.sort((a, b) => a.avgTotalTokens - b.avgTotalTokens)[0].startTier;
1349
+ }
1211
1350
  function setUserScore(sessionId, score) {
1212
1351
  getDb().prepare(`UPDATE sessions SET user_score=? WHERE id=?`).run(score, sessionId);
1213
1352
  }
@@ -1651,6 +1790,35 @@ function renderAnalysis(filter = {}) {
1651
1790
  }
1652
1791
  out.push("");
1653
1792
  }
1793
+ const tierStats = goalTierStats();
1794
+ if (tierStats.length) {
1795
+ out.push(c.bold("Optimal starting model per goal type") + c.dim(" (pass rate vs total tokens to reach the goal)"));
1796
+ out.push(
1797
+ table(
1798
+ ["Goal type", "Start tier", "Sessions", "Pass rate", "Avg total tok", "Avg attempts"],
1799
+ tierStats.map((s) => [
1800
+ s.goalType,
1801
+ tierColor(s.startTier),
1802
+ String(s.sessions),
1803
+ `${Math.round(s.passRate * 100)}%`,
1804
+ tokens(Math.round(s.avgTotalTokens)),
1805
+ s.avgAttempts.toFixed(1)
1806
+ ])
1807
+ )
1808
+ );
1809
+ const goalTypes = [...new Set(tierStats.map((s) => s.goalType))];
1810
+ const learned = goalTypes.map((g) => ({ g, tier: optimalStartTier(g) })).filter((x) => x.tier);
1811
+ if (learned.length) {
1812
+ out.push(
1813
+ c.green(
1814
+ "\u2192 Learned starts (auto-applied on `poly run`): " + learned.map((x) => `${x.g}\u2192${x.tier}`).join(", ")
1815
+ )
1816
+ );
1817
+ } else {
1818
+ out.push(c.dim("\u2192 Not enough evidence yet to auto-pick a starting tier (needs \u22653 verified sessions per goal type)."));
1819
+ }
1820
+ out.push("");
1821
+ }
1654
1822
  if (byCommand.length) {
1655
1823
  out.push(c.bold("Usage by command"));
1656
1824
  out.push(
@@ -2086,6 +2254,28 @@ var TOOL_SCHEMAS = [
2086
2254
  }
2087
2255
  }
2088
2256
  ];
2257
+ var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
2258
+ var READONLY_TOOL_SCHEMAS = TOOL_SCHEMAS.filter(
2259
+ (t) => ["read_file", "list_dir", "run_command"].includes(t.function.name)
2260
+ );
2261
+ function parseTextToolCall(content) {
2262
+ if (!content) return null;
2263
+ const json = extractJson(content);
2264
+ if (!json) return null;
2265
+ try {
2266
+ const obj = JSON.parse(json);
2267
+ const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
2268
+ if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
2269
+ const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
2270
+ return {
2271
+ id: `textcall_${name}`,
2272
+ type: "function",
2273
+ function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
2274
+ };
2275
+ } catch {
2276
+ return null;
2277
+ }
2278
+ }
2089
2279
  var MAX_OUTPUT = 8e3;
2090
2280
  function clip(s) {
2091
2281
  return s.length > MAX_OUTPUT ? s.slice(0, MAX_OUTPUT) + `
@@ -2167,50 +2357,124 @@ ${stderr}`)) };
2167
2357
  }
2168
2358
  }
2169
2359
 
2170
- // src/agent/loop.ts
2171
- var MAX_ITERS_PER_STEP = 6;
2172
- var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
2173
- function parseTextToolCall(content) {
2174
- if (!content) return null;
2175
- const json = extractJson(content);
2360
+ // src/agent/verify.ts
2361
+ var VERIFY_MAX_ITERS = 8;
2362
+ var VERIFY_SYSTEM = `You are the VERIFY stage of an autonomous coding agent. Your job is to MEASURE whether the goal was actually achieved \u2014 be skeptical and check the real workspace, do not assume.
2363
+ Use the read-only tools (read_file, list_dir, run_command) to inspect files and, where relevant, run build/test commands. Then judge EACH acceptance criterion against what you actually observed.
2364
+ When done, reply with ONLY this JSON (no prose, no code fence):
2365
+ {"results":[{"criterion":"<verbatim>","met":true|false,"reason":"<evidence>"}],"feedback":"<concrete guidance to fix any unmet criteria>"}`;
2366
+ async function verifyGoal(goal, criteria, deps, ev = {}) {
2367
+ const toolCtx = { cwd: deps.cwd, allowWrite: false, allowCommands: deps.allowCommands };
2368
+ const useTools = deps.model.capabilities.tools;
2369
+ const messages = [
2370
+ { role: "system", content: VERIFY_SYSTEM },
2371
+ {
2372
+ role: "user",
2373
+ content: `Goal: ${goal}
2374
+
2375
+ Acceptance criteria:
2376
+ ` + criteria.map((c2, i) => `${i + 1}. ${c2}`).join("\n") + `
2377
+
2378
+ Inspect the workspace, then return the verdict JSON.`
2379
+ }
2380
+ ];
2381
+ let verdict = null;
2382
+ for (let iter = 0; iter < VERIFY_MAX_ITERS; iter++) {
2383
+ const gen = deps.client.stream(
2384
+ { model: deps.model.id, messages, tools: useTools ? READONLY_TOOL_SCHEMAS : void 0, temperature: 0, maxTokens: 1500 },
2385
+ deps.model.pricing
2386
+ );
2387
+ let next = await gen.next();
2388
+ while (!next.done) next = await gen.next();
2389
+ const result = next.value;
2390
+ ev.onUsage?.(result);
2391
+ const calls = result.toolCalls.length ? result.toolCalls : useTools && parseTextToolCall(result.content) ? [parseTextToolCall(result.content)] : [];
2392
+ const parsed = parseVerdict(result.content, criteria);
2393
+ if (parsed) {
2394
+ verdict = parsed;
2395
+ break;
2396
+ }
2397
+ if (calls.length) {
2398
+ if (result.toolCalls.length) messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
2399
+ for (const tc of calls) {
2400
+ ev.onToolCall?.(tc.function.name, tc.function.arguments);
2401
+ const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
2402
+ ev.onToolResult?.(tc.function.name, outcome.result);
2403
+ if (result.toolCalls.length) {
2404
+ messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
2405
+ } else {
2406
+ messages.push({ role: "assistant", content: result.content });
2407
+ messages.push({ role: "user", content: `Tool ${tc.function.name} returned:
2408
+ ${outcome.result}
2409
+ Continue, then return the verdict JSON.` });
2410
+ }
2411
+ }
2412
+ continue;
2413
+ }
2414
+ messages.push({ role: "assistant", content: result.content });
2415
+ messages.push({ role: "user", content: `Return ONLY the verdict JSON now.` });
2416
+ }
2417
+ return verdict ?? fallbackVerdict(criteria);
2418
+ }
2419
+ function parseVerdict(text, criteria) {
2420
+ const json = extractJson(text);
2176
2421
  if (!json) return null;
2177
2422
  try {
2178
2423
  const obj = JSON.parse(json);
2179
- const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
2180
- if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
2181
- const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
2424
+ if (!Array.isArray(obj.results)) return null;
2425
+ const results = obj.results.map((r) => ({
2426
+ criterion: String(r.criterion ?? ""),
2427
+ met: r.met === true || String(r.met).toLowerCase() === "true",
2428
+ reason: String(r.reason ?? "").slice(0, 300)
2429
+ }));
2430
+ if (!results.length) return null;
2431
+ const unmet = results.filter((r) => !r.met);
2182
2432
  return {
2183
- id: `textcall_${name}`,
2184
- type: "function",
2185
- function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
2433
+ total: results.length,
2434
+ metCount: results.length - unmet.length,
2435
+ allMet: unmet.length === 0,
2436
+ results,
2437
+ unmet,
2438
+ feedback: String(obj.feedback ?? "").slice(0, 1e3) || unmet.map((u) => u.reason).join("; ")
2186
2439
  };
2187
2440
  } catch {
2188
2441
  return null;
2189
2442
  }
2190
2443
  }
2444
+ function fallbackVerdict(criteria) {
2445
+ const results = criteria.map((c2) => ({ criterion: c2, met: false, reason: "verifier produced no verdict" }));
2446
+ return { total: results.length, metCount: 0, allMet: false, results, unmet: results, feedback: "Verification inconclusive; re-attempt with a stronger model." };
2447
+ }
2448
+
2449
+ // src/agent/loop.ts
2450
+ function localDate2(d = /* @__PURE__ */ new Date()) {
2451
+ const y = d.getFullYear();
2452
+ const m = String(d.getMonth() + 1).padStart(2, "0");
2453
+ const day = String(d.getDate()).padStart(2, "0");
2454
+ return `${y}-${m}-${day}`;
2455
+ }
2191
2456
  async function runAgent(goal, deps, emit) {
2192
- const { client: client2, models, policy, sessionId, cwd } = deps;
2193
- let totalCostUsd = 0;
2194
- let totalTokens = 0;
2195
- let totalPromptTokens = 0;
2196
- let totalCompletionTokens = 0;
2197
- let calls = 0;
2457
+ const { client: client2, models, cwd } = deps;
2458
+ const verifyOn = deps.verify ?? true;
2459
+ const maxAttempts = deps.maxAttempts ?? 3;
2460
+ const acc = { cost: 0, tokens: 0, prompt: 0, completion: 0, calls: 0 };
2198
2461
  const sessionStart = Date.now();
2199
- let completedSteps = 0;
2200
- let failedSteps = 0;
2201
- const planRoute = route("plan", models, policy);
2462
+ const toolCtx = { cwd, allowWrite: deps.allowWrite, allowCommands: deps.allowCommands };
2463
+ const logUsage = (r, taskType) => {
2464
+ const entry = logCompletion(r, taskType, deps.sessionId);
2465
+ emit({ type: "usage", entry });
2466
+ acc.cost += entry.costUsd;
2467
+ acc.tokens += entry.totalTokens;
2468
+ acc.prompt += entry.promptTokens;
2469
+ acc.completion += entry.completionTokens;
2470
+ acc.calls++;
2471
+ return entry;
2472
+ };
2473
+ const planRoute = route("plan", models, deps.policy);
2202
2474
  let plan;
2203
2475
  if (planRoute) {
2204
2476
  try {
2205
- plan = await planRequest(goal, client2, planRoute.model, (result) => {
2206
- const entry = logCompletion(result, "plan", sessionId);
2207
- emit({ type: "usage", entry });
2208
- totalCostUsd += entry.costUsd;
2209
- totalTokens += entry.totalTokens;
2210
- totalPromptTokens += entry.promptTokens;
2211
- totalCompletionTokens += entry.completionTokens;
2212
- calls++;
2213
- });
2477
+ plan = await planRequest(goal, client2, planRoute.model, (r) => logUsage(r, "plan"));
2214
2478
  } catch {
2215
2479
  plan = heuristicPlan(goal);
2216
2480
  }
@@ -2218,164 +2482,276 @@ async function runAgent(goal, deps, emit) {
2218
2482
  plan = heuristicPlan(goal);
2219
2483
  }
2220
2484
  emit({ type: "plan", plan, planModel: planRoute?.model.id ?? "heuristic" });
2485
+ let startRung = 0;
2486
+ let learned = false;
2487
+ if (verifyOn) {
2488
+ const tier = optimalStartTier(plan.goalType);
2489
+ if (tier) {
2490
+ const r = rungForTier(tier);
2491
+ if (r > 0) {
2492
+ startRung = r;
2493
+ learned = true;
2494
+ }
2495
+ }
2496
+ }
2497
+ const startTier = ESCALATION_LADDER[startRung].tierFloor ?? "cheap";
2498
+ emit({ type: "criteria", goalType: plan.goalType, criteria: plan.criteria, startTier, learned });
2221
2499
  startSession({
2222
- id: sessionId,
2500
+ id: deps.sessionId,
2223
2501
  ts: sessionStart,
2224
2502
  date: localDate2(),
2225
2503
  goal,
2226
2504
  command: "run",
2227
- objective: policy.objective,
2228
- plannedSteps: plan.steps.length
2505
+ objective: deps.policy.objective,
2506
+ plannedSteps: plan.steps.length,
2507
+ goalType: plan.goalType,
2508
+ startTier
2229
2509
  });
2230
- const toolCtx = {
2231
- cwd,
2232
- allowWrite: deps.allowWrite,
2233
- allowCommands: deps.allowCommands
2234
- };
2510
+ let rung = startRung;
2511
+ let attemptNo = 0;
2512
+ let verdict = null;
2513
+ let completedSteps = 0;
2514
+ let failedSteps = 0;
2235
2515
  const priorSummaries = [];
2236
- for (const step of plan.steps) {
2237
- const r = route(step.type, models, policy, {
2238
- promptTokens: step.estPromptTokens,
2239
- completionTokens: step.estCompletionTokens
2240
- });
2241
- if (!r) {
2242
- failedSteps++;
2243
- emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
2244
- continue;
2245
- }
2246
- const model = r.model;
2247
- emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
2248
- const useTools = model.capabilities.tools;
2249
- const messages = [
2250
- { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, useTools) },
2251
- { role: "user", content: step.description }
2252
- ];
2253
- const stepStart = Date.now();
2254
- let stepPrompt = 0;
2255
- let stepCompletion = 0;
2256
- let stepCost = 0;
2257
- let stepToolCalls = 0;
2258
- let iterations = 0;
2259
- let finishedBy = "max-iters";
2260
- let summary = "";
2261
- try {
2262
- for (let iter = 0; iter < MAX_ITERS_PER_STEP; iter++) {
2263
- iterations = iter + 1;
2264
- const gen = client2.stream(
2265
- {
2266
- model: model.id,
2267
- messages,
2268
- tools: useTools ? TOOL_SCHEMAS : void 0,
2269
- temperature: 0.2,
2270
- maxTokens: 2e3
2271
- },
2272
- model.pricing
2273
- );
2274
- let next = await gen.next();
2275
- while (!next.done) {
2276
- emit({ type: "text", delta: next.value });
2277
- next = await gen.next();
2278
- }
2279
- const result = next.value;
2280
- const entry = logCompletion(result, step.type, sessionId);
2281
- emit({ type: "usage", entry });
2282
- totalCostUsd += entry.costUsd;
2283
- totalTokens += entry.totalTokens;
2284
- totalPromptTokens += entry.promptTokens;
2285
- totalCompletionTokens += entry.completionTokens;
2286
- stepPrompt += entry.promptTokens;
2287
- stepCompletion += entry.completionTokens;
2288
- stepCost += entry.costUsd;
2289
- calls++;
2290
- if (result.toolCalls.length && useTools) {
2291
- messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
2292
- let finished = false;
2293
- for (const tc of result.toolCalls) {
2294
- stepToolCalls++;
2295
- emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
2296
- const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
2297
- emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
2298
- messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
2299
- if (outcome.finishSummary != null) {
2300
- summary = outcome.finishSummary;
2301
- finished = true;
2302
- }
2303
- }
2304
- if (finished) {
2305
- finishedBy = "finish-tool";
2306
- break;
2307
- }
2308
- continue;
2309
- }
2310
- const textCall = useTools ? parseTextToolCall(result.content) : null;
2311
- if (textCall) {
2312
- stepToolCalls++;
2313
- emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
2314
- const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
2315
- emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
2316
- if (outcome.finishSummary != null) {
2317
- summary = outcome.finishSummary;
2318
- finishedBy = "finish-tool";
2319
- break;
2320
- }
2321
- messages.push({ role: "assistant", content: result.content });
2322
- messages.push({
2323
- role: "user",
2324
- content: `Tool ${textCall.function.name} returned:
2325
- ${outcome.result}
2326
- Continue with this step. When the objective is met, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
2327
- });
2328
- continue;
2329
- }
2330
- summary = result.content || summary;
2331
- if (summary) finishedBy = "text";
2332
- break;
2516
+ while (attemptNo < maxAttempts) {
2517
+ const rungDef = ESCALATION_LADDER[Math.min(rung, ESCALATION_LADDER.length - 1)];
2518
+ const rungPolicy = applyRung(deps.policy, rungDef);
2519
+ const attemptStart = Date.now();
2520
+ const before = { ...acc };
2521
+ if (attemptNo === 0) {
2522
+ for (const step of plan.steps) {
2523
+ const res = await runStep(step, rungPolicy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal);
2524
+ if (res.success) completedSteps++;
2525
+ else failedSteps++;
2333
2526
  }
2334
- } catch (err) {
2335
- finishedBy = "error";
2336
- emit({ type: "error", message: `Step ${step.id} failed: ${err?.message ?? err}` });
2527
+ } else {
2528
+ await runFix(goal, plan, verdict, rungPolicy, rungDef, deps, toolCtx, emit, logUsage);
2337
2529
  }
2338
- const success = finishedBy === "finish-tool" || finishedBy === "text";
2339
- if (success) completedSteps++;
2340
- else failedSteps++;
2341
- recordStepRun({
2342
- sessionId,
2343
- stepNo: step.id,
2344
- taskType: step.type,
2345
- skill: TASK_SKILL[step.type],
2346
- model: model.id,
2347
- provider: model.provider,
2348
- iterations,
2349
- toolCalls: stepToolCalls,
2350
- promptTokens: stepPrompt,
2351
- completionTokens: stepCompletion,
2352
- costUsd: stepCost,
2353
- finishedBy,
2354
- success,
2355
- durationMs: Date.now() - stepStart
2530
+ if (!verifyOn) {
2531
+ attemptNo++;
2532
+ break;
2533
+ }
2534
+ const verifyPolicy = { ...deps.policy, objective: "quality", tierFloor: rungDef.tierFloor };
2535
+ const verifier = routeOrBest("verify", models, verifyPolicy);
2536
+ if (!verifier) {
2537
+ emit({ type: "error", message: "No model available to verify." });
2538
+ attemptNo++;
2539
+ break;
2540
+ }
2541
+ emit({ type: "verify-start", model: verifier.model.id, attempt: attemptNo + 1 });
2542
+ verdict = await verifyGoal(goal, plan.criteria, { client: client2, model: verifier.model, cwd, allowCommands: deps.allowCommands }, {
2543
+ onToolCall: (name, args) => emit({ type: "tool-call", name, args }),
2544
+ onToolResult: (name, result) => emit({ type: "tool-result", name, result }),
2545
+ onUsage: (r) => logUsage(r, "review")
2356
2546
  });
2357
- if (!summary) summary = "(no summary)";
2358
- priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
2359
- emit({ type: "step-end", step, summary });
2547
+ emit({ type: "verdict", attempt: attemptNo + 1, metCount: verdict.metCount, total: verdict.total, allMet: verdict.allMet, unmet: verdict.unmet });
2548
+ recordAttempt({
2549
+ sessionId: deps.sessionId,
2550
+ attemptNo: attemptNo + 1,
2551
+ goalType: plan.goalType,
2552
+ tierFloor: rungDef.tierFloor ?? null,
2553
+ objective: rungDef.objective,
2554
+ promptTokens: acc.prompt - before.prompt,
2555
+ completionTokens: acc.completion - before.completion,
2556
+ costUsd: acc.cost - before.cost,
2557
+ criteriaTotal: verdict.total,
2558
+ criteriaMet: verdict.metCount,
2559
+ passed: verdict.allMet,
2560
+ durationMs: Date.now() - attemptStart
2561
+ });
2562
+ attemptNo++;
2563
+ if (verdict.allMet) break;
2564
+ if (attemptNo < maxAttempts) {
2565
+ const next = Math.min(rung + 1, ESCALATION_LADDER.length - 1);
2566
+ rung = next;
2567
+ emit({
2568
+ type: "escalate",
2569
+ toRung: ESCALATION_LADDER[next].label,
2570
+ reason: `${verdict.unmet.length}/${verdict.total} criteria unmet`
2571
+ });
2572
+ }
2360
2573
  }
2361
- finishSession(sessionId, {
2574
+ const passed = verifyOn ? verdict ? verdict.allMet : false : null;
2575
+ finishSession(deps.sessionId, {
2362
2576
  plannedSteps: plan.steps.length,
2363
2577
  completedSteps,
2364
2578
  failedSteps,
2365
- autoScore: plan.steps.length ? completedSteps / plan.steps.length : null,
2366
- promptTokens: totalPromptTokens,
2367
- completionTokens: totalCompletionTokens,
2368
- costUsd: totalCostUsd,
2369
- durationMs: Date.now() - sessionStart
2579
+ autoScore: verdict ? verdict.metCount / Math.max(verdict.total, 1) : plan.steps.length ? completedSteps / plan.steps.length : null,
2580
+ promptTokens: acc.prompt,
2581
+ completionTokens: acc.completion,
2582
+ costUsd: acc.cost,
2583
+ durationMs: Date.now() - sessionStart,
2584
+ attempts: attemptNo,
2585
+ finalPassed: passed
2370
2586
  });
2371
- emit({ type: "done", totalCostUsd, totalTokens, calls });
2372
- return { totalCostUsd, totalTokens, calls };
2587
+ emit({ type: "done", totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed, attempts: attemptNo });
2588
+ return { totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed };
2373
2589
  }
2374
- function localDate2(d = /* @__PURE__ */ new Date()) {
2375
- const y = d.getFullYear();
2376
- const m = String(d.getMonth() + 1).padStart(2, "0");
2377
- const day = String(d.getDate()).padStart(2, "0");
2378
- return `${y}-${m}-${day}`;
2590
+ async function runStep(step, policy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal) {
2591
+ const r = routeOrBest(step.type, deps.models, policy, {
2592
+ promptTokens: step.estPromptTokens,
2593
+ completionTokens: step.estCompletionTokens
2594
+ });
2595
+ if (!r) {
2596
+ emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
2597
+ return { summary: "(no model)", success: false };
2598
+ }
2599
+ const model = r.model;
2600
+ emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
2601
+ const messages = [
2602
+ { role: "system", content: stepSystemPrompt(goal, step, priorSummaries, model.capabilities.tools) },
2603
+ { role: "user", content: step.description }
2604
+ ];
2605
+ const loop = await runToolLoop(model, messages, step.type, rungDef, deps, toolCtx, emit, logUsage);
2606
+ recordStepRun({
2607
+ sessionId: deps.sessionId,
2608
+ stepNo: step.id,
2609
+ taskType: step.type,
2610
+ skill: TASK_SKILL[step.type],
2611
+ model: model.id,
2612
+ provider: model.provider,
2613
+ iterations: loop.iterations,
2614
+ toolCalls: loop.toolCalls,
2615
+ promptTokens: loop.prompt,
2616
+ completionTokens: loop.completion,
2617
+ costUsd: loop.cost,
2618
+ finishedBy: loop.finishedBy,
2619
+ success: loop.success,
2620
+ durationMs: loop.durationMs
2621
+ });
2622
+ const summary = loop.summary || "(no summary)";
2623
+ priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
2624
+ emit({ type: "step-end", step, summary });
2625
+ return { summary, success: loop.success };
2626
+ }
2627
+ async function runFix(goal, plan, verdict, policy, rungDef, deps, toolCtx, emit, logUsage) {
2628
+ const r = routeOrBest("edit", deps.models, policy);
2629
+ if (!r) return { summary: "(no model)", success: false };
2630
+ const model = r.model;
2631
+ const fixStep = {
2632
+ id: 100,
2633
+ type: "edit",
2634
+ description: "Fix the unmet acceptance criteria",
2635
+ estPromptTokens: 9e3,
2636
+ estCompletionTokens: 1500
2637
+ };
2638
+ emit({ type: "step-start", step: fixStep, model, estCostUsd: r.estCostUsd });
2639
+ const unmet = verdict.unmet.map((u, i) => `${i + 1}. ${u.criterion} \u2014 ${u.reason}`).join("\n");
2640
+ const messages = [
2641
+ {
2642
+ role: "system",
2643
+ content: `You are the FIX stage of an autonomous coding agent (escalated model). The verify gate found unmet acceptance criteria; resolve them.
2644
+ Overall goal: ${goal}
2645
+ You may use the tools (read_file, write_file, list_dir, run_command). Inspect what's there, then make the changes. Call \`finish\` with a one-line summary when all listed criteria should now pass.
2646
+ If you cannot call tools natively, reply with ONLY one JSON object per turn: {"name":"<tool>","arguments":{...}}`
2647
+ },
2648
+ { role: "user", content: `Unmet criteria:
2649
+ ${unmet}
2650
+
2651
+ Verifier feedback: ${verdict.feedback}` }
2652
+ ];
2653
+ const loop = await runToolLoop(model, messages, "edit", rungDef, deps, toolCtx, emit, logUsage);
2654
+ recordStepRun({
2655
+ sessionId: deps.sessionId,
2656
+ stepNo: fixStep.id,
2657
+ taskType: "edit",
2658
+ skill: TASK_SKILL.edit,
2659
+ model: model.id,
2660
+ provider: model.provider,
2661
+ iterations: loop.iterations,
2662
+ toolCalls: loop.toolCalls,
2663
+ promptTokens: loop.prompt,
2664
+ completionTokens: loop.completion,
2665
+ costUsd: loop.cost,
2666
+ finishedBy: loop.finishedBy,
2667
+ success: loop.success,
2668
+ durationMs: loop.durationMs
2669
+ });
2670
+ emit({ type: "step-end", step: fixStep, summary: loop.summary || "(fix pass)" });
2671
+ return { summary: loop.summary, success: loop.success };
2672
+ }
2673
+ async function runToolLoop(model, messages, taskTypeForLog, rungDef, deps, toolCtx, emit, logUsage) {
2674
+ const useTools = model.capabilities.tools;
2675
+ const start = Date.now();
2676
+ let prompt = 0, completion = 0, cost = 0, toolCalls = 0, iterations = 0;
2677
+ let summary = "";
2678
+ let finishedBy = "max-iters";
2679
+ try {
2680
+ for (let iter = 0; iter < rungDef.maxIters; iter++) {
2681
+ iterations = iter + 1;
2682
+ const gen = deps.client.stream(
2683
+ { model: model.id, messages, tools: useTools ? TOOL_SCHEMAS : void 0, temperature: 0.2, maxTokens: rungDef.maxTokens },
2684
+ model.pricing
2685
+ );
2686
+ let next = await gen.next();
2687
+ while (!next.done) {
2688
+ emit({ type: "text", delta: next.value });
2689
+ next = await gen.next();
2690
+ }
2691
+ const result = next.value;
2692
+ const entry = logUsage(result, taskTypeForLog);
2693
+ prompt += entry.promptTokens;
2694
+ completion += entry.completionTokens;
2695
+ cost += entry.costUsd;
2696
+ if (result.toolCalls.length && useTools) {
2697
+ messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
2698
+ let finished = false;
2699
+ for (const tc of result.toolCalls) {
2700
+ toolCalls++;
2701
+ emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
2702
+ const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
2703
+ emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
2704
+ messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
2705
+ if (outcome.finishSummary != null) {
2706
+ summary = outcome.finishSummary;
2707
+ finished = true;
2708
+ }
2709
+ }
2710
+ if (finished) {
2711
+ finishedBy = "finish-tool";
2712
+ break;
2713
+ }
2714
+ continue;
2715
+ }
2716
+ const textCall = useTools ? parseTextToolCall(result.content) : null;
2717
+ if (textCall) {
2718
+ toolCalls++;
2719
+ emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
2720
+ const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
2721
+ emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
2722
+ if (outcome.finishSummary != null) {
2723
+ summary = outcome.finishSummary;
2724
+ finishedBy = "finish-tool";
2725
+ break;
2726
+ }
2727
+ messages.push({ role: "assistant", content: result.content });
2728
+ messages.push({
2729
+ role: "user",
2730
+ content: `Tool ${textCall.function.name} returned:
2731
+ ${outcome.result}
2732
+ Continue. When done, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
2733
+ });
2734
+ continue;
2735
+ }
2736
+ summary = result.content || summary;
2737
+ if (summary) finishedBy = "text";
2738
+ break;
2739
+ }
2740
+ } catch (err) {
2741
+ finishedBy = "error";
2742
+ emit({ type: "error", message: `${taskTypeForLog} failed: ${err?.message ?? err}` });
2743
+ }
2744
+ return {
2745
+ summary,
2746
+ success: finishedBy === "finish-tool" || finishedBy === "text",
2747
+ finishedBy,
2748
+ iterations,
2749
+ toolCalls,
2750
+ prompt,
2751
+ completion,
2752
+ cost,
2753
+ durationMs: Date.now() - start
2754
+ };
2379
2755
  }
2380
2756
  function stepSystemPrompt(goal, step, priorSummaries, useTools) {
2381
2757
  const context = priorSummaries.length ? `
@@ -2389,7 +2765,7 @@ Return a concise result for this step. Do not ask the user questions.`;
2389
2765
  return `You are the "${step.type}" stage of an autonomous coding agent.
2390
2766
  Overall goal: ${goal}
2391
2767
  Your current step: ${step.description}${context}${toolNote}
2392
- Be efficient \u2014 you were selected as the cheapest capable model for this step.`;
2768
+ Be efficient \u2014 you were selected as the most cost-effective capable model for this step.`;
2393
2769
  }
2394
2770
 
2395
2771
  // src/tui/App.tsx
@@ -2405,6 +2781,8 @@ function App(props) {
2405
2781
  const [tok, setTok] = useState(0);
2406
2782
  const [calls, setCalls] = useState(0);
2407
2783
  const [rated, setRated] = useState(null);
2784
+ const [passed, setPassed] = useState(null);
2785
+ const [attempts, setAttempts] = useState(0);
2408
2786
  const push = useCallback((text, color) => {
2409
2787
  setLog((l) => [...l, { key: l.length, text, color }]);
2410
2788
  }, []);
@@ -2422,7 +2800,9 @@ function App(props) {
2422
2800
  sessionId: props.sessionId,
2423
2801
  cwd: props.cwd,
2424
2802
  allowWrite: props.allowWrite,
2425
- allowCommands: props.allowCommands
2803
+ allowCommands: props.allowCommands,
2804
+ verify: props.verify,
2805
+ maxAttempts: props.maxAttempts
2426
2806
  };
2427
2807
  let textBuf = "";
2428
2808
  const flush = () => {
@@ -2434,6 +2814,24 @@ function App(props) {
2434
2814
  case "plan":
2435
2815
  push(`\u{1F4CB} Plan (${e.plan.steps.length} steps) \xB7 planner: ${e.planModel}`, "cyan");
2436
2816
  break;
2817
+ case "criteria":
2818
+ push(`\u{1F3AF} ${e.goalType} \xB7 ${e.criteria.length} criteria \xB7 start: ${e.startTier}${e.learned ? " (learned)" : ""}`, "cyan");
2819
+ e.criteria.forEach((cr, i) => push(` ${i + 1}. ${cr}`, "gray"));
2820
+ break;
2821
+ case "verify-start":
2822
+ flush();
2823
+ push(`\u{1F50D} Verify (attempt ${e.attempt}) \u2192 ${e.model}`, "cyan");
2824
+ break;
2825
+ case "verdict":
2826
+ flush();
2827
+ push(
2828
+ `${e.allMet ? "\u2705" : "\u274C"} ${e.metCount}/${e.total} criteria met` + (e.unmet.length ? " \u2014 unmet: " + e.unmet.map((u) => u.criterion).join("; ").slice(0, 100) : ""),
2829
+ e.allMet ? "green" : "red"
2830
+ );
2831
+ break;
2832
+ case "escalate":
2833
+ push(`\u23EB Escalate \u2192 ${e.toRung} (${e.reason})`, "magenta");
2834
+ break;
2437
2835
  case "step-start":
2438
2836
  flush();
2439
2837
  push(`\u25B6 Step ${e.step.id} [${e.step.type}] \u2192 ${e.model.id} ~${usd(e.estCostUsd)}`, "yellow");
@@ -2463,6 +2861,8 @@ function App(props) {
2463
2861
  break;
2464
2862
  case "done":
2465
2863
  flush();
2864
+ setPassed(e.passed);
2865
+ setAttempts(e.attempts);
2466
2866
  break;
2467
2867
  }
2468
2868
  };
@@ -2522,10 +2922,14 @@ function App(props) {
2522
2922
  " working\u2026"
2523
2923
  ] }),
2524
2924
  phase === "rate" && /* @__PURE__ */ jsxs(Text, { children: [
2525
- /* @__PURE__ */ jsxs(Text, { color: "green", children: [
2526
- "\u2713 Done \xB7 ",
2925
+ /* @__PURE__ */ jsxs(Text, { color: passed === false ? "yellow" : "green", children: [
2926
+ passed === false ? "\u26A0 goal not fully met" : "\u2713 goal met",
2927
+ " \xB7 ",
2928
+ attempts,
2929
+ " attempt(s) \xB7 ",
2527
2930
  calls,
2528
- " calls \xB7 ",
2931
+ " calls \xB7",
2932
+ " ",
2529
2933
  tokens(tok),
2530
2934
  " tokens \xB7 ",
2531
2935
  usd(cost)
@@ -2610,7 +3014,7 @@ function truncate2(s, n) {
2610
3014
 
2611
3015
  // src/index.ts
2612
3016
  var program = new Command();
2613
- program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.3.1");
3017
+ program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.4.0");
2614
3018
  function client(config) {
2615
3019
  return new OpenRouterClient({
2616
3020
  apiKey: resolveApiKey(config),
@@ -2684,7 +3088,7 @@ async function loadCatalog(config, refresh = false) {
2684
3088
  program.command("login").description("Connect Polymath to OpenRouter (set/replace your API key)").action(async () => {
2685
3089
  await runLogin();
2686
3090
  });
2687
- program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).action(async (goalParts, opts) => {
3091
+ program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).option("--no-verify", "skip the verify-and-escalate loop (single pass)").option("--max-attempts <n>", "max code\u2192verify\u2192escalate attempts until goals met", "3").action(async (goalParts, opts) => {
2688
3092
  const startedAt = Date.now();
2689
3093
  const config = loadConfig();
2690
3094
  if (!config.local.enabled || resolveApiKey(config)) {
@@ -2709,6 +3113,8 @@ program.command("run", { isDefault: true }).description("Launch the interactive
2709
3113
  allowWrite: !!opts.write,
2710
3114
  allowCommands: !!opts.commands,
2711
3115
  objectiveLabel: policy.objective,
3116
+ verify: opts.verify !== false,
3117
+ maxAttempts: Math.max(1, parseInt(opts.maxAttempts, 10) || 3),
2712
3118
  initialGoal: goal
2713
3119
  })
2714
3120
  );
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "polymath-agent",
3
- "version": "0.3.1",
3
+ "version": "0.4.0",
4
4
  "description": "Polymath — a cost-optimized, multi-model TUI coding agent. Decomposes work into typed tasks, routes each task to the cheapest capable model via OpenRouter, and logs real usage/cost by date + model.",
5
5
  "type": "module",
6
6
  "bin": {