polymath-agent 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/dist/cli.js +604 -158
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -100,6 +100,37 @@ poly usage # cost by date + model
|
|
|
100
100
|
After each `poly run`, rate the result 0–9 (one keypress) — your goal-achievement
|
|
101
101
|
rating joins the auto score (completed/planned steps) to power `poly analyze`.
|
|
102
102
|
|
|
103
|
+
### Outcome-driven loop (verify → escalate → repeat)
|
|
104
|
+
|
|
105
|
+
`poly run` doesn't stop at "code written" — it measures the result and keeps going
|
|
106
|
+
until the goal is actually met:
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
command → plan + acceptance criteria → code (cheapest model)
|
|
110
|
+
→ VERIFY result against criteria (inspects files, runs tests)
|
|
111
|
+
→ if unmet: ESCALATE (higher tier, more tokens, cost cap lifted) → fix → re-verify
|
|
112
|
+
→ repeat until all criteria pass (or --max-attempts)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
The cheapest model gets first crack; only the criteria it *fails* trigger a pricier
|
|
116
|
+
model — so you pay for frontier capability exactly when (and only when) it's needed.
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
poly run -w -x "add an add(a,b) to calc.js and make the tests pass"
|
|
120
|
+
poly run --no-verify "..." # single pass, no verify/escalate
|
|
121
|
+
poly run --max-attempts 5 "..." # try harder before giving up
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
After each run you'll see `✓ goal met · 2 attempts` (or `⚠ goal not fully met`).
|
|
125
|
+
|
|
126
|
+
### Statistical model optimization (learned starting tier)
|
|
127
|
+
|
|
128
|
+
Every attempt is recorded with its goal type, starting tier, tokens, and pass/fail.
|
|
129
|
+
`poly analyze` then shows, per goal type, **which starting model reaches the goal
|
|
130
|
+
with the fewest total tokens** — and once there's enough evidence (≥3 verified
|
|
131
|
+
sessions), `poly run` **auto-starts at that tier**, skipping cheap attempts for goal
|
|
132
|
+
types that historically need a stronger model from the start.
|
|
133
|
+
|
|
103
134
|
### The efficiency playbook (learned routing)
|
|
104
135
|
|
|
105
136
|
Everything is captured locally (SQLite). `poly analyze` distills it into a **playbook**
|
package/dist/cli.js
CHANGED
|
@@ -599,11 +599,14 @@ var TASK_SPECS = {
|
|
|
599
599
|
command: { type: "command", minTier: "cheap", needsTools: true, label: "Run command" },
|
|
600
600
|
review: { type: "review", minTier: "frontier", needsTools: false, label: "Review / critique" },
|
|
601
601
|
reason: { type: "reason", minTier: "frontier", needsTools: false, label: "Hard reasoning" },
|
|
602
|
+
// The verify gate inspects files / runs tests — it MUST have tools.
|
|
603
|
+
verify: { type: "verify", minTier: "frontier", needsTools: true, label: "Verify result" },
|
|
602
604
|
explain: { type: "explain", minTier: "cheap", needsTools: false, label: "Explain" },
|
|
603
605
|
summarize: { type: "summarize", minTier: "cheap", needsTools: false, label: "Summarize" },
|
|
604
606
|
chat: { type: "chat", minTier: "cheap", needsTools: false, label: "Chat" }
|
|
605
607
|
};
|
|
606
608
|
var ALL_TASK_TYPES = Object.keys(TASK_SPECS);
|
|
609
|
+
var ALL_GOAL_TYPES = ["feature", "bugfix", "refactor", "test", "docs", "chore", "other"];
|
|
607
610
|
|
|
608
611
|
// src/planner/planner.ts
|
|
609
612
|
var PLAN_SYSTEM = `You are the planning stage of a coding agent. Break the user's request into a short, ordered list of concrete steps.
|
|
@@ -619,9 +622,21 @@ Each step must be classified by type, chosen from EXACTLY this set:
|
|
|
619
622
|
summarize - condense long content
|
|
620
623
|
chat - a simple conversational reply
|
|
621
624
|
|
|
625
|
+
Also classify the request's goalType (one of: feature, bugfix, refactor, test, docs, chore, other) and write 2-5 MEASURABLE acceptance criteria \u2014 concrete, checkable conditions that mean the goal is fully achieved (e.g. "hello.js exists and prints the greeting", "npm test passes", "the function handles empty input").
|
|
626
|
+
|
|
622
627
|
Return ONLY minified JSON of the form:
|
|
623
|
-
{"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
|
|
628
|
+
{"goalType":"<type>","criteria":["...","..."],"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
|
|
624
629
|
Use 3-8 steps for non-trivial work, fewer for simple requests. Estimate tokens realistically (prompts often 2000-15000, completions 200-3000).`;
|
|
630
|
+
function classifyGoalType(goal) {
|
|
631
|
+
const g = goal.toLowerCase();
|
|
632
|
+
if (/\b(fix|bug|broken|error|crash|regression|fails?)\b/.test(g)) return "bugfix";
|
|
633
|
+
if (/\b(refactor|rename|clean ?up|restructure|extract|simplif)/.test(g)) return "refactor";
|
|
634
|
+
if (/\b(test|spec|coverage|unit test|e2e)\b/.test(g)) return "test";
|
|
635
|
+
if (/\b(docs?|readme|comment|documentation)\b/.test(g)) return "docs";
|
|
636
|
+
if (/\b(bump|upgrade|dependency|deps|config|chore|lint|format)\b/.test(g)) return "chore";
|
|
637
|
+
if (/\b(add|create|implement|build|feature|support|new)\b/.test(g)) return "feature";
|
|
638
|
+
return "other";
|
|
639
|
+
}
|
|
625
640
|
function heuristicPlan(goal) {
|
|
626
641
|
const steps = [
|
|
627
642
|
{ id: 1, type: "plan", description: "Decompose the request", estPromptTokens: 2e3, estCompletionTokens: 600 },
|
|
@@ -630,7 +645,12 @@ function heuristicPlan(goal) {
|
|
|
630
645
|
{ id: 4, type: "edit", description: "Implement the change", estPromptTokens: 9e3, estCompletionTokens: 1500 },
|
|
631
646
|
{ id: 5, type: "review", description: "Review the change", estPromptTokens: 6e3, estCompletionTokens: 800 }
|
|
632
647
|
];
|
|
633
|
-
return {
|
|
648
|
+
return {
|
|
649
|
+
goal,
|
|
650
|
+
steps,
|
|
651
|
+
goalType: classifyGoalType(goal),
|
|
652
|
+
criteria: ["The stated goal is fully implemented and works", "No obvious errors or omissions remain"]
|
|
653
|
+
};
|
|
634
654
|
}
|
|
635
655
|
async function planRequest(goal, client2, planModel, onUsage) {
|
|
636
656
|
const result = await client2.complete(
|
|
@@ -648,7 +668,7 @@ async function planRequest(goal, client2, planModel, onUsage) {
|
|
|
648
668
|
onUsage?.(result);
|
|
649
669
|
const parsed = extractPlan(result.content);
|
|
650
670
|
if (!parsed) return heuristicPlan(goal);
|
|
651
|
-
return { goal,
|
|
671
|
+
return { goal, ...parsed };
|
|
652
672
|
}
|
|
653
673
|
function extractPlan(text) {
|
|
654
674
|
const json = extractJson(text);
|
|
@@ -663,7 +683,10 @@ function extractPlan(text) {
|
|
|
663
683
|
estPromptTokens: clampInt(s.estPromptTokens, 500, 6e4, 4e3),
|
|
664
684
|
estCompletionTokens: clampInt(s.estCompletionTokens, 100, 8e3, 800)
|
|
665
685
|
}));
|
|
666
|
-
|
|
686
|
+
if (!steps.length) return null;
|
|
687
|
+
const goalType = ALL_GOAL_TYPES.includes(String(obj.goalType)) ? obj.goalType : "other";
|
|
688
|
+
const criteria = Array.isArray(obj.criteria) ? obj.criteria.map((x) => String(x).slice(0, 200)).filter(Boolean).slice(0, 6) : [];
|
|
689
|
+
return { steps, goalType, criteria: criteria.length ? criteria : ["The stated goal is fully achieved"] };
|
|
667
690
|
} catch {
|
|
668
691
|
return null;
|
|
669
692
|
}
|
|
@@ -705,10 +728,29 @@ function extractJson(text) {
|
|
|
705
728
|
}
|
|
706
729
|
|
|
707
730
|
// src/router/policy.ts
|
|
731
|
+
var ESCALATION_LADDER = [
|
|
732
|
+
{ objective: "value", maxTokens: 2e3, maxIters: 6, liftCostCap: false, label: "value \xB7 cheapest-capable" },
|
|
733
|
+
{ tierFloor: "standard", objective: "value", maxTokens: 4e3, maxIters: 8, liftCostCap: true, label: "standard+ \xB7 more tokens" },
|
|
734
|
+
{ tierFloor: "frontier", objective: "quality", maxTokens: 8e3, maxIters: 10, liftCostCap: true, label: "frontier \xB7 strongest" }
|
|
735
|
+
];
|
|
736
|
+
function rungForTier(tier) {
|
|
737
|
+
return ESCALATION_LADDER.findIndex((r) => r.tierFloor === tier || !r.tierFloor && tier === "cheap");
|
|
738
|
+
}
|
|
739
|
+
function applyRung(base, rung) {
|
|
740
|
+
return {
|
|
741
|
+
...base,
|
|
742
|
+
objective: rung.objective,
|
|
743
|
+
tierFloor: rung.tierFloor,
|
|
744
|
+
maxCostPerCallUsd: rung.liftCostCap ? void 0 : base.maxCostPerCallUsd
|
|
745
|
+
};
|
|
746
|
+
}
|
|
708
747
|
var TIER_RANK = { cheap: 0, standard: 1, frontier: 2 };
|
|
709
748
|
function tierAtLeast(tier, min) {
|
|
710
749
|
return TIER_RANK[tier] >= TIER_RANK[min];
|
|
711
750
|
}
|
|
751
|
+
function tierRank(tier) {
|
|
752
|
+
return TIER_RANK[tier];
|
|
753
|
+
}
|
|
712
754
|
function blendedPrice(m) {
|
|
713
755
|
return (m.pricing.promptUsdPerMTok * 3 + m.pricing.completionUsdPerMTok) / 4;
|
|
714
756
|
}
|
|
@@ -755,6 +797,7 @@ var TASK_SKILL = {
|
|
|
755
797
|
command: "speed",
|
|
756
798
|
review: "reasoning",
|
|
757
799
|
reason: "reasoning",
|
|
800
|
+
verify: "reasoning",
|
|
758
801
|
explain: "general",
|
|
759
802
|
summarize: "speed",
|
|
760
803
|
chat: "speed"
|
|
@@ -781,6 +824,7 @@ var TASK_MIN_STRENGTH = {
|
|
|
781
824
|
edit: 1.4,
|
|
782
825
|
review: 1.5,
|
|
783
826
|
reason: 1.5,
|
|
827
|
+
verify: 1.4,
|
|
784
828
|
plan: 1.2
|
|
785
829
|
};
|
|
786
830
|
var HEADLINE_SKILLS = ["coding", "reasoning", "retrieval", "speed"];
|
|
@@ -798,9 +842,10 @@ function taskValue(m, taskType, empirical) {
|
|
|
798
842
|
function candidatesFor(taskType, models, policy, est) {
|
|
799
843
|
const spec = TASK_SPECS[taskType];
|
|
800
844
|
const strengthFloor = TASK_MIN_STRENGTH[taskType] ?? 0;
|
|
845
|
+
const minTier = policy.tierFloor && tierRank(policy.tierFloor) > tierRank(spec.minTier) ? policy.tierFloor : spec.minTier;
|
|
801
846
|
return models.filter((m) => {
|
|
802
847
|
if (m.id === "openrouter/auto") return false;
|
|
803
|
-
const covers = tierAtLeast(m.tier,
|
|
848
|
+
const covers = tierAtLeast(m.tier, minTier) || !policy.tierFloor && taskStrength(m, taskType) >= strengthFloor;
|
|
804
849
|
if (!covers) return false;
|
|
805
850
|
if (spec.needsTools && !m.capabilities.tools) return false;
|
|
806
851
|
if (policy.maxCostPerCallUsd != null && est) {
|
|
@@ -846,6 +891,19 @@ function route(taskType, models, policy, est = { promptTokens: 4e3, completionTo
|
|
|
846
891
|
const reason = policy.objective === "cheapest" ? `cheapest model that covers ${skill}` : policy.objective === "quality" ? `strongest at ${skill}` : proven ? `proven ${Math.round(proven)}% fewer tokens on ${taskType} (playbook)` : `best ${skill}-per-dollar`;
|
|
847
892
|
return { model: chosen, reason, estCostUsd: projectCost(chosen, est) };
|
|
848
893
|
}
|
|
894
|
+
function routeOrBest(taskType, models, policy, est = { promptTokens: 4e3, completionTokens: 1e3 }) {
|
|
895
|
+
const r = route(taskType, models, policy, est);
|
|
896
|
+
if (r) return r;
|
|
897
|
+
const spec = TASK_SPECS[taskType];
|
|
898
|
+
const usable = models.filter(
|
|
899
|
+
(m) => m.id !== "openrouter/auto" && (!spec.needsTools || m.capabilities.tools)
|
|
900
|
+
);
|
|
901
|
+
if (!usable.length) return null;
|
|
902
|
+
const byStrength = (a, b) => taskStrength(b, taskType) - taskStrength(a, taskType);
|
|
903
|
+
const withTools = usable.filter((m) => m.capabilities.tools).sort(byStrength);
|
|
904
|
+
const best = (withTools.length ? withTools : [...usable].sort(byStrength))[0];
|
|
905
|
+
return { model: best, reason: `best available for ${TASK_SKILL[taskType]} (fallback)`, estCostUsd: projectCost(best, est) };
|
|
906
|
+
}
|
|
849
907
|
|
|
850
908
|
// src/recommend/recommend.ts
|
|
851
909
|
var OBJECTIVES = [
|
|
@@ -1075,6 +1133,27 @@ function getDb() {
|
|
|
1075
1133
|
);
|
|
1076
1134
|
CREATE INDEX IF NOT EXISTS idx_cmd_date ON command_runs(date);
|
|
1077
1135
|
|
|
1136
|
+
-- One row per verify-and-escalate attempt within a session. Powers the
|
|
1137
|
+
-- "optimal starting model per goal type" statistical learning.
|
|
1138
|
+
CREATE TABLE IF NOT EXISTS attempts (
|
|
1139
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1140
|
+
session_id TEXT NOT NULL,
|
|
1141
|
+
attempt_no INTEGER NOT NULL,
|
|
1142
|
+
goal_type TEXT NOT NULL,
|
|
1143
|
+
tier_floor TEXT,
|
|
1144
|
+
objective TEXT NOT NULL,
|
|
1145
|
+
prompt_tokens INTEGER NOT NULL,
|
|
1146
|
+
completion_tokens INTEGER NOT NULL,
|
|
1147
|
+
cost_usd REAL NOT NULL,
|
|
1148
|
+
criteria_total INTEGER NOT NULL,
|
|
1149
|
+
criteria_met INTEGER NOT NULL,
|
|
1150
|
+
passed INTEGER NOT NULL,
|
|
1151
|
+
duration_ms INTEGER NOT NULL,
|
|
1152
|
+
synced INTEGER NOT NULL DEFAULT 0
|
|
1153
|
+
);
|
|
1154
|
+
CREATE INDEX IF NOT EXISTS idx_attempts_session ON attempts(session_id);
|
|
1155
|
+
CREATE INDEX IF NOT EXISTS idx_attempts_goal ON attempts(goal_type, tier_floor);
|
|
1156
|
+
|
|
1078
1157
|
-- Distilled efficiency insights: ONLY the notably cost-efficient approaches.
|
|
1079
1158
|
-- This is what syncs to the cloud by default (raw logs stay local).
|
|
1080
1159
|
CREATE TABLE IF NOT EXISTS insights (
|
|
@@ -1096,6 +1175,15 @@ function getDb() {
|
|
|
1096
1175
|
if (!cols.some((c2) => c2.name === "command")) {
|
|
1097
1176
|
db.exec(`ALTER TABLE usage_log ADD COLUMN command TEXT NOT NULL DEFAULT 'run'`);
|
|
1098
1177
|
}
|
|
1178
|
+
const conn = db;
|
|
1179
|
+
const scols = conn.prepare(`PRAGMA table_info(sessions)`).all();
|
|
1180
|
+
const addSession = (name, decl) => {
|
|
1181
|
+
if (!scols.some((c2) => c2.name === name)) conn.exec(`ALTER TABLE sessions ADD COLUMN ${name} ${decl}`);
|
|
1182
|
+
};
|
|
1183
|
+
addSession("goal_type", "TEXT NOT NULL DEFAULT 'other'");
|
|
1184
|
+
addSession("start_tier", "TEXT");
|
|
1185
|
+
addSession("attempts", "INTEGER NOT NULL DEFAULT 1");
|
|
1186
|
+
addSession("final_passed", "INTEGER");
|
|
1099
1187
|
return db;
|
|
1100
1188
|
}
|
|
1101
1189
|
function recordUsage(e) {
|
|
@@ -1188,14 +1276,14 @@ function markSynced(ids) {
|
|
|
1188
1276
|
}
|
|
1189
1277
|
function startSession(s) {
|
|
1190
1278
|
getDb().prepare(
|
|
1191
|
-
`INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps)
|
|
1192
|
-
VALUES (?, ?, ?, ?, ?, ?, ?)`
|
|
1193
|
-
).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps);
|
|
1279
|
+
`INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps, goal_type, start_tier)
|
|
1280
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
1281
|
+
).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps, s.goalType, s.startTier ?? null);
|
|
1194
1282
|
}
|
|
1195
1283
|
function finishSession(id, u) {
|
|
1196
1284
|
getDb().prepare(
|
|
1197
1285
|
`UPDATE sessions SET planned_steps=?, completed_steps=?, failed_steps=?, auto_score=?,
|
|
1198
|
-
prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=? WHERE id=?`
|
|
1286
|
+
prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=?, attempts=?, final_passed=? WHERE id=?`
|
|
1199
1287
|
).run(
|
|
1200
1288
|
u.plannedSteps,
|
|
1201
1289
|
u.completedSteps,
|
|
@@ -1205,9 +1293,60 @@ function finishSession(id, u) {
|
|
|
1205
1293
|
u.completionTokens,
|
|
1206
1294
|
u.costUsd,
|
|
1207
1295
|
u.durationMs,
|
|
1296
|
+
u.attempts ?? 1,
|
|
1297
|
+
u.finalPassed == null ? null : u.finalPassed ? 1 : 0,
|
|
1208
1298
|
id
|
|
1209
1299
|
);
|
|
1210
1300
|
}
|
|
1301
|
+
function recordAttempt(a) {
|
|
1302
|
+
getDb().prepare(
|
|
1303
|
+
`INSERT INTO attempts
|
|
1304
|
+
(session_id, attempt_no, goal_type, tier_floor, objective, prompt_tokens, completion_tokens,
|
|
1305
|
+
cost_usd, criteria_total, criteria_met, passed, duration_ms)
|
|
1306
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
1307
|
+
).run(
|
|
1308
|
+
a.sessionId,
|
|
1309
|
+
a.attemptNo,
|
|
1310
|
+
a.goalType,
|
|
1311
|
+
a.tierFloor,
|
|
1312
|
+
a.objective,
|
|
1313
|
+
a.promptTokens,
|
|
1314
|
+
a.completionTokens,
|
|
1315
|
+
a.costUsd,
|
|
1316
|
+
a.criteriaTotal,
|
|
1317
|
+
a.criteriaMet,
|
|
1318
|
+
a.passed ? 1 : 0,
|
|
1319
|
+
a.durationMs
|
|
1320
|
+
);
|
|
1321
|
+
}
|
|
1322
|
+
function goalTierStats() {
|
|
1323
|
+
const rows = getDb().prepare(
|
|
1324
|
+
`SELECT goal_type AS goalType, COALESCE(start_tier,'cheap') AS startTier,
|
|
1325
|
+
COUNT(*) AS sessions,
|
|
1326
|
+
AVG(CASE WHEN final_passed=1 THEN 1.0 ELSE 0.0 END) AS passRate,
|
|
1327
|
+
AVG(prompt_tokens + completion_tokens) AS avgTotalTokens,
|
|
1328
|
+
AVG(attempts) AS avgAttempts
|
|
1329
|
+
FROM sessions
|
|
1330
|
+
WHERE final_passed IS NOT NULL
|
|
1331
|
+
GROUP BY goal_type, startTier
|
|
1332
|
+
ORDER BY goal_type, avgTotalTokens ASC`
|
|
1333
|
+
).all();
|
|
1334
|
+
return rows.map((r) => ({
|
|
1335
|
+
goalType: String(r.goalType),
|
|
1336
|
+
startTier: String(r.startTier),
|
|
1337
|
+
sessions: Number(r.sessions),
|
|
1338
|
+
passRate: Number(r.passRate ?? 0),
|
|
1339
|
+
avgTotalTokens: Number(r.avgTotalTokens ?? 0),
|
|
1340
|
+
avgAttempts: Number(r.avgAttempts ?? 0)
|
|
1341
|
+
}));
|
|
1342
|
+
}
|
|
1343
|
+
function optimalStartTier(goalType, minSessions = 3) {
|
|
1344
|
+
const stats = goalTierStats().filter(
|
|
1345
|
+
(s) => s.goalType === goalType && s.sessions >= minSessions && s.passRate >= 0.6
|
|
1346
|
+
);
|
|
1347
|
+
if (!stats.length) return null;
|
|
1348
|
+
return stats.sort((a, b) => a.avgTotalTokens - b.avgTotalTokens)[0].startTier;
|
|
1349
|
+
}
|
|
1211
1350
|
function setUserScore(sessionId, score) {
|
|
1212
1351
|
getDb().prepare(`UPDATE sessions SET user_score=? WHERE id=?`).run(score, sessionId);
|
|
1213
1352
|
}
|
|
@@ -1651,6 +1790,35 @@ function renderAnalysis(filter = {}) {
|
|
|
1651
1790
|
}
|
|
1652
1791
|
out.push("");
|
|
1653
1792
|
}
|
|
1793
|
+
const tierStats = goalTierStats();
|
|
1794
|
+
if (tierStats.length) {
|
|
1795
|
+
out.push(c.bold("Optimal starting model per goal type") + c.dim(" (pass rate vs total tokens to reach the goal)"));
|
|
1796
|
+
out.push(
|
|
1797
|
+
table(
|
|
1798
|
+
["Goal type", "Start tier", "Sessions", "Pass rate", "Avg total tok", "Avg attempts"],
|
|
1799
|
+
tierStats.map((s) => [
|
|
1800
|
+
s.goalType,
|
|
1801
|
+
tierColor(s.startTier),
|
|
1802
|
+
String(s.sessions),
|
|
1803
|
+
`${Math.round(s.passRate * 100)}%`,
|
|
1804
|
+
tokens(Math.round(s.avgTotalTokens)),
|
|
1805
|
+
s.avgAttempts.toFixed(1)
|
|
1806
|
+
])
|
|
1807
|
+
)
|
|
1808
|
+
);
|
|
1809
|
+
const goalTypes = [...new Set(tierStats.map((s) => s.goalType))];
|
|
1810
|
+
const learned = goalTypes.map((g) => ({ g, tier: optimalStartTier(g) })).filter((x) => x.tier);
|
|
1811
|
+
if (learned.length) {
|
|
1812
|
+
out.push(
|
|
1813
|
+
c.green(
|
|
1814
|
+
"\u2192 Learned starts (auto-applied on `poly run`): " + learned.map((x) => `${x.g}\u2192${x.tier}`).join(", ")
|
|
1815
|
+
)
|
|
1816
|
+
);
|
|
1817
|
+
} else {
|
|
1818
|
+
out.push(c.dim("\u2192 Not enough evidence yet to auto-pick a starting tier (needs \u22653 verified sessions per goal type)."));
|
|
1819
|
+
}
|
|
1820
|
+
out.push("");
|
|
1821
|
+
}
|
|
1654
1822
|
if (byCommand.length) {
|
|
1655
1823
|
out.push(c.bold("Usage by command"));
|
|
1656
1824
|
out.push(
|
|
@@ -2086,6 +2254,28 @@ var TOOL_SCHEMAS = [
|
|
|
2086
2254
|
}
|
|
2087
2255
|
}
|
|
2088
2256
|
];
|
|
2257
|
+
var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
|
|
2258
|
+
var READONLY_TOOL_SCHEMAS = TOOL_SCHEMAS.filter(
|
|
2259
|
+
(t) => ["read_file", "list_dir", "run_command"].includes(t.function.name)
|
|
2260
|
+
);
|
|
2261
|
+
function parseTextToolCall(content) {
|
|
2262
|
+
if (!content) return null;
|
|
2263
|
+
const json = extractJson(content);
|
|
2264
|
+
if (!json) return null;
|
|
2265
|
+
try {
|
|
2266
|
+
const obj = JSON.parse(json);
|
|
2267
|
+
const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
|
|
2268
|
+
if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
|
|
2269
|
+
const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
|
|
2270
|
+
return {
|
|
2271
|
+
id: `textcall_${name}`,
|
|
2272
|
+
type: "function",
|
|
2273
|
+
function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
|
|
2274
|
+
};
|
|
2275
|
+
} catch {
|
|
2276
|
+
return null;
|
|
2277
|
+
}
|
|
2278
|
+
}
|
|
2089
2279
|
var MAX_OUTPUT = 8e3;
|
|
2090
2280
|
function clip(s) {
|
|
2091
2281
|
return s.length > MAX_OUTPUT ? s.slice(0, MAX_OUTPUT) + `
|
|
@@ -2167,31 +2357,124 @@ ${stderr}`)) };
|
|
|
2167
2357
|
}
|
|
2168
2358
|
}
|
|
2169
2359
|
|
|
2360
|
+
// src/agent/verify.ts
|
|
2361
|
+
var VERIFY_MAX_ITERS = 8;
|
|
2362
|
+
var VERIFY_SYSTEM = `You are the VERIFY stage of an autonomous coding agent. Your job is to MEASURE whether the goal was actually achieved \u2014 be skeptical and check the real workspace, do not assume.
|
|
2363
|
+
Use the read-only tools (read_file, list_dir, run_command) to inspect files and, where relevant, run build/test commands. Then judge EACH acceptance criterion against what you actually observed.
|
|
2364
|
+
When done, reply with ONLY this JSON (no prose, no code fence):
|
|
2365
|
+
{"results":[{"criterion":"<verbatim>","met":true|false,"reason":"<evidence>"}],"feedback":"<concrete guidance to fix any unmet criteria>"}`;
|
|
2366
|
+
async function verifyGoal(goal, criteria, deps, ev = {}) {
|
|
2367
|
+
const toolCtx = { cwd: deps.cwd, allowWrite: false, allowCommands: deps.allowCommands };
|
|
2368
|
+
const useTools = deps.model.capabilities.tools;
|
|
2369
|
+
const messages = [
|
|
2370
|
+
{ role: "system", content: VERIFY_SYSTEM },
|
|
2371
|
+
{
|
|
2372
|
+
role: "user",
|
|
2373
|
+
content: `Goal: ${goal}
|
|
2374
|
+
|
|
2375
|
+
Acceptance criteria:
|
|
2376
|
+
` + criteria.map((c2, i) => `${i + 1}. ${c2}`).join("\n") + `
|
|
2377
|
+
|
|
2378
|
+
Inspect the workspace, then return the verdict JSON.`
|
|
2379
|
+
}
|
|
2380
|
+
];
|
|
2381
|
+
let verdict = null;
|
|
2382
|
+
for (let iter = 0; iter < VERIFY_MAX_ITERS; iter++) {
|
|
2383
|
+
const gen = deps.client.stream(
|
|
2384
|
+
{ model: deps.model.id, messages, tools: useTools ? READONLY_TOOL_SCHEMAS : void 0, temperature: 0, maxTokens: 1500 },
|
|
2385
|
+
deps.model.pricing
|
|
2386
|
+
);
|
|
2387
|
+
let next = await gen.next();
|
|
2388
|
+
while (!next.done) next = await gen.next();
|
|
2389
|
+
const result = next.value;
|
|
2390
|
+
ev.onUsage?.(result);
|
|
2391
|
+
const calls = result.toolCalls.length ? result.toolCalls : useTools && parseTextToolCall(result.content) ? [parseTextToolCall(result.content)] : [];
|
|
2392
|
+
const parsed = parseVerdict(result.content, criteria);
|
|
2393
|
+
if (parsed) {
|
|
2394
|
+
verdict = parsed;
|
|
2395
|
+
break;
|
|
2396
|
+
}
|
|
2397
|
+
if (calls.length) {
|
|
2398
|
+
if (result.toolCalls.length) messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
|
|
2399
|
+
for (const tc of calls) {
|
|
2400
|
+
ev.onToolCall?.(tc.function.name, tc.function.arguments);
|
|
2401
|
+
const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
|
|
2402
|
+
ev.onToolResult?.(tc.function.name, outcome.result);
|
|
2403
|
+
if (result.toolCalls.length) {
|
|
2404
|
+
messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
|
|
2405
|
+
} else {
|
|
2406
|
+
messages.push({ role: "assistant", content: result.content });
|
|
2407
|
+
messages.push({ role: "user", content: `Tool ${tc.function.name} returned:
|
|
2408
|
+
${outcome.result}
|
|
2409
|
+
Continue, then return the verdict JSON.` });
|
|
2410
|
+
}
|
|
2411
|
+
}
|
|
2412
|
+
continue;
|
|
2413
|
+
}
|
|
2414
|
+
messages.push({ role: "assistant", content: result.content });
|
|
2415
|
+
messages.push({ role: "user", content: `Return ONLY the verdict JSON now.` });
|
|
2416
|
+
}
|
|
2417
|
+
return verdict ?? fallbackVerdict(criteria);
|
|
2418
|
+
}
|
|
2419
|
+
function parseVerdict(text, criteria) {
|
|
2420
|
+
const json = extractJson(text);
|
|
2421
|
+
if (!json) return null;
|
|
2422
|
+
try {
|
|
2423
|
+
const obj = JSON.parse(json);
|
|
2424
|
+
if (!Array.isArray(obj.results)) return null;
|
|
2425
|
+
const results = obj.results.map((r) => ({
|
|
2426
|
+
criterion: String(r.criterion ?? ""),
|
|
2427
|
+
met: r.met === true || String(r.met).toLowerCase() === "true",
|
|
2428
|
+
reason: String(r.reason ?? "").slice(0, 300)
|
|
2429
|
+
}));
|
|
2430
|
+
if (!results.length) return null;
|
|
2431
|
+
const unmet = results.filter((r) => !r.met);
|
|
2432
|
+
return {
|
|
2433
|
+
total: results.length,
|
|
2434
|
+
metCount: results.length - unmet.length,
|
|
2435
|
+
allMet: unmet.length === 0,
|
|
2436
|
+
results,
|
|
2437
|
+
unmet,
|
|
2438
|
+
feedback: String(obj.feedback ?? "").slice(0, 1e3) || unmet.map((u) => u.reason).join("; ")
|
|
2439
|
+
};
|
|
2440
|
+
} catch {
|
|
2441
|
+
return null;
|
|
2442
|
+
}
|
|
2443
|
+
}
|
|
2444
|
+
function fallbackVerdict(criteria) {
|
|
2445
|
+
const results = criteria.map((c2) => ({ criterion: c2, met: false, reason: "verifier produced no verdict" }));
|
|
2446
|
+
return { total: results.length, metCount: 0, allMet: false, results, unmet: results, feedback: "Verification inconclusive; re-attempt with a stronger model." };
|
|
2447
|
+
}
|
|
2448
|
+
|
|
2170
2449
|
// src/agent/loop.ts
|
|
2171
|
-
|
|
2450
|
+
function localDate2(d = /* @__PURE__ */ new Date()) {
|
|
2451
|
+
const y = d.getFullYear();
|
|
2452
|
+
const m = String(d.getMonth() + 1).padStart(2, "0");
|
|
2453
|
+
const day = String(d.getDate()).padStart(2, "0");
|
|
2454
|
+
return `${y}-${m}-${day}`;
|
|
2455
|
+
}
|
|
2172
2456
|
async function runAgent(goal, deps, emit) {
|
|
2173
|
-
const { client: client2, models,
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
let totalCompletionTokens = 0;
|
|
2178
|
-
let calls = 0;
|
|
2457
|
+
const { client: client2, models, cwd } = deps;
|
|
2458
|
+
const verifyOn = deps.verify ?? true;
|
|
2459
|
+
const maxAttempts = deps.maxAttempts ?? 3;
|
|
2460
|
+
const acc = { cost: 0, tokens: 0, prompt: 0, completion: 0, calls: 0 };
|
|
2179
2461
|
const sessionStart = Date.now();
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2462
|
+
const toolCtx = { cwd, allowWrite: deps.allowWrite, allowCommands: deps.allowCommands };
|
|
2463
|
+
const logUsage = (r, taskType) => {
|
|
2464
|
+
const entry = logCompletion(r, taskType, deps.sessionId);
|
|
2465
|
+
emit({ type: "usage", entry });
|
|
2466
|
+
acc.cost += entry.costUsd;
|
|
2467
|
+
acc.tokens += entry.totalTokens;
|
|
2468
|
+
acc.prompt += entry.promptTokens;
|
|
2469
|
+
acc.completion += entry.completionTokens;
|
|
2470
|
+
acc.calls++;
|
|
2471
|
+
return entry;
|
|
2472
|
+
};
|
|
2473
|
+
const planRoute = route("plan", models, deps.policy);
|
|
2183
2474
|
let plan;
|
|
2184
2475
|
if (planRoute) {
|
|
2185
2476
|
try {
|
|
2186
|
-
plan = await planRequest(goal, client2, planRoute.model, (
|
|
2187
|
-
const entry = logCompletion(result, "plan", sessionId);
|
|
2188
|
-
emit({ type: "usage", entry });
|
|
2189
|
-
totalCostUsd += entry.costUsd;
|
|
2190
|
-
totalTokens += entry.totalTokens;
|
|
2191
|
-
totalPromptTokens += entry.promptTokens;
|
|
2192
|
-
totalCompletionTokens += entry.completionTokens;
|
|
2193
|
-
calls++;
|
|
2194
|
-
});
|
|
2477
|
+
plan = await planRequest(goal, client2, planRoute.model, (r) => logUsage(r, "plan"));
|
|
2195
2478
|
} catch {
|
|
2196
2479
|
plan = heuristicPlan(goal);
|
|
2197
2480
|
}
|
|
@@ -2199,144 +2482,276 @@ async function runAgent(goal, deps, emit) {
|
|
|
2199
2482
|
plan = heuristicPlan(goal);
|
|
2200
2483
|
}
|
|
2201
2484
|
emit({ type: "plan", plan, planModel: planRoute?.model.id ?? "heuristic" });
|
|
2485
|
+
let startRung = 0;
|
|
2486
|
+
let learned = false;
|
|
2487
|
+
if (verifyOn) {
|
|
2488
|
+
const tier = optimalStartTier(plan.goalType);
|
|
2489
|
+
if (tier) {
|
|
2490
|
+
const r = rungForTier(tier);
|
|
2491
|
+
if (r > 0) {
|
|
2492
|
+
startRung = r;
|
|
2493
|
+
learned = true;
|
|
2494
|
+
}
|
|
2495
|
+
}
|
|
2496
|
+
}
|
|
2497
|
+
const startTier = ESCALATION_LADDER[startRung].tierFloor ?? "cheap";
|
|
2498
|
+
emit({ type: "criteria", goalType: plan.goalType, criteria: plan.criteria, startTier, learned });
|
|
2202
2499
|
startSession({
|
|
2203
|
-
id: sessionId,
|
|
2500
|
+
id: deps.sessionId,
|
|
2204
2501
|
ts: sessionStart,
|
|
2205
2502
|
date: localDate2(),
|
|
2206
2503
|
goal,
|
|
2207
2504
|
command: "run",
|
|
2208
|
-
objective: policy.objective,
|
|
2209
|
-
plannedSteps: plan.steps.length
|
|
2505
|
+
objective: deps.policy.objective,
|
|
2506
|
+
plannedSteps: plan.steps.length,
|
|
2507
|
+
goalType: plan.goalType,
|
|
2508
|
+
startTier
|
|
2210
2509
|
});
|
|
2211
|
-
|
|
2212
|
-
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
|
|
2510
|
+
let rung = startRung;
|
|
2511
|
+
let attemptNo = 0;
|
|
2512
|
+
let verdict = null;
|
|
2513
|
+
let completedSteps = 0;
|
|
2514
|
+
let failedSteps = 0;
|
|
2216
2515
|
const priorSummaries = [];
|
|
2217
|
-
|
|
2218
|
-
const
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
}
|
|
2222
|
-
if (
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
const model = r.model;
|
|
2228
|
-
emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
|
|
2229
|
-
const useTools = model.capabilities.tools;
|
|
2230
|
-
const messages = [
|
|
2231
|
-
{ role: "system", content: stepSystemPrompt(goal, step, priorSummaries, useTools) },
|
|
2232
|
-
{ role: "user", content: step.description }
|
|
2233
|
-
];
|
|
2234
|
-
const stepStart = Date.now();
|
|
2235
|
-
let stepPrompt = 0;
|
|
2236
|
-
let stepCompletion = 0;
|
|
2237
|
-
let stepCost = 0;
|
|
2238
|
-
let stepToolCalls = 0;
|
|
2239
|
-
let iterations = 0;
|
|
2240
|
-
let finishedBy = "max-iters";
|
|
2241
|
-
let summary = "";
|
|
2242
|
-
try {
|
|
2243
|
-
for (let iter = 0; iter < MAX_ITERS_PER_STEP; iter++) {
|
|
2244
|
-
iterations = iter + 1;
|
|
2245
|
-
const gen = client2.stream(
|
|
2246
|
-
{
|
|
2247
|
-
model: model.id,
|
|
2248
|
-
messages,
|
|
2249
|
-
tools: useTools ? TOOL_SCHEMAS : void 0,
|
|
2250
|
-
temperature: 0.2,
|
|
2251
|
-
maxTokens: 2e3
|
|
2252
|
-
},
|
|
2253
|
-
model.pricing
|
|
2254
|
-
);
|
|
2255
|
-
let next = await gen.next();
|
|
2256
|
-
while (!next.done) {
|
|
2257
|
-
emit({ type: "text", delta: next.value });
|
|
2258
|
-
next = await gen.next();
|
|
2259
|
-
}
|
|
2260
|
-
const result = next.value;
|
|
2261
|
-
const entry = logCompletion(result, step.type, sessionId);
|
|
2262
|
-
emit({ type: "usage", entry });
|
|
2263
|
-
totalCostUsd += entry.costUsd;
|
|
2264
|
-
totalTokens += entry.totalTokens;
|
|
2265
|
-
totalPromptTokens += entry.promptTokens;
|
|
2266
|
-
totalCompletionTokens += entry.completionTokens;
|
|
2267
|
-
stepPrompt += entry.promptTokens;
|
|
2268
|
-
stepCompletion += entry.completionTokens;
|
|
2269
|
-
stepCost += entry.costUsd;
|
|
2270
|
-
calls++;
|
|
2271
|
-
if (result.toolCalls.length && useTools) {
|
|
2272
|
-
messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
|
|
2273
|
-
let finished = false;
|
|
2274
|
-
for (const tc of result.toolCalls) {
|
|
2275
|
-
stepToolCalls++;
|
|
2276
|
-
emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
|
|
2277
|
-
const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
|
|
2278
|
-
emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
|
|
2279
|
-
messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
|
|
2280
|
-
if (outcome.finishSummary != null) {
|
|
2281
|
-
summary = outcome.finishSummary;
|
|
2282
|
-
finished = true;
|
|
2283
|
-
}
|
|
2284
|
-
}
|
|
2285
|
-
if (finished) {
|
|
2286
|
-
finishedBy = "finish-tool";
|
|
2287
|
-
break;
|
|
2288
|
-
}
|
|
2289
|
-
continue;
|
|
2290
|
-
}
|
|
2291
|
-
summary = result.content || summary;
|
|
2292
|
-
if (summary) finishedBy = "text";
|
|
2293
|
-
break;
|
|
2516
|
+
while (attemptNo < maxAttempts) {
|
|
2517
|
+
const rungDef = ESCALATION_LADDER[Math.min(rung, ESCALATION_LADDER.length - 1)];
|
|
2518
|
+
const rungPolicy = applyRung(deps.policy, rungDef);
|
|
2519
|
+
const attemptStart = Date.now();
|
|
2520
|
+
const before = { ...acc };
|
|
2521
|
+
if (attemptNo === 0) {
|
|
2522
|
+
for (const step of plan.steps) {
|
|
2523
|
+
const res = await runStep(step, rungPolicy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal);
|
|
2524
|
+
if (res.success) completedSteps++;
|
|
2525
|
+
else failedSteps++;
|
|
2294
2526
|
}
|
|
2295
|
-
}
|
|
2296
|
-
|
|
2297
|
-
emit({ type: "error", message: `Step ${step.id} failed: ${err?.message ?? err}` });
|
|
2527
|
+
} else {
|
|
2528
|
+
await runFix(goal, plan, verdict, rungPolicy, rungDef, deps, toolCtx, emit, logUsage);
|
|
2298
2529
|
}
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
|
|
2530
|
+
if (!verifyOn) {
|
|
2531
|
+
attemptNo++;
|
|
2532
|
+
break;
|
|
2533
|
+
}
|
|
2534
|
+
const verifyPolicy = { ...deps.policy, objective: "quality", tierFloor: rungDef.tierFloor };
|
|
2535
|
+
const verifier = routeOrBest("verify", models, verifyPolicy);
|
|
2536
|
+
if (!verifier) {
|
|
2537
|
+
emit({ type: "error", message: "No model available to verify." });
|
|
2538
|
+
attemptNo++;
|
|
2539
|
+
break;
|
|
2540
|
+
}
|
|
2541
|
+
emit({ type: "verify-start", model: verifier.model.id, attempt: attemptNo + 1 });
|
|
2542
|
+
verdict = await verifyGoal(goal, plan.criteria, { client: client2, model: verifier.model, cwd, allowCommands: deps.allowCommands }, {
|
|
2543
|
+
onToolCall: (name, args) => emit({ type: "tool-call", name, args }),
|
|
2544
|
+
onToolResult: (name, result) => emit({ type: "tool-result", name, result }),
|
|
2545
|
+
onUsage: (r) => logUsage(r, "review")
|
|
2546
|
+
});
|
|
2547
|
+
emit({ type: "verdict", attempt: attemptNo + 1, metCount: verdict.metCount, total: verdict.total, allMet: verdict.allMet, unmet: verdict.unmet });
|
|
2548
|
+
recordAttempt({
|
|
2549
|
+
sessionId: deps.sessionId,
|
|
2550
|
+
attemptNo: attemptNo + 1,
|
|
2551
|
+
goalType: plan.goalType,
|
|
2552
|
+
tierFloor: rungDef.tierFloor ?? null,
|
|
2553
|
+
objective: rungDef.objective,
|
|
2554
|
+
promptTokens: acc.prompt - before.prompt,
|
|
2555
|
+
completionTokens: acc.completion - before.completion,
|
|
2556
|
+
costUsd: acc.cost - before.cost,
|
|
2557
|
+
criteriaTotal: verdict.total,
|
|
2558
|
+
criteriaMet: verdict.metCount,
|
|
2559
|
+
passed: verdict.allMet,
|
|
2560
|
+
durationMs: Date.now() - attemptStart
|
|
2317
2561
|
});
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2562
|
+
attemptNo++;
|
|
2563
|
+
if (verdict.allMet) break;
|
|
2564
|
+
if (attemptNo < maxAttempts) {
|
|
2565
|
+
const next = Math.min(rung + 1, ESCALATION_LADDER.length - 1);
|
|
2566
|
+
rung = next;
|
|
2567
|
+
emit({
|
|
2568
|
+
type: "escalate",
|
|
2569
|
+
toRung: ESCALATION_LADDER[next].label,
|
|
2570
|
+
reason: `${verdict.unmet.length}/${verdict.total} criteria unmet`
|
|
2571
|
+
});
|
|
2572
|
+
}
|
|
2321
2573
|
}
|
|
2322
|
-
|
|
2574
|
+
const passed = verifyOn ? verdict ? verdict.allMet : false : null;
|
|
2575
|
+
finishSession(deps.sessionId, {
|
|
2323
2576
|
plannedSteps: plan.steps.length,
|
|
2324
2577
|
completedSteps,
|
|
2325
2578
|
failedSteps,
|
|
2326
|
-
autoScore: plan.steps.length ? completedSteps / plan.steps.length : null,
|
|
2327
|
-
promptTokens:
|
|
2328
|
-
completionTokens:
|
|
2329
|
-
costUsd:
|
|
2330
|
-
durationMs: Date.now() - sessionStart
|
|
2579
|
+
autoScore: verdict ? verdict.metCount / Math.max(verdict.total, 1) : plan.steps.length ? completedSteps / plan.steps.length : null,
|
|
2580
|
+
promptTokens: acc.prompt,
|
|
2581
|
+
completionTokens: acc.completion,
|
|
2582
|
+
costUsd: acc.cost,
|
|
2583
|
+
durationMs: Date.now() - sessionStart,
|
|
2584
|
+
attempts: attemptNo,
|
|
2585
|
+
finalPassed: passed
|
|
2331
2586
|
});
|
|
2332
|
-
emit({ type: "done", totalCostUsd, totalTokens, calls });
|
|
2333
|
-
return { totalCostUsd, totalTokens, calls };
|
|
2587
|
+
emit({ type: "done", totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed, attempts: attemptNo });
|
|
2588
|
+
return { totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed };
|
|
2334
2589
|
}
|
|
2335
|
-
function
|
|
2336
|
-
const
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2590
|
+
async function runStep(step, policy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal) {
|
|
2591
|
+
const r = routeOrBest(step.type, deps.models, policy, {
|
|
2592
|
+
promptTokens: step.estPromptTokens,
|
|
2593
|
+
completionTokens: step.estCompletionTokens
|
|
2594
|
+
});
|
|
2595
|
+
if (!r) {
|
|
2596
|
+
emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
|
|
2597
|
+
return { summary: "(no model)", success: false };
|
|
2598
|
+
}
|
|
2599
|
+
const model = r.model;
|
|
2600
|
+
emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
|
|
2601
|
+
const messages = [
|
|
2602
|
+
{ role: "system", content: stepSystemPrompt(goal, step, priorSummaries, model.capabilities.tools) },
|
|
2603
|
+
{ role: "user", content: step.description }
|
|
2604
|
+
];
|
|
2605
|
+
const loop = await runToolLoop(model, messages, step.type, rungDef, deps, toolCtx, emit, logUsage);
|
|
2606
|
+
recordStepRun({
|
|
2607
|
+
sessionId: deps.sessionId,
|
|
2608
|
+
stepNo: step.id,
|
|
2609
|
+
taskType: step.type,
|
|
2610
|
+
skill: TASK_SKILL[step.type],
|
|
2611
|
+
model: model.id,
|
|
2612
|
+
provider: model.provider,
|
|
2613
|
+
iterations: loop.iterations,
|
|
2614
|
+
toolCalls: loop.toolCalls,
|
|
2615
|
+
promptTokens: loop.prompt,
|
|
2616
|
+
completionTokens: loop.completion,
|
|
2617
|
+
costUsd: loop.cost,
|
|
2618
|
+
finishedBy: loop.finishedBy,
|
|
2619
|
+
success: loop.success,
|
|
2620
|
+
durationMs: loop.durationMs
|
|
2621
|
+
});
|
|
2622
|
+
const summary = loop.summary || "(no summary)";
|
|
2623
|
+
priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
|
|
2624
|
+
emit({ type: "step-end", step, summary });
|
|
2625
|
+
return { summary, success: loop.success };
|
|
2626
|
+
}
|
|
2627
|
+
async function runFix(goal, plan, verdict, policy, rungDef, deps, toolCtx, emit, logUsage) {
|
|
2628
|
+
const r = routeOrBest("edit", deps.models, policy);
|
|
2629
|
+
if (!r) return { summary: "(no model)", success: false };
|
|
2630
|
+
const model = r.model;
|
|
2631
|
+
const fixStep = {
|
|
2632
|
+
id: 100,
|
|
2633
|
+
type: "edit",
|
|
2634
|
+
description: "Fix the unmet acceptance criteria",
|
|
2635
|
+
estPromptTokens: 9e3,
|
|
2636
|
+
estCompletionTokens: 1500
|
|
2637
|
+
};
|
|
2638
|
+
emit({ type: "step-start", step: fixStep, model, estCostUsd: r.estCostUsd });
|
|
2639
|
+
const unmet = verdict.unmet.map((u, i) => `${i + 1}. ${u.criterion} \u2014 ${u.reason}`).join("\n");
|
|
2640
|
+
const messages = [
|
|
2641
|
+
{
|
|
2642
|
+
role: "system",
|
|
2643
|
+
content: `You are the FIX stage of an autonomous coding agent (escalated model). The verify gate found unmet acceptance criteria; resolve them.
|
|
2644
|
+
Overall goal: ${goal}
|
|
2645
|
+
You may use the tools (read_file, write_file, list_dir, run_command). Inspect what's there, then make the changes. Call \`finish\` with a one-line summary when all listed criteria should now pass.
|
|
2646
|
+
If you cannot call tools natively, reply with ONLY one JSON object per turn: {"name":"<tool>","arguments":{...}}`
|
|
2647
|
+
},
|
|
2648
|
+
{ role: "user", content: `Unmet criteria:
|
|
2649
|
+
${unmet}
|
|
2650
|
+
|
|
2651
|
+
Verifier feedback: ${verdict.feedback}` }
|
|
2652
|
+
];
|
|
2653
|
+
const loop = await runToolLoop(model, messages, "edit", rungDef, deps, toolCtx, emit, logUsage);
|
|
2654
|
+
recordStepRun({
|
|
2655
|
+
sessionId: deps.sessionId,
|
|
2656
|
+
stepNo: fixStep.id,
|
|
2657
|
+
taskType: "edit",
|
|
2658
|
+
skill: TASK_SKILL.edit,
|
|
2659
|
+
model: model.id,
|
|
2660
|
+
provider: model.provider,
|
|
2661
|
+
iterations: loop.iterations,
|
|
2662
|
+
toolCalls: loop.toolCalls,
|
|
2663
|
+
promptTokens: loop.prompt,
|
|
2664
|
+
completionTokens: loop.completion,
|
|
2665
|
+
costUsd: loop.cost,
|
|
2666
|
+
finishedBy: loop.finishedBy,
|
|
2667
|
+
success: loop.success,
|
|
2668
|
+
durationMs: loop.durationMs
|
|
2669
|
+
});
|
|
2670
|
+
emit({ type: "step-end", step: fixStep, summary: loop.summary || "(fix pass)" });
|
|
2671
|
+
return { summary: loop.summary, success: loop.success };
|
|
2672
|
+
}
|
|
2673
|
+
async function runToolLoop(model, messages, taskTypeForLog, rungDef, deps, toolCtx, emit, logUsage) {
|
|
2674
|
+
const useTools = model.capabilities.tools;
|
|
2675
|
+
const start = Date.now();
|
|
2676
|
+
let prompt = 0, completion = 0, cost = 0, toolCalls = 0, iterations = 0;
|
|
2677
|
+
let summary = "";
|
|
2678
|
+
let finishedBy = "max-iters";
|
|
2679
|
+
try {
|
|
2680
|
+
for (let iter = 0; iter < rungDef.maxIters; iter++) {
|
|
2681
|
+
iterations = iter + 1;
|
|
2682
|
+
const gen = deps.client.stream(
|
|
2683
|
+
{ model: model.id, messages, tools: useTools ? TOOL_SCHEMAS : void 0, temperature: 0.2, maxTokens: rungDef.maxTokens },
|
|
2684
|
+
model.pricing
|
|
2685
|
+
);
|
|
2686
|
+
let next = await gen.next();
|
|
2687
|
+
while (!next.done) {
|
|
2688
|
+
emit({ type: "text", delta: next.value });
|
|
2689
|
+
next = await gen.next();
|
|
2690
|
+
}
|
|
2691
|
+
const result = next.value;
|
|
2692
|
+
const entry = logUsage(result, taskTypeForLog);
|
|
2693
|
+
prompt += entry.promptTokens;
|
|
2694
|
+
completion += entry.completionTokens;
|
|
2695
|
+
cost += entry.costUsd;
|
|
2696
|
+
if (result.toolCalls.length && useTools) {
|
|
2697
|
+
messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
|
|
2698
|
+
let finished = false;
|
|
2699
|
+
for (const tc of result.toolCalls) {
|
|
2700
|
+
toolCalls++;
|
|
2701
|
+
emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
|
|
2702
|
+
const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
|
|
2703
|
+
emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
|
|
2704
|
+
messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
|
|
2705
|
+
if (outcome.finishSummary != null) {
|
|
2706
|
+
summary = outcome.finishSummary;
|
|
2707
|
+
finished = true;
|
|
2708
|
+
}
|
|
2709
|
+
}
|
|
2710
|
+
if (finished) {
|
|
2711
|
+
finishedBy = "finish-tool";
|
|
2712
|
+
break;
|
|
2713
|
+
}
|
|
2714
|
+
continue;
|
|
2715
|
+
}
|
|
2716
|
+
const textCall = useTools ? parseTextToolCall(result.content) : null;
|
|
2717
|
+
if (textCall) {
|
|
2718
|
+
toolCalls++;
|
|
2719
|
+
emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
|
|
2720
|
+
const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
|
|
2721
|
+
emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
|
|
2722
|
+
if (outcome.finishSummary != null) {
|
|
2723
|
+
summary = outcome.finishSummary;
|
|
2724
|
+
finishedBy = "finish-tool";
|
|
2725
|
+
break;
|
|
2726
|
+
}
|
|
2727
|
+
messages.push({ role: "assistant", content: result.content });
|
|
2728
|
+
messages.push({
|
|
2729
|
+
role: "user",
|
|
2730
|
+
content: `Tool ${textCall.function.name} returned:
|
|
2731
|
+
${outcome.result}
|
|
2732
|
+
Continue. When done, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
|
|
2733
|
+
});
|
|
2734
|
+
continue;
|
|
2735
|
+
}
|
|
2736
|
+
summary = result.content || summary;
|
|
2737
|
+
if (summary) finishedBy = "text";
|
|
2738
|
+
break;
|
|
2739
|
+
}
|
|
2740
|
+
} catch (err) {
|
|
2741
|
+
finishedBy = "error";
|
|
2742
|
+
emit({ type: "error", message: `${taskTypeForLog} failed: ${err?.message ?? err}` });
|
|
2743
|
+
}
|
|
2744
|
+
return {
|
|
2745
|
+
summary,
|
|
2746
|
+
success: finishedBy === "finish-tool" || finishedBy === "text",
|
|
2747
|
+
finishedBy,
|
|
2748
|
+
iterations,
|
|
2749
|
+
toolCalls,
|
|
2750
|
+
prompt,
|
|
2751
|
+
completion,
|
|
2752
|
+
cost,
|
|
2753
|
+
durationMs: Date.now() - start
|
|
2754
|
+
};
|
|
2340
2755
|
}
|
|
2341
2756
|
function stepSystemPrompt(goal, step, priorSummaries, useTools) {
|
|
2342
2757
|
const context = priorSummaries.length ? `
|
|
@@ -2344,12 +2759,13 @@ function stepSystemPrompt(goal, step, priorSummaries, useTools) {
|
|
|
2344
2759
|
What previous steps accomplished:
|
|
2345
2760
|
${priorSummaries.join("\n")}` : "";
|
|
2346
2761
|
const toolNote = useTools ? `
|
|
2347
|
-
You may use the provided tools (read_file, write_file, list_dir, run_command). Call the \`finish\` tool with a one-line summary when this step's objective is met
|
|
2762
|
+
You may use the provided tools (read_file, write_file, list_dir, run_command). Call the \`finish\` tool with a one-line summary when this step's objective is met.
|
|
2763
|
+
If you cannot call tools natively, reply with ONLY one JSON object per turn, no prose: {"name":"<tool>","arguments":{...}}` : `
|
|
2348
2764
|
Return a concise result for this step. Do not ask the user questions.`;
|
|
2349
2765
|
return `You are the "${step.type}" stage of an autonomous coding agent.
|
|
2350
2766
|
Overall goal: ${goal}
|
|
2351
2767
|
Your current step: ${step.description}${context}${toolNote}
|
|
2352
|
-
Be efficient \u2014 you were selected as the
|
|
2768
|
+
Be efficient \u2014 you were selected as the most cost-effective capable model for this step.`;
|
|
2353
2769
|
}
|
|
2354
2770
|
|
|
2355
2771
|
// src/tui/App.tsx
|
|
@@ -2365,6 +2781,8 @@ function App(props) {
|
|
|
2365
2781
|
const [tok, setTok] = useState(0);
|
|
2366
2782
|
const [calls, setCalls] = useState(0);
|
|
2367
2783
|
const [rated, setRated] = useState(null);
|
|
2784
|
+
const [passed, setPassed] = useState(null);
|
|
2785
|
+
const [attempts, setAttempts] = useState(0);
|
|
2368
2786
|
const push = useCallback((text, color) => {
|
|
2369
2787
|
setLog((l) => [...l, { key: l.length, text, color }]);
|
|
2370
2788
|
}, []);
|
|
@@ -2382,7 +2800,9 @@ function App(props) {
|
|
|
2382
2800
|
sessionId: props.sessionId,
|
|
2383
2801
|
cwd: props.cwd,
|
|
2384
2802
|
allowWrite: props.allowWrite,
|
|
2385
|
-
allowCommands: props.allowCommands
|
|
2803
|
+
allowCommands: props.allowCommands,
|
|
2804
|
+
verify: props.verify,
|
|
2805
|
+
maxAttempts: props.maxAttempts
|
|
2386
2806
|
};
|
|
2387
2807
|
let textBuf = "";
|
|
2388
2808
|
const flush = () => {
|
|
@@ -2394,6 +2814,24 @@ function App(props) {
|
|
|
2394
2814
|
case "plan":
|
|
2395
2815
|
push(`\u{1F4CB} Plan (${e.plan.steps.length} steps) \xB7 planner: ${e.planModel}`, "cyan");
|
|
2396
2816
|
break;
|
|
2817
|
+
case "criteria":
|
|
2818
|
+
push(`\u{1F3AF} ${e.goalType} \xB7 ${e.criteria.length} criteria \xB7 start: ${e.startTier}${e.learned ? " (learned)" : ""}`, "cyan");
|
|
2819
|
+
e.criteria.forEach((cr, i) => push(` ${i + 1}. ${cr}`, "gray"));
|
|
2820
|
+
break;
|
|
2821
|
+
case "verify-start":
|
|
2822
|
+
flush();
|
|
2823
|
+
push(`\u{1F50D} Verify (attempt ${e.attempt}) \u2192 ${e.model}`, "cyan");
|
|
2824
|
+
break;
|
|
2825
|
+
case "verdict":
|
|
2826
|
+
flush();
|
|
2827
|
+
push(
|
|
2828
|
+
`${e.allMet ? "\u2705" : "\u274C"} ${e.metCount}/${e.total} criteria met` + (e.unmet.length ? " \u2014 unmet: " + e.unmet.map((u) => u.criterion).join("; ").slice(0, 100) : ""),
|
|
2829
|
+
e.allMet ? "green" : "red"
|
|
2830
|
+
);
|
|
2831
|
+
break;
|
|
2832
|
+
case "escalate":
|
|
2833
|
+
push(`\u23EB Escalate \u2192 ${e.toRung} (${e.reason})`, "magenta");
|
|
2834
|
+
break;
|
|
2397
2835
|
case "step-start":
|
|
2398
2836
|
flush();
|
|
2399
2837
|
push(`\u25B6 Step ${e.step.id} [${e.step.type}] \u2192 ${e.model.id} ~${usd(e.estCostUsd)}`, "yellow");
|
|
@@ -2423,6 +2861,8 @@ function App(props) {
|
|
|
2423
2861
|
break;
|
|
2424
2862
|
case "done":
|
|
2425
2863
|
flush();
|
|
2864
|
+
setPassed(e.passed);
|
|
2865
|
+
setAttempts(e.attempts);
|
|
2426
2866
|
break;
|
|
2427
2867
|
}
|
|
2428
2868
|
};
|
|
@@ -2482,10 +2922,14 @@ function App(props) {
|
|
|
2482
2922
|
" working\u2026"
|
|
2483
2923
|
] }),
|
|
2484
2924
|
phase === "rate" && /* @__PURE__ */ jsxs(Text, { children: [
|
|
2485
|
-
/* @__PURE__ */ jsxs(Text, { color: "green", children: [
|
|
2486
|
-
"\
|
|
2925
|
+
/* @__PURE__ */ jsxs(Text, { color: passed === false ? "yellow" : "green", children: [
|
|
2926
|
+
passed === false ? "\u26A0 goal not fully met" : "\u2713 goal met",
|
|
2927
|
+
" \xB7 ",
|
|
2928
|
+
attempts,
|
|
2929
|
+
" attempt(s) \xB7 ",
|
|
2487
2930
|
calls,
|
|
2488
|
-
" calls \xB7
|
|
2931
|
+
" calls \xB7",
|
|
2932
|
+
" ",
|
|
2489
2933
|
tokens(tok),
|
|
2490
2934
|
" tokens \xB7 ",
|
|
2491
2935
|
usd(cost)
|
|
@@ -2570,7 +3014,7 @@ function truncate2(s, n) {
|
|
|
2570
3014
|
|
|
2571
3015
|
// src/index.ts
|
|
2572
3016
|
var program = new Command();
|
|
2573
|
-
program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.
|
|
3017
|
+
program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version("0.4.0");
|
|
2574
3018
|
function client(config) {
|
|
2575
3019
|
return new OpenRouterClient({
|
|
2576
3020
|
apiKey: resolveApiKey(config),
|
|
@@ -2644,7 +3088,7 @@ async function loadCatalog(config, refresh = false) {
|
|
|
2644
3088
|
program.command("login").description("Connect Polymath to OpenRouter (set/replace your API key)").action(async () => {
|
|
2645
3089
|
await runLogin();
|
|
2646
3090
|
});
|
|
2647
|
-
program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).action(async (goalParts, opts) => {
|
|
3091
|
+
program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).option("--no-verify", "skip the verify-and-escalate loop (single pass)").option("--max-attempts <n>", "max code\u2192verify\u2192escalate attempts until goals met", "3").action(async (goalParts, opts) => {
|
|
2648
3092
|
const startedAt = Date.now();
|
|
2649
3093
|
const config = loadConfig();
|
|
2650
3094
|
if (!config.local.enabled || resolveApiKey(config)) {
|
|
@@ -2669,6 +3113,8 @@ program.command("run", { isDefault: true }).description("Launch the interactive
|
|
|
2669
3113
|
allowWrite: !!opts.write,
|
|
2670
3114
|
allowCommands: !!opts.commands,
|
|
2671
3115
|
objectiveLabel: policy.objective,
|
|
3116
|
+
verify: opts.verify !== false,
|
|
3117
|
+
maxAttempts: Math.max(1, parseInt(opts.maxAttempts, 10) || 3),
|
|
2672
3118
|
initialGoal: goal
|
|
2673
3119
|
})
|
|
2674
3120
|
);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "polymath-agent",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Polymath — a cost-optimized, multi-model TUI coding agent. Decomposes work into typed tasks, routes each task to the cheapest capable model via OpenRouter, and logs real usage/cost by date + model.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|