polymath-agent 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -1
- package/dist/cli.js +876 -190
- package/package.json +3 -2
package/dist/cli.js
CHANGED
|
@@ -599,11 +599,14 @@ var TASK_SPECS = {
|
|
|
599
599
|
command: { type: "command", minTier: "cheap", needsTools: true, label: "Run command" },
|
|
600
600
|
review: { type: "review", minTier: "frontier", needsTools: false, label: "Review / critique" },
|
|
601
601
|
reason: { type: "reason", minTier: "frontier", needsTools: false, label: "Hard reasoning" },
|
|
602
|
+
// The verify gate inspects files / runs tests — it MUST have tools.
|
|
603
|
+
verify: { type: "verify", minTier: "frontier", needsTools: true, label: "Verify result" },
|
|
602
604
|
explain: { type: "explain", minTier: "cheap", needsTools: false, label: "Explain" },
|
|
603
605
|
summarize: { type: "summarize", minTier: "cheap", needsTools: false, label: "Summarize" },
|
|
604
606
|
chat: { type: "chat", minTier: "cheap", needsTools: false, label: "Chat" }
|
|
605
607
|
};
|
|
606
608
|
var ALL_TASK_TYPES = Object.keys(TASK_SPECS);
|
|
609
|
+
var ALL_GOAL_TYPES = ["feature", "bugfix", "refactor", "test", "docs", "chore", "other"];
|
|
607
610
|
|
|
608
611
|
// src/planner/planner.ts
|
|
609
612
|
var PLAN_SYSTEM = `You are the planning stage of a coding agent. Break the user's request into a short, ordered list of concrete steps.
|
|
@@ -619,9 +622,21 @@ Each step must be classified by type, chosen from EXACTLY this set:
|
|
|
619
622
|
summarize - condense long content
|
|
620
623
|
chat - a simple conversational reply
|
|
621
624
|
|
|
625
|
+
Also classify the request's goalType (one of: feature, bugfix, refactor, test, docs, chore, other) and write 2-5 MEASURABLE acceptance criteria \u2014 concrete, checkable conditions that mean the goal is fully achieved (e.g. "hello.js exists and prints the greeting", "npm test passes", "the function handles empty input").
|
|
626
|
+
|
|
622
627
|
Return ONLY minified JSON of the form:
|
|
623
|
-
{"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
|
|
628
|
+
{"goalType":"<type>","criteria":["...","..."],"steps":[{"type":"<type>","description":"...","estPromptTokens":<int>,"estCompletionTokens":<int>}]}
|
|
624
629
|
Use 3-8 steps for non-trivial work, fewer for simple requests. Estimate tokens realistically (prompts often 2000-15000, completions 200-3000).`;
|
|
630
|
+
function classifyGoalType(goal) {
|
|
631
|
+
const g = goal.toLowerCase();
|
|
632
|
+
if (/\b(fix|bug|broken|error|crash|regression|fails?)\b/.test(g)) return "bugfix";
|
|
633
|
+
if (/\b(refactor|rename|clean ?up|restructure|extract|simplif)/.test(g)) return "refactor";
|
|
634
|
+
if (/\b(test|spec|coverage|unit test|e2e)\b/.test(g)) return "test";
|
|
635
|
+
if (/\b(docs?|readme|comment|documentation)\b/.test(g)) return "docs";
|
|
636
|
+
if (/\b(bump|upgrade|dependency|deps|config|chore|lint|format)\b/.test(g)) return "chore";
|
|
637
|
+
if (/\b(add|create|implement|build|feature|support|new)\b/.test(g)) return "feature";
|
|
638
|
+
return "other";
|
|
639
|
+
}
|
|
625
640
|
function heuristicPlan(goal) {
|
|
626
641
|
const steps = [
|
|
627
642
|
{ id: 1, type: "plan", description: "Decompose the request", estPromptTokens: 2e3, estCompletionTokens: 600 },
|
|
@@ -630,7 +645,12 @@ function heuristicPlan(goal) {
|
|
|
630
645
|
{ id: 4, type: "edit", description: "Implement the change", estPromptTokens: 9e3, estCompletionTokens: 1500 },
|
|
631
646
|
{ id: 5, type: "review", description: "Review the change", estPromptTokens: 6e3, estCompletionTokens: 800 }
|
|
632
647
|
];
|
|
633
|
-
return {
|
|
648
|
+
return {
|
|
649
|
+
goal,
|
|
650
|
+
steps,
|
|
651
|
+
goalType: classifyGoalType(goal),
|
|
652
|
+
criteria: ["The stated goal is fully implemented and works", "No obvious errors or omissions remain"]
|
|
653
|
+
};
|
|
634
654
|
}
|
|
635
655
|
async function planRequest(goal, client2, planModel, onUsage) {
|
|
636
656
|
const result = await client2.complete(
|
|
@@ -648,7 +668,7 @@ async function planRequest(goal, client2, planModel, onUsage) {
|
|
|
648
668
|
onUsage?.(result);
|
|
649
669
|
const parsed = extractPlan(result.content);
|
|
650
670
|
if (!parsed) return heuristicPlan(goal);
|
|
651
|
-
return { goal,
|
|
671
|
+
return { goal, ...parsed };
|
|
652
672
|
}
|
|
653
673
|
function extractPlan(text) {
|
|
654
674
|
const json = extractJson(text);
|
|
@@ -663,7 +683,10 @@ function extractPlan(text) {
|
|
|
663
683
|
estPromptTokens: clampInt(s.estPromptTokens, 500, 6e4, 4e3),
|
|
664
684
|
estCompletionTokens: clampInt(s.estCompletionTokens, 100, 8e3, 800)
|
|
665
685
|
}));
|
|
666
|
-
|
|
686
|
+
if (!steps.length) return null;
|
|
687
|
+
const goalType = ALL_GOAL_TYPES.includes(String(obj.goalType)) ? obj.goalType : "other";
|
|
688
|
+
const criteria = Array.isArray(obj.criteria) ? obj.criteria.map((x) => String(x).slice(0, 200)).filter(Boolean).slice(0, 6) : [];
|
|
689
|
+
return { steps, goalType, criteria: criteria.length ? criteria : ["The stated goal is fully achieved"] };
|
|
667
690
|
} catch {
|
|
668
691
|
return null;
|
|
669
692
|
}
|
|
@@ -705,10 +728,29 @@ function extractJson(text) {
|
|
|
705
728
|
}
|
|
706
729
|
|
|
707
730
|
// src/router/policy.ts
|
|
731
|
+
var ESCALATION_LADDER = [
|
|
732
|
+
{ objective: "value", maxTokens: 2e3, maxIters: 6, liftCostCap: false, label: "value \xB7 cheapest-capable" },
|
|
733
|
+
{ tierFloor: "standard", objective: "value", maxTokens: 4e3, maxIters: 8, liftCostCap: true, label: "standard+ \xB7 more tokens" },
|
|
734
|
+
{ tierFloor: "frontier", objective: "quality", maxTokens: 8e3, maxIters: 10, liftCostCap: true, label: "frontier \xB7 strongest" }
|
|
735
|
+
];
|
|
736
|
+
function rungForTier(tier) {
|
|
737
|
+
return ESCALATION_LADDER.findIndex((r) => r.tierFloor === tier || !r.tierFloor && tier === "cheap");
|
|
738
|
+
}
|
|
739
|
+
function applyRung(base, rung) {
|
|
740
|
+
return {
|
|
741
|
+
...base,
|
|
742
|
+
objective: rung.objective,
|
|
743
|
+
tierFloor: rung.tierFloor,
|
|
744
|
+
maxCostPerCallUsd: rung.liftCostCap ? void 0 : base.maxCostPerCallUsd
|
|
745
|
+
};
|
|
746
|
+
}
|
|
708
747
|
var TIER_RANK = { cheap: 0, standard: 1, frontier: 2 };
|
|
709
748
|
function tierAtLeast(tier, min) {
|
|
710
749
|
return TIER_RANK[tier] >= TIER_RANK[min];
|
|
711
750
|
}
|
|
751
|
+
function tierRank(tier) {
|
|
752
|
+
return TIER_RANK[tier];
|
|
753
|
+
}
|
|
712
754
|
function blendedPrice(m) {
|
|
713
755
|
return (m.pricing.promptUsdPerMTok * 3 + m.pricing.completionUsdPerMTok) / 4;
|
|
714
756
|
}
|
|
@@ -755,6 +797,7 @@ var TASK_SKILL = {
|
|
|
755
797
|
command: "speed",
|
|
756
798
|
review: "reasoning",
|
|
757
799
|
reason: "reasoning",
|
|
800
|
+
verify: "reasoning",
|
|
758
801
|
explain: "general",
|
|
759
802
|
summarize: "speed",
|
|
760
803
|
chat: "speed"
|
|
@@ -781,6 +824,7 @@ var TASK_MIN_STRENGTH = {
|
|
|
781
824
|
edit: 1.4,
|
|
782
825
|
review: 1.5,
|
|
783
826
|
reason: 1.5,
|
|
827
|
+
verify: 1.4,
|
|
784
828
|
plan: 1.2
|
|
785
829
|
};
|
|
786
830
|
var HEADLINE_SKILLS = ["coding", "reasoning", "retrieval", "speed"];
|
|
@@ -798,9 +842,10 @@ function taskValue(m, taskType, empirical) {
|
|
|
798
842
|
function candidatesFor(taskType, models, policy, est) {
|
|
799
843
|
const spec = TASK_SPECS[taskType];
|
|
800
844
|
const strengthFloor = TASK_MIN_STRENGTH[taskType] ?? 0;
|
|
845
|
+
const minTier = policy.tierFloor && tierRank(policy.tierFloor) > tierRank(spec.minTier) ? policy.tierFloor : spec.minTier;
|
|
801
846
|
return models.filter((m) => {
|
|
802
847
|
if (m.id === "openrouter/auto") return false;
|
|
803
|
-
const covers = tierAtLeast(m.tier,
|
|
848
|
+
const covers = tierAtLeast(m.tier, minTier) || !policy.tierFloor && taskStrength(m, taskType) >= strengthFloor;
|
|
804
849
|
if (!covers) return false;
|
|
805
850
|
if (spec.needsTools && !m.capabilities.tools) return false;
|
|
806
851
|
if (policy.maxCostPerCallUsd != null && est) {
|
|
@@ -846,6 +891,19 @@ function route(taskType, models, policy, est = { promptTokens: 4e3, completionTo
|
|
|
846
891
|
const reason = policy.objective === "cheapest" ? `cheapest model that covers ${skill}` : policy.objective === "quality" ? `strongest at ${skill}` : proven ? `proven ${Math.round(proven)}% fewer tokens on ${taskType} (playbook)` : `best ${skill}-per-dollar`;
|
|
847
892
|
return { model: chosen, reason, estCostUsd: projectCost(chosen, est) };
|
|
848
893
|
}
|
|
894
|
+
function routeOrBest(taskType, models, policy, est = { promptTokens: 4e3, completionTokens: 1e3 }) {
|
|
895
|
+
const r = route(taskType, models, policy, est);
|
|
896
|
+
if (r) return r;
|
|
897
|
+
const spec = TASK_SPECS[taskType];
|
|
898
|
+
const usable = models.filter(
|
|
899
|
+
(m) => m.id !== "openrouter/auto" && (!spec.needsTools || m.capabilities.tools)
|
|
900
|
+
);
|
|
901
|
+
if (!usable.length) return null;
|
|
902
|
+
const byStrength = (a, b) => taskStrength(b, taskType) - taskStrength(a, taskType);
|
|
903
|
+
const withTools = usable.filter((m) => m.capabilities.tools).sort(byStrength);
|
|
904
|
+
const best = (withTools.length ? withTools : [...usable].sort(byStrength))[0];
|
|
905
|
+
return { model: best, reason: `best available for ${TASK_SKILL[taskType]} (fallback)`, estCostUsd: projectCost(best, est) };
|
|
906
|
+
}
|
|
849
907
|
|
|
850
908
|
// src/recommend/recommend.ts
|
|
851
909
|
var OBJECTIVES = [
|
|
@@ -1075,6 +1133,27 @@ function getDb() {
|
|
|
1075
1133
|
);
|
|
1076
1134
|
CREATE INDEX IF NOT EXISTS idx_cmd_date ON command_runs(date);
|
|
1077
1135
|
|
|
1136
|
+
-- One row per verify-and-escalate attempt within a session. Powers the
|
|
1137
|
+
-- "optimal starting model per goal type" statistical learning.
|
|
1138
|
+
CREATE TABLE IF NOT EXISTS attempts (
|
|
1139
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
1140
|
+
session_id TEXT NOT NULL,
|
|
1141
|
+
attempt_no INTEGER NOT NULL,
|
|
1142
|
+
goal_type TEXT NOT NULL,
|
|
1143
|
+
tier_floor TEXT,
|
|
1144
|
+
objective TEXT NOT NULL,
|
|
1145
|
+
prompt_tokens INTEGER NOT NULL,
|
|
1146
|
+
completion_tokens INTEGER NOT NULL,
|
|
1147
|
+
cost_usd REAL NOT NULL,
|
|
1148
|
+
criteria_total INTEGER NOT NULL,
|
|
1149
|
+
criteria_met INTEGER NOT NULL,
|
|
1150
|
+
passed INTEGER NOT NULL,
|
|
1151
|
+
duration_ms INTEGER NOT NULL,
|
|
1152
|
+
synced INTEGER NOT NULL DEFAULT 0
|
|
1153
|
+
);
|
|
1154
|
+
CREATE INDEX IF NOT EXISTS idx_attempts_session ON attempts(session_id);
|
|
1155
|
+
CREATE INDEX IF NOT EXISTS idx_attempts_goal ON attempts(goal_type, tier_floor);
|
|
1156
|
+
|
|
1078
1157
|
-- Distilled efficiency insights: ONLY the notably cost-efficient approaches.
|
|
1079
1158
|
-- This is what syncs to the cloud by default (raw logs stay local).
|
|
1080
1159
|
CREATE TABLE IF NOT EXISTS insights (
|
|
@@ -1096,6 +1175,15 @@ function getDb() {
|
|
|
1096
1175
|
if (!cols.some((c2) => c2.name === "command")) {
|
|
1097
1176
|
db.exec(`ALTER TABLE usage_log ADD COLUMN command TEXT NOT NULL DEFAULT 'run'`);
|
|
1098
1177
|
}
|
|
1178
|
+
const conn = db;
|
|
1179
|
+
const scols = conn.prepare(`PRAGMA table_info(sessions)`).all();
|
|
1180
|
+
const addSession = (name, decl) => {
|
|
1181
|
+
if (!scols.some((c2) => c2.name === name)) conn.exec(`ALTER TABLE sessions ADD COLUMN ${name} ${decl}`);
|
|
1182
|
+
};
|
|
1183
|
+
addSession("goal_type", "TEXT NOT NULL DEFAULT 'other'");
|
|
1184
|
+
addSession("start_tier", "TEXT");
|
|
1185
|
+
addSession("attempts", "INTEGER NOT NULL DEFAULT 1");
|
|
1186
|
+
addSession("final_passed", "INTEGER");
|
|
1099
1187
|
return db;
|
|
1100
1188
|
}
|
|
1101
1189
|
function recordUsage(e) {
|
|
@@ -1188,14 +1276,14 @@ function markSynced(ids) {
|
|
|
1188
1276
|
}
|
|
1189
1277
|
function startSession(s) {
|
|
1190
1278
|
getDb().prepare(
|
|
1191
|
-
`INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps)
|
|
1192
|
-
VALUES (?, ?, ?, ?, ?, ?, ?)`
|
|
1193
|
-
).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps);
|
|
1279
|
+
`INSERT OR REPLACE INTO sessions (id, ts, date, goal, command, objective, planned_steps, goal_type, start_tier)
|
|
1280
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
1281
|
+
).run(s.id, s.ts, s.date, s.goal, s.command, s.objective, s.plannedSteps, s.goalType, s.startTier ?? null);
|
|
1194
1282
|
}
|
|
1195
1283
|
function finishSession(id, u) {
|
|
1196
1284
|
getDb().prepare(
|
|
1197
1285
|
`UPDATE sessions SET planned_steps=?, completed_steps=?, failed_steps=?, auto_score=?,
|
|
1198
|
-
prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=? WHERE id=?`
|
|
1286
|
+
prompt_tokens=?, completion_tokens=?, cost_usd=?, duration_ms=?, attempts=?, final_passed=? WHERE id=?`
|
|
1199
1287
|
).run(
|
|
1200
1288
|
u.plannedSteps,
|
|
1201
1289
|
u.completedSteps,
|
|
@@ -1205,9 +1293,60 @@ function finishSession(id, u) {
|
|
|
1205
1293
|
u.completionTokens,
|
|
1206
1294
|
u.costUsd,
|
|
1207
1295
|
u.durationMs,
|
|
1296
|
+
u.attempts ?? 1,
|
|
1297
|
+
u.finalPassed == null ? null : u.finalPassed ? 1 : 0,
|
|
1208
1298
|
id
|
|
1209
1299
|
);
|
|
1210
1300
|
}
|
|
1301
|
+
function recordAttempt(a) {
|
|
1302
|
+
getDb().prepare(
|
|
1303
|
+
`INSERT INTO attempts
|
|
1304
|
+
(session_id, attempt_no, goal_type, tier_floor, objective, prompt_tokens, completion_tokens,
|
|
1305
|
+
cost_usd, criteria_total, criteria_met, passed, duration_ms)
|
|
1306
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
1307
|
+
).run(
|
|
1308
|
+
a.sessionId,
|
|
1309
|
+
a.attemptNo,
|
|
1310
|
+
a.goalType,
|
|
1311
|
+
a.tierFloor,
|
|
1312
|
+
a.objective,
|
|
1313
|
+
a.promptTokens,
|
|
1314
|
+
a.completionTokens,
|
|
1315
|
+
a.costUsd,
|
|
1316
|
+
a.criteriaTotal,
|
|
1317
|
+
a.criteriaMet,
|
|
1318
|
+
a.passed ? 1 : 0,
|
|
1319
|
+
a.durationMs
|
|
1320
|
+
);
|
|
1321
|
+
}
|
|
1322
|
+
function goalTierStats() {
|
|
1323
|
+
const rows = getDb().prepare(
|
|
1324
|
+
`SELECT goal_type AS goalType, COALESCE(start_tier,'cheap') AS startTier,
|
|
1325
|
+
COUNT(*) AS sessions,
|
|
1326
|
+
AVG(CASE WHEN final_passed=1 THEN 1.0 ELSE 0.0 END) AS passRate,
|
|
1327
|
+
AVG(prompt_tokens + completion_tokens) AS avgTotalTokens,
|
|
1328
|
+
AVG(attempts) AS avgAttempts
|
|
1329
|
+
FROM sessions
|
|
1330
|
+
WHERE final_passed IS NOT NULL
|
|
1331
|
+
GROUP BY goal_type, startTier
|
|
1332
|
+
ORDER BY goal_type, avgTotalTokens ASC`
|
|
1333
|
+
).all();
|
|
1334
|
+
return rows.map((r) => ({
|
|
1335
|
+
goalType: String(r.goalType),
|
|
1336
|
+
startTier: String(r.startTier),
|
|
1337
|
+
sessions: Number(r.sessions),
|
|
1338
|
+
passRate: Number(r.passRate ?? 0),
|
|
1339
|
+
avgTotalTokens: Number(r.avgTotalTokens ?? 0),
|
|
1340
|
+
avgAttempts: Number(r.avgAttempts ?? 0)
|
|
1341
|
+
}));
|
|
1342
|
+
}
|
|
1343
|
+
function optimalStartTier(goalType, minSessions = 3) {
|
|
1344
|
+
const stats = goalTierStats().filter(
|
|
1345
|
+
(s) => s.goalType === goalType && s.sessions >= minSessions && s.passRate >= 0.6
|
|
1346
|
+
);
|
|
1347
|
+
if (!stats.length) return null;
|
|
1348
|
+
return stats.sort((a, b) => a.avgTotalTokens - b.avgTotalTokens)[0].startTier;
|
|
1349
|
+
}
|
|
1211
1350
|
function setUserScore(sessionId, score) {
|
|
1212
1351
|
getDb().prepare(`UPDATE sessions SET user_score=? WHERE id=?`).run(score, sessionId);
|
|
1213
1352
|
}
|
|
@@ -1651,6 +1790,35 @@ function renderAnalysis(filter = {}) {
|
|
|
1651
1790
|
}
|
|
1652
1791
|
out.push("");
|
|
1653
1792
|
}
|
|
1793
|
+
const tierStats = goalTierStats();
|
|
1794
|
+
if (tierStats.length) {
|
|
1795
|
+
out.push(c.bold("Optimal starting model per goal type") + c.dim(" (pass rate vs total tokens to reach the goal)"));
|
|
1796
|
+
out.push(
|
|
1797
|
+
table(
|
|
1798
|
+
["Goal type", "Start tier", "Sessions", "Pass rate", "Avg total tok", "Avg attempts"],
|
|
1799
|
+
tierStats.map((s) => [
|
|
1800
|
+
s.goalType,
|
|
1801
|
+
tierColor(s.startTier),
|
|
1802
|
+
String(s.sessions),
|
|
1803
|
+
`${Math.round(s.passRate * 100)}%`,
|
|
1804
|
+
tokens(Math.round(s.avgTotalTokens)),
|
|
1805
|
+
s.avgAttempts.toFixed(1)
|
|
1806
|
+
])
|
|
1807
|
+
)
|
|
1808
|
+
);
|
|
1809
|
+
const goalTypes = [...new Set(tierStats.map((s) => s.goalType))];
|
|
1810
|
+
const learned = goalTypes.map((g) => ({ g, tier: optimalStartTier(g) })).filter((x) => x.tier);
|
|
1811
|
+
if (learned.length) {
|
|
1812
|
+
out.push(
|
|
1813
|
+
c.green(
|
|
1814
|
+
"\u2192 Learned starts (auto-applied on `poly run`): " + learned.map((x) => `${x.g}\u2192${x.tier}`).join(", ")
|
|
1815
|
+
)
|
|
1816
|
+
);
|
|
1817
|
+
} else {
|
|
1818
|
+
out.push(c.dim("\u2192 Not enough evidence yet to auto-pick a starting tier (needs \u22653 verified sessions per goal type)."));
|
|
1819
|
+
}
|
|
1820
|
+
out.push("");
|
|
1821
|
+
}
|
|
1654
1822
|
if (byCommand.length) {
|
|
1655
1823
|
out.push(c.bold("Usage by command"));
|
|
1656
1824
|
out.push(
|
|
@@ -2012,6 +2180,274 @@ function logCompletion(result, taskType, sessionId, command = "run") {
|
|
|
2012
2180
|
return entry;
|
|
2013
2181
|
}
|
|
2014
2182
|
|
|
2183
|
+
// src/setup/commands.ts
|
|
2184
|
+
import { execSync as execSync2 } from "node:child_process";
|
|
2185
|
+
|
|
2186
|
+
// src/util/prompt.ts
|
|
2187
|
+
import readline2 from "node:readline";
|
|
2188
|
+
function interactive() {
|
|
2189
|
+
return process.stdin.isTTY === true && process.stdout.isTTY === true;
|
|
2190
|
+
}
|
|
2191
|
+
function ask2(question) {
|
|
2192
|
+
return new Promise((resolve2) => {
|
|
2193
|
+
const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
|
|
2194
|
+
rl.question(question, (a) => {
|
|
2195
|
+
rl.close();
|
|
2196
|
+
resolve2(a.trim());
|
|
2197
|
+
});
|
|
2198
|
+
});
|
|
2199
|
+
}
|
|
2200
|
+
async function confirm(question, def = true) {
|
|
2201
|
+
if (!interactive()) return def;
|
|
2202
|
+
const hint = def ? "[Y/n]" : "[y/N]";
|
|
2203
|
+
const a = (await ask2(`${question} ${hint} `)).toLowerCase();
|
|
2204
|
+
if (!a) return def;
|
|
2205
|
+
return /^y/.test(a);
|
|
2206
|
+
}
|
|
2207
|
+
async function select(question, items, render2) {
|
|
2208
|
+
if (!interactive() || items.length <= 1) return items[0];
|
|
2209
|
+
console.log(question);
|
|
2210
|
+
items.forEach((it, i) => console.log(` ${i + 1}) ${render2(it)}`));
|
|
2211
|
+
const a = await ask2(`Choose [1-${items.length}] (default 1): `);
|
|
2212
|
+
const n = parseInt(a, 10);
|
|
2213
|
+
return Number.isInteger(n) && n >= 1 && n <= items.length ? items[n - 1] : items[0];
|
|
2214
|
+
}
|
|
2215
|
+
|
|
2216
|
+
// src/setup/localllm.ts
|
|
2217
|
+
import { execSync, spawn } from "node:child_process";
|
|
2218
|
+
import os2 from "node:os";
|
|
2219
|
+
function suggestModels() {
|
|
2220
|
+
const ramGb = Math.round(os2.totalmem() / 1024 ** 3);
|
|
2221
|
+
const list = [];
|
|
2222
|
+
if (ramGb >= 13) list.push({ id: "qwen2.5-coder:7b", label: "Qwen2.5 Coder 7B", sizeGb: 4.7, note: "best coding pick for ~16GB" });
|
|
2223
|
+
list.push({ id: "llama3.2:3b", label: "Llama 3.2 3B", sizeGb: 2, note: "fast, light; great for cheap tasks" });
|
|
2224
|
+
if (ramGb >= 30) list.push({ id: "qwen2.5-coder:14b", label: "Qwen2.5 Coder 14B", sizeGb: 9, note: "stronger coding for 32GB+" });
|
|
2225
|
+
return list;
|
|
2226
|
+
}
|
|
2227
|
+
function totalRamGb() {
|
|
2228
|
+
return Math.round(os2.totalmem() / 1024 ** 3);
|
|
2229
|
+
}
|
|
2230
|
+
function which(cmd) {
|
|
2231
|
+
try {
|
|
2232
|
+
execSync(process.platform === "win32" ? `where ${cmd}` : `command -v ${cmd}`, { stdio: "ignore" });
|
|
2233
|
+
return true;
|
|
2234
|
+
} catch {
|
|
2235
|
+
return false;
|
|
2236
|
+
}
|
|
2237
|
+
}
|
|
2238
|
+
function ollamaInstalled() {
|
|
2239
|
+
return which("ollama");
|
|
2240
|
+
}
|
|
2241
|
+
function ollamaVersion() {
|
|
2242
|
+
try {
|
|
2243
|
+
return execSync("ollama --version", { encoding: "utf8" }).trim();
|
|
2244
|
+
} catch {
|
|
2245
|
+
return null;
|
|
2246
|
+
}
|
|
2247
|
+
}
|
|
2248
|
+
async function ollamaServerUp(baseUrl = "http://localhost:11434") {
|
|
2249
|
+
try {
|
|
2250
|
+
const res = await fetch(`${baseUrl.replace(/\/v1\/?$/, "")}/api/version`);
|
|
2251
|
+
return res.ok;
|
|
2252
|
+
} catch {
|
|
2253
|
+
return false;
|
|
2254
|
+
}
|
|
2255
|
+
}
|
|
2256
|
+
async function installedModels(baseUrl = "http://localhost:11434") {
|
|
2257
|
+
try {
|
|
2258
|
+
const res = await fetch(`${baseUrl.replace(/\/v1\/?$/, "")}/api/tags`);
|
|
2259
|
+
if (!res.ok) return [];
|
|
2260
|
+
const json = await res.json();
|
|
2261
|
+
return (json.models ?? []).map((m) => m.name);
|
|
2262
|
+
} catch {
|
|
2263
|
+
return [];
|
|
2264
|
+
}
|
|
2265
|
+
}
|
|
2266
|
+
function run(cmd, args) {
|
|
2267
|
+
return new Promise((resolve2) => {
|
|
2268
|
+
const child = spawn(cmd, args, { stdio: "inherit" });
|
|
2269
|
+
child.on("error", () => resolve2(false));
|
|
2270
|
+
child.on("exit", (code) => resolve2(code === 0));
|
|
2271
|
+
});
|
|
2272
|
+
}
|
|
2273
|
+
function ollamaInstallPlan() {
|
|
2274
|
+
const platform = process.platform;
|
|
2275
|
+
if (platform === "darwin") {
|
|
2276
|
+
if (which("brew")) return { canAuto: true, command: { cmd: "brew", args: ["install", "ollama"] }, manual: "brew install ollama" };
|
|
2277
|
+
return { canAuto: false, manual: "Install Homebrew (https://brew.sh) then `brew install ollama`, or download https://ollama.com/download" };
|
|
2278
|
+
}
|
|
2279
|
+
if (platform === "linux") {
|
|
2280
|
+
return { canAuto: true, command: { cmd: "sh", args: ["-c", "curl -fsSL https://ollama.com/install.sh | sh"] }, manual: "curl -fsSL https://ollama.com/install.sh | sh" };
|
|
2281
|
+
}
|
|
2282
|
+
if (platform === "win32") {
|
|
2283
|
+
if (which("winget")) return { canAuto: true, command: { cmd: "winget", args: ["install", "-e", "--id", "Ollama.Ollama"] }, manual: "winget install Ollama.Ollama" };
|
|
2284
|
+
return { canAuto: false, manual: "Download the installer from https://ollama.com/download" };
|
|
2285
|
+
}
|
|
2286
|
+
return { canAuto: false, manual: "See https://ollama.com/download" };
|
|
2287
|
+
}
|
|
2288
|
+
async function ensureServer(baseUrl = "http://localhost:11434") {
|
|
2289
|
+
if (await ollamaServerUp(baseUrl)) return true;
|
|
2290
|
+
if (process.platform === "darwin" && which("brew")) {
|
|
2291
|
+
await run("brew", ["services", "start", "ollama"]);
|
|
2292
|
+
} else {
|
|
2293
|
+
try {
|
|
2294
|
+
const child = spawn("ollama", ["serve"], { stdio: "ignore", detached: true });
|
|
2295
|
+
child.unref();
|
|
2296
|
+
} catch {
|
|
2297
|
+
}
|
|
2298
|
+
}
|
|
2299
|
+
for (let i = 0; i < 10; i++) {
|
|
2300
|
+
if (await ollamaServerUp(baseUrl)) return true;
|
|
2301
|
+
await delay(500);
|
|
2302
|
+
}
|
|
2303
|
+
return false;
|
|
2304
|
+
}
|
|
2305
|
+
function delay(ms) {
|
|
2306
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
2307
|
+
}
|
|
2308
|
+
|
|
2309
|
+
// src/setup/commands.ts
|
|
2310
|
+
async function runSetup(opts) {
|
|
2311
|
+
console.log(c.bold("\n\u{1F527} Polymath setup\n"));
|
|
2312
|
+
const config = loadConfig();
|
|
2313
|
+
let wantLocal = opts.local;
|
|
2314
|
+
if (wantLocal === void 0) {
|
|
2315
|
+
wantLocal = await confirm(
|
|
2316
|
+
`Install a local LLM (Ollama) for $0, offline, no-API-key runs? (RAM detected: ${totalRamGb()}GB)`,
|
|
2317
|
+
true
|
|
2318
|
+
);
|
|
2319
|
+
}
|
|
2320
|
+
if (wantLocal) {
|
|
2321
|
+
await setupLocal(opts, config);
|
|
2322
|
+
} else {
|
|
2323
|
+
config.local.enabled = false;
|
|
2324
|
+
saveConfig(config);
|
|
2325
|
+
console.log(c.dim("Skipping local LLM. (You can run `poly setup --local` later.)"));
|
|
2326
|
+
}
|
|
2327
|
+
const freshConfig = loadConfig();
|
|
2328
|
+
if (!resolveApiKey(freshConfig)) {
|
|
2329
|
+
const wantKey = opts.yes ? false : await confirm("Connect an OpenRouter API key for cloud models (300+ models)?", !wantLocal);
|
|
2330
|
+
if (wantKey) await runLogin();
|
|
2331
|
+
else if (!wantLocal) console.log(c.yellow("No models configured yet \u2014 run `poly login` or `poly setup --local`."));
|
|
2332
|
+
}
|
|
2333
|
+
console.log(c.green("\n\u2713 Setup complete.") + c.dim(' Try: poly recommend "add a dark-mode toggle" \xB7 poly run -w "..."'));
|
|
2334
|
+
}
|
|
2335
|
+
async function setupLocal(opts, config) {
|
|
2336
|
+
if (!ollamaInstalled()) {
|
|
2337
|
+
const plan = ollamaInstallPlan();
|
|
2338
|
+
console.log(c.cyan("Local LLM runtime: Ollama is not installed."));
|
|
2339
|
+
if (plan.canAuto && plan.command) {
|
|
2340
|
+
const go = opts.yes || await confirm(`Install Ollama via \`${plan.command.cmd} ${plan.command.args.join(" ")}\`?`, true);
|
|
2341
|
+
if (go) {
|
|
2342
|
+
const ok = await run(plan.command.cmd, plan.command.args);
|
|
2343
|
+
if (!ok) console.log(c.yellow("Auto-install failed. Manual: " + plan.manual));
|
|
2344
|
+
} else {
|
|
2345
|
+
console.log(c.dim("Manual install: " + plan.manual));
|
|
2346
|
+
}
|
|
2347
|
+
} else {
|
|
2348
|
+
console.log(c.yellow("Install manually: " + plan.manual));
|
|
2349
|
+
}
|
|
2350
|
+
} else {
|
|
2351
|
+
console.log(c.green("\u2713 Ollama present ") + c.dim(ollamaVersion() ?? ""));
|
|
2352
|
+
}
|
|
2353
|
+
if (!ollamaInstalled()) {
|
|
2354
|
+
console.log(c.yellow("Ollama still not on PATH \u2014 re-run `poly setup --local` after installing."));
|
|
2355
|
+
return;
|
|
2356
|
+
}
|
|
2357
|
+
process.stdout.write("Starting Ollama server\u2026 ");
|
|
2358
|
+
const up = await ensureServer(config.local.baseUrl);
|
|
2359
|
+
console.log(up ? c.green("ok") : c.yellow("could not confirm (start it with `ollama serve`)"));
|
|
2360
|
+
const have = await installedModels(config.local.baseUrl);
|
|
2361
|
+
let modelId = opts.model;
|
|
2362
|
+
if (!modelId) {
|
|
2363
|
+
const suggestions = suggestModels().filter((s) => !have.includes(s.id));
|
|
2364
|
+
if (have.length && !suggestions.length) {
|
|
2365
|
+
modelId = have[0];
|
|
2366
|
+
console.log(c.dim(`Using already-installed model ${modelId}.`));
|
|
2367
|
+
} else {
|
|
2368
|
+
const pick = opts.yes ? suggestModels()[0] : await select(
|
|
2369
|
+
"Pick a model to download:",
|
|
2370
|
+
suggestModels(),
|
|
2371
|
+
(s) => `${s.label} (~${s.sizeGb}GB) \u2014 ${s.note}${have.includes(s.id) ? " [installed]" : ""}`
|
|
2372
|
+
);
|
|
2373
|
+
modelId = pick.id;
|
|
2374
|
+
}
|
|
2375
|
+
}
|
|
2376
|
+
if (!have.includes(modelId)) {
|
|
2377
|
+
console.log(c.cyan(`Downloading ${modelId}\u2026`));
|
|
2378
|
+
const ok = await run("ollama", ["pull", modelId]);
|
|
2379
|
+
if (!ok) {
|
|
2380
|
+
console.log(c.yellow(`Could not pull ${modelId}. Run \`ollama pull ${modelId}\` manually.`));
|
|
2381
|
+
return;
|
|
2382
|
+
}
|
|
2383
|
+
}
|
|
2384
|
+
config.local.enabled = true;
|
|
2385
|
+
saveConfig(config);
|
|
2386
|
+
console.log(c.green(`\u2713 Local LLM ready: ${modelId} \u2192 local/${modelId} ($0). `) + c.dim("Enabled in config."));
|
|
2387
|
+
}
|
|
2388
|
+
function cmp(a, b) {
|
|
2389
|
+
const pa = a.replace(/^v/, "").split(".").map((n) => parseInt(n, 10) || 0);
|
|
2390
|
+
const pb = b.replace(/^v/, "").split(".").map((n) => parseInt(n, 10) || 0);
|
|
2391
|
+
for (let i = 0; i < 3; i++) {
|
|
2392
|
+
if ((pa[i] ?? 0) !== (pb[i] ?? 0)) return (pa[i] ?? 0) - (pb[i] ?? 0);
|
|
2393
|
+
}
|
|
2394
|
+
return 0;
|
|
2395
|
+
}
|
|
2396
|
+
async function runUpdate(currentVersion, opts) {
|
|
2397
|
+
const all = !opts.self && !opts.ollama && !opts.models;
|
|
2398
|
+
console.log(c.bold("\n\u2B06\uFE0F Polymath update") + (opts.check ? c.dim(" (check only)") : "") + "\n");
|
|
2399
|
+
if (all || opts.self) {
|
|
2400
|
+
let latest = "";
|
|
2401
|
+
try {
|
|
2402
|
+
latest = execSync2("npm view polymath-agent version", { encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] }).trim();
|
|
2403
|
+
} catch {
|
|
2404
|
+
latest = "";
|
|
2405
|
+
}
|
|
2406
|
+
if (!latest) {
|
|
2407
|
+
console.log(c.dim("CLI: could not reach npm registry."));
|
|
2408
|
+
} else if (cmp(latest, currentVersion) > 0) {
|
|
2409
|
+
console.log(c.yellow(`CLI: ${currentVersion} \u2192 ${latest} available.`));
|
|
2410
|
+
if (!opts.check) {
|
|
2411
|
+
const ok = await run("npm", ["install", "-g", `polymath-agent@${latest}`]);
|
|
2412
|
+
console.log(ok ? c.green(`\u2713 Updated to ${latest}.`) : c.red("npm update failed (try: sudo npm i -g polymath-agent@latest)."));
|
|
2413
|
+
} else {
|
|
2414
|
+
console.log(c.dim(" Run `poly update` to install."));
|
|
2415
|
+
}
|
|
2416
|
+
} else {
|
|
2417
|
+
console.log(c.green(`\u2713 CLI is up to date (${currentVersion}).`));
|
|
2418
|
+
}
|
|
2419
|
+
}
|
|
2420
|
+
if (all || opts.ollama) {
|
|
2421
|
+
if (!ollamaInstalled()) {
|
|
2422
|
+
console.log(c.dim("Ollama: not installed (run `poly setup --local`)."));
|
|
2423
|
+
} else if (opts.check) {
|
|
2424
|
+
console.log(c.dim(`Ollama: ${ollamaVersion() ?? "present"} (update with \`poly update --ollama\`).`));
|
|
2425
|
+
} else if (process.platform === "darwin") {
|
|
2426
|
+
console.log(c.cyan("Updating Ollama\u2026"));
|
|
2427
|
+
await run("brew", ["upgrade", "ollama"]).then((ok) => !ok && console.log(c.dim(" (brew upgrade skipped/failed)")));
|
|
2428
|
+
} else if (process.platform === "linux") {
|
|
2429
|
+
await run("sh", ["-c", "curl -fsSL https://ollama.com/install.sh | sh"]);
|
|
2430
|
+
} else {
|
|
2431
|
+
console.log(c.dim("Ollama: update via your installer (winget upgrade Ollama.Ollama)."));
|
|
2432
|
+
}
|
|
2433
|
+
}
|
|
2434
|
+
if (all || opts.models) {
|
|
2435
|
+
const config = loadConfig();
|
|
2436
|
+
const models = await installedModels(config.local.baseUrl);
|
|
2437
|
+
if (!models.length) {
|
|
2438
|
+
console.log(c.dim("Models: none installed."));
|
|
2439
|
+
} else if (opts.check) {
|
|
2440
|
+
console.log(c.dim(`Models: ${models.join(", ")} (re-pull to update).`));
|
|
2441
|
+
} else {
|
|
2442
|
+
for (const m of models) {
|
|
2443
|
+
console.log(c.cyan(`Updating ${m}\u2026`));
|
|
2444
|
+
await run("ollama", ["pull", m]);
|
|
2445
|
+
}
|
|
2446
|
+
}
|
|
2447
|
+
}
|
|
2448
|
+
console.log("");
|
|
2449
|
+
}
|
|
2450
|
+
|
|
2015
2451
|
// src/tui/App.tsx
|
|
2016
2452
|
import { useState, useEffect, useCallback } from "react";
|
|
2017
2453
|
import { Box, Text, useApp, useInput } from "ink";
|
|
@@ -2021,7 +2457,7 @@ import Spinner from "ink-spinner";
|
|
|
2021
2457
|
// src/agent/tools.ts
|
|
2022
2458
|
import fs4 from "node:fs";
|
|
2023
2459
|
import path2 from "node:path";
|
|
2024
|
-
import { execSync } from "node:child_process";
|
|
2460
|
+
import { execSync as execSync3 } from "node:child_process";
|
|
2025
2461
|
var TOOL_SCHEMAS = [
|
|
2026
2462
|
{
|
|
2027
2463
|
type: "function",
|
|
@@ -2086,6 +2522,28 @@ var TOOL_SCHEMAS = [
|
|
|
2086
2522
|
}
|
|
2087
2523
|
}
|
|
2088
2524
|
];
|
|
2525
|
+
var KNOWN_TOOLS = new Set(TOOL_SCHEMAS.map((t) => t.function.name));
|
|
2526
|
+
var READONLY_TOOL_SCHEMAS = TOOL_SCHEMAS.filter(
|
|
2527
|
+
(t) => ["read_file", "list_dir", "run_command"].includes(t.function.name)
|
|
2528
|
+
);
|
|
2529
|
+
function parseTextToolCall(content) {
|
|
2530
|
+
if (!content) return null;
|
|
2531
|
+
const json = extractJson(content);
|
|
2532
|
+
if (!json) return null;
|
|
2533
|
+
try {
|
|
2534
|
+
const obj = JSON.parse(json);
|
|
2535
|
+
const name = obj?.name ?? obj?.tool ?? obj?.function?.name;
|
|
2536
|
+
if (typeof name !== "string" || !KNOWN_TOOLS.has(name)) return null;
|
|
2537
|
+
const args = obj.arguments ?? obj.parameters ?? obj.function?.arguments ?? {};
|
|
2538
|
+
return {
|
|
2539
|
+
id: `textcall_${name}`,
|
|
2540
|
+
type: "function",
|
|
2541
|
+
function: { name, arguments: typeof args === "string" ? args : JSON.stringify(args) }
|
|
2542
|
+
};
|
|
2543
|
+
} catch {
|
|
2544
|
+
return null;
|
|
2545
|
+
}
|
|
2546
|
+
}
|
|
2089
2547
|
var MAX_OUTPUT = 8e3;
|
|
2090
2548
|
function clip(s) {
|
|
2091
2549
|
return s.length > MAX_OUTPUT ? s.slice(0, MAX_OUTPUT) + `
|
|
@@ -2142,7 +2600,7 @@ function executeTool(name, argsJson, ctx) {
|
|
|
2142
2600
|
}
|
|
2143
2601
|
case "run_command": {
|
|
2144
2602
|
if (!ctx.allowCommands) return { result: "Denied: run_command is disabled." };
|
|
2145
|
-
const out =
|
|
2603
|
+
const out = execSync3(String(args.command), {
|
|
2146
2604
|
cwd: ctx.cwd,
|
|
2147
2605
|
encoding: "utf8",
|
|
2148
2606
|
env: scrubbedEnv(),
|
|
@@ -2167,50 +2625,124 @@ ${stderr}`)) };
|
|
|
2167
2625
|
}
|
|
2168
2626
|
}
|
|
2169
2627
|
|
|
2170
|
-
// src/agent/
|
|
2171
|
-
var
|
|
2172
|
-
var
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2628
|
+
// src/agent/verify.ts
|
|
2629
|
+
var VERIFY_MAX_ITERS = 8;
|
|
2630
|
+
var VERIFY_SYSTEM = `You are the VERIFY stage of an autonomous coding agent. Your job is to MEASURE whether the goal was actually achieved \u2014 be skeptical and check the real workspace, do not assume.
|
|
2631
|
+
Use the read-only tools (read_file, list_dir, run_command) to inspect files and, where relevant, run build/test commands. Then judge EACH acceptance criterion against what you actually observed.
|
|
2632
|
+
When done, reply with ONLY this JSON (no prose, no code fence):
|
|
2633
|
+
{"results":[{"criterion":"<verbatim>","met":true|false,"reason":"<evidence>"}],"feedback":"<concrete guidance to fix any unmet criteria>"}`;
|
|
2634
|
+
async function verifyGoal(goal, criteria, deps, ev = {}) {
|
|
2635
|
+
const toolCtx = { cwd: deps.cwd, allowWrite: false, allowCommands: deps.allowCommands };
|
|
2636
|
+
const useTools = deps.model.capabilities.tools;
|
|
2637
|
+
const messages = [
|
|
2638
|
+
{ role: "system", content: VERIFY_SYSTEM },
|
|
2639
|
+
{
|
|
2640
|
+
role: "user",
|
|
2641
|
+
content: `Goal: ${goal}
|
|
2642
|
+
|
|
2643
|
+
Acceptance criteria:
|
|
2644
|
+
` + criteria.map((c2, i) => `${i + 1}. ${c2}`).join("\n") + `
|
|
2645
|
+
|
|
2646
|
+
Inspect the workspace, then return the verdict JSON.`
|
|
2647
|
+
}
|
|
2648
|
+
];
|
|
2649
|
+
let verdict = null;
|
|
2650
|
+
for (let iter = 0; iter < VERIFY_MAX_ITERS; iter++) {
|
|
2651
|
+
const gen = deps.client.stream(
|
|
2652
|
+
{ model: deps.model.id, messages, tools: useTools ? READONLY_TOOL_SCHEMAS : void 0, temperature: 0, maxTokens: 1500 },
|
|
2653
|
+
deps.model.pricing
|
|
2654
|
+
);
|
|
2655
|
+
let next = await gen.next();
|
|
2656
|
+
while (!next.done) next = await gen.next();
|
|
2657
|
+
const result = next.value;
|
|
2658
|
+
ev.onUsage?.(result);
|
|
2659
|
+
const calls = result.toolCalls.length ? result.toolCalls : useTools && parseTextToolCall(result.content) ? [parseTextToolCall(result.content)] : [];
|
|
2660
|
+
const parsed = parseVerdict(result.content, criteria);
|
|
2661
|
+
if (parsed) {
|
|
2662
|
+
verdict = parsed;
|
|
2663
|
+
break;
|
|
2664
|
+
}
|
|
2665
|
+
if (calls.length) {
|
|
2666
|
+
if (result.toolCalls.length) messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
|
|
2667
|
+
for (const tc of calls) {
|
|
2668
|
+
ev.onToolCall?.(tc.function.name, tc.function.arguments);
|
|
2669
|
+
const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
|
|
2670
|
+
ev.onToolResult?.(tc.function.name, outcome.result);
|
|
2671
|
+
if (result.toolCalls.length) {
|
|
2672
|
+
messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
|
|
2673
|
+
} else {
|
|
2674
|
+
messages.push({ role: "assistant", content: result.content });
|
|
2675
|
+
messages.push({ role: "user", content: `Tool ${tc.function.name} returned:
|
|
2676
|
+
${outcome.result}
|
|
2677
|
+
Continue, then return the verdict JSON.` });
|
|
2678
|
+
}
|
|
2679
|
+
}
|
|
2680
|
+
continue;
|
|
2681
|
+
}
|
|
2682
|
+
messages.push({ role: "assistant", content: result.content });
|
|
2683
|
+
messages.push({ role: "user", content: `Return ONLY the verdict JSON now.` });
|
|
2684
|
+
}
|
|
2685
|
+
return verdict ?? fallbackVerdict(criteria);
|
|
2686
|
+
}
|
|
2687
|
+
function parseVerdict(text, criteria) {
|
|
2688
|
+
const json = extractJson(text);
|
|
2176
2689
|
if (!json) return null;
|
|
2177
2690
|
try {
|
|
2178
2691
|
const obj = JSON.parse(json);
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2692
|
+
if (!Array.isArray(obj.results)) return null;
|
|
2693
|
+
const results = obj.results.map((r) => ({
|
|
2694
|
+
criterion: String(r.criterion ?? ""),
|
|
2695
|
+
met: r.met === true || String(r.met).toLowerCase() === "true",
|
|
2696
|
+
reason: String(r.reason ?? "").slice(0, 300)
|
|
2697
|
+
}));
|
|
2698
|
+
if (!results.length) return null;
|
|
2699
|
+
const unmet = results.filter((r) => !r.met);
|
|
2182
2700
|
return {
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2701
|
+
total: results.length,
|
|
2702
|
+
metCount: results.length - unmet.length,
|
|
2703
|
+
allMet: unmet.length === 0,
|
|
2704
|
+
results,
|
|
2705
|
+
unmet,
|
|
2706
|
+
feedback: String(obj.feedback ?? "").slice(0, 1e3) || unmet.map((u) => u.reason).join("; ")
|
|
2186
2707
|
};
|
|
2187
2708
|
} catch {
|
|
2188
2709
|
return null;
|
|
2189
2710
|
}
|
|
2190
2711
|
}
|
|
2712
|
+
function fallbackVerdict(criteria) {
|
|
2713
|
+
const results = criteria.map((c2) => ({ criterion: c2, met: false, reason: "verifier produced no verdict" }));
|
|
2714
|
+
return { total: results.length, metCount: 0, allMet: false, results, unmet: results, feedback: "Verification inconclusive; re-attempt with a stronger model." };
|
|
2715
|
+
}
|
|
2716
|
+
|
|
2717
|
+
// src/agent/loop.ts
|
|
2718
|
+
function localDate2(d = /* @__PURE__ */ new Date()) {
|
|
2719
|
+
const y = d.getFullYear();
|
|
2720
|
+
const m = String(d.getMonth() + 1).padStart(2, "0");
|
|
2721
|
+
const day = String(d.getDate()).padStart(2, "0");
|
|
2722
|
+
return `${y}-${m}-${day}`;
|
|
2723
|
+
}
|
|
2191
2724
|
async function runAgent(goal, deps, emit) {
|
|
2192
|
-
const { client: client2, models,
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
let totalCompletionTokens = 0;
|
|
2197
|
-
let calls = 0;
|
|
2725
|
+
const { client: client2, models, cwd } = deps;
|
|
2726
|
+
const verifyOn = deps.verify ?? true;
|
|
2727
|
+
const maxAttempts = deps.maxAttempts ?? 3;
|
|
2728
|
+
const acc = { cost: 0, tokens: 0, prompt: 0, completion: 0, calls: 0 };
|
|
2198
2729
|
const sessionStart = Date.now();
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2730
|
+
const toolCtx = { cwd, allowWrite: deps.allowWrite, allowCommands: deps.allowCommands };
|
|
2731
|
+
const logUsage = (r, taskType) => {
|
|
2732
|
+
const entry = logCompletion(r, taskType, deps.sessionId);
|
|
2733
|
+
emit({ type: "usage", entry });
|
|
2734
|
+
acc.cost += entry.costUsd;
|
|
2735
|
+
acc.tokens += entry.totalTokens;
|
|
2736
|
+
acc.prompt += entry.promptTokens;
|
|
2737
|
+
acc.completion += entry.completionTokens;
|
|
2738
|
+
acc.calls++;
|
|
2739
|
+
return entry;
|
|
2740
|
+
};
|
|
2741
|
+
const planRoute = route("plan", models, deps.policy);
|
|
2202
2742
|
let plan;
|
|
2203
2743
|
if (planRoute) {
|
|
2204
2744
|
try {
|
|
2205
|
-
plan = await planRequest(goal, client2, planRoute.model, (
|
|
2206
|
-
const entry = logCompletion(result, "plan", sessionId);
|
|
2207
|
-
emit({ type: "usage", entry });
|
|
2208
|
-
totalCostUsd += entry.costUsd;
|
|
2209
|
-
totalTokens += entry.totalTokens;
|
|
2210
|
-
totalPromptTokens += entry.promptTokens;
|
|
2211
|
-
totalCompletionTokens += entry.completionTokens;
|
|
2212
|
-
calls++;
|
|
2213
|
-
});
|
|
2745
|
+
plan = await planRequest(goal, client2, planRoute.model, (r) => logUsage(r, "plan"));
|
|
2214
2746
|
} catch {
|
|
2215
2747
|
plan = heuristicPlan(goal);
|
|
2216
2748
|
}
|
|
@@ -2218,164 +2750,276 @@ async function runAgent(goal, deps, emit) {
|
|
|
2218
2750
|
plan = heuristicPlan(goal);
|
|
2219
2751
|
}
|
|
2220
2752
|
emit({ type: "plan", plan, planModel: planRoute?.model.id ?? "heuristic" });
|
|
2753
|
+
let startRung = 0;
|
|
2754
|
+
let learned = false;
|
|
2755
|
+
if (verifyOn) {
|
|
2756
|
+
const tier = optimalStartTier(plan.goalType);
|
|
2757
|
+
if (tier) {
|
|
2758
|
+
const r = rungForTier(tier);
|
|
2759
|
+
if (r > 0) {
|
|
2760
|
+
startRung = r;
|
|
2761
|
+
learned = true;
|
|
2762
|
+
}
|
|
2763
|
+
}
|
|
2764
|
+
}
|
|
2765
|
+
const startTier = ESCALATION_LADDER[startRung].tierFloor ?? "cheap";
|
|
2766
|
+
emit({ type: "criteria", goalType: plan.goalType, criteria: plan.criteria, startTier, learned });
|
|
2221
2767
|
startSession({
|
|
2222
|
-
id: sessionId,
|
|
2768
|
+
id: deps.sessionId,
|
|
2223
2769
|
ts: sessionStart,
|
|
2224
2770
|
date: localDate2(),
|
|
2225
2771
|
goal,
|
|
2226
2772
|
command: "run",
|
|
2227
|
-
objective: policy.objective,
|
|
2228
|
-
plannedSteps: plan.steps.length
|
|
2773
|
+
objective: deps.policy.objective,
|
|
2774
|
+
plannedSteps: plan.steps.length,
|
|
2775
|
+
goalType: plan.goalType,
|
|
2776
|
+
startTier
|
|
2229
2777
|
});
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
|
|
2233
|
-
|
|
2234
|
-
|
|
2778
|
+
let rung = startRung;
|
|
2779
|
+
let attemptNo = 0;
|
|
2780
|
+
let verdict = null;
|
|
2781
|
+
let completedSteps = 0;
|
|
2782
|
+
let failedSteps = 0;
|
|
2235
2783
|
const priorSummaries = [];
|
|
2236
|
-
|
|
2237
|
-
const
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
}
|
|
2241
|
-
if (
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
const model = r.model;
|
|
2247
|
-
emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
|
|
2248
|
-
const useTools = model.capabilities.tools;
|
|
2249
|
-
const messages = [
|
|
2250
|
-
{ role: "system", content: stepSystemPrompt(goal, step, priorSummaries, useTools) },
|
|
2251
|
-
{ role: "user", content: step.description }
|
|
2252
|
-
];
|
|
2253
|
-
const stepStart = Date.now();
|
|
2254
|
-
let stepPrompt = 0;
|
|
2255
|
-
let stepCompletion = 0;
|
|
2256
|
-
let stepCost = 0;
|
|
2257
|
-
let stepToolCalls = 0;
|
|
2258
|
-
let iterations = 0;
|
|
2259
|
-
let finishedBy = "max-iters";
|
|
2260
|
-
let summary = "";
|
|
2261
|
-
try {
|
|
2262
|
-
for (let iter = 0; iter < MAX_ITERS_PER_STEP; iter++) {
|
|
2263
|
-
iterations = iter + 1;
|
|
2264
|
-
const gen = client2.stream(
|
|
2265
|
-
{
|
|
2266
|
-
model: model.id,
|
|
2267
|
-
messages,
|
|
2268
|
-
tools: useTools ? TOOL_SCHEMAS : void 0,
|
|
2269
|
-
temperature: 0.2,
|
|
2270
|
-
maxTokens: 2e3
|
|
2271
|
-
},
|
|
2272
|
-
model.pricing
|
|
2273
|
-
);
|
|
2274
|
-
let next = await gen.next();
|
|
2275
|
-
while (!next.done) {
|
|
2276
|
-
emit({ type: "text", delta: next.value });
|
|
2277
|
-
next = await gen.next();
|
|
2278
|
-
}
|
|
2279
|
-
const result = next.value;
|
|
2280
|
-
const entry = logCompletion(result, step.type, sessionId);
|
|
2281
|
-
emit({ type: "usage", entry });
|
|
2282
|
-
totalCostUsd += entry.costUsd;
|
|
2283
|
-
totalTokens += entry.totalTokens;
|
|
2284
|
-
totalPromptTokens += entry.promptTokens;
|
|
2285
|
-
totalCompletionTokens += entry.completionTokens;
|
|
2286
|
-
stepPrompt += entry.promptTokens;
|
|
2287
|
-
stepCompletion += entry.completionTokens;
|
|
2288
|
-
stepCost += entry.costUsd;
|
|
2289
|
-
calls++;
|
|
2290
|
-
if (result.toolCalls.length && useTools) {
|
|
2291
|
-
messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
|
|
2292
|
-
let finished = false;
|
|
2293
|
-
for (const tc of result.toolCalls) {
|
|
2294
|
-
stepToolCalls++;
|
|
2295
|
-
emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
|
|
2296
|
-
const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
|
|
2297
|
-
emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
|
|
2298
|
-
messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
|
|
2299
|
-
if (outcome.finishSummary != null) {
|
|
2300
|
-
summary = outcome.finishSummary;
|
|
2301
|
-
finished = true;
|
|
2302
|
-
}
|
|
2303
|
-
}
|
|
2304
|
-
if (finished) {
|
|
2305
|
-
finishedBy = "finish-tool";
|
|
2306
|
-
break;
|
|
2307
|
-
}
|
|
2308
|
-
continue;
|
|
2309
|
-
}
|
|
2310
|
-
const textCall = useTools ? parseTextToolCall(result.content) : null;
|
|
2311
|
-
if (textCall) {
|
|
2312
|
-
stepToolCalls++;
|
|
2313
|
-
emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
|
|
2314
|
-
const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
|
|
2315
|
-
emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
|
|
2316
|
-
if (outcome.finishSummary != null) {
|
|
2317
|
-
summary = outcome.finishSummary;
|
|
2318
|
-
finishedBy = "finish-tool";
|
|
2319
|
-
break;
|
|
2320
|
-
}
|
|
2321
|
-
messages.push({ role: "assistant", content: result.content });
|
|
2322
|
-
messages.push({
|
|
2323
|
-
role: "user",
|
|
2324
|
-
content: `Tool ${textCall.function.name} returned:
|
|
2325
|
-
${outcome.result}
|
|
2326
|
-
Continue with this step. When the objective is met, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
|
|
2327
|
-
});
|
|
2328
|
-
continue;
|
|
2329
|
-
}
|
|
2330
|
-
summary = result.content || summary;
|
|
2331
|
-
if (summary) finishedBy = "text";
|
|
2332
|
-
break;
|
|
2784
|
+
while (attemptNo < maxAttempts) {
|
|
2785
|
+
const rungDef = ESCALATION_LADDER[Math.min(rung, ESCALATION_LADDER.length - 1)];
|
|
2786
|
+
const rungPolicy = applyRung(deps.policy, rungDef);
|
|
2787
|
+
const attemptStart = Date.now();
|
|
2788
|
+
const before = { ...acc };
|
|
2789
|
+
if (attemptNo === 0) {
|
|
2790
|
+
for (const step of plan.steps) {
|
|
2791
|
+
const res = await runStep(step, rungPolicy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal);
|
|
2792
|
+
if (res.success) completedSteps++;
|
|
2793
|
+
else failedSteps++;
|
|
2333
2794
|
}
|
|
2334
|
-
}
|
|
2335
|
-
|
|
2336
|
-
emit({ type: "error", message: `Step ${step.id} failed: ${err?.message ?? err}` });
|
|
2795
|
+
} else {
|
|
2796
|
+
await runFix(goal, plan, verdict, rungPolicy, rungDef, deps, toolCtx, emit, logUsage);
|
|
2337
2797
|
}
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
|
|
2798
|
+
if (!verifyOn) {
|
|
2799
|
+
attemptNo++;
|
|
2800
|
+
break;
|
|
2801
|
+
}
|
|
2802
|
+
const verifyPolicy = { ...deps.policy, objective: "quality", tierFloor: rungDef.tierFloor };
|
|
2803
|
+
const verifier = routeOrBest("verify", models, verifyPolicy);
|
|
2804
|
+
if (!verifier) {
|
|
2805
|
+
emit({ type: "error", message: "No model available to verify." });
|
|
2806
|
+
attemptNo++;
|
|
2807
|
+
break;
|
|
2808
|
+
}
|
|
2809
|
+
emit({ type: "verify-start", model: verifier.model.id, attempt: attemptNo + 1 });
|
|
2810
|
+
verdict = await verifyGoal(goal, plan.criteria, { client: client2, model: verifier.model, cwd, allowCommands: deps.allowCommands }, {
|
|
2811
|
+
onToolCall: (name, args) => emit({ type: "tool-call", name, args }),
|
|
2812
|
+
onToolResult: (name, result) => emit({ type: "tool-result", name, result }),
|
|
2813
|
+
onUsage: (r) => logUsage(r, "review")
|
|
2814
|
+
});
|
|
2815
|
+
emit({ type: "verdict", attempt: attemptNo + 1, metCount: verdict.metCount, total: verdict.total, allMet: verdict.allMet, unmet: verdict.unmet });
|
|
2816
|
+
recordAttempt({
|
|
2817
|
+
sessionId: deps.sessionId,
|
|
2818
|
+
attemptNo: attemptNo + 1,
|
|
2819
|
+
goalType: plan.goalType,
|
|
2820
|
+
tierFloor: rungDef.tierFloor ?? null,
|
|
2821
|
+
objective: rungDef.objective,
|
|
2822
|
+
promptTokens: acc.prompt - before.prompt,
|
|
2823
|
+
completionTokens: acc.completion - before.completion,
|
|
2824
|
+
costUsd: acc.cost - before.cost,
|
|
2825
|
+
criteriaTotal: verdict.total,
|
|
2826
|
+
criteriaMet: verdict.metCount,
|
|
2827
|
+
passed: verdict.allMet,
|
|
2828
|
+
durationMs: Date.now() - attemptStart
|
|
2356
2829
|
});
|
|
2357
|
-
|
|
2358
|
-
|
|
2359
|
-
|
|
2830
|
+
attemptNo++;
|
|
2831
|
+
if (verdict.allMet) break;
|
|
2832
|
+
if (attemptNo < maxAttempts) {
|
|
2833
|
+
const next = Math.min(rung + 1, ESCALATION_LADDER.length - 1);
|
|
2834
|
+
rung = next;
|
|
2835
|
+
emit({
|
|
2836
|
+
type: "escalate",
|
|
2837
|
+
toRung: ESCALATION_LADDER[next].label,
|
|
2838
|
+
reason: `${verdict.unmet.length}/${verdict.total} criteria unmet`
|
|
2839
|
+
});
|
|
2840
|
+
}
|
|
2360
2841
|
}
|
|
2361
|
-
|
|
2842
|
+
const passed = verifyOn ? verdict ? verdict.allMet : false : null;
|
|
2843
|
+
finishSession(deps.sessionId, {
|
|
2362
2844
|
plannedSteps: plan.steps.length,
|
|
2363
2845
|
completedSteps,
|
|
2364
2846
|
failedSteps,
|
|
2365
|
-
autoScore: plan.steps.length ? completedSteps / plan.steps.length : null,
|
|
2366
|
-
promptTokens:
|
|
2367
|
-
completionTokens:
|
|
2368
|
-
costUsd:
|
|
2369
|
-
durationMs: Date.now() - sessionStart
|
|
2847
|
+
autoScore: verdict ? verdict.metCount / Math.max(verdict.total, 1) : plan.steps.length ? completedSteps / plan.steps.length : null,
|
|
2848
|
+
promptTokens: acc.prompt,
|
|
2849
|
+
completionTokens: acc.completion,
|
|
2850
|
+
costUsd: acc.cost,
|
|
2851
|
+
durationMs: Date.now() - sessionStart,
|
|
2852
|
+
attempts: attemptNo,
|
|
2853
|
+
finalPassed: passed
|
|
2370
2854
|
});
|
|
2371
|
-
emit({ type: "done", totalCostUsd, totalTokens, calls });
|
|
2372
|
-
return { totalCostUsd, totalTokens, calls };
|
|
2855
|
+
emit({ type: "done", totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed, attempts: attemptNo });
|
|
2856
|
+
return { totalCostUsd: acc.cost, totalTokens: acc.tokens, calls: acc.calls, passed };
|
|
2373
2857
|
}
|
|
2374
|
-
function
|
|
2375
|
-
const
|
|
2376
|
-
|
|
2377
|
-
|
|
2378
|
-
|
|
2858
|
+
async function runStep(step, policy, rungDef, deps, toolCtx, priorSummaries, emit, logUsage, goal) {
|
|
2859
|
+
const r = routeOrBest(step.type, deps.models, policy, {
|
|
2860
|
+
promptTokens: step.estPromptTokens,
|
|
2861
|
+
completionTokens: step.estCompletionTokens
|
|
2862
|
+
});
|
|
2863
|
+
if (!r) {
|
|
2864
|
+
emit({ type: "error", message: `No capable model for step ${step.id} (${step.type}).` });
|
|
2865
|
+
return { summary: "(no model)", success: false };
|
|
2866
|
+
}
|
|
2867
|
+
const model = r.model;
|
|
2868
|
+
emit({ type: "step-start", step, model, estCostUsd: r.estCostUsd });
|
|
2869
|
+
const messages = [
|
|
2870
|
+
{ role: "system", content: stepSystemPrompt(goal, step, priorSummaries, model.capabilities.tools) },
|
|
2871
|
+
{ role: "user", content: step.description }
|
|
2872
|
+
];
|
|
2873
|
+
const loop = await runToolLoop(model, messages, step.type, rungDef, deps, toolCtx, emit, logUsage);
|
|
2874
|
+
recordStepRun({
|
|
2875
|
+
sessionId: deps.sessionId,
|
|
2876
|
+
stepNo: step.id,
|
|
2877
|
+
taskType: step.type,
|
|
2878
|
+
skill: TASK_SKILL[step.type],
|
|
2879
|
+
model: model.id,
|
|
2880
|
+
provider: model.provider,
|
|
2881
|
+
iterations: loop.iterations,
|
|
2882
|
+
toolCalls: loop.toolCalls,
|
|
2883
|
+
promptTokens: loop.prompt,
|
|
2884
|
+
completionTokens: loop.completion,
|
|
2885
|
+
costUsd: loop.cost,
|
|
2886
|
+
finishedBy: loop.finishedBy,
|
|
2887
|
+
success: loop.success,
|
|
2888
|
+
durationMs: loop.durationMs
|
|
2889
|
+
});
|
|
2890
|
+
const summary = loop.summary || "(no summary)";
|
|
2891
|
+
priorSummaries.push(`Step ${step.id} (${step.type}): ${summary}`);
|
|
2892
|
+
emit({ type: "step-end", step, summary });
|
|
2893
|
+
return { summary, success: loop.success };
|
|
2894
|
+
}
|
|
2895
|
+
async function runFix(goal, plan, verdict, policy, rungDef, deps, toolCtx, emit, logUsage) {
|
|
2896
|
+
const r = routeOrBest("edit", deps.models, policy);
|
|
2897
|
+
if (!r) return { summary: "(no model)", success: false };
|
|
2898
|
+
const model = r.model;
|
|
2899
|
+
const fixStep = {
|
|
2900
|
+
id: 100,
|
|
2901
|
+
type: "edit",
|
|
2902
|
+
description: "Fix the unmet acceptance criteria",
|
|
2903
|
+
estPromptTokens: 9e3,
|
|
2904
|
+
estCompletionTokens: 1500
|
|
2905
|
+
};
|
|
2906
|
+
emit({ type: "step-start", step: fixStep, model, estCostUsd: r.estCostUsd });
|
|
2907
|
+
const unmet = verdict.unmet.map((u, i) => `${i + 1}. ${u.criterion} \u2014 ${u.reason}`).join("\n");
|
|
2908
|
+
const messages = [
|
|
2909
|
+
{
|
|
2910
|
+
role: "system",
|
|
2911
|
+
content: `You are the FIX stage of an autonomous coding agent (escalated model). The verify gate found unmet acceptance criteria; resolve them.
|
|
2912
|
+
Overall goal: ${goal}
|
|
2913
|
+
You may use the tools (read_file, write_file, list_dir, run_command). Inspect what's there, then make the changes. Call \`finish\` with a one-line summary when all listed criteria should now pass.
|
|
2914
|
+
If you cannot call tools natively, reply with ONLY one JSON object per turn: {"name":"<tool>","arguments":{...}}`
|
|
2915
|
+
},
|
|
2916
|
+
{ role: "user", content: `Unmet criteria:
|
|
2917
|
+
${unmet}
|
|
2918
|
+
|
|
2919
|
+
Verifier feedback: ${verdict.feedback}` }
|
|
2920
|
+
];
|
|
2921
|
+
const loop = await runToolLoop(model, messages, "edit", rungDef, deps, toolCtx, emit, logUsage);
|
|
2922
|
+
recordStepRun({
|
|
2923
|
+
sessionId: deps.sessionId,
|
|
2924
|
+
stepNo: fixStep.id,
|
|
2925
|
+
taskType: "edit",
|
|
2926
|
+
skill: TASK_SKILL.edit,
|
|
2927
|
+
model: model.id,
|
|
2928
|
+
provider: model.provider,
|
|
2929
|
+
iterations: loop.iterations,
|
|
2930
|
+
toolCalls: loop.toolCalls,
|
|
2931
|
+
promptTokens: loop.prompt,
|
|
2932
|
+
completionTokens: loop.completion,
|
|
2933
|
+
costUsd: loop.cost,
|
|
2934
|
+
finishedBy: loop.finishedBy,
|
|
2935
|
+
success: loop.success,
|
|
2936
|
+
durationMs: loop.durationMs
|
|
2937
|
+
});
|
|
2938
|
+
emit({ type: "step-end", step: fixStep, summary: loop.summary || "(fix pass)" });
|
|
2939
|
+
return { summary: loop.summary, success: loop.success };
|
|
2940
|
+
}
|
|
2941
|
+
async function runToolLoop(model, messages, taskTypeForLog, rungDef, deps, toolCtx, emit, logUsage) {
|
|
2942
|
+
const useTools = model.capabilities.tools;
|
|
2943
|
+
const start = Date.now();
|
|
2944
|
+
let prompt = 0, completion = 0, cost = 0, toolCalls = 0, iterations = 0;
|
|
2945
|
+
let summary = "";
|
|
2946
|
+
let finishedBy = "max-iters";
|
|
2947
|
+
try {
|
|
2948
|
+
for (let iter = 0; iter < rungDef.maxIters; iter++) {
|
|
2949
|
+
iterations = iter + 1;
|
|
2950
|
+
const gen = deps.client.stream(
|
|
2951
|
+
{ model: model.id, messages, tools: useTools ? TOOL_SCHEMAS : void 0, temperature: 0.2, maxTokens: rungDef.maxTokens },
|
|
2952
|
+
model.pricing
|
|
2953
|
+
);
|
|
2954
|
+
let next = await gen.next();
|
|
2955
|
+
while (!next.done) {
|
|
2956
|
+
emit({ type: "text", delta: next.value });
|
|
2957
|
+
next = await gen.next();
|
|
2958
|
+
}
|
|
2959
|
+
const result = next.value;
|
|
2960
|
+
const entry = logUsage(result, taskTypeForLog);
|
|
2961
|
+
prompt += entry.promptTokens;
|
|
2962
|
+
completion += entry.completionTokens;
|
|
2963
|
+
cost += entry.costUsd;
|
|
2964
|
+
if (result.toolCalls.length && useTools) {
|
|
2965
|
+
messages.push({ role: "assistant", content: result.content, tool_calls: result.toolCalls });
|
|
2966
|
+
let finished = false;
|
|
2967
|
+
for (const tc of result.toolCalls) {
|
|
2968
|
+
toolCalls++;
|
|
2969
|
+
emit({ type: "tool-call", name: tc.function.name, args: tc.function.arguments });
|
|
2970
|
+
const outcome = executeTool(tc.function.name, tc.function.arguments, toolCtx);
|
|
2971
|
+
emit({ type: "tool-result", name: tc.function.name, result: outcome.result });
|
|
2972
|
+
messages.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content: outcome.result });
|
|
2973
|
+
if (outcome.finishSummary != null) {
|
|
2974
|
+
summary = outcome.finishSummary;
|
|
2975
|
+
finished = true;
|
|
2976
|
+
}
|
|
2977
|
+
}
|
|
2978
|
+
if (finished) {
|
|
2979
|
+
finishedBy = "finish-tool";
|
|
2980
|
+
break;
|
|
2981
|
+
}
|
|
2982
|
+
continue;
|
|
2983
|
+
}
|
|
2984
|
+
const textCall = useTools ? parseTextToolCall(result.content) : null;
|
|
2985
|
+
if (textCall) {
|
|
2986
|
+
toolCalls++;
|
|
2987
|
+
emit({ type: "tool-call", name: textCall.function.name, args: textCall.function.arguments });
|
|
2988
|
+
const outcome = executeTool(textCall.function.name, textCall.function.arguments, toolCtx);
|
|
2989
|
+
emit({ type: "tool-result", name: textCall.function.name, result: outcome.result });
|
|
2990
|
+
if (outcome.finishSummary != null) {
|
|
2991
|
+
summary = outcome.finishSummary;
|
|
2992
|
+
finishedBy = "finish-tool";
|
|
2993
|
+
break;
|
|
2994
|
+
}
|
|
2995
|
+
messages.push({ role: "assistant", content: result.content });
|
|
2996
|
+
messages.push({
|
|
2997
|
+
role: "user",
|
|
2998
|
+
content: `Tool ${textCall.function.name} returned:
|
|
2999
|
+
${outcome.result}
|
|
3000
|
+
Continue. When done, reply with ONLY {"name":"finish","arguments":{"summary":"<one line>"}}.`
|
|
3001
|
+
});
|
|
3002
|
+
continue;
|
|
3003
|
+
}
|
|
3004
|
+
summary = result.content || summary;
|
|
3005
|
+
if (summary) finishedBy = "text";
|
|
3006
|
+
break;
|
|
3007
|
+
}
|
|
3008
|
+
} catch (err) {
|
|
3009
|
+
finishedBy = "error";
|
|
3010
|
+
emit({ type: "error", message: `${taskTypeForLog} failed: ${err?.message ?? err}` });
|
|
3011
|
+
}
|
|
3012
|
+
return {
|
|
3013
|
+
summary,
|
|
3014
|
+
success: finishedBy === "finish-tool" || finishedBy === "text",
|
|
3015
|
+
finishedBy,
|
|
3016
|
+
iterations,
|
|
3017
|
+
toolCalls,
|
|
3018
|
+
prompt,
|
|
3019
|
+
completion,
|
|
3020
|
+
cost,
|
|
3021
|
+
durationMs: Date.now() - start
|
|
3022
|
+
};
|
|
2379
3023
|
}
|
|
2380
3024
|
function stepSystemPrompt(goal, step, priorSummaries, useTools) {
|
|
2381
3025
|
const context = priorSummaries.length ? `
|
|
@@ -2389,7 +3033,7 @@ Return a concise result for this step. Do not ask the user questions.`;
|
|
|
2389
3033
|
return `You are the "${step.type}" stage of an autonomous coding agent.
|
|
2390
3034
|
Overall goal: ${goal}
|
|
2391
3035
|
Your current step: ${step.description}${context}${toolNote}
|
|
2392
|
-
Be efficient \u2014 you were selected as the
|
|
3036
|
+
Be efficient \u2014 you were selected as the most cost-effective capable model for this step.`;
|
|
2393
3037
|
}
|
|
2394
3038
|
|
|
2395
3039
|
// src/tui/App.tsx
|
|
@@ -2405,6 +3049,8 @@ function App(props) {
|
|
|
2405
3049
|
const [tok, setTok] = useState(0);
|
|
2406
3050
|
const [calls, setCalls] = useState(0);
|
|
2407
3051
|
const [rated, setRated] = useState(null);
|
|
3052
|
+
const [passed, setPassed] = useState(null);
|
|
3053
|
+
const [attempts, setAttempts] = useState(0);
|
|
2408
3054
|
const push = useCallback((text, color) => {
|
|
2409
3055
|
setLog((l) => [...l, { key: l.length, text, color }]);
|
|
2410
3056
|
}, []);
|
|
@@ -2422,7 +3068,9 @@ function App(props) {
|
|
|
2422
3068
|
sessionId: props.sessionId,
|
|
2423
3069
|
cwd: props.cwd,
|
|
2424
3070
|
allowWrite: props.allowWrite,
|
|
2425
|
-
allowCommands: props.allowCommands
|
|
3071
|
+
allowCommands: props.allowCommands,
|
|
3072
|
+
verify: props.verify,
|
|
3073
|
+
maxAttempts: props.maxAttempts
|
|
2426
3074
|
};
|
|
2427
3075
|
let textBuf = "";
|
|
2428
3076
|
const flush = () => {
|
|
@@ -2434,6 +3082,24 @@ function App(props) {
|
|
|
2434
3082
|
case "plan":
|
|
2435
3083
|
push(`\u{1F4CB} Plan (${e.plan.steps.length} steps) \xB7 planner: ${e.planModel}`, "cyan");
|
|
2436
3084
|
break;
|
|
3085
|
+
case "criteria":
|
|
3086
|
+
push(`\u{1F3AF} ${e.goalType} \xB7 ${e.criteria.length} criteria \xB7 start: ${e.startTier}${e.learned ? " (learned)" : ""}`, "cyan");
|
|
3087
|
+
e.criteria.forEach((cr, i) => push(` ${i + 1}. ${cr}`, "gray"));
|
|
3088
|
+
break;
|
|
3089
|
+
case "verify-start":
|
|
3090
|
+
flush();
|
|
3091
|
+
push(`\u{1F50D} Verify (attempt ${e.attempt}) \u2192 ${e.model}`, "cyan");
|
|
3092
|
+
break;
|
|
3093
|
+
case "verdict":
|
|
3094
|
+
flush();
|
|
3095
|
+
push(
|
|
3096
|
+
`${e.allMet ? "\u2705" : "\u274C"} ${e.metCount}/${e.total} criteria met` + (e.unmet.length ? " \u2014 unmet: " + e.unmet.map((u) => u.criterion).join("; ").slice(0, 100) : ""),
|
|
3097
|
+
e.allMet ? "green" : "red"
|
|
3098
|
+
);
|
|
3099
|
+
break;
|
|
3100
|
+
case "escalate":
|
|
3101
|
+
push(`\u23EB Escalate \u2192 ${e.toRung} (${e.reason})`, "magenta");
|
|
3102
|
+
break;
|
|
2437
3103
|
case "step-start":
|
|
2438
3104
|
flush();
|
|
2439
3105
|
push(`\u25B6 Step ${e.step.id} [${e.step.type}] \u2192 ${e.model.id} ~${usd(e.estCostUsd)}`, "yellow");
|
|
@@ -2463,6 +3129,8 @@ function App(props) {
|
|
|
2463
3129
|
break;
|
|
2464
3130
|
case "done":
|
|
2465
3131
|
flush();
|
|
3132
|
+
setPassed(e.passed);
|
|
3133
|
+
setAttempts(e.attempts);
|
|
2466
3134
|
break;
|
|
2467
3135
|
}
|
|
2468
3136
|
};
|
|
@@ -2522,10 +3190,14 @@ function App(props) {
|
|
|
2522
3190
|
" working\u2026"
|
|
2523
3191
|
] }),
|
|
2524
3192
|
phase === "rate" && /* @__PURE__ */ jsxs(Text, { children: [
|
|
2525
|
-
/* @__PURE__ */ jsxs(Text, { color: "green", children: [
|
|
2526
|
-
"\
|
|
3193
|
+
/* @__PURE__ */ jsxs(Text, { color: passed === false ? "yellow" : "green", children: [
|
|
3194
|
+
passed === false ? "\u26A0 goal not fully met" : "\u2713 goal met",
|
|
3195
|
+
" \xB7 ",
|
|
3196
|
+
attempts,
|
|
3197
|
+
" attempt(s) \xB7 ",
|
|
2527
3198
|
calls,
|
|
2528
|
-
" calls \xB7
|
|
3199
|
+
" calls \xB7",
|
|
3200
|
+
" ",
|
|
2529
3201
|
tokens(tok),
|
|
2530
3202
|
" tokens \xB7 ",
|
|
2531
3203
|
usd(cost)
|
|
@@ -2609,8 +3281,9 @@ function truncate2(s, n) {
|
|
|
2609
3281
|
}
|
|
2610
3282
|
|
|
2611
3283
|
// src/index.ts
|
|
3284
|
+
var VERSION = "0.5.0";
|
|
2612
3285
|
var program = new Command();
|
|
2613
|
-
program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version(
|
|
3286
|
+
program.name("poly").description("Polymath \u2014 cost-optimized, multi-model TUI coding agent").version(VERSION);
|
|
2614
3287
|
function client(config) {
|
|
2615
3288
|
return new OpenRouterClient({
|
|
2616
3289
|
apiKey: resolveApiKey(config),
|
|
@@ -2681,10 +3354,18 @@ async function loadCatalog(config, refresh = false) {
|
|
|
2681
3354
|
}
|
|
2682
3355
|
return models;
|
|
2683
3356
|
}
|
|
3357
|
+
program.command("setup").description("First-run setup: optionally install a local LLM (Ollama) and connect models").option("--local", "install a local LLM (Ollama) \u2014 skips the prompt").option("--no-local", "skip the local LLM \u2014 skips the prompt").option("-m, --model <id>", "local model to pull (e.g. qwen2.5-coder:7b)").option("-y, --yes", "accept defaults / auto-install without prompts", false).action(async (opts) => {
|
|
3358
|
+
const argv = process.argv;
|
|
3359
|
+
const local = argv.includes("--local") ? true : argv.includes("--no-local") ? false : void 0;
|
|
3360
|
+
await runSetup({ local, model: opts.model, yes: !!opts.yes });
|
|
3361
|
+
});
|
|
3362
|
+
program.command("update").description("Update Polymath, the Ollama runtime, and local models").option("--check", "report available updates without installing", false).option("--self", "only the Polymath CLI", false).option("--ollama", "only the Ollama runtime", false).option("--models", "only the local models", false).action(async (opts) => {
|
|
3363
|
+
await runUpdate(VERSION, { check: !!opts.check, self: !!opts.self, ollama: !!opts.ollama, models: !!opts.models });
|
|
3364
|
+
});
|
|
2684
3365
|
program.command("login").description("Connect Polymath to OpenRouter (set/replace your API key)").action(async () => {
|
|
2685
3366
|
await runLogin();
|
|
2686
3367
|
});
|
|
2687
|
-
program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).action(async (goalParts, opts) => {
|
|
3368
|
+
program.command("run", { isDefault: true }).description("Launch the interactive agent (TUI)").argument("[goal...]", "what to do (optional; prompts if omitted)").option("-o, --objective <name>", "routing objective: cheapest | value | quality").option("--max-cost <usd>", "exclude models whose projected per-call cost exceeds this").option("-w, --write", "allow the agent to write files (confined to --cwd)", false).option("-x, --commands", "DANGER: let the model run arbitrary shell commands in --cwd", false).option("-C, --cwd <dir>", "working directory", process.cwd()).option("--no-verify", "skip the verify-and-escalate loop (single pass)").option("--max-attempts <n>", "max code\u2192verify\u2192escalate attempts until goals met", "3").action(async (goalParts, opts) => {
|
|
2688
3369
|
const startedAt = Date.now();
|
|
2689
3370
|
const config = loadConfig();
|
|
2690
3371
|
if (!config.local.enabled || resolveApiKey(config)) {
|
|
@@ -2709,6 +3390,8 @@ program.command("run", { isDefault: true }).description("Launch the interactive
|
|
|
2709
3390
|
allowWrite: !!opts.write,
|
|
2710
3391
|
allowCommands: !!opts.commands,
|
|
2711
3392
|
objectiveLabel: policy.objective,
|
|
3393
|
+
verify: opts.verify !== false,
|
|
3394
|
+
maxAttempts: Math.max(1, parseInt(opts.maxAttempts, 10) || 3),
|
|
2712
3395
|
initialGoal: goal
|
|
2713
3396
|
})
|
|
2714
3397
|
);
|
|
@@ -2888,3 +3571,6 @@ program.parseAsync().catch((err) => {
|
|
|
2888
3571
|
console.error(c.red(err?.message ?? String(err)));
|
|
2889
3572
|
process.exit(1);
|
|
2890
3573
|
});
|
|
3574
|
+
export {
|
|
3575
|
+
VERSION
|
|
3576
|
+
};
|