@caupulican/pi-adaptative 0.80.29 → 0.80.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,906 @@
1
+ import { createHash } from "node:crypto";
2
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
3
+ import { dirname, join } from "node:path";
4
+ import { StringEnum } from "@caupulican/pi-ai";
5
+ import { Type } from "typebox";
6
+ import { getAgentDir } from "../config.js";
7
+ import { defineTool } from "./extensions/types.js";
8
+ const METRIC_LINE_PREFIX = "METRIC";
9
+ const DENIED_METRIC_NAMES = new Set(["__proto__", "constructor", "prototype"]);
10
+ const METRIC_NAME_RE = /^[\w.µ]+$/u;
11
+ const DECIMAL_NUMBER_RE = /^[+-]?(?:(?:\d+(?:\.\d*)?)|(?:\.\d+))(?:[eE][+-]?\d+)?$/;
12
+ function normalizeFiniteNumber(value) {
13
+ if (typeof value !== "number" || !Number.isFinite(value))
14
+ return null;
15
+ return value;
16
+ }
17
+ function normalizeNonNegative(value, fallback) {
18
+ if (typeof value !== "number" || !Number.isFinite(value) || value < 0)
19
+ return fallback;
20
+ return value;
21
+ }
22
+ function normalizeDirection(direction) {
23
+ return direction === "higher" ? "higher" : "lower";
24
+ }
25
+ export function parseMetricLines(output) {
26
+ const metrics = new Map();
27
+ for (const line of output.split(/\r?\n/)) {
28
+ const trimmed = line.trim();
29
+ if (!trimmed.startsWith(`${METRIC_LINE_PREFIX} `))
30
+ continue;
31
+ const body = trimmed.slice(METRIC_LINE_PREFIX.length).trim();
32
+ const equals = body.indexOf("=");
33
+ if (equals <= 0)
34
+ continue;
35
+ const name = body.slice(0, equals);
36
+ const rawValue = body.slice(equals + 1).trim();
37
+ if (!METRIC_NAME_RE.test(name) || DENIED_METRIC_NAMES.has(name))
38
+ continue;
39
+ if (!DECIMAL_NUMBER_RE.test(rawValue))
40
+ continue;
41
+ const value = Number(rawValue);
42
+ if (Number.isFinite(value))
43
+ metrics.set(name, value);
44
+ }
45
+ return metrics;
46
+ }
47
+ export function metricMapFromOutput(output) {
48
+ return Object.fromEntries(parseMetricLines(output));
49
+ }
50
+ export function selectPrimaryMetric(metrics, metricName) {
51
+ const value = metrics instanceof Map ? metrics.get(metricName) : metrics[metricName];
52
+ return normalizeFiniteNumber(value);
53
+ }
54
+ export function compareMetric(current, best, direction = "lower") {
55
+ const normalizedBest = normalizeFiniteNumber(best) ?? null;
56
+ if (normalizedBest === null) {
57
+ return {
58
+ current,
59
+ best: null,
60
+ direction,
61
+ delta: Number.POSITIVE_INFINITY,
62
+ relativeDelta: null,
63
+ improved: true,
64
+ };
65
+ }
66
+ const delta = direction === "lower" ? normalizedBest - current : current - normalizedBest;
67
+ const relativeDelta = normalizedBest === 0 ? null : delta / Math.abs(normalizedBest);
68
+ return {
69
+ current,
70
+ best: normalizedBest,
71
+ direction,
72
+ delta,
73
+ relativeDelta,
74
+ improved: delta > 0,
75
+ };
76
+ }
77
+ export function median(values) {
78
+ const finite = values.filter(Number.isFinite).sort((a, b) => a - b);
79
+ if (finite.length === 0)
80
+ return null;
81
+ const mid = Math.floor(finite.length / 2);
82
+ return finite.length % 2 === 0 ? (finite[mid - 1] + finite[mid]) / 2 : finite[mid];
83
+ }
84
+ export function medianAbsoluteDeviation(values) {
85
+ const m = median(values);
86
+ if (m === null)
87
+ return null;
88
+ return median(values.filter(Number.isFinite).map((value) => Math.abs(value - m)));
89
+ }
90
+ export function computeMadConfidence(improvementDelta, samples) {
91
+ const finite = samples.filter(Number.isFinite);
92
+ if (finite.length < 3) {
93
+ return { mode: "mad", value: null, noiseFloor: null, sampleCount: finite.length };
94
+ }
95
+ const mad = medianAbsoluteDeviation(finite);
96
+ if (mad === null || mad === 0) {
97
+ return { mode: "mad", value: null, noiseFloor: mad, sampleCount: finite.length };
98
+ }
99
+ return {
100
+ mode: "mad",
101
+ value: Math.abs(improvementDelta) / mad,
102
+ noiseFloor: mad,
103
+ sampleCount: finite.length,
104
+ };
105
+ }
106
+ export function decideImprovement(input) {
107
+ const direction = normalizeDirection(input.direction);
108
+ const minDelta = normalizeNonNegative(input.minDelta, 0);
109
+ const minRelativeDelta = normalizeNonNegative(input.minRelativeDelta, 0);
110
+ const minConfidence = normalizeNonNegative(input.minConfidence, 0);
111
+ const confidenceMode = input.confidenceMode ?? (minConfidence > 0 ? "mad" : "none");
112
+ const lowConfidenceAction = input.lowConfidenceAction ?? "retry";
113
+ const emptyConfidence = { mode: confidenceMode, value: null, noiseFloor: null, sampleCount: 0 };
114
+ if (input.checksPass === false) {
115
+ return { decision: "discard", reason: "checks_failed", comparison: null, confidence: emptyConfidence };
116
+ }
117
+ const current = normalizeFiniteNumber(input.currentMetric);
118
+ if (current === null) {
119
+ return {
120
+ decision: "blocked",
121
+ reason: input.currentMetric == null ? "metric_missing" : "metric_invalid",
122
+ comparison: null,
123
+ confidence: emptyConfidence,
124
+ };
125
+ }
126
+ const best = normalizeFiniteNumber(input.bestMetric) ?? null;
127
+ const comparison = compareMetric(current, best, direction);
128
+ if (best === null) {
129
+ return { decision: "keep", reason: "baseline", comparison, confidence: emptyConfidence };
130
+ }
131
+ if (!comparison.improved) {
132
+ return { decision: "discard", reason: "not_better_than_best", comparison, confidence: emptyConfidence };
133
+ }
134
+ if (comparison.delta < minDelta) {
135
+ return { decision: "discard", reason: "below_min_delta", comparison, confidence: emptyConfidence };
136
+ }
137
+ if (comparison.relativeDelta !== null && comparison.relativeDelta < minRelativeDelta) {
138
+ return { decision: "discard", reason: "below_min_relative_delta", comparison, confidence: emptyConfidence };
139
+ }
140
+ if (confidenceMode === "mad" && minConfidence > 0) {
141
+ const samples = [...(input.noiseMetrics ?? []), current];
142
+ const confidence = computeMadConfidence(comparison.delta, samples);
143
+ if (confidence.value === null) {
144
+ return {
145
+ decision: lowConfidenceAction,
146
+ reason: "insufficient_noise_evidence",
147
+ comparison,
148
+ confidence,
149
+ };
150
+ }
151
+ if (confidence.value < minConfidence) {
152
+ return {
153
+ decision: lowConfidenceAction,
154
+ reason: "below_confidence",
155
+ comparison,
156
+ confidence,
157
+ };
158
+ }
159
+ return { decision: "keep", reason: "metric_improved", comparison, confidence };
160
+ }
161
+ return { decision: "keep", reason: "metric_improved", comparison, confidence: emptyConfidence };
162
+ }
163
+ export function serializeRunRecord(record) {
164
+ return JSON.stringify(record);
165
+ }
166
+ export async function appendRunRecord(filePath, record) {
167
+ await mkdir(dirname(filePath), { recursive: true });
168
+ await writeFile(filePath, `${serializeRunRecord(record)}\n`, { flag: "a" });
169
+ }
170
+ export async function readRunRecords(filePath) {
171
+ let text = "";
172
+ try {
173
+ text = await readFile(filePath, "utf8");
174
+ }
175
+ catch (error) {
176
+ if (error.code === "ENOENT")
177
+ return [];
178
+ throw error;
179
+ }
180
+ const records = [];
181
+ for (const line of text.split("\n")) {
182
+ if (!line.trim())
183
+ continue;
184
+ const parsed = JSON.parse(line);
185
+ records.push(parsed);
186
+ }
187
+ return records;
188
+ }
189
+ function normalizeLoopId(loopId) {
190
+ const normalized = (loopId ?? "default")
191
+ .trim()
192
+ .replace(/[^a-zA-Z0-9._-]+/g, "-")
193
+ .replace(/^-+|-+$/g, "");
194
+ return normalized || "default";
195
+ }
196
+ function workspaceKeyFor(cwd) {
197
+ return createHash("sha256").update(cwd).digest("hex").slice(0, 16);
198
+ }
199
+ export function improvementLoopPaths(input) {
200
+ const rootDir = join(input.agentDir ?? getAgentDir(), "improvement-loop");
201
+ const workspaceKey = workspaceKeyFor(input.cwd);
202
+ const loopId = normalizeLoopId(input.loopId);
203
+ const workspaceDir = join(rootDir, "workspaces", workspaceKey);
204
+ return {
205
+ rootDir,
206
+ workspaceDir,
207
+ logPath: join(workspaceDir, `${loopId}.jsonl`),
208
+ sandboxDir: join(workspaceDir, "sandboxes", loopId),
209
+ artifactDir: join(workspaceDir, "artifacts", loopId),
210
+ workspaceKey,
211
+ loopId,
212
+ };
213
+ }
214
+ function finiteRunMetric(run) {
215
+ return normalizeFiniteNumber(run.metric);
216
+ }
217
+ function betterRun(current, best, direction) {
218
+ if (!best)
219
+ return current;
220
+ const currentMetric = finiteRunMetric(current);
221
+ const bestMetric = finiteRunMetric(best);
222
+ if (currentMetric === null || bestMetric === null)
223
+ return best;
224
+ return compareMetric(currentMetric, bestMetric, direction).improved ? current : best;
225
+ }
226
+ function configFromInit(input, loopId) {
227
+ return {
228
+ loopId,
229
+ objective: input.objective,
230
+ metricName: input.metricName,
231
+ metricUnit: input.metricUnit ?? "",
232
+ direction: normalizeDirection(input.direction),
233
+ minDelta: normalizeNonNegative(input.minDelta, 0),
234
+ minRelativeDelta: normalizeNonNegative(input.minRelativeDelta, 0),
235
+ confidenceMode: input.confidenceMode ?? (normalizeNonNegative(input.minConfidence, 0) > 0 ? "mad" : "none"),
236
+ minConfidence: normalizeNonNegative(input.minConfidence, 0),
237
+ lowConfidenceAction: input.lowConfidenceAction ?? "retry",
238
+ createdAt: Date.now(),
239
+ cwd: input.cwd,
240
+ };
241
+ }
242
+ async function readLoopLogEntries(logPath) {
243
+ let text = "";
244
+ try {
245
+ text = await readFile(logPath, "utf8");
246
+ }
247
+ catch (error) {
248
+ if (error.code === "ENOENT")
249
+ return [];
250
+ throw error;
251
+ }
252
+ return text
253
+ .split("\n")
254
+ .filter((line) => line.trim().length > 0)
255
+ .map((line) => JSON.parse(line));
256
+ }
257
+ function reconstructImprovementLoopState(entries, logPath) {
258
+ let config = null;
259
+ let runs = [];
260
+ const sandboxById = new Map();
261
+ for (const entry of entries) {
262
+ if (entry.type === "config") {
263
+ const { type: _type, ...rest } = entry;
264
+ config = rest;
265
+ runs = [];
266
+ sandboxById.clear();
267
+ continue;
268
+ }
269
+ if (entry.type === "run") {
270
+ const { type: _type, ...run } = entry;
271
+ runs.push(run);
272
+ continue;
273
+ }
274
+ if (entry.type === "sandbox") {
275
+ const { type: _type, ...sandbox } = entry;
276
+ sandboxById.set(sandbox.sandboxId, sandbox);
277
+ }
278
+ }
279
+ if (!config)
280
+ return null;
281
+ const baselineMetric = finiteRunMetric(runs[0] ?? {});
282
+ let bestRun = null;
283
+ for (const run of runs) {
284
+ if (run.decision !== "keep" || finiteRunMetric(run) === null)
285
+ continue;
286
+ bestRun = betterRun(run, bestRun, config.direction);
287
+ }
288
+ const sandboxes = [...sandboxById.values()].sort((a, b) => a.createdAt - b.createdAt);
289
+ const activeSandbox = [...sandboxes].reverse().find((sandbox) => sandbox.status === "active") ?? null;
290
+ return {
291
+ config,
292
+ runs,
293
+ sandboxes,
294
+ activeSandbox,
295
+ baselineMetric,
296
+ bestMetric: bestRun ? finiteRunMetric(bestRun) : null,
297
+ bestRunId: bestRun?.runId ?? null,
298
+ lastDecision: null,
299
+ logPath,
300
+ };
301
+ }
302
+ export async function initImprovementLoop(input) {
303
+ const paths = improvementLoopPaths(input);
304
+ const existing = await readLoopLogEntries(paths.logPath);
305
+ if (existing.length > 0 && !input.reset) {
306
+ throw new Error(`Improvement loop already exists at ${paths.logPath}; pass reset=true to replace user-level loop state.`);
307
+ }
308
+ const config = configFromInit(input, paths.loopId);
309
+ await mkdir(dirname(paths.logPath), { recursive: true });
310
+ await writeFile(paths.logPath, `${JSON.stringify({ type: "config", ...config })}\n`);
311
+ const state = reconstructImprovementLoopState([{ type: "config", ...config }], paths.logPath);
312
+ if (!state)
313
+ throw new Error("Failed to initialize improvement loop state");
314
+ return state;
315
+ }
316
+ export async function readImprovementLoopState(input) {
317
+ const paths = improvementLoopPaths(input);
318
+ return reconstructImprovementLoopState(await readLoopLogEntries(paths.logPath), paths.logPath);
319
+ }
320
+ function createSandboxId(input) {
321
+ if (input.sandboxId)
322
+ return normalizeLoopId(input.sandboxId);
323
+ const seed = `${input.cwd}:${input.loopId ?? "default"}:${Date.now()}:${Math.random()}`;
324
+ return `${Date.now()}-${createHash("sha256").update(seed).digest("hex").slice(0, 8)}`;
325
+ }
326
+ async function appendSandboxRecord(input, sandbox) {
327
+ const paths = improvementLoopPaths(input);
328
+ await mkdir(dirname(paths.logPath), { recursive: true });
329
+ await writeFile(paths.logPath, `${JSON.stringify({ type: "sandbox", ...sandbox })}\n`, { flag: "a" });
330
+ const updated = await readImprovementLoopState(input);
331
+ if (!updated)
332
+ throw new Error("Failed to read improvement loop after sandbox update");
333
+ return updated;
334
+ }
335
+ async function runGit(exec, cwd, args, signal) {
336
+ return exec("git", args, { cwd, timeout: 60_000, signal, maxBuffer: 64 * 1024 });
337
+ }
338
+ export async function createImprovementSandbox(input) {
339
+ const state = await readImprovementLoopState(input);
340
+ if (!state)
341
+ throw new Error("Improvement loop is not initialized; call init first.");
342
+ if (state.activeSandbox) {
343
+ throw new Error(`Active sandbox already exists at ${state.activeSandbox.worktreePath}; clean it before creating another.`);
344
+ }
345
+ const status = await runGit(input.exec, input.cwd, ["status", "--porcelain"], input.signal);
346
+ if (status.code !== 0)
347
+ throw new Error(`Cannot inspect git status: ${(status.stderr || status.stdout).trim()}`);
348
+ if (status.stdout.trim() && !input.allowDirtyRepo) {
349
+ throw new Error("Refusing to create sandbox from dirty repository; commit/stash changes or pass allowDirtyRepo=true.");
350
+ }
351
+ const baseRef = input.baseRef?.trim() || "HEAD";
352
+ const sandboxId = createSandboxId(input);
353
+ const paths = improvementLoopPaths(input);
354
+ const worktreePath = join(paths.sandboxDir, sandboxId);
355
+ await mkdir(dirname(worktreePath), { recursive: true });
356
+ const add = await runGit(input.exec, input.cwd, ["worktree", "add", "--detach", worktreePath, baseRef], input.signal);
357
+ if (add.code !== 0)
358
+ throw new Error(`Failed to create git worktree sandbox: ${(add.stderr || add.stdout).trim()}`);
359
+ const sandbox = {
360
+ sandboxId,
361
+ status: "active",
362
+ repoPath: input.cwd,
363
+ worktreePath,
364
+ baseRef,
365
+ createdAt: Date.now(),
366
+ };
367
+ return appendSandboxRecord(input, sandbox);
368
+ }
369
+ export async function exportImprovementSandboxPatch(input) {
370
+ const state = await readImprovementLoopState(input);
371
+ if (!state)
372
+ throw new Error("Improvement loop is not initialized; call init first.");
373
+ const sandbox = input.sandboxId
374
+ ? state.sandboxes.find((candidate) => candidate.sandboxId === normalizeLoopId(input.sandboxId))
375
+ : state.activeSandbox;
376
+ if (!sandbox || sandbox.status !== "active")
377
+ throw new Error("No active sandbox found to export.");
378
+ const diff = await runGit(input.exec, sandbox.worktreePath, ["diff", "--binary", "HEAD"], input.signal);
379
+ if (diff.code !== 0)
380
+ throw new Error(`Failed to export sandbox patch: ${(diff.stderr || diff.stdout).trim()}`);
381
+ if (!diff.stdout.trim() && !input.allowEmptyPatch)
382
+ throw new Error("Sandbox has no changes to export.");
383
+ const paths = improvementLoopPaths(input);
384
+ await mkdir(paths.artifactDir, { recursive: true });
385
+ const patchPath = join(paths.artifactDir, `${sandbox.sandboxId}.patch`);
386
+ await writeFile(patchPath, diff.stdout);
387
+ return appendSandboxRecord(input, {
388
+ ...sandbox,
389
+ exportedAt: Date.now(),
390
+ patchPath,
391
+ patchBytes: Buffer.byteLength(diff.stdout),
392
+ });
393
+ }
394
+ export async function cleanupImprovementSandbox(input) {
395
+ const state = await readImprovementLoopState(input);
396
+ if (!state)
397
+ throw new Error("Improvement loop is not initialized; call init first.");
398
+ const sandbox = input.sandboxId
399
+ ? state.sandboxes.find((candidate) => candidate.sandboxId === normalizeLoopId(input.sandboxId))
400
+ : state.activeSandbox;
401
+ if (!sandbox || sandbox.status !== "active")
402
+ throw new Error("No active sandbox found to clean up.");
403
+ const remove = await runGit(input.exec, sandbox.repoPath, ["worktree", "remove", "--force", sandbox.worktreePath], input.signal);
404
+ if (remove.code !== 0)
405
+ throw new Error(`Failed to remove git worktree sandbox: ${(remove.stderr || remove.stdout).trim()}`);
406
+ await runGit(input.exec, sandbox.repoPath, ["worktree", "prune"], input.signal);
407
+ return appendSandboxRecord(input, {
408
+ ...sandbox,
409
+ status: "cleaned",
410
+ cleanedAt: Date.now(),
411
+ reason: input.reason,
412
+ });
413
+ }
414
+ export async function recordImprovementRun(input) {
415
+ const state = await readImprovementLoopState(input);
416
+ if (!state)
417
+ throw new Error("Improvement loop is not initialized; call init first.");
418
+ const runId = input.runId ?? state.runs.length + 1;
419
+ const priorMetrics = state.runs
420
+ .map((run) => run.metric)
421
+ .filter((metric) => Number.isFinite(metric));
422
+ const decision = decideImprovement({
423
+ currentMetric: input.metric,
424
+ bestMetric: state.bestMetric,
425
+ direction: state.config.direction,
426
+ checksPass: input.checksPass ?? null,
427
+ minDelta: state.config.minDelta,
428
+ minRelativeDelta: state.config.minRelativeDelta,
429
+ confidenceMode: state.config.confidenceMode,
430
+ minConfidence: state.config.minConfidence,
431
+ lowConfidenceAction: state.config.lowConfidenceAction,
432
+ noiseMetrics: priorMetrics,
433
+ });
434
+ const record = {
435
+ runId,
436
+ objective: state.config.objective,
437
+ hypothesis: input.hypothesis,
438
+ metricName: state.config.metricName,
439
+ metricUnit: state.config.metricUnit,
440
+ direction: state.config.direction,
441
+ metric: normalizeFiniteNumber(input.metric),
442
+ secondaryMetrics: input.secondaryMetrics,
443
+ checksPass: input.checksPass ?? null,
444
+ decision: decision.decision,
445
+ reason: decision.reason,
446
+ confidence: decision.confidence.value,
447
+ changedFiles: input.changedFiles,
448
+ evidenceRef: input.evidenceRef,
449
+ nextHint: input.nextHint,
450
+ timestamp: Date.now(),
451
+ };
452
+ await mkdir(dirname(state.logPath), { recursive: true });
453
+ await writeFile(state.logPath, `${JSON.stringify({ type: "run", ...record })}\n`, { flag: "a" });
454
+ const updated = await readImprovementLoopState(input);
455
+ if (!updated)
456
+ throw new Error("Failed to read improvement loop after recording run");
457
+ return { ...updated, lastDecision: decision };
458
+ }
459
+ function secondaryMetricsFrom(parsedMetrics, primaryName) {
460
+ const secondary = Object.fromEntries(Object.entries(parsedMetrics).filter(([name]) => name !== primaryName));
461
+ return Object.keys(secondary).length > 0 ? secondary : undefined;
462
+ }
463
+ export async function runImprovementMeasurement(input) {
464
+ const command = input.command.trim();
465
+ if (!command)
466
+ throw new Error("command is required");
467
+ const timeoutMs = Math.max(1, Math.floor(input.timeoutSeconds ?? 600)) * 1000;
468
+ const checksTimeoutMs = Math.max(1, Math.floor(input.checksTimeoutSeconds ?? 300)) * 1000;
469
+ const maxBuffer = Math.max(1024, Math.floor(input.maxOutputBytes ?? 64 * 1024));
470
+ const started = Date.now();
471
+ const result = await input.exec("bash", ["-c", command], {
472
+ cwd: input.cwd,
473
+ timeout: timeoutMs,
474
+ signal: input.signal,
475
+ maxBuffer,
476
+ });
477
+ const durationMs = Date.now() - started;
478
+ const output = `${result.stdout}\n${result.stderr}`;
479
+ const parsedMetrics = metricMapFromOutput(output);
480
+ const primaryMetric = selectPrimaryMetric(parsedMetrics, input.metricName);
481
+ let checksPass = result.code === 0 && !result.killed;
482
+ let checksExitCode;
483
+ let checksTimedOut;
484
+ let checksStdout;
485
+ let checksStderr;
486
+ const checksCommand = input.checksCommand?.trim();
487
+ if (checksPass && checksCommand) {
488
+ const checks = await input.exec("bash", ["-c", checksCommand], {
489
+ cwd: input.cwd,
490
+ timeout: checksTimeoutMs,
491
+ signal: input.signal,
492
+ maxBuffer,
493
+ });
494
+ checksExitCode = checks.code;
495
+ checksTimedOut = checks.killed;
496
+ checksStdout = checks.stdout;
497
+ checksStderr = checks.stderr;
498
+ checksPass = checks.code === 0 && !checks.killed;
499
+ }
500
+ return {
501
+ command,
502
+ exitCode: result.code,
503
+ timedOut: result.killed,
504
+ durationMs,
505
+ stdout: result.stdout,
506
+ stderr: result.stderr,
507
+ stdoutTruncated: !!result.stdoutTruncated,
508
+ stderrTruncated: !!result.stderrTruncated,
509
+ parsedMetrics,
510
+ primaryMetric,
511
+ checksCommand: checksCommand || undefined,
512
+ checksExitCode,
513
+ checksTimedOut,
514
+ checksPass,
515
+ checksStdout,
516
+ checksStderr,
517
+ };
518
+ }
519
+ export function parseGitPorcelainStatus(status) {
520
+ return status
521
+ .split("\n")
522
+ .map((line) => line.trimEnd())
523
+ .filter(Boolean)
524
+ .map((line) => {
525
+ const index = line[0] ?? " ";
526
+ const workingTree = line[1] ?? " ";
527
+ const rawPath = line.slice(3);
528
+ const renameParts = rawPath.split(" -> ");
529
+ if (renameParts.length === 2) {
530
+ return { index, workingTree, origPath: renameParts[0], path: renameParts[1] };
531
+ }
532
+ return { index, workingTree, path: rawPath };
533
+ });
534
+ }
535
+ function entriesFromStatus(status) {
536
+ return Array.isArray(status) ? status : parseGitPorcelainStatus(status);
537
+ }
538
+ function normalizePath(path) {
539
+ return path.replace(/\\/g, "/").replace(/^\.\//, "");
540
+ }
541
+ function pathSet(paths) {
542
+ return new Set(paths.map(normalizePath).filter(Boolean));
543
+ }
544
+ function entryPaths(entry) {
545
+ return [entry.path, entry.origPath].filter((value) => typeof value === "string").map(normalizePath);
546
+ }
547
+ function statusPathSet(entries) {
548
+ const paths = new Set();
549
+ for (const entry of entries) {
550
+ for (const path of entryPaths(entry))
551
+ paths.add(path);
552
+ }
553
+ return paths;
554
+ }
555
+ function isPreserved(path, preservePaths) {
556
+ for (const preserve of preservePaths) {
557
+ if (path === preserve || path.startsWith(`${preserve}/`))
558
+ return true;
559
+ }
560
+ return false;
561
+ }
562
+ export function planOwnedDiscard(input) {
563
+ const beforePaths = statusPathSet(entriesFromStatus(input.beforeStatus));
564
+ const afterPaths = statusPathSet(entriesFromStatus(input.afterStatus));
565
+ const ownedPaths = pathSet(input.ownedPaths);
566
+ const preservePaths = pathSet(input.preservePaths ?? []);
567
+ const revertPaths = [];
568
+ const protectedUserDirtyPaths = [];
569
+ const unownedChangedPaths = [];
570
+ const preservedChangedPaths = [];
571
+ for (const path of afterPaths) {
572
+ if (isPreserved(path, preservePaths)) {
573
+ preservedChangedPaths.push(path);
574
+ continue;
575
+ }
576
+ if (!ownedPaths.has(path)) {
577
+ unownedChangedPaths.push(path);
578
+ continue;
579
+ }
580
+ if (beforePaths.has(path)) {
581
+ protectedUserDirtyPaths.push(path);
582
+ continue;
583
+ }
584
+ revertPaths.push(path);
585
+ }
586
+ return {
587
+ revertPaths: [...new Set(revertPaths)].sort(),
588
+ preservePaths: [...new Set(preservedChangedPaths)].sort(),
589
+ protectedUserDirtyPaths: [...new Set(protectedUserDirtyPaths)].sort(),
590
+ unownedChangedPaths: [...new Set(unownedChangedPaths)].sort(),
591
+ canDiscardOwnedChanges: protectedUserDirtyPaths.length === 0,
592
+ };
593
+ }
594
+ const DecisionToolParams = Type.Object({
595
+ currentMetric: Type.Number({ description: "Measured primary metric for the candidate run." }),
596
+ bestMetric: Type.Optional(Type.Number({ description: "Best kept primary metric so far. Omit for baseline." })),
597
+ direction: StringEnum(["lower", "higher"], {
598
+ description: "Whether lower or higher metric values are better. Default lower.",
599
+ }),
600
+ checksPass: Type.Optional(Type.Boolean({ description: "Whether correctness checks passed. false forces discard." })),
601
+ minDelta: Type.Optional(Type.Number({ description: "Minimum absolute improvement required to keep. Default 0." })),
602
+ minRelativeDelta: Type.Optional(Type.Number({ description: "Minimum relative improvement required to keep, e.g. 0.01 for 1%. Default 0." })),
603
+ confidenceMode: Type.Optional(StringEnum(["none", "mad"], {
604
+ description: "Noise/confidence policy. Use mad with minConfidence to reject noisy wins.",
605
+ })),
606
+ minConfidence: Type.Optional(Type.Number({ description: "Minimum confidence multiple over MAD noise floor. Default 0." })),
607
+ lowConfidenceAction: Type.Optional(StringEnum(["retry", "discard"], {
608
+ description: "Decision when confidence evidence is missing or too low. Default retry.",
609
+ })),
610
+ noiseMetrics: Type.Optional(Type.Array(Type.Number(), { description: "Previous metric samples for MAD confidence calculation." })),
611
+ }, { additionalProperties: false });
612
+ export function createImprovementDecisionTool() {
613
+ return defineTool({
614
+ name: "improvement_decision",
615
+ label: "Improvement Decision",
616
+ description: "Deterministically decide keep/discard/retry/blocked for an improvement candidate from metric, direction, thresholds, checks, and optional noise samples. The model proposes; code judges.",
617
+ promptSnippet: "Decide keep/discard for a measured improvement candidate using deterministic metric gates",
618
+ promptGuidelines: [
619
+ "Use improvement_decision after measuring a candidate when keep/discard must be based on metric evidence rather than model intuition.",
620
+ "Do not claim a candidate is kept unless this tool returns decision=keep or a stronger project-specific validator passes.",
621
+ "Correctness failures override metric wins; checksPass=false always discards.",
622
+ ],
623
+ parameters: DecisionToolParams,
624
+ async execute(_toolCallId, params) {
625
+ const decision = decideImprovement(params);
626
+ const comparison = decision.comparison;
627
+ const confidenceText = decision.confidence.value === null ? "n/a" : decision.confidence.value.toFixed(2);
628
+ const deltaText = comparison ? comparison.delta.toString() : "n/a";
629
+ return {
630
+ content: [
631
+ {
632
+ type: "text",
633
+ text: `Decision: ${decision.decision} (${decision.reason})\nDelta: ${deltaText}\nConfidence: ${confidenceText}`,
634
+ },
635
+ ],
636
+ details: decision,
637
+ };
638
+ },
639
+ });
640
+ }
641
+ const LoopToolParams = Type.Object({
642
+ action: StringEnum(["init", "status", "record", "measure", "sandbox_create", "sandbox_export", "sandbox_cleanup"], {
643
+ description: "init creates user-level loop state, status reads it, record appends a supplied measurement, measure runs and records a bounded command, sandbox_create/export/cleanup manage disposable git worktrees and keep patches.",
644
+ }),
645
+ loopId: Type.Optional(Type.String({ description: "Loop id within the current workspace. Default: default." })),
646
+ objective: Type.Optional(Type.String({ description: "Loop objective. Required for action=init." })),
647
+ metricName: Type.Optional(Type.String({ description: "Primary metric name. Required for action=init." })),
648
+ metricUnit: Type.Optional(Type.String({ description: "Display unit for the primary metric." })),
649
+ direction: Type.Optional(StringEnum(["lower", "higher"], {
650
+ description: "Whether lower or higher metric values are better. Default lower.",
651
+ })),
652
+ reset: Type.Optional(Type.Boolean({
653
+ description: "For action=init, replace existing user-level loop state for this workspace/loop id.",
654
+ })),
655
+ minDelta: Type.Optional(Type.Number({ description: "Minimum absolute improvement required to keep. Default 0." })),
656
+ minRelativeDelta: Type.Optional(Type.Number({ description: "Minimum relative improvement required to keep. Default 0." })),
657
+ confidenceMode: Type.Optional(StringEnum(["none", "mad"], {
658
+ description: "Noise/confidence policy. Default none unless minConfidence > 0.",
659
+ })),
660
+ minConfidence: Type.Optional(Type.Number({ description: "Minimum confidence multiple over MAD noise floor. Default 0." })),
661
+ lowConfidenceAction: Type.Optional(StringEnum(["retry", "discard"], {
662
+ description: "Decision when confidence evidence is missing or low. Default retry.",
663
+ })),
664
+ currentMetric: Type.Optional(Type.Number({ description: "Measured primary metric for action=record." })),
665
+ checksPass: Type.Optional(Type.Boolean({ description: "Correctness check result for action=record. false forces discard." })),
666
+ hypothesis: Type.Optional(Type.String({ description: "What this run tried." })),
667
+ secondaryMetrics: Type.Optional(Type.Object({}, { additionalProperties: Type.Number(), description: "Secondary metric map for action=record." })),
668
+ changedFiles: Type.Optional(Type.Array(Type.String(), { description: "Files changed by this candidate run." })),
669
+ evidenceRef: Type.Optional(Type.String({ description: "Path, command, or artifact reference proving the measurement/check result." })),
670
+ nextHint: Type.Optional(Type.String({ description: "Useful next-step hint for later runs, especially after discard/retry/blocked." })),
671
+ command: Type.Optional(Type.String({
672
+ description: "Measurement command for action=measure. Runs via bash -c in ctx.cwd or active sandbox when useSandbox=true.",
673
+ })),
674
+ checksCommand: Type.Optional(Type.String({
675
+ description: "Optional correctness command for action=measure. Runs only if the measurement command exits 0.",
676
+ })),
677
+ timeoutSeconds: Type.Optional(Type.Number({ description: "Measurement command timeout in seconds. Default 600." })),
678
+ checksTimeoutSeconds: Type.Optional(Type.Number({ description: "Checks command timeout in seconds. Default 300." })),
679
+ maxOutputBytes: Type.Optional(Type.Number({ description: "Maximum stdout/stderr tail retained per command. Default 65536." })),
680
+ useSandbox: Type.Optional(Type.Boolean({
681
+ description: "For action=measure, run command/checks inside the active sandbox worktree. Default false.",
682
+ })),
683
+ sandboxId: Type.Optional(Type.String({
684
+ description: "Sandbox id for sandbox_create/sandbox_export/sandbox_cleanup. Defaults to generated id or active sandbox.",
685
+ })),
686
+ baseRef: Type.Optional(Type.String({ description: "Git base ref for sandbox_create. Default HEAD." })),
687
+ allowDirtyRepo: Type.Optional(Type.Boolean({
688
+ description: "Allow sandbox_create when the real repo has uncommitted changes. Default false.",
689
+ })),
690
+ cleanupReason: Type.Optional(Type.String({ description: "Optional reason recorded when sandbox_cleanup removes a sandbox." })),
691
+ allowEmptyPatch: Type.Optional(Type.Boolean({ description: "For sandbox_export, allow writing an empty patch. Default false." })),
692
+ }, { additionalProperties: false });
693
+ function requireString(value, label) {
694
+ const trimmed = value?.trim();
695
+ if (!trimmed)
696
+ throw new Error(`${label} is required`);
697
+ return trimmed;
698
+ }
699
+ function summarizeLoopState(state) {
700
+ if (!state)
701
+ return "Improvement loop is not initialized.";
702
+ const lines = [
703
+ `Loop ${state.config.loopId}: ${state.config.objective}`,
704
+ `Metric: ${state.config.metricName} (${state.config.metricUnit || "unitless"}, ${state.config.direction} is better)`,
705
+ `Runs: ${state.runs.length}`,
706
+ `Baseline: ${state.baselineMetric ?? "none"}`,
707
+ `Best: ${state.bestMetric ?? "none"}${state.bestRunId === null ? "" : ` (#${state.bestRunId})`}`,
708
+ `Log: ${state.logPath}`,
709
+ ];
710
+ const lastRun = state.runs.at(-1);
711
+ if (lastRun)
712
+ lines.push(`Last: ${lastRun.decision} (${lastRun.reason}) metric=${lastRun.metric ?? "missing"}`);
713
+ if (state.activeSandbox)
714
+ lines.push(`Active sandbox: ${state.activeSandbox.worktreePath}`);
715
+ return lines.join("\n");
716
+ }
717
+ export function createImprovementLoopTool(exec) {
718
+ return defineTool({
719
+ name: "improvement_loop",
720
+ label: "Improvement Loop",
721
+ description: "Persist and evaluate a deterministic improvement loop in user-level state: init objective/metric policy, measure bounded commands, record measured runs, manage disposable git worktree sandboxes, and status current baseline/best/decision log. Does not commit/apply/revert real-repo files.",
722
+ promptSnippet: "Track deterministic improvement-loop state and record measured keep/discard decisions",
723
+ promptGuidelines: [
724
+ "Use improvement_loop init before iterative optimization that needs metric-based keep/discard state.",
725
+ "Use improvement_loop measure to run a bounded measurement command and optional checks command, then record the deterministic decision.",
726
+ "Use improvement_loop sandbox_create before risky self-modifying experiments so edits happen in a disposable git worktree, not the real repository.",
727
+ "Use improvement_loop sandbox_export to capture the sandbox diff as a user-level patch artifact before cleanup when the decision is keep.",
728
+ "Use improvement_loop sandbox_cleanup after discard or after exporting/approving a patch; cleanup removes the disposable worktree record, not the real repository.",
729
+ "Use improvement_loop record when measurement already happened elsewhere; code decides keep/discard/retry/blocked from metrics and gates.",
730
+ "This tool does not commit/apply/revert real-repo files; keep/discard is a decision record until a later approved executor applies it.",
731
+ "Operational state is stored under the user-level agent directory, not in the target repository.",
732
+ ],
733
+ parameters: LoopToolParams,
734
+ executionMode: "sequential",
735
+ async execute(_toolCallId, params, signal, _onUpdate, ctx) {
736
+ const cwd = ctx.cwd;
737
+ if (params.action === "init") {
738
+ const state = await initImprovementLoop({
739
+ cwd,
740
+ loopId: params.loopId,
741
+ objective: requireString(params.objective, "objective"),
742
+ metricName: requireString(params.metricName, "metricName"),
743
+ metricUnit: params.metricUnit,
744
+ direction: params.direction,
745
+ minDelta: params.minDelta,
746
+ minRelativeDelta: params.minRelativeDelta,
747
+ confidenceMode: params.confidenceMode,
748
+ minConfidence: params.minConfidence,
749
+ lowConfidenceAction: params.lowConfidenceAction,
750
+ reset: params.reset,
751
+ });
752
+ return {
753
+ content: [{ type: "text", text: `Initialized improvement loop.\n${summarizeLoopState(state)}` }],
754
+ details: { action: "init", state, logPath: state.logPath },
755
+ };
756
+ }
757
+ if (params.action === "record") {
758
+ const state = await recordImprovementRun({
759
+ cwd,
760
+ loopId: params.loopId,
761
+ metric: params.currentMetric,
762
+ secondaryMetrics: params.secondaryMetrics,
763
+ checksPass: params.checksPass ?? null,
764
+ hypothesis: params.hypothesis,
765
+ changedFiles: params.changedFiles,
766
+ evidenceRef: params.evidenceRef,
767
+ nextHint: params.nextHint,
768
+ });
769
+ const decision = state.lastDecision;
770
+ return {
771
+ content: [
772
+ {
773
+ type: "text",
774
+ text: `Recorded run: ${decision?.decision ?? "unknown"} (${decision?.reason ?? "unknown"}).\n${summarizeLoopState(state)}`,
775
+ },
776
+ ],
777
+ details: { action: "record", state, decision: decision ?? undefined, logPath: state.logPath },
778
+ };
779
+ }
780
+ if (params.action === "measure") {
781
+ if (!exec)
782
+ throw new Error("improvement_loop measure requires createImprovementLoopTool(pi.exec)");
783
+ const before = await readImprovementLoopState({ cwd, loopId: params.loopId });
784
+ if (!before)
785
+ throw new Error("Improvement loop is not initialized; call init first.");
786
+ const measurementCwd = params.useSandbox
787
+ ? (before.activeSandbox?.worktreePath ??
788
+ (() => {
789
+ throw new Error("useSandbox=true requires an active sandbox.");
790
+ })())
791
+ : cwd;
792
+ const measurement = await runImprovementMeasurement({
793
+ exec,
794
+ cwd: measurementCwd,
795
+ command: requireString(params.command, "command"),
796
+ metricName: before.config.metricName,
797
+ checksCommand: params.checksCommand,
798
+ timeoutSeconds: params.timeoutSeconds,
799
+ checksTimeoutSeconds: params.checksTimeoutSeconds,
800
+ maxOutputBytes: params.maxOutputBytes,
801
+ signal,
802
+ });
803
+ const state = await recordImprovementRun({
804
+ cwd,
805
+ loopId: params.loopId,
806
+ metric: measurement.primaryMetric,
807
+ secondaryMetrics: secondaryMetricsFrom(measurement.parsedMetrics, before.config.metricName),
808
+ checksPass: measurement.checksPass,
809
+ hypothesis: params.hypothesis,
810
+ changedFiles: params.changedFiles,
811
+ evidenceRef: params.evidenceRef ?? `command:${measurement.command};cwd:${measurementCwd}`,
812
+ nextHint: params.nextHint,
813
+ });
814
+ const decision = state.lastDecision;
815
+ return {
816
+ content: [
817
+ {
818
+ type: "text",
819
+ text: `Measured ${before.config.metricName}=${measurement.primaryMetric ?? "missing"}; decision: ${decision?.decision ?? "unknown"} (${decision?.reason ?? "unknown"}).\n${summarizeLoopState(state)}`,
820
+ },
821
+ ],
822
+ details: {
823
+ action: "measure",
824
+ state,
825
+ decision: decision ?? undefined,
826
+ measurement,
827
+ logPath: state.logPath,
828
+ },
829
+ };
830
+ }
831
+ if (params.action === "sandbox_create") {
832
+ if (!exec)
833
+ throw new Error("improvement_loop sandbox_create requires createImprovementLoopTool(pi.exec)");
834
+ const state = await createImprovementSandbox({
835
+ cwd,
836
+ loopId: params.loopId,
837
+ exec,
838
+ baseRef: params.baseRef,
839
+ allowDirtyRepo: params.allowDirtyRepo,
840
+ sandboxId: params.sandboxId,
841
+ signal,
842
+ });
843
+ const sandbox = state.activeSandbox ?? undefined;
844
+ return {
845
+ content: [
846
+ {
847
+ type: "text",
848
+ text: `Created sandbox: ${sandbox?.worktreePath ?? "unknown"}\n${summarizeLoopState(state)}`,
849
+ },
850
+ ],
851
+ details: { action: "sandbox_create", state, sandbox, logPath: state.logPath },
852
+ };
853
+ }
854
+ if (params.action === "sandbox_export") {
855
+ if (!exec)
856
+ throw new Error("improvement_loop sandbox_export requires createImprovementLoopTool(pi.exec)");
857
+ const state = await exportImprovementSandboxPatch({
858
+ cwd,
859
+ loopId: params.loopId,
860
+ exec,
861
+ sandboxId: params.sandboxId,
862
+ allowEmptyPatch: params.allowEmptyPatch,
863
+ signal,
864
+ });
865
+ const sandbox = params.sandboxId
866
+ ? state.sandboxes.find((candidate) => candidate.sandboxId === normalizeLoopId(params.sandboxId))
867
+ : (state.activeSandbox ?? state.sandboxes.at(-1));
868
+ return {
869
+ content: [
870
+ {
871
+ type: "text",
872
+ text: `Exported sandbox patch: ${sandbox?.patchPath ?? "unknown"}\n${summarizeLoopState(state)}`,
873
+ },
874
+ ],
875
+ details: { action: "sandbox_export", state, sandbox, logPath: state.logPath },
876
+ };
877
+ }
878
+ if (params.action === "sandbox_cleanup") {
879
+ if (!exec)
880
+ throw new Error("improvement_loop sandbox_cleanup requires createImprovementLoopTool(pi.exec)");
881
+ const state = await cleanupImprovementSandbox({
882
+ cwd,
883
+ loopId: params.loopId,
884
+ exec,
885
+ sandboxId: params.sandboxId,
886
+ reason: params.cleanupReason,
887
+ signal,
888
+ });
889
+ const sandbox = params.sandboxId
890
+ ? state.sandboxes.find((candidate) => candidate.sandboxId === normalizeLoopId(params.sandboxId))
891
+ : state.sandboxes.at(-1);
892
+ return {
893
+ content: [{ type: "text", text: `Cleaned sandbox.\n${summarizeLoopState(state)}` }],
894
+ details: { action: "sandbox_cleanup", state, sandbox, logPath: state.logPath },
895
+ };
896
+ }
897
+ const state = await readImprovementLoopState({ cwd, loopId: params.loopId });
898
+ const paths = improvementLoopPaths({ cwd, loopId: params.loopId });
899
+ return {
900
+ content: [{ type: "text", text: summarizeLoopState(state) }],
901
+ details: { action: "status", state, logPath: state?.logPath ?? paths.logPath },
902
+ };
903
+ },
904
+ });
905
+ }
906
+ //# sourceMappingURL=improvement-loop.js.map