@spacek33z/autoauto 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -0
- package/package.json +51 -0
- package/src/App.tsx +224 -0
- package/src/cli.ts +772 -0
- package/src/components/AgentPanel.tsx +254 -0
- package/src/components/Chat.test.tsx +71 -0
- package/src/components/Chat.tsx +308 -0
- package/src/components/CycleField.tsx +23 -0
- package/src/components/ModelPicker.tsx +97 -0
- package/src/components/PostUpdatePrompt.tsx +46 -0
- package/src/components/ResultsTable.tsx +172 -0
- package/src/components/RunCompletePrompt.tsx +90 -0
- package/src/components/RunSettingsOverlay.tsx +49 -0
- package/src/components/RunsTable.tsx +219 -0
- package/src/components/StatsHeader.tsx +100 -0
- package/src/daemon.ts +264 -0
- package/src/index.tsx +8 -0
- package/src/lib/agent/agent-provider.test.ts +133 -0
- package/src/lib/agent/claude-provider.ts +277 -0
- package/src/lib/agent/codex-provider.ts +413 -0
- package/src/lib/agent/default-providers.ts +10 -0
- package/src/lib/agent/index.ts +32 -0
- package/src/lib/agent/mock-provider.ts +61 -0
- package/src/lib/agent/opencode-provider.ts +424 -0
- package/src/lib/agent/types.ts +73 -0
- package/src/lib/auth.ts +11 -0
- package/src/lib/config.ts +152 -0
- package/src/lib/daemon-callbacks.ts +59 -0
- package/src/lib/daemon-client.ts +16 -0
- package/src/lib/daemon-lifecycle.ts +368 -0
- package/src/lib/daemon-spawn.ts +122 -0
- package/src/lib/daemon-status.ts +189 -0
- package/src/lib/daemon-watcher.ts +192 -0
- package/src/lib/experiment-loop.ts +679 -0
- package/src/lib/experiment.ts +356 -0
- package/src/lib/finalize.test.ts +143 -0
- package/src/lib/finalize.ts +511 -0
- package/src/lib/format.test.ts +32 -0
- package/src/lib/format.ts +44 -0
- package/src/lib/git.ts +176 -0
- package/src/lib/ideas-backlog.test.ts +54 -0
- package/src/lib/ideas-backlog.ts +109 -0
- package/src/lib/measure.ts +472 -0
- package/src/lib/model-options.ts +24 -0
- package/src/lib/programs.ts +247 -0
- package/src/lib/push-stream.ts +48 -0
- package/src/lib/run-context.ts +112 -0
- package/src/lib/run-setup.ts +34 -0
- package/src/lib/run.ts +383 -0
- package/src/lib/syntax-theme.ts +39 -0
- package/src/lib/system-prompts/experiment.ts +77 -0
- package/src/lib/system-prompts/finalize.ts +90 -0
- package/src/lib/system-prompts/index.ts +7 -0
- package/src/lib/system-prompts/setup.ts +516 -0
- package/src/lib/system-prompts/update.ts +188 -0
- package/src/lib/tool-events.ts +99 -0
- package/src/lib/validate-measurement.ts +326 -0
- package/src/lib/worktree.ts +40 -0
- package/src/screens/AuthErrorScreen.tsx +31 -0
- package/src/screens/ExecutionScreen.tsx +851 -0
- package/src/screens/FirstSetupScreen.tsx +168 -0
- package/src/screens/HomeScreen.tsx +406 -0
- package/src/screens/PreRunScreen.tsx +206 -0
- package/src/screens/SettingsScreen.tsx +189 -0
- package/src/screens/SetupScreen.tsx +226 -0
- package/src/tui.tsx +17 -0
- package/tsconfig.json +17 -0
|
@@ -0,0 +1,679 @@
|
|
|
1
|
+
import { chmod } from "node:fs/promises"
|
|
2
|
+
import { join, relative } from "node:path"
|
|
3
|
+
import type { RunState, ExperimentResult, TerminationReason } from "./run.ts"
|
|
4
|
+
import type { ProgramConfig } from "./programs.ts"
|
|
5
|
+
import type { ModelSlot } from "./config.ts"
|
|
6
|
+
import {
|
|
7
|
+
readState,
|
|
8
|
+
writeState,
|
|
9
|
+
appendResult,
|
|
10
|
+
serializeSecondaryValues,
|
|
11
|
+
serializeDiffStats,
|
|
12
|
+
} from "./run.ts"
|
|
13
|
+
import { unlockMeasurement, MEASUREMENT_FILES } from "./run-setup.ts"
|
|
14
|
+
import {
|
|
15
|
+
getFullSha,
|
|
16
|
+
resetHard,
|
|
17
|
+
isWorkingTreeClean,
|
|
18
|
+
countCommitsBetween,
|
|
19
|
+
} from "./git.ts"
|
|
20
|
+
import {
|
|
21
|
+
runMeasurementSeries,
|
|
22
|
+
compareMetric,
|
|
23
|
+
} from "./measure.ts"
|
|
24
|
+
import type { DiffStats } from "./git.ts"
|
|
25
|
+
import {
|
|
26
|
+
buildContextPacket,
|
|
27
|
+
buildExperimentPrompt,
|
|
28
|
+
runExperimentAgent,
|
|
29
|
+
checkLockViolation,
|
|
30
|
+
type ExperimentCost,
|
|
31
|
+
} from "./experiment.ts"
|
|
32
|
+
import { appendIdeasBacklog, type ExperimentNotes } from "./ideas-backlog.ts"
|
|
33
|
+
import { readRunConfig } from "./daemon-lifecycle.ts"
|
|
34
|
+
import { getExperimentSystemPrompt } from "./system-prompts/index.ts"
|
|
35
|
+
|
|
36
|
+
/** Re-measure baseline after this many consecutive discards to check for environment drift. */
|
|
37
|
+
const REBASELINE_AFTER_DISCARDS = 5
|
|
38
|
+
|
|
39
|
+
/** Default: stop after this many consecutive non-improving experiments. */
|
|
40
|
+
const DEFAULT_MAX_CONSECUTIVE_DISCARDS = 10
|
|
41
|
+
|
|
42
|
+
// --- Types ---
|
|
43
|
+
|
|
44
|
+
// TerminationReason is now defined in run.ts and re-exported here for backwards compat
|
|
45
|
+
export type { TerminationReason } from "./run.ts"
|
|
46
|
+
|
|
47
|
+
/** Callback for the TUI to receive live updates */
|
|
48
|
+
export interface LoopCallbacks {
|
|
49
|
+
onPhaseChange: (phase: RunState["phase"], detail?: string) => void
|
|
50
|
+
onExperimentStart: (experimentNumber: number) => void
|
|
51
|
+
onExperimentEnd: (result: ExperimentResult) => void
|
|
52
|
+
onStateUpdate: (state: RunState) => void
|
|
53
|
+
onAgentStream: (text: string) => void
|
|
54
|
+
onAgentToolUse: (status: string) => void
|
|
55
|
+
onError: (error: string) => void
|
|
56
|
+
onExperimentCost?: (cost: ExperimentCost) => void
|
|
57
|
+
onRebaseline?: (oldBaseline: number, newBaseline: number, reason: string) => void
|
|
58
|
+
onLoopComplete?: (state: RunState, reason: TerminationReason) => void
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/** Options to control the experiment loop */
|
|
62
|
+
export interface LoopOptions {
|
|
63
|
+
maxExperiments: number
|
|
64
|
+
/** Hard abort — kills agent mid-execution, reverts, crash row */
|
|
65
|
+
signal?: AbortSignal
|
|
66
|
+
/** Soft stop — checked at iteration boundary, finishes current experiment normally */
|
|
67
|
+
stopRequested?: () => boolean
|
|
68
|
+
/** Durable ideas.md experiment memory. Disable to use results.tsv/git history only. */
|
|
69
|
+
ideasBacklogEnabled?: boolean
|
|
70
|
+
/** Diagnostics from the baseline measurement, to pass to the first experiment */
|
|
71
|
+
baselineDiagnostics?: string
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// --- Helpers ---
|
|
75
|
+
|
|
76
|
+
const now = () => new Date().toISOString()
|
|
77
|
+
|
|
78
|
+
interface MeasurementFileSnapshot {
|
|
79
|
+
path: string
|
|
80
|
+
label: string
|
|
81
|
+
content: string
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async function readMeasurementSnapshot(
|
|
85
|
+
programDir: string,
|
|
86
|
+
): Promise<MeasurementFileSnapshot[]> {
|
|
87
|
+
const paths = MEASUREMENT_FILES.map((f) => join(programDir, f))
|
|
88
|
+
const results = await Promise.all(paths.map(async (path) => {
|
|
89
|
+
try {
|
|
90
|
+
const file = Bun.file(path)
|
|
91
|
+
if (!await file.exists()) return null // build.sh may not exist
|
|
92
|
+
const content = await file.text()
|
|
93
|
+
return { path, label: relative(programDir, path), content }
|
|
94
|
+
} catch {
|
|
95
|
+
return null
|
|
96
|
+
}
|
|
97
|
+
}))
|
|
98
|
+
return results.filter((r): r is MeasurementFileSnapshot => r !== null)
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
async function getMeasurementViolations(snapshot: MeasurementFileSnapshot[]): Promise<string[]> {
|
|
102
|
+
const checks = await Promise.all(snapshot.map(async (file) => {
|
|
103
|
+
try {
|
|
104
|
+
const bunFile = Bun.file(file.path)
|
|
105
|
+
const current = await bunFile.text()
|
|
106
|
+
// Check content changed or write permission restored (0o222 = write bits)
|
|
107
|
+
// Bun.file doesn't expose mode, so use stat for permission check
|
|
108
|
+
const { mode } = await Bun.file(file.path).stat()
|
|
109
|
+
if (current !== file.content || (mode & 0o222) !== 0) {
|
|
110
|
+
return file.label
|
|
111
|
+
}
|
|
112
|
+
} catch {
|
|
113
|
+
return file.label
|
|
114
|
+
}
|
|
115
|
+
return null
|
|
116
|
+
}))
|
|
117
|
+
|
|
118
|
+
return checks.filter((file): file is string => file !== null)
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async function restoreMeasurementSnapshot(snapshot: MeasurementFileSnapshot[]): Promise<void> {
|
|
122
|
+
await Promise.all(snapshot.map(async (file) => {
|
|
123
|
+
await chmod(file.path, 0o644).catch(() => {})
|
|
124
|
+
await Bun.write(file.path, file.content)
|
|
125
|
+
await chmod(file.path, 0o444)
|
|
126
|
+
}))
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
async function resetAndVerify(cwd: string, startSha: string, errorContext: string): Promise<void> {
|
|
130
|
+
await resetHard(cwd, startSha)
|
|
131
|
+
if (!(await isWorkingTreeClean(cwd))) {
|
|
132
|
+
throw new Error(`Working tree still dirty after ${errorContext}; stopping to avoid contaminating the next experiment.`)
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
async function recordIdeasBacklog(
|
|
137
|
+
enabled: boolean,
|
|
138
|
+
runDir: string,
|
|
139
|
+
result: ExperimentResult,
|
|
140
|
+
notes?: ExperimentNotes,
|
|
141
|
+
): Promise<void> {
|
|
142
|
+
if (!enabled) return
|
|
143
|
+
await appendIdeasBacklog(runDir, result, notes).catch(() => {})
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
async function maybeRebaseline(
|
|
147
|
+
consecutiveDiscards: number,
|
|
148
|
+
measureShPath: string,
|
|
149
|
+
buildShPath: string,
|
|
150
|
+
cwd: string,
|
|
151
|
+
config: ProgramConfig,
|
|
152
|
+
state: RunState,
|
|
153
|
+
runDir: string,
|
|
154
|
+
callbacks: LoopCallbacks,
|
|
155
|
+
signal?: AbortSignal,
|
|
156
|
+
): Promise<RunState> {
|
|
157
|
+
if (consecutiveDiscards <= 0 || consecutiveDiscards % REBASELINE_AFTER_DISCARDS !== 0) {
|
|
158
|
+
return state
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
callbacks.onPhaseChange("measuring", `re-baselining after ${consecutiveDiscards} consecutive discards`)
|
|
162
|
+
const driftCheck = await runMeasurementSeries(measureShPath, cwd, config, signal, buildShPath)
|
|
163
|
+
|
|
164
|
+
if (!driftCheck.success) return state
|
|
165
|
+
|
|
166
|
+
const driftVerdict = compareMetric(
|
|
167
|
+
state.current_baseline,
|
|
168
|
+
driftCheck.median_metric,
|
|
169
|
+
config.noise_threshold,
|
|
170
|
+
config.direction,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if (driftVerdict === "noise") return state
|
|
174
|
+
|
|
175
|
+
const oldBaseline = state.current_baseline
|
|
176
|
+
const newState: RunState = {
|
|
177
|
+
...state,
|
|
178
|
+
current_baseline: driftCheck.median_metric,
|
|
179
|
+
updated_at: now(),
|
|
180
|
+
}
|
|
181
|
+
await writeState(runDir, newState)
|
|
182
|
+
callbacks.onRebaseline?.(oldBaseline, driftCheck.median_metric, "drift")
|
|
183
|
+
callbacks.onError(
|
|
184
|
+
`Baseline drift detected: ${oldBaseline} → ${driftCheck.median_metric}. ` +
|
|
185
|
+
`Recent discards may have been compared against a stale baseline.`
|
|
186
|
+
)
|
|
187
|
+
callbacks.onStateUpdate(newState)
|
|
188
|
+
|
|
189
|
+
return newState
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// --- Measurement + Decision ---
|
|
193
|
+
|
|
194
|
+
async function runMeasurementAndDecide(
|
|
195
|
+
cwd: string,
|
|
196
|
+
runDir: string,
|
|
197
|
+
measureShPath: string,
|
|
198
|
+
buildShPath: string,
|
|
199
|
+
config: ProgramConfig,
|
|
200
|
+
state: RunState,
|
|
201
|
+
startSha: string,
|
|
202
|
+
candidateSha: string,
|
|
203
|
+
description: string,
|
|
204
|
+
diffStats: DiffStats | undefined,
|
|
205
|
+
callbacks: LoopCallbacks,
|
|
206
|
+
recordBacklog: (result: ExperimentResult) => Promise<void>,
|
|
207
|
+
signal?: AbortSignal,
|
|
208
|
+
): Promise<{ state: RunState; kept: boolean; diagnostics?: string }> {
|
|
209
|
+
|
|
210
|
+
const diffStatsStr = serializeDiffStats(diffStats)
|
|
211
|
+
|
|
212
|
+
// 1. Measure
|
|
213
|
+
callbacks.onPhaseChange("measuring")
|
|
214
|
+
let currentState: RunState = { ...state, phase: "measuring", updated_at: now() }
|
|
215
|
+
await writeState(runDir, currentState)
|
|
216
|
+
|
|
217
|
+
const series = await runMeasurementSeries(measureShPath, cwd, config, signal, buildShPath)
|
|
218
|
+
|
|
219
|
+
// 2. Handle measurement failure
|
|
220
|
+
if (!series.success) {
|
|
221
|
+
const failureReason = series.failure_reason ?? "unknown measurement error"
|
|
222
|
+
callbacks.onPhaseChange("reverting", "measurement failed")
|
|
223
|
+
currentState = { ...currentState, phase: "reverting", updated_at: now() }
|
|
224
|
+
await writeState(runDir, currentState)
|
|
225
|
+
|
|
226
|
+
await resetAndVerify(cwd, startSha, "measurement failure reset")
|
|
227
|
+
|
|
228
|
+
const result: ExperimentResult = {
|
|
229
|
+
experiment_number: state.experiment_number,
|
|
230
|
+
commit: candidateSha.slice(0, 7),
|
|
231
|
+
metric_value: 0,
|
|
232
|
+
secondary_values: "",
|
|
233
|
+
status: "measurement_failure",
|
|
234
|
+
description: `measurement failed (${failureReason}): ${description}`,
|
|
235
|
+
measurement_duration_ms: series.duration_ms,
|
|
236
|
+
diff_stats: diffStatsStr,
|
|
237
|
+
}
|
|
238
|
+
await appendResult(runDir, result)
|
|
239
|
+
await recordBacklog(result)
|
|
240
|
+
callbacks.onExperimentEnd(result)
|
|
241
|
+
|
|
242
|
+
const finalState: RunState = {
|
|
243
|
+
...currentState,
|
|
244
|
+
total_crashes: currentState.total_crashes + 1,
|
|
245
|
+
candidate_sha: null,
|
|
246
|
+
phase: "idle",
|
|
247
|
+
updated_at: now(),
|
|
248
|
+
}
|
|
249
|
+
await writeState(runDir, finalState)
|
|
250
|
+
return { state: finalState, kept: false, diagnostics: series.diagnostics }
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// 3. Check quality gates
|
|
254
|
+
if (!series.quality_gates_passed) {
|
|
255
|
+
callbacks.onPhaseChange("reverting", `quality gate: ${series.gate_violations.join(", ")}`)
|
|
256
|
+
currentState = { ...currentState, phase: "reverting", updated_at: now() }
|
|
257
|
+
await writeState(runDir, currentState)
|
|
258
|
+
|
|
259
|
+
await resetAndVerify(cwd, startSha, "quality gate failure reset")
|
|
260
|
+
|
|
261
|
+
const result: ExperimentResult = {
|
|
262
|
+
experiment_number: state.experiment_number,
|
|
263
|
+
commit: candidateSha.slice(0, 7),
|
|
264
|
+
metric_value: series.median_metric,
|
|
265
|
+
secondary_values: serializeSecondaryValues(series.median_quality_gates, series.median_secondary_metrics),
|
|
266
|
+
status: "discard",
|
|
267
|
+
description: `quality gate failed: ${description}`,
|
|
268
|
+
measurement_duration_ms: series.duration_ms,
|
|
269
|
+
diff_stats: diffStatsStr,
|
|
270
|
+
}
|
|
271
|
+
await appendResult(runDir, result)
|
|
272
|
+
await recordBacklog(result)
|
|
273
|
+
callbacks.onExperimentEnd(result)
|
|
274
|
+
|
|
275
|
+
const finalState: RunState = {
|
|
276
|
+
...currentState,
|
|
277
|
+
total_discards: currentState.total_discards + 1,
|
|
278
|
+
candidate_sha: null,
|
|
279
|
+
phase: "idle",
|
|
280
|
+
updated_at: now(),
|
|
281
|
+
}
|
|
282
|
+
await writeState(runDir, finalState)
|
|
283
|
+
return { state: finalState, kept: false, diagnostics: series.diagnostics }
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// 4. Compare against baseline
|
|
287
|
+
const verdict = compareMetric(
|
|
288
|
+
state.current_baseline,
|
|
289
|
+
series.median_metric,
|
|
290
|
+
config.noise_threshold,
|
|
291
|
+
config.direction,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
// 5. Check for simplification auto-keep: net-negative LOC within noise
|
|
295
|
+
const isSimplification = verdict === "noise"
|
|
296
|
+
&& diffStats != null
|
|
297
|
+
&& diffStats.lines_removed > diffStats.lines_added
|
|
298
|
+
|
|
299
|
+
if (verdict === "keep" || isSimplification) {
|
|
300
|
+
// KEEP (metric improvement or simplification)
|
|
301
|
+
const keepReason = isSimplification ? "simplification" : "keep"
|
|
302
|
+
const keepDesc = isSimplification ? `simplification: ${description}` : description
|
|
303
|
+
callbacks.onPhaseChange("kept", `${keepReason}: ${state.current_baseline} → ${series.median_metric}`)
|
|
304
|
+
|
|
305
|
+
const isBest = !isSimplification && (config.direction === "lower"
|
|
306
|
+
? series.median_metric < state.best_metric
|
|
307
|
+
: series.median_metric > state.best_metric)
|
|
308
|
+
|
|
309
|
+
const result: ExperimentResult = {
|
|
310
|
+
experiment_number: state.experiment_number,
|
|
311
|
+
commit: candidateSha.slice(0, 7),
|
|
312
|
+
metric_value: series.median_metric,
|
|
313
|
+
secondary_values: serializeSecondaryValues(series.median_quality_gates, series.median_secondary_metrics),
|
|
314
|
+
status: "keep",
|
|
315
|
+
description: keepDesc,
|
|
316
|
+
measurement_duration_ms: series.duration_ms,
|
|
317
|
+
diff_stats: diffStatsStr,
|
|
318
|
+
}
|
|
319
|
+
await appendResult(runDir, result)
|
|
320
|
+
await recordBacklog(result)
|
|
321
|
+
callbacks.onExperimentEnd(result)
|
|
322
|
+
|
|
323
|
+
// Re-baseline: fresh measurement on the kept code
|
|
324
|
+
callbacks.onPhaseChange("measuring", `re-baselining after ${keepReason}`)
|
|
325
|
+
const rebaseline = await runMeasurementSeries(measureShPath, cwd, config, signal, buildShPath)
|
|
326
|
+
const newBaseline = rebaseline.success ? rebaseline.median_metric : series.median_metric
|
|
327
|
+
|
|
328
|
+
if (rebaseline.success && newBaseline !== series.median_metric) {
|
|
329
|
+
callbacks.onRebaseline?.(series.median_metric, newBaseline, keepReason)
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
const finalState: RunState = {
|
|
333
|
+
...currentState,
|
|
334
|
+
total_keeps: currentState.total_keeps + 1,
|
|
335
|
+
current_baseline: newBaseline,
|
|
336
|
+
best_metric: isBest ? series.median_metric : currentState.best_metric,
|
|
337
|
+
best_experiment: isBest ? state.experiment_number : currentState.best_experiment,
|
|
338
|
+
last_known_good_sha: candidateSha,
|
|
339
|
+
candidate_sha: null,
|
|
340
|
+
phase: "idle",
|
|
341
|
+
updated_at: now(),
|
|
342
|
+
}
|
|
343
|
+
await writeState(runDir, finalState)
|
|
344
|
+
return { state: finalState, kept: true, diagnostics: rebaseline.success ? rebaseline.diagnostics : series.diagnostics }
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// DISCARD (regressed or noise without simplification)
|
|
348
|
+
const reason = verdict === "regressed" ? "regressed" : "within noise"
|
|
349
|
+
callbacks.onPhaseChange("reverting", `${reason}: ${state.current_baseline} → ${series.median_metric}`)
|
|
350
|
+
|
|
351
|
+
currentState = { ...currentState, phase: "reverting", updated_at: now() }
|
|
352
|
+
await writeState(runDir, currentState)
|
|
353
|
+
|
|
354
|
+
await resetAndVerify(cwd, startSha, "discard reset")
|
|
355
|
+
|
|
356
|
+
const statusDesc = verdict === "regressed" ? description : `noise: ${description}`
|
|
357
|
+
|
|
358
|
+
const result: ExperimentResult = {
|
|
359
|
+
experiment_number: state.experiment_number,
|
|
360
|
+
commit: candidateSha.slice(0, 7),
|
|
361
|
+
metric_value: series.median_metric,
|
|
362
|
+
secondary_values: serializeSecondaryValues(series.median_quality_gates, series.median_secondary_metrics),
|
|
363
|
+
status: "discard",
|
|
364
|
+
description: statusDesc,
|
|
365
|
+
measurement_duration_ms: series.duration_ms,
|
|
366
|
+
diff_stats: diffStatsStr,
|
|
367
|
+
}
|
|
368
|
+
await appendResult(runDir, result)
|
|
369
|
+
await recordBacklog(result)
|
|
370
|
+
callbacks.onExperimentEnd(result)
|
|
371
|
+
|
|
372
|
+
const finalState: RunState = {
|
|
373
|
+
...currentState,
|
|
374
|
+
total_discards: currentState.total_discards + 1,
|
|
375
|
+
candidate_sha: null,
|
|
376
|
+
phase: "idle",
|
|
377
|
+
updated_at: now(),
|
|
378
|
+
}
|
|
379
|
+
await writeState(runDir, finalState)
|
|
380
|
+
return { state: finalState, kept: false, diagnostics: series.diagnostics }
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// --- Main Experiment Loop ---
|
|
384
|
+
|
|
385
|
+
/**
|
|
386
|
+
* The main experiment loop. Called after startRun() has established the baseline.
|
|
387
|
+
* Iterates: build context → spawn agent → check locks → measure → decide → repeat.
|
|
388
|
+
*/
|
|
389
|
+
export async function runExperimentLoop(
|
|
390
|
+
cwd: string,
|
|
391
|
+
programDir: string,
|
|
392
|
+
runDir: string,
|
|
393
|
+
config: ProgramConfig,
|
|
394
|
+
modelConfig: ModelSlot,
|
|
395
|
+
callbacks: LoopCallbacks,
|
|
396
|
+
options: LoopOptions,
|
|
397
|
+
): Promise<RunState> {
|
|
398
|
+
const measureShPath = join(programDir, "measure.sh")
|
|
399
|
+
const buildShPath = join(programDir, "build.sh")
|
|
400
|
+
let state = await readState(runDir)
|
|
401
|
+
let consecutiveDiscards = 0
|
|
402
|
+
let lastDiagnostics: string | undefined = options.baselineDiagnostics
|
|
403
|
+
const ideasBacklogEnabled = options.ideasBacklogEnabled ?? true
|
|
404
|
+
const maxConsecutiveDiscards = config.max_consecutive_discards ?? DEFAULT_MAX_CONSECUTIVE_DISCARDS
|
|
405
|
+
|
|
406
|
+
// Re-read from run-config.json each iteration to support mid-run TUI changes
|
|
407
|
+
let effectiveMaxExperiments = options.maxExperiments
|
|
408
|
+
|
|
409
|
+
try {
|
|
410
|
+
while (true) {
|
|
411
|
+
const runConfig = await readRunConfig(runDir)
|
|
412
|
+
if (runConfig) effectiveMaxExperiments = runConfig.max_experiments
|
|
413
|
+
|
|
414
|
+
// --- Check stop conditions ---
|
|
415
|
+
if (options.signal?.aborted) {
|
|
416
|
+
state = { ...state, phase: "stopping", updated_at: now() }
|
|
417
|
+
await writeState(runDir, state)
|
|
418
|
+
callbacks.onPhaseChange("stopping", "aborted")
|
|
419
|
+
break
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
if (options.stopRequested?.()) {
|
|
423
|
+
state = { ...state, phase: "stopping", updated_at: now() }
|
|
424
|
+
await writeState(runDir, state)
|
|
425
|
+
callbacks.onPhaseChange("stopping", "stop requested — finishing after current experiment")
|
|
426
|
+
break
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// Stagnation detection — stop after too many consecutive non-improving experiments
|
|
430
|
+
if (consecutiveDiscards >= maxConsecutiveDiscards) {
|
|
431
|
+
state = { ...state, phase: "stopping", updated_at: now() }
|
|
432
|
+
await writeState(runDir, state)
|
|
433
|
+
callbacks.onPhaseChange("stopping", `stagnation — ${consecutiveDiscards} consecutive discards with no improvement`)
|
|
434
|
+
break
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
// Warn once at ~2/3 of the stagnation limit
|
|
438
|
+
const warningThreshold = Math.floor(maxConsecutiveDiscards * 2 / 3)
|
|
439
|
+
if (warningThreshold > 0 && consecutiveDiscards === warningThreshold) {
|
|
440
|
+
callbacks.onError(`Warning: ${consecutiveDiscards}/${maxConsecutiveDiscards} consecutive discards. Agent may be stuck — consider stopping and reviewing results.`)
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
if (effectiveMaxExperiments && state.experiment_number >= effectiveMaxExperiments) {
|
|
444
|
+
state = { ...state, phase: "complete", updated_at: now() }
|
|
445
|
+
await writeState(runDir, state)
|
|
446
|
+
callbacks.onPhaseChange("complete", `reached max experiments (${effectiveMaxExperiments})`)
|
|
447
|
+
break
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// --- Start new experiment ---
|
|
451
|
+
const experimentNumber = state.experiment_number + 1
|
|
452
|
+
callbacks.onExperimentStart(experimentNumber)
|
|
453
|
+
|
|
454
|
+
state = { ...state, phase: "agent_running", experiment_number: experimentNumber, updated_at: now() }
|
|
455
|
+
await writeState(runDir, state)
|
|
456
|
+
callbacks.onPhaseChange("agent_running")
|
|
457
|
+
callbacks.onStateUpdate(state)
|
|
458
|
+
|
|
459
|
+
// --- Build context packet ---
|
|
460
|
+
const packet = await buildContextPacket(
|
|
461
|
+
cwd, programDir, runDir, state, config, { ideasBacklogEnabled, consecutiveDiscards, maxConsecutiveDiscards, measurementDiagnostics: lastDiagnostics },
|
|
462
|
+
)
|
|
463
|
+
const systemPrompt = getExperimentSystemPrompt(packet.program_md, { ideasBacklogEnabled })
|
|
464
|
+
const userPrompt = buildExperimentPrompt(packet)
|
|
465
|
+
|
|
466
|
+
// --- Spawn experiment agent ---
|
|
467
|
+
const startSha = await getFullSha(cwd)
|
|
468
|
+
const measurementSnapshot = await readMeasurementSnapshot(programDir)
|
|
469
|
+
|
|
470
|
+
const outcome = await runExperimentAgent(
|
|
471
|
+
cwd,
|
|
472
|
+
systemPrompt,
|
|
473
|
+
userPrompt,
|
|
474
|
+
modelConfig,
|
|
475
|
+
startSha,
|
|
476
|
+
(text) => callbacks.onAgentStream(text),
|
|
477
|
+
(status) => callbacks.onAgentToolUse(status),
|
|
478
|
+
options.signal,
|
|
479
|
+
config.max_turns ?? 50,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
// Log cost data if available + accumulate tokens on run state
|
|
483
|
+
if (outcome.cost) {
|
|
484
|
+
callbacks.onExperimentCost?.(outcome.cost)
|
|
485
|
+
state = {
|
|
486
|
+
...state,
|
|
487
|
+
total_tokens: (state.total_tokens ?? 0) + outcome.cost.input_tokens + outcome.cost.output_tokens,
|
|
488
|
+
total_cost_usd: (state.total_cost_usd ?? 0) + outcome.cost.total_cost_usd,
|
|
489
|
+
}
|
|
490
|
+
await writeState(runDir, state)
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// --- Abort detection + cleanup ---
|
|
494
|
+
if (options.signal?.aborted) {
|
|
495
|
+
callbacks.onPhaseChange("stopping", "aborted by user")
|
|
496
|
+
|
|
497
|
+
await restoreMeasurementSnapshot(measurementSnapshot)
|
|
498
|
+
await resetAndVerify(cwd, startSha, "abort cleanup")
|
|
499
|
+
|
|
500
|
+
const abortResult: ExperimentResult = {
|
|
501
|
+
experiment_number: experimentNumber,
|
|
502
|
+
commit: startSha.slice(0, 7),
|
|
503
|
+
metric_value: 0,
|
|
504
|
+
secondary_values: "",
|
|
505
|
+
status: "crash",
|
|
506
|
+
description: "aborted by user",
|
|
507
|
+
measurement_duration_ms: 0,
|
|
508
|
+
}
|
|
509
|
+
await appendResult(runDir, abortResult)
|
|
510
|
+
await recordIdeasBacklog(ideasBacklogEnabled, runDir, abortResult)
|
|
511
|
+
callbacks.onExperimentEnd(abortResult)
|
|
512
|
+
|
|
513
|
+
state = {
|
|
514
|
+
...state,
|
|
515
|
+
total_crashes: state.total_crashes + 1,
|
|
516
|
+
candidate_sha: null,
|
|
517
|
+
phase: "stopping",
|
|
518
|
+
updated_at: now(),
|
|
519
|
+
}
|
|
520
|
+
await writeState(runDir, state)
|
|
521
|
+
break
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
// --- Handle no-commit or error (no code change) ---
|
|
525
|
+
if (outcome.type === "no_commit" || outcome.type === "agent_error") {
|
|
526
|
+
const measurementViolations = await getMeasurementViolations(measurementSnapshot)
|
|
527
|
+
if (measurementViolations.length > 0) {
|
|
528
|
+
await restoreMeasurementSnapshot(measurementSnapshot)
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
await resetAndVerify(cwd, startSha, "failed experiment cleanup")
|
|
532
|
+
|
|
533
|
+
const isLockViolation = measurementViolations.length > 0
|
|
534
|
+
const crashDesc = isLockViolation
|
|
535
|
+
? `lock violation: modified ${measurementViolations.join(", ")}`
|
|
536
|
+
: outcome.type === "no_commit"
|
|
537
|
+
? "no commit produced"
|
|
538
|
+
: `agent error: ${outcome.error}`
|
|
539
|
+
const crashResult: ExperimentResult = {
|
|
540
|
+
experiment_number: experimentNumber,
|
|
541
|
+
commit: startSha.slice(0, 7),
|
|
542
|
+
metric_value: 0,
|
|
543
|
+
secondary_values: "",
|
|
544
|
+
status: isLockViolation ? "discard" : "crash",
|
|
545
|
+
description: crashDesc,
|
|
546
|
+
measurement_duration_ms: 0,
|
|
547
|
+
}
|
|
548
|
+
await appendResult(runDir, crashResult)
|
|
549
|
+
await recordIdeasBacklog(ideasBacklogEnabled, runDir, crashResult, outcome.notes)
|
|
550
|
+
callbacks.onExperimentEnd(crashResult)
|
|
551
|
+
|
|
552
|
+
state = {
|
|
553
|
+
...state,
|
|
554
|
+
total_crashes: isLockViolation ? state.total_crashes : state.total_crashes + 1,
|
|
555
|
+
total_discards: isLockViolation ? state.total_discards + 1 : state.total_discards,
|
|
556
|
+
candidate_sha: null,
|
|
557
|
+
phase: "idle",
|
|
558
|
+
updated_at: now(),
|
|
559
|
+
}
|
|
560
|
+
await writeState(runDir, state)
|
|
561
|
+
callbacks.onStateUpdate(state)
|
|
562
|
+
consecutiveDiscards++
|
|
563
|
+
state = await maybeRebaseline(consecutiveDiscards, measureShPath, buildShPath, cwd, config, state, runDir, callbacks, options.signal)
|
|
564
|
+
continue
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
// --- Agent committed. Use SHA from outcome. ---
|
|
568
|
+
const candidateSha = outcome.sha
|
|
569
|
+
state = { ...state, candidate_sha: candidateSha, updated_at: now() }
|
|
570
|
+
await writeState(runDir, state)
|
|
571
|
+
|
|
572
|
+
if (!(await isWorkingTreeClean(cwd))) {
|
|
573
|
+
await resetHard(cwd, candidateSha)
|
|
574
|
+
if (!(await isWorkingTreeClean(cwd))) {
|
|
575
|
+
const measurementViolations = await getMeasurementViolations(measurementSnapshot)
|
|
576
|
+
if (measurementViolations.length > 0) {
|
|
577
|
+
await restoreMeasurementSnapshot(measurementSnapshot)
|
|
578
|
+
}
|
|
579
|
+
throw new Error("Agent left uncommitted files after committing; stopping to avoid measuring a dirty worktree.")
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
// --- Check lock violation ---
|
|
584
|
+
const lockCheck = checkLockViolation(outcome.files_changed)
|
|
585
|
+
const measurementViolations = await getMeasurementViolations(measurementSnapshot)
|
|
586
|
+
const lockViolationFiles = [...new Set([...lockCheck.files, ...measurementViolations])]
|
|
587
|
+
if (lockViolationFiles.length > 0) {
|
|
588
|
+
callbacks.onPhaseChange("reverting", `lock violation: ${lockViolationFiles.join(", ")}`)
|
|
589
|
+
|
|
590
|
+
state = { ...state, phase: "reverting", updated_at: now() }
|
|
591
|
+
await writeState(runDir, state)
|
|
592
|
+
|
|
593
|
+
if (measurementViolations.length > 0) {
|
|
594
|
+
await restoreMeasurementSnapshot(measurementSnapshot)
|
|
595
|
+
}
|
|
596
|
+
await resetAndVerify(cwd, startSha, "lock violation reset")
|
|
597
|
+
|
|
598
|
+
const lockResult: ExperimentResult = {
|
|
599
|
+
experiment_number: experimentNumber,
|
|
600
|
+
commit: candidateSha.slice(0, 7),
|
|
601
|
+
metric_value: 0,
|
|
602
|
+
secondary_values: "",
|
|
603
|
+
status: "discard",
|
|
604
|
+
description: `lock violation: modified ${lockViolationFiles.join(", ")} — ${outcome.description}`,
|
|
605
|
+
measurement_duration_ms: 0,
|
|
606
|
+
}
|
|
607
|
+
await appendResult(runDir, lockResult)
|
|
608
|
+
await recordIdeasBacklog(ideasBacklogEnabled, runDir, lockResult, outcome.notes)
|
|
609
|
+
callbacks.onExperimentEnd(lockResult)
|
|
610
|
+
|
|
611
|
+
state = { ...state, total_discards: state.total_discards + 1, candidate_sha: null, phase: "idle", updated_at: now() }
|
|
612
|
+
await writeState(runDir, state)
|
|
613
|
+
callbacks.onStateUpdate(state)
|
|
614
|
+
consecutiveDiscards++
|
|
615
|
+
state = await maybeRebaseline(consecutiveDiscards, measureShPath, buildShPath, cwd, config, state, runDir, callbacks, options.signal)
|
|
616
|
+
continue
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
// --- Check commit count (warn if multiple) ---
|
|
620
|
+
const commitCount = await countCommitsBetween(cwd, startSha, candidateSha)
|
|
621
|
+
if (commitCount > 1) {
|
|
622
|
+
callbacks.onError(`Warning: agent made ${commitCount} commits (expected 1). Proceeding with measurement.`)
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
// --- Hand off to measurement ---
|
|
626
|
+
const measurementResult = await runMeasurementAndDecide(
|
|
627
|
+
cwd, runDir, measureShPath, buildShPath,
|
|
628
|
+
config, state, startSha, candidateSha, outcome.description,
|
|
629
|
+
outcome.diff_stats,
|
|
630
|
+
callbacks,
|
|
631
|
+
(result) => recordIdeasBacklog(ideasBacklogEnabled, runDir, result, outcome.notes),
|
|
632
|
+
options.signal,
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
// Check if abort fired during measurement
|
|
636
|
+
if (options.signal?.aborted) {
|
|
637
|
+
state = { ...measurementResult.state, phase: "stopping", updated_at: now() }
|
|
638
|
+
await writeState(runDir, state)
|
|
639
|
+
break
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
state = measurementResult.state
|
|
643
|
+
lastDiagnostics = measurementResult.diagnostics
|
|
644
|
+
if (measurementResult.kept) {
|
|
645
|
+
consecutiveDiscards = 0
|
|
646
|
+
} else {
|
|
647
|
+
consecutiveDiscards++
|
|
648
|
+
state = await maybeRebaseline(consecutiveDiscards, measureShPath, buildShPath, cwd, config, state, runDir, callbacks, options.signal)
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
callbacks.onStateUpdate(state)
|
|
652
|
+
}
|
|
653
|
+
} finally {
|
|
654
|
+
await unlockMeasurement(programDir)
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
// --- Finalize ---
|
|
658
|
+
|
|
659
|
+
// Determine termination reason
|
|
660
|
+
const reason: TerminationReason = options.signal?.aborted
|
|
661
|
+
? "aborted"
|
|
662
|
+
: consecutiveDiscards >= maxConsecutiveDiscards
|
|
663
|
+
? "stagnation"
|
|
664
|
+
: state.experiment_number >= effectiveMaxExperiments
|
|
665
|
+
? "max_experiments"
|
|
666
|
+
: "stopped"
|
|
667
|
+
|
|
668
|
+
const finalState: RunState = {
|
|
669
|
+
...state,
|
|
670
|
+
phase: state.phase === "stopping" ? "complete" as const : state.phase,
|
|
671
|
+
termination_reason: reason,
|
|
672
|
+
updated_at: now(),
|
|
673
|
+
}
|
|
674
|
+
await writeState(runDir, finalState)
|
|
675
|
+
callbacks.onStateUpdate(finalState)
|
|
676
|
+
callbacks.onLoopComplete?.(finalState, reason)
|
|
677
|
+
|
|
678
|
+
return finalState
|
|
679
|
+
}
|