@spacek33z/autoauto 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -0
- package/package.json +51 -0
- package/src/App.tsx +224 -0
- package/src/cli.ts +772 -0
- package/src/components/AgentPanel.tsx +254 -0
- package/src/components/Chat.test.tsx +71 -0
- package/src/components/Chat.tsx +308 -0
- package/src/components/CycleField.tsx +23 -0
- package/src/components/ModelPicker.tsx +97 -0
- package/src/components/PostUpdatePrompt.tsx +46 -0
- package/src/components/ResultsTable.tsx +172 -0
- package/src/components/RunCompletePrompt.tsx +90 -0
- package/src/components/RunSettingsOverlay.tsx +49 -0
- package/src/components/RunsTable.tsx +219 -0
- package/src/components/StatsHeader.tsx +100 -0
- package/src/daemon.ts +264 -0
- package/src/index.tsx +8 -0
- package/src/lib/agent/agent-provider.test.ts +133 -0
- package/src/lib/agent/claude-provider.ts +277 -0
- package/src/lib/agent/codex-provider.ts +413 -0
- package/src/lib/agent/default-providers.ts +10 -0
- package/src/lib/agent/index.ts +32 -0
- package/src/lib/agent/mock-provider.ts +61 -0
- package/src/lib/agent/opencode-provider.ts +424 -0
- package/src/lib/agent/types.ts +73 -0
- package/src/lib/auth.ts +11 -0
- package/src/lib/config.ts +152 -0
- package/src/lib/daemon-callbacks.ts +59 -0
- package/src/lib/daemon-client.ts +16 -0
- package/src/lib/daemon-lifecycle.ts +368 -0
- package/src/lib/daemon-spawn.ts +122 -0
- package/src/lib/daemon-status.ts +189 -0
- package/src/lib/daemon-watcher.ts +192 -0
- package/src/lib/experiment-loop.ts +679 -0
- package/src/lib/experiment.ts +356 -0
- package/src/lib/finalize.test.ts +143 -0
- package/src/lib/finalize.ts +511 -0
- package/src/lib/format.test.ts +32 -0
- package/src/lib/format.ts +44 -0
- package/src/lib/git.ts +176 -0
- package/src/lib/ideas-backlog.test.ts +54 -0
- package/src/lib/ideas-backlog.ts +109 -0
- package/src/lib/measure.ts +472 -0
- package/src/lib/model-options.ts +24 -0
- package/src/lib/programs.ts +247 -0
- package/src/lib/push-stream.ts +48 -0
- package/src/lib/run-context.ts +112 -0
- package/src/lib/run-setup.ts +34 -0
- package/src/lib/run.ts +383 -0
- package/src/lib/syntax-theme.ts +39 -0
- package/src/lib/system-prompts/experiment.ts +77 -0
- package/src/lib/system-prompts/finalize.ts +90 -0
- package/src/lib/system-prompts/index.ts +7 -0
- package/src/lib/system-prompts/setup.ts +516 -0
- package/src/lib/system-prompts/update.ts +188 -0
- package/src/lib/tool-events.ts +99 -0
- package/src/lib/validate-measurement.ts +326 -0
- package/src/lib/worktree.ts +40 -0
- package/src/screens/AuthErrorScreen.tsx +31 -0
- package/src/screens/ExecutionScreen.tsx +851 -0
- package/src/screens/FirstSetupScreen.tsx +168 -0
- package/src/screens/HomeScreen.tsx +406 -0
- package/src/screens/PreRunScreen.tsx +206 -0
- package/src/screens/SettingsScreen.tsx +189 -0
- package/src/screens/SetupScreen.tsx +226 -0
- package/src/tui.tsx +17 -0
- package/tsconfig.json +17 -0
package/src/lib/run.ts
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
import { rename, readdir, appendFile, rm } from "node:fs/promises"
|
|
2
|
+
import { join } from "node:path"
|
|
3
|
+
import { $ } from "bun"
|
|
4
|
+
import { getProgramDir, type ProgramConfig } from "./programs.ts"
|
|
5
|
+
|
|
6
|
+
// --- Types ---
|
|
7
|
+
|
|
8
|
+
/** Status values for results.tsv rows */
|
|
9
|
+
export type ExperimentStatus = "keep" | "discard" | "measurement_failure" | "crash"
|
|
10
|
+
|
|
11
|
+
/** Phases the daemon/orchestrator can be in */
|
|
12
|
+
export type RunPhase =
|
|
13
|
+
| "idle"
|
|
14
|
+
| "baseline"
|
|
15
|
+
| "agent_running"
|
|
16
|
+
| "measuring"
|
|
17
|
+
| "reverting"
|
|
18
|
+
| "kept"
|
|
19
|
+
| "stopping"
|
|
20
|
+
| "complete"
|
|
21
|
+
| "crashed"
|
|
22
|
+
| "finalizing"
|
|
23
|
+
|
|
24
|
+
/** Termination reason for a completed run */
|
|
25
|
+
export type TerminationReason = "aborted" | "max_experiments" | "stopped" | "stagnation"
|
|
26
|
+
|
|
27
|
+
/** Persisted run state — the checkpoint file */
|
|
28
|
+
export interface RunState {
|
|
29
|
+
run_id: string
|
|
30
|
+
program_slug: string
|
|
31
|
+
phase: RunPhase
|
|
32
|
+
experiment_number: number
|
|
33
|
+
original_baseline: number
|
|
34
|
+
current_baseline: number
|
|
35
|
+
best_metric: number
|
|
36
|
+
best_experiment: number
|
|
37
|
+
total_keeps: number
|
|
38
|
+
total_discards: number
|
|
39
|
+
total_crashes: number
|
|
40
|
+
branch_name: string
|
|
41
|
+
original_baseline_sha: string
|
|
42
|
+
last_known_good_sha: string
|
|
43
|
+
candidate_sha: string | null
|
|
44
|
+
started_at: string
|
|
45
|
+
updated_at: string
|
|
46
|
+
/** Agent provider used for this run. Legacy runs omit this and default to Claude. */
|
|
47
|
+
provider?: string
|
|
48
|
+
/** Model alias/ID used for this run (e.g. "sonnet" or "anthropic/claude-sonnet-4-5") */
|
|
49
|
+
model?: string
|
|
50
|
+
/** Effort level used for this run */
|
|
51
|
+
effort?: string
|
|
52
|
+
/** Cumulative input+output tokens across all experiments */
|
|
53
|
+
total_tokens?: number
|
|
54
|
+
/** Cumulative cost in USD across all experiments */
|
|
55
|
+
total_cost_usd?: number
|
|
56
|
+
/** Why the run terminated (set on completion) */
|
|
57
|
+
termination_reason?: TerminationReason | null
|
|
58
|
+
/** Branch the user was on before the run started */
|
|
59
|
+
original_branch?: string
|
|
60
|
+
/** Absolute path to the AutoAuto-owned worktree */
|
|
61
|
+
worktree_path?: string
|
|
62
|
+
/** True when running without worktree isolation (experiments run in main checkout) */
|
|
63
|
+
in_place?: boolean
|
|
64
|
+
/** Error message if the run crashed */
|
|
65
|
+
error?: string | null
|
|
66
|
+
/** Which phase the error occurred in */
|
|
67
|
+
error_phase?: RunPhase | null
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** A single row in results.tsv */
|
|
71
|
+
export interface ExperimentResult {
|
|
72
|
+
experiment_number: number
|
|
73
|
+
commit: string
|
|
74
|
+
metric_value: number
|
|
75
|
+
secondary_values: string
|
|
76
|
+
status: ExperimentStatus
|
|
77
|
+
description: string
|
|
78
|
+
/** Total wall time for the measurement series (all repeats), in ms */
|
|
79
|
+
measurement_duration_ms: number
|
|
80
|
+
/** Diff stats JSON — e.g. {"lines_added":12,"lines_removed":5}. Absent for old runs. */
|
|
81
|
+
diff_stats?: string
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** Structured secondary values stored in results.tsv */
|
|
85
|
+
export interface SecondaryValuesBlob {
|
|
86
|
+
quality_gates: Record<string, number>
|
|
87
|
+
secondary_metrics: Record<string, number>
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/** Serializes DiffStats into compact JSON for results.tsv. */
|
|
91
|
+
export function serializeDiffStats(stats: { lines_added: number; lines_removed: number } | undefined): string {
|
|
92
|
+
if (!stats) return ""
|
|
93
|
+
return JSON.stringify({ lines_added: stats.lines_added, lines_removed: stats.lines_removed })
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/** Serializes quality gate and secondary metric medians into the structured JSON format. */
|
|
97
|
+
export function serializeSecondaryValues(
|
|
98
|
+
qualityGates: Record<string, number>,
|
|
99
|
+
secondaryMetrics: Record<string, number>,
|
|
100
|
+
): string {
|
|
101
|
+
return JSON.stringify({ quality_gates: qualityGates, secondary_metrics: secondaryMetrics })
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Parses secondary_values JSON with backward compatibility.
|
|
106
|
+
* New format: { quality_gates: {...}, secondary_metrics: {...} }
|
|
107
|
+
* Old format: flat { field: value, ... } — all values placed under quality_gates.
|
|
108
|
+
*/
|
|
109
|
+
export function parseSecondaryValues(raw: string | undefined): SecondaryValuesBlob {
|
|
110
|
+
const empty: SecondaryValuesBlob = { quality_gates: {}, secondary_metrics: {} }
|
|
111
|
+
if (!raw) return empty
|
|
112
|
+
try {
|
|
113
|
+
const parsed = JSON.parse(raw)
|
|
114
|
+
if (typeof parsed !== "object" || parsed === null) return empty
|
|
115
|
+
// Detect new structured format
|
|
116
|
+
if ("quality_gates" in parsed || "secondary_metrics" in parsed) {
|
|
117
|
+
return {
|
|
118
|
+
quality_gates: (parsed as Record<string, unknown>).quality_gates as Record<string, number> ?? {},
|
|
119
|
+
secondary_metrics: (parsed as Record<string, unknown>).secondary_metrics as Record<string, number> ?? {},
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
// Old flat format — treat all values as quality gates
|
|
123
|
+
return { quality_gates: parsed as Record<string, number>, secondary_metrics: {} }
|
|
124
|
+
} catch {
|
|
125
|
+
return empty
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// --- Run ID ---
|
|
130
|
+
|
|
131
|
+
const pad = (n: number) => String(n).padStart(2, "0")
|
|
132
|
+
|
|
133
|
+
export function generateRunId(): string {
|
|
134
|
+
const now = new Date()
|
|
135
|
+
return `${now.getFullYear()}${pad(now.getMonth() + 1)}${pad(now.getDate())}-${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// --- State Persistence ---
|
|
139
|
+
|
|
140
|
+
/** Atomically writes state.json via temp-file + rename. */
|
|
141
|
+
export async function writeState(runDir: string, state: RunState): Promise<void> {
|
|
142
|
+
const tmpPath = join(runDir, "state.json.tmp")
|
|
143
|
+
await Bun.write(tmpPath, JSON.stringify(state, null, 2) + "\n")
|
|
144
|
+
await rename(tmpPath, join(runDir, "state.json"))
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
export async function readState(runDir: string): Promise<RunState> {
|
|
148
|
+
return Bun.file(join(runDir, "state.json")).json() as Promise<RunState>
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// --- Results ---
|
|
152
|
+
|
|
153
|
+
export async function appendResult(runDir: string, result: ExperimentResult): Promise<void> {
|
|
154
|
+
const secondaryStr = result.secondary_values || ""
|
|
155
|
+
const diffStatsStr = result.diff_stats || ""
|
|
156
|
+
const line = `${result.experiment_number}\t${result.commit}\t${result.metric_value}\t${secondaryStr}\t${result.status}\t${result.description}\t${result.measurement_duration_ms}\t${diffStatsStr}\n`
|
|
157
|
+
await appendFile(join(runDir, "results.tsv"), line)
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// --- Results Parsing (synchronous — operate on pre-read content) ---
|
|
161
|
+
|
|
162
|
+
/** Parses a single TSV row into a typed result. Returns null if malformed. */
|
|
163
|
+
function parseTsvRow(line: string): ExperimentResult | null {
|
|
164
|
+
const parts = line.split("\t")
|
|
165
|
+
if (parts.length < 6) return null
|
|
166
|
+
return {
|
|
167
|
+
experiment_number: parseInt(parts[0], 10),
|
|
168
|
+
commit: parts[1],
|
|
169
|
+
metric_value: parseFloat(parts[2]),
|
|
170
|
+
secondary_values: parts[3],
|
|
171
|
+
status: parts[4] as ExperimentStatus,
|
|
172
|
+
description: parts[5],
|
|
173
|
+
measurement_duration_ms: parts[6] ? parseInt(parts[6], 10) : 0,
|
|
174
|
+
diff_stats: parts[7] || undefined,
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/** Formats header + last N data rows from raw results.tsv content. */
|
|
179
|
+
export function formatRecentResults(raw: string, count = 15): string {
|
|
180
|
+
const lines = raw.split("\n").filter(Boolean)
|
|
181
|
+
if (lines.length <= 1) return lines.join("\n")
|
|
182
|
+
|
|
183
|
+
const header = lines[0]
|
|
184
|
+
const rows = lines.slice(1)
|
|
185
|
+
const recent = rows.slice(-count)
|
|
186
|
+
return [header, ...recent].join("\n")
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/** Parses the last row of raw results.tsv content into a typed object. */
|
|
190
|
+
export function parseLastResult(raw: string): ExperimentResult | null {
|
|
191
|
+
const lines = raw.trim().split("\n")
|
|
192
|
+
if (lines.length <= 1) return null // only header
|
|
193
|
+
return parseTsvRow(lines[lines.length - 1])
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/** Parses the last keep row from raw results.tsv content. */
|
|
197
|
+
export function parseLastKeepResult(raw: string): ExperimentResult | null {
|
|
198
|
+
const lines = raw.trim().split("\n")
|
|
199
|
+
for (let i = lines.length - 1; i >= 1; i--) {
|
|
200
|
+
const row = parseTsvRow(lines[i])
|
|
201
|
+
if (row?.status === "keep") return row
|
|
202
|
+
}
|
|
203
|
+
return null
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/** Extracts SHAs of recent discarded/crashed experiments from raw results.tsv content. */
|
|
207
|
+
export function parseDiscardedShas(raw: string, count = 5): string[] {
|
|
208
|
+
const lines = raw.trim().split("\n")
|
|
209
|
+
const shas: string[] = []
|
|
210
|
+
|
|
211
|
+
for (let i = lines.length - 1; i >= 1 && shas.length < count; i--) {
|
|
212
|
+
const parts = lines[i].split("\t")
|
|
213
|
+
if (parts.length >= 5) {
|
|
214
|
+
const status = parts[4]
|
|
215
|
+
if (status === "discard" || status === "crash" || status === "measurement_failure") {
|
|
216
|
+
shas.push(parts[1]) // commit SHA
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
return shas
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// --- Results Reading ---
|
|
225
|
+
|
|
226
|
+
/** Parses the entire results.tsv into a typed array. */
|
|
227
|
+
export async function readAllResults(runDir: string): Promise<ExperimentResult[]> {
|
|
228
|
+
const raw = await Bun.file(join(runDir, "results.tsv")).text()
|
|
229
|
+
const lines = raw.trim().split("\n")
|
|
230
|
+
if (lines.length <= 1) return [] // only header
|
|
231
|
+
|
|
232
|
+
const results: ExperimentResult[] = []
|
|
233
|
+
for (let i = 1; i < lines.length; i++) {
|
|
234
|
+
const row = parseTsvRow(lines[i])
|
|
235
|
+
if (row) results.push(row)
|
|
236
|
+
}
|
|
237
|
+
return results
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/** Extracts metric values from keep results for sparkline/chart rendering. */
|
|
241
|
+
export function getMetricHistory(results: ExperimentResult[]): number[] {
|
|
242
|
+
return results
|
|
243
|
+
.filter((r) => r.status === "keep")
|
|
244
|
+
.map((r) => r.metric_value)
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/** Computes average measurement duration from results that have duration data. */
|
|
248
|
+
export function getAvgMeasurementDuration(results: ExperimentResult[]): number | null {
|
|
249
|
+
const durations = results
|
|
250
|
+
.filter((r) => r.measurement_duration_ms > 0)
|
|
251
|
+
.map((r) => r.measurement_duration_ms)
|
|
252
|
+
if (durations.length === 0) return null
|
|
253
|
+
return Math.round(durations.reduce((sum, d) => sum + d, 0) / durations.length)
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/** Derived statistics from state for the TUI dashboard. */
|
|
257
|
+
export interface RunStats {
|
|
258
|
+
total_experiments: number
|
|
259
|
+
total_keeps: number
|
|
260
|
+
total_discards: number
|
|
261
|
+
total_crashes: number
|
|
262
|
+
keep_rate: number
|
|
263
|
+
improvement_pct: number
|
|
264
|
+
current_improvement_pct: number
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function computeImprovementPct(
|
|
268
|
+
original: number,
|
|
269
|
+
current: number,
|
|
270
|
+
direction: ProgramConfig["direction"],
|
|
271
|
+
): number {
|
|
272
|
+
if (original === 0) return 0
|
|
273
|
+
return direction === "lower"
|
|
274
|
+
? ((original - current) / Math.abs(original)) * 100
|
|
275
|
+
: ((current - original) / Math.abs(original)) * 100
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/** Computes derived statistics from run state. Counts come from RunState's authoritative counters. */
|
|
279
|
+
export function getRunStats(state: RunState, direction: ProgramConfig["direction"]): RunStats {
|
|
280
|
+
const total = state.total_keeps + state.total_discards + state.total_crashes
|
|
281
|
+
|
|
282
|
+
return {
|
|
283
|
+
total_experiments: total,
|
|
284
|
+
total_keeps: state.total_keeps,
|
|
285
|
+
total_discards: state.total_discards,
|
|
286
|
+
total_crashes: state.total_crashes,
|
|
287
|
+
keep_rate: total > 0 ? state.total_keeps / total : 0,
|
|
288
|
+
improvement_pct: computeImprovementPct(state.original_baseline, state.best_metric, direction),
|
|
289
|
+
current_improvement_pct: computeImprovementPct(state.original_baseline, state.current_baseline, direction),
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// --- Run Listing ---
|
|
294
|
+
|
|
295
|
+
/** Metadata for a run, used in list views. */
|
|
296
|
+
export interface RunInfo {
|
|
297
|
+
run_id: string
|
|
298
|
+
run_dir: string
|
|
299
|
+
state: RunState | null
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/** Lists all runs for a program, sorted newest first. */
|
|
303
|
+
export async function listRuns(programDir: string): Promise<RunInfo[]> {
|
|
304
|
+
const runsDir = join(programDir, "runs")
|
|
305
|
+
let entries: string[]
|
|
306
|
+
try {
|
|
307
|
+
const dirents = await readdir(runsDir, { withFileTypes: true })
|
|
308
|
+
entries = dirents.filter((e) => e.isDirectory()).map((e) => e.name)
|
|
309
|
+
} catch {
|
|
310
|
+
return []
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
const runs = await Promise.all(
|
|
314
|
+
entries.map(async (runId): Promise<RunInfo> => {
|
|
315
|
+
const runDir = join(runsDir, runId)
|
|
316
|
+
let state: RunState | null = null
|
|
317
|
+
try {
|
|
318
|
+
state = await readState(runDir)
|
|
319
|
+
} catch {
|
|
320
|
+
// state.json missing or corrupt
|
|
321
|
+
}
|
|
322
|
+
return { run_id: runId, run_dir: runDir, state }
|
|
323
|
+
}),
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
runs.sort((a, b) => b.run_id.localeCompare(a.run_id))
|
|
327
|
+
return runs
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
/** Returns the most recent run for a program. */
|
|
331
|
+
export async function getLatestRun(programDir: string): Promise<RunInfo | null> {
|
|
332
|
+
const runs = await listRuns(programDir)
|
|
333
|
+
return runs.length > 0 ? runs[0] : null
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
export function isRunActive(r: RunInfo): boolean {
|
|
337
|
+
const phase = r.state?.phase
|
|
338
|
+
return phase != null && phase !== "complete" && phase !== "crashed"
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// --- Run Deletion ---
|
|
342
|
+
|
|
343
|
+
/** Deletes a completed/crashed run: removes run directory, worktree, and git branch. */
|
|
344
|
+
export async function deleteRun(projectRoot: string, run: RunInfo): Promise<void> {
|
|
345
|
+
if (isRunActive(run)) {
|
|
346
|
+
throw new Error("Cannot delete an active run")
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
const state = run.state
|
|
350
|
+
|
|
351
|
+
// Remove worktree if it exists (skip for in-place runs — there's no worktree)
|
|
352
|
+
if (state?.worktree_path && !state?.in_place) {
|
|
353
|
+
await $`git worktree remove --force ${state.worktree_path}`.cwd(projectRoot).nothrow().quiet()
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Delete the experiment branch
|
|
357
|
+
if (state?.branch_name) {
|
|
358
|
+
await $`git branch -D ${state.branch_name}`.cwd(projectRoot).nothrow().quiet()
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Remove the run directory
|
|
362
|
+
await rm(run.run_dir, { recursive: true, force: true })
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/** Deletes an entire program: removes all runs (worktrees + branches) and the program directory. */
|
|
366
|
+
export async function deleteProgram(projectRoot: string, slug: string): Promise<void> {
|
|
367
|
+
const programDir = getProgramDir(projectRoot, slug)
|
|
368
|
+
const runs = await listRuns(programDir)
|
|
369
|
+
|
|
370
|
+
const activeRun = runs.find(isRunActive)
|
|
371
|
+
if (activeRun) {
|
|
372
|
+
throw new Error("Cannot delete a program with an active run")
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// Delete all runs first (cleans up worktrees + branches)
|
|
376
|
+
for (const run of runs) {
|
|
377
|
+
await deleteRun(projectRoot, run)
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// Remove the program directory
|
|
381
|
+
await rm(programDir, { recursive: true, force: true })
|
|
382
|
+
}
|
|
383
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { SyntaxStyle } from "@opentui/core"
|
|
2
|
+
import type { ThemeTokenStyle } from "@opentui/core"
|
|
3
|
+
|
|
4
|
+
const theme: ThemeTokenStyle[] = [
|
|
5
|
+
// Base text
|
|
6
|
+
{ scope: ["default"], style: { foreground: "#a9b1d6" } },
|
|
7
|
+
|
|
8
|
+
// Keywords & control flow
|
|
9
|
+
{ scope: ["keyword"], style: { foreground: "#bb9af7", bold: true } },
|
|
10
|
+
{ scope: ["operator"], style: { foreground: "#89ddff" } },
|
|
11
|
+
|
|
12
|
+
// Literals
|
|
13
|
+
{ scope: ["string"], style: { foreground: "#9ece6a" } },
|
|
14
|
+
{ scope: ["number"], style: { foreground: "#ff9e64" } },
|
|
15
|
+
{ scope: ["constant"], style: { foreground: "#ff9e64" } },
|
|
16
|
+
|
|
17
|
+
// Functions & types
|
|
18
|
+
{ scope: ["function"], style: { foreground: "#7aa2f7" } },
|
|
19
|
+
{ scope: ["type"], style: { foreground: "#2ac3de" } },
|
|
20
|
+
|
|
21
|
+
// Comments
|
|
22
|
+
{ scope: ["comment"], style: { foreground: "#565f89", italic: true } },
|
|
23
|
+
|
|
24
|
+
// Variables & properties
|
|
25
|
+
{ scope: ["variable"], style: { foreground: "#c0caf5" } },
|
|
26
|
+
{ scope: ["property"], style: { foreground: "#73daca" } },
|
|
27
|
+
{ scope: ["punctuation"], style: { foreground: "#89ddff" } },
|
|
28
|
+
{ scope: ["tag"], style: { foreground: "#f7768e" } },
|
|
29
|
+
|
|
30
|
+
// Markdown-specific
|
|
31
|
+
{ scope: ["markup.heading"], style: { foreground: "#7aa2f7", bold: true } },
|
|
32
|
+
{ scope: ["markup.italic"], style: { italic: true } },
|
|
33
|
+
{ scope: ["markup.bold"], style: { bold: true } },
|
|
34
|
+
{ scope: ["markup.link"], style: { foreground: "#7aa2f7", underline: true } },
|
|
35
|
+
{ scope: ["markup.raw"], style: { foreground: "#9ece6a" } },
|
|
36
|
+
{ scope: ["markup.list"], style: { foreground: "#ff7b72" } },
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
export const syntaxStyle = SyntaxStyle.fromTheme(theme)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/** Returns the system prompt for the experiment agent. Wraps program.md with framing instructions. */
|
|
2
|
+
export function getExperimentSystemPrompt(
|
|
3
|
+
programMd: string,
|
|
4
|
+
options: { ideasBacklogEnabled?: boolean } = {},
|
|
5
|
+
): string {
|
|
6
|
+
const useIdeasBacklog = options.ideasBacklogEnabled !== false
|
|
7
|
+
const notesInstruction = useIdeasBacklog ? `
|
|
8
|
+
### 6. Leave Experiment Notes
|
|
9
|
+
At the end of your final response, include exactly one notes block for the orchestrator:
|
|
10
|
+
|
|
11
|
+
<autoauto_notes>
|
|
12
|
+
{"hypothesis":"one sentence describing what you tried and why it should affect the metric","why":"one sentence describing what happened or what failure mode to watch for","avoid":["specific approach to avoid repeating"],"next":["specific follow-up idea to try next"]}
|
|
13
|
+
</autoauto_notes>
|
|
14
|
+
|
|
15
|
+
Keep these notes factual and short. Do not edit any ideas backlog file yourself; the orchestrator persists these notes.
|
|
16
|
+
` : ""
|
|
17
|
+
const exitSectionNumber = useIdeasBacklog ? "7" : "6"
|
|
18
|
+
|
|
19
|
+
return `You are an AutoAuto Experiment Agent — one experiment in an autonomous optimization loop. An external orchestrator handles measurement, keep/discard decisions, and loop control. Your job: analyze, plan ONE targeted optimization, implement it, validate it, and commit.
|
|
20
|
+
|
|
21
|
+
${programMd}
|
|
22
|
+
|
|
23
|
+
## How to Be a Good Experimenter
|
|
24
|
+
|
|
25
|
+
### 1. Analyze Before Acting
|
|
26
|
+
- Read the codebase within scope. Understand the current implementation before proposing changes.
|
|
27
|
+
- Study results.tsv carefully: which approaches were kept? Which were discarded? What patterns emerge?
|
|
28
|
+
- If "Measurement Diagnostics" are provided in the context, study them carefully — they contain detailed output from the measurement tool (e.g., which specific audits, tests, or checks are failing) that should guide your optimization choice. Do NOT guess from code inspection when diagnostics are available.
|
|
29
|
+
- Review the 'Recently Discarded Experiments' section above to understand WHY past experiments failed — don't just note that they failed.
|
|
30
|
+
- Identify the actual bottleneck or opportunity. A targeted change to the real bottleneck beats a shotgun approach.
|
|
31
|
+
- If you're experiment #1, spend extra time reading the codebase. Later experiments should build on what the history tells you.
|
|
32
|
+
|
|
33
|
+
### 2. Choose ONE Mechanism to Test
|
|
34
|
+
- Pick ONE specific mechanism per experiment. "Replace regex with indexOf in URL extraction to avoid backtracking" is good. "Various improvements" is bad.
|
|
35
|
+
- Build on what worked: if recent keeps share a pattern (e.g., reducing allocations), explore that direction further.
|
|
36
|
+
- Avoid what failed: if recent discards share a pattern (e.g., algorithmic changes that broke quality), steer clear.
|
|
37
|
+
- Do NOT repeat discarded approaches — even with minor variations. If tree shaking was discarded, "better tree shaking" is likely wasteful too.
|
|
38
|
+
- You should be able to explain in one sentence WHY your change should improve the metric. If you can't, pick a different approach.
|
|
39
|
+
- When the obvious optimizations are exhausted, look deeper: profile the code mentally, read the hot path line by line, check for redundant work, unnecessary allocations, or algorithmic inefficiency.
|
|
40
|
+
|
|
41
|
+
### 3. Implement the Change
|
|
42
|
+
- Make exactly ONE focused change — not multiple changes at once.
|
|
43
|
+
- Keep diffs small and reviewable. A 10-line targeted fix beats a 200-line refactor.
|
|
44
|
+
- Stay strictly within the allowed file scope defined in program.md.
|
|
45
|
+
- NEVER modify files in .autoauto/ — these are locked by the orchestrator.
|
|
46
|
+
- NEVER modify measure.sh, build.sh, or config.json — they are read-only (chmod 444).
|
|
47
|
+
- NEVER hardcode absolute home directory paths (e.g. /Users/username/...). Use relative paths, \`$HOME\`, or \`~\`.
|
|
48
|
+
|
|
49
|
+
### 4. Validate
|
|
50
|
+
- Run existing tests if available. If tests fail, fix them or revert — do NOT commit broken code.
|
|
51
|
+
- If your change breaks the build, try to fix it. If you can't fix it quickly, revert everything and exit without committing.
|
|
52
|
+
- Do NOT run the measurement script — the orchestrator handles that after you commit.
|
|
53
|
+
|
|
54
|
+
### 5. Commit with a Descriptive Message
|
|
55
|
+
- Commit with: git add -A && git commit -m "<type>(scope): description"
|
|
56
|
+
- Explain the MECHANISM in your commit message, not just the action:
|
|
57
|
+
- Good: "perf(parser): replace regex with indexOf for URL extraction — avoids backtracking on long strings"
|
|
58
|
+
- Bad: "perf: improve performance"
|
|
59
|
+
- The commit message is how future experiment agents learn from your work. Make it count.
|
|
60
|
+
${notesInstruction}
|
|
61
|
+
### ${exitSectionNumber}. When to Exit Without Committing
|
|
62
|
+
- If you've analyzed the code and can't find a promising change within scope — exit. A no-commit is better than a low-quality experiment that wastes measurement time.
|
|
63
|
+
- If validation fails and you cannot fix it — revert and exit.
|
|
64
|
+
- If your proposed change is essentially the same as a recently discarded experiment — exit instead of wasting a cycle.
|
|
65
|
+
- Do NOT ask for human input — you are fully autonomous.
|
|
66
|
+
|
|
67
|
+
## What Makes Experiments Fail (Avoid These)
|
|
68
|
+
- **Repeating discarded ideas:** The #1 waste of cycles. Read the history carefully.
|
|
69
|
+
- **Shotgun changes:** Multiple unrelated changes in one experiment. The orchestrator can't tell which one helped.
|
|
70
|
+
- **Out-of-scope modifications:** Touching files outside your allowed scope gets the entire experiment discarded.
|
|
71
|
+
- **Speculative changes without a mechanism:** "Maybe this will help" changes rarely work. Have a clear hypothesis.
|
|
72
|
+
- **Over-engineering:** Adding complexity that doesn't directly serve the metric. Simpler is better at equal metric.
|
|
73
|
+
- **Benchmark-specific tricks:** Bitwise hacks the compiler already does, unrolled loops for specific sizes — these don't generalize.
|
|
74
|
+
|
|
75
|
+
## Simplification Bonus
|
|
76
|
+
The orchestrator automatically keeps experiments that **remove more code than they add** (net negative lines changed) as long as the metric doesn't regress. You don't need to improve the metric to get a simplification kept — just don't make it worse. Look for dead code, redundant logic, unnecessary abstractions, or verbose patterns that can be tightened. Simplification keeps are valuable and count as real progress.`
|
|
77
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/** Returns the system prompt for the finalize agent. Read-only review + grouping of accumulated experiment changes. */
|
|
2
|
+
export function getFinalizeSystemPrompt(): string {
|
|
3
|
+
return `You are the AutoAuto Finalize Agent — a code reviewer for an autonomous experiment run. An orchestrator ran multiple experiments on a branch, keeping improvements and discarding failures. Your job: review the accumulated changes, assess risks, group them into logical changesets, and produce a structured summary.
|
|
4
|
+
|
|
5
|
+
## Your Role
|
|
6
|
+
|
|
7
|
+
You are a READ-ONLY reviewer. You MUST NOT modify any files. You only analyze and report.
|
|
8
|
+
|
|
9
|
+
## Tools
|
|
10
|
+
|
|
11
|
+
Use these tools to inspect the changes:
|
|
12
|
+
- **Bash**: Run \`git log\`, \`git diff\`, \`git show <sha>\` to inspect individual commits and the overall diff
|
|
13
|
+
- **Read**: Read source files to understand context around changes
|
|
14
|
+
- **Glob/Grep**: Search the codebase to understand how changed code is used
|
|
15
|
+
|
|
16
|
+
## Task
|
|
17
|
+
|
|
18
|
+
1. Review the full diff provided in the user message
|
|
19
|
+
2. Inspect individual experiment commits via \`git log --oneline\` and \`git show <sha>\` to understand the evolution
|
|
20
|
+
3. Read surrounding source code to assess impact of changes
|
|
21
|
+
4. Group the changed files into logical changesets (see Group Analysis below)
|
|
22
|
+
5. Produce a structured summary (see Output Format below)
|
|
23
|
+
|
|
24
|
+
## Group Analysis
|
|
25
|
+
|
|
26
|
+
Your primary job is to group changed files into logical, independently-reviewable changesets. Each group will become its own git branch that can be reviewed and merged independently.
|
|
27
|
+
|
|
28
|
+
**Rules:**
|
|
29
|
+
- The user message includes a "Changed Files" list — this is the canonical set of files. Use ONLY files from this list.
|
|
30
|
+
- Each file must appear in exactly ONE group. You cannot split changes within a single file across groups.
|
|
31
|
+
- Group files that form a single logical change together (e.g., a feature + its tests, a refactor across related files).
|
|
32
|
+
- Each group should be independently mergeable — it should make sense on its own without the other groups.
|
|
33
|
+
- If all changes are tightly coupled and cannot be meaningfully separated, put everything in a single group. That's fine.
|
|
34
|
+
- Use kebab-case for group names (e.g., "optimize-image-loading", "remove-unused-deps").
|
|
35
|
+
|
|
36
|
+
## Output Format
|
|
37
|
+
|
|
38
|
+
Your final output MUST contain all of these sections:
|
|
39
|
+
|
|
40
|
+
## Summary
|
|
41
|
+
One paragraph overview of what the experiment run accomplished. Mention the metric, improvement achieved, and number of kept changes.
|
|
42
|
+
|
|
43
|
+
## Changes
|
|
44
|
+
Bulleted list of each logical change. For each:
|
|
45
|
+
- What was changed (file paths, function names)
|
|
46
|
+
- Why it likely improved the metric
|
|
47
|
+
- How significant the change is
|
|
48
|
+
|
|
49
|
+
## Risk Assessment
|
|
50
|
+
Flag any concerns:
|
|
51
|
+
- **Security**: New attack surfaces, input validation gaps, auth changes
|
|
52
|
+
- **User-facing behavior**: UI changes, API contract changes, output format changes
|
|
53
|
+
- **Performance**: Potential regressions in non-measured dimensions (memory, startup time)
|
|
54
|
+
- **Error handling**: Removed error checks, swallowed exceptions, narrowed error types
|
|
55
|
+
- **Correctness**: Logic changes that might break edge cases
|
|
56
|
+
|
|
57
|
+
If no risks are found, say "No significant risks identified."
|
|
58
|
+
|
|
59
|
+
## Recommendations
|
|
60
|
+
List items that warrant manual review before merging. If none, say "No specific recommendations."
|
|
61
|
+
|
|
62
|
+
## Finalize Groups
|
|
63
|
+
Wrap your grouping in XML tags containing a JSON array. Use conventional commit format for titles.
|
|
64
|
+
|
|
65
|
+
<finalize_groups>
|
|
66
|
+
[
|
|
67
|
+
{
|
|
68
|
+
"name": "optimize-image-loading",
|
|
69
|
+
"title": "perf(images): lazy-load below-fold images and use WebP format",
|
|
70
|
+
"description": "Converted eager image loading to intersection-observer-based lazy loading",
|
|
71
|
+
"files": ["src/components/ImageLoader.tsx", "src/utils/image.ts"],
|
|
72
|
+
"risk": "low"
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
"name": "remove-unused-deps",
|
|
76
|
+
"title": "refactor: remove lodash and moment.js dependencies",
|
|
77
|
+
"description": "Replaced lodash utilities with native array methods, moment with Intl.DateTimeFormat",
|
|
78
|
+
"files": ["package.json", "src/utils/date.ts", "src/utils/array.ts"],
|
|
79
|
+
"risk": "low"
|
|
80
|
+
}
|
|
81
|
+
]
|
|
82
|
+
</finalize_groups>
|
|
83
|
+
|
|
84
|
+
Each group object must have:
|
|
85
|
+
- \`name\`: kebab-case identifier (used in branch name)
|
|
86
|
+
- \`title\`: conventional commit message for this group
|
|
87
|
+
- \`description\`: 1-2 sentence summary of what changed
|
|
88
|
+
- \`files\`: array of file paths (ONLY from the Changed Files list)
|
|
89
|
+
- \`risk\`: "low", "medium", or "high"`
|
|
90
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export { getSetupSystemPrompt, type SetupPromptResult } from "./setup.ts"
|
|
2
|
+
export { getUpdateSystemPrompt, type UpdatePromptResult } from "./update.ts"
|
|
3
|
+
export { getExperimentSystemPrompt } from "./experiment.ts"
|
|
4
|
+
export { getFinalizeSystemPrompt } from "./finalize.ts"
|
|
5
|
+
|
|
6
|
+
export const DEFAULT_SYSTEM_PROMPT =
|
|
7
|
+
"You are AutoAuto, an autoresearch assistant. Be concise."
|