@spacek33z/autoauto 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/README.md +197 -0
  2. package/package.json +51 -0
  3. package/src/App.tsx +224 -0
  4. package/src/cli.ts +772 -0
  5. package/src/components/AgentPanel.tsx +254 -0
  6. package/src/components/Chat.test.tsx +71 -0
  7. package/src/components/Chat.tsx +308 -0
  8. package/src/components/CycleField.tsx +23 -0
  9. package/src/components/ModelPicker.tsx +97 -0
  10. package/src/components/PostUpdatePrompt.tsx +46 -0
  11. package/src/components/ResultsTable.tsx +172 -0
  12. package/src/components/RunCompletePrompt.tsx +90 -0
  13. package/src/components/RunSettingsOverlay.tsx +49 -0
  14. package/src/components/RunsTable.tsx +219 -0
  15. package/src/components/StatsHeader.tsx +100 -0
  16. package/src/daemon.ts +264 -0
  17. package/src/index.tsx +8 -0
  18. package/src/lib/agent/agent-provider.test.ts +133 -0
  19. package/src/lib/agent/claude-provider.ts +277 -0
  20. package/src/lib/agent/codex-provider.ts +413 -0
  21. package/src/lib/agent/default-providers.ts +10 -0
  22. package/src/lib/agent/index.ts +32 -0
  23. package/src/lib/agent/mock-provider.ts +61 -0
  24. package/src/lib/agent/opencode-provider.ts +424 -0
  25. package/src/lib/agent/types.ts +73 -0
  26. package/src/lib/auth.ts +11 -0
  27. package/src/lib/config.ts +152 -0
  28. package/src/lib/daemon-callbacks.ts +59 -0
  29. package/src/lib/daemon-client.ts +16 -0
  30. package/src/lib/daemon-lifecycle.ts +368 -0
  31. package/src/lib/daemon-spawn.ts +122 -0
  32. package/src/lib/daemon-status.ts +189 -0
  33. package/src/lib/daemon-watcher.ts +192 -0
  34. package/src/lib/experiment-loop.ts +679 -0
  35. package/src/lib/experiment.ts +356 -0
  36. package/src/lib/finalize.test.ts +143 -0
  37. package/src/lib/finalize.ts +511 -0
  38. package/src/lib/format.test.ts +32 -0
  39. package/src/lib/format.ts +44 -0
  40. package/src/lib/git.ts +176 -0
  41. package/src/lib/ideas-backlog.test.ts +54 -0
  42. package/src/lib/ideas-backlog.ts +109 -0
  43. package/src/lib/measure.ts +472 -0
  44. package/src/lib/model-options.ts +24 -0
  45. package/src/lib/programs.ts +247 -0
  46. package/src/lib/push-stream.ts +48 -0
  47. package/src/lib/run-context.ts +112 -0
  48. package/src/lib/run-setup.ts +34 -0
  49. package/src/lib/run.ts +383 -0
  50. package/src/lib/syntax-theme.ts +39 -0
  51. package/src/lib/system-prompts/experiment.ts +77 -0
  52. package/src/lib/system-prompts/finalize.ts +90 -0
  53. package/src/lib/system-prompts/index.ts +7 -0
  54. package/src/lib/system-prompts/setup.ts +516 -0
  55. package/src/lib/system-prompts/update.ts +188 -0
  56. package/src/lib/tool-events.ts +99 -0
  57. package/src/lib/validate-measurement.ts +326 -0
  58. package/src/lib/worktree.ts +40 -0
  59. package/src/screens/AuthErrorScreen.tsx +31 -0
  60. package/src/screens/ExecutionScreen.tsx +851 -0
  61. package/src/screens/FirstSetupScreen.tsx +168 -0
  62. package/src/screens/HomeScreen.tsx +406 -0
  63. package/src/screens/PreRunScreen.tsx +206 -0
  64. package/src/screens/SettingsScreen.tsx +189 -0
  65. package/src/screens/SetupScreen.tsx +226 -0
  66. package/src/tui.tsx +17 -0
  67. package/tsconfig.json +17 -0
package/src/lib/run.ts ADDED
@@ -0,0 +1,383 @@
1
+ import { rename, readdir, appendFile, rm } from "node:fs/promises"
2
+ import { join } from "node:path"
3
+ import { $ } from "bun"
4
+ import { getProgramDir, type ProgramConfig } from "./programs.ts"
5
+
6
+ // --- Types ---
7
+
8
+ /** Status values for results.tsv rows */
9
+ export type ExperimentStatus = "keep" | "discard" | "measurement_failure" | "crash"
10
+
11
+ /** Phases the daemon/orchestrator can be in */
12
+ export type RunPhase =
13
+ | "idle"
14
+ | "baseline"
15
+ | "agent_running"
16
+ | "measuring"
17
+ | "reverting"
18
+ | "kept"
19
+ | "stopping"
20
+ | "complete"
21
+ | "crashed"
22
+ | "finalizing"
23
+
24
+ /** Termination reason for a completed run */
25
+ export type TerminationReason = "aborted" | "max_experiments" | "stopped" | "stagnation"
26
+
27
+ /** Persisted run state — the checkpoint file */
28
+ export interface RunState {
29
+ run_id: string
30
+ program_slug: string
31
+ phase: RunPhase
32
+ experiment_number: number
33
+ original_baseline: number
34
+ current_baseline: number
35
+ best_metric: number
36
+ best_experiment: number
37
+ total_keeps: number
38
+ total_discards: number
39
+ total_crashes: number
40
+ branch_name: string
41
+ original_baseline_sha: string
42
+ last_known_good_sha: string
43
+ candidate_sha: string | null
44
+ started_at: string
45
+ updated_at: string
46
+ /** Agent provider used for this run. Legacy runs omit this and default to Claude. */
47
+ provider?: string
48
+ /** Model alias/ID used for this run (e.g. "sonnet" or "anthropic/claude-sonnet-4-5") */
49
+ model?: string
50
+ /** Effort level used for this run */
51
+ effort?: string
52
+ /** Cumulative input+output tokens across all experiments */
53
+ total_tokens?: number
54
+ /** Cumulative cost in USD across all experiments */
55
+ total_cost_usd?: number
56
+ /** Why the run terminated (set on completion) */
57
+ termination_reason?: TerminationReason | null
58
+ /** Branch the user was on before the run started */
59
+ original_branch?: string
60
+ /** Absolute path to the AutoAuto-owned worktree */
61
+ worktree_path?: string
62
+ /** True when running without worktree isolation (experiments run in main checkout) */
63
+ in_place?: boolean
64
+ /** Error message if the run crashed */
65
+ error?: string | null
66
+ /** Which phase the error occurred in */
67
+ error_phase?: RunPhase | null
68
+ }
69
+
70
+ /** A single row in results.tsv */
71
+ export interface ExperimentResult {
72
+ experiment_number: number
73
+ commit: string
74
+ metric_value: number
75
+ secondary_values: string
76
+ status: ExperimentStatus
77
+ description: string
78
+ /** Total wall time for the measurement series (all repeats), in ms */
79
+ measurement_duration_ms: number
80
+ /** Diff stats JSON — e.g. {"lines_added":12,"lines_removed":5}. Absent for old runs. */
81
+ diff_stats?: string
82
+ }
83
+
84
+ /** Structured secondary values stored in results.tsv */
85
+ export interface SecondaryValuesBlob {
86
+ quality_gates: Record<string, number>
87
+ secondary_metrics: Record<string, number>
88
+ }
89
+
90
+ /** Serializes DiffStats into compact JSON for results.tsv. */
91
+ export function serializeDiffStats(stats: { lines_added: number; lines_removed: number } | undefined): string {
92
+ if (!stats) return ""
93
+ return JSON.stringify({ lines_added: stats.lines_added, lines_removed: stats.lines_removed })
94
+ }
95
+
96
+ /** Serializes quality gate and secondary metric medians into the structured JSON format. */
97
+ export function serializeSecondaryValues(
98
+ qualityGates: Record<string, number>,
99
+ secondaryMetrics: Record<string, number>,
100
+ ): string {
101
+ return JSON.stringify({ quality_gates: qualityGates, secondary_metrics: secondaryMetrics })
102
+ }
103
+
104
+ /**
105
+ * Parses secondary_values JSON with backward compatibility.
106
+ * New format: { quality_gates: {...}, secondary_metrics: {...} }
107
+ * Old format: flat { field: value, ... } — all values placed under quality_gates.
108
+ */
109
+ export function parseSecondaryValues(raw: string | undefined): SecondaryValuesBlob {
110
+ const empty: SecondaryValuesBlob = { quality_gates: {}, secondary_metrics: {} }
111
+ if (!raw) return empty
112
+ try {
113
+ const parsed = JSON.parse(raw)
114
+ if (typeof parsed !== "object" || parsed === null) return empty
115
+ // Detect new structured format
116
+ if ("quality_gates" in parsed || "secondary_metrics" in parsed) {
117
+ return {
118
+ quality_gates: (parsed as Record<string, unknown>).quality_gates as Record<string, number> ?? {},
119
+ secondary_metrics: (parsed as Record<string, unknown>).secondary_metrics as Record<string, number> ?? {},
120
+ }
121
+ }
122
+ // Old flat format — treat all values as quality gates
123
+ return { quality_gates: parsed as Record<string, number>, secondary_metrics: {} }
124
+ } catch {
125
+ return empty
126
+ }
127
+ }
128
+
129
+ // --- Run ID ---
130
+
131
+ const pad = (n: number) => String(n).padStart(2, "0")
132
+
133
+ export function generateRunId(): string {
134
+ const now = new Date()
135
+ return `${now.getFullYear()}${pad(now.getMonth() + 1)}${pad(now.getDate())}-${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`
136
+ }
137
+
138
+ // --- State Persistence ---
139
+
140
+ /** Atomically writes state.json via temp-file + rename. */
141
+ export async function writeState(runDir: string, state: RunState): Promise<void> {
142
+ const tmpPath = join(runDir, "state.json.tmp")
143
+ await Bun.write(tmpPath, JSON.stringify(state, null, 2) + "\n")
144
+ await rename(tmpPath, join(runDir, "state.json"))
145
+ }
146
+
147
+ export async function readState(runDir: string): Promise<RunState> {
148
+ return Bun.file(join(runDir, "state.json")).json() as Promise<RunState>
149
+ }
150
+
151
+ // --- Results ---
152
+
153
+ export async function appendResult(runDir: string, result: ExperimentResult): Promise<void> {
154
+ const secondaryStr = result.secondary_values || ""
155
+ const diffStatsStr = result.diff_stats || ""
156
+ const line = `${result.experiment_number}\t${result.commit}\t${result.metric_value}\t${secondaryStr}\t${result.status}\t${result.description}\t${result.measurement_duration_ms}\t${diffStatsStr}\n`
157
+ await appendFile(join(runDir, "results.tsv"), line)
158
+ }
159
+
160
+ // --- Results Parsing (synchronous — operate on pre-read content) ---
161
+
162
+ /** Parses a single TSV row into a typed result. Returns null if malformed. */
163
+ function parseTsvRow(line: string): ExperimentResult | null {
164
+ const parts = line.split("\t")
165
+ if (parts.length < 6) return null
166
+ return {
167
+ experiment_number: parseInt(parts[0], 10),
168
+ commit: parts[1],
169
+ metric_value: parseFloat(parts[2]),
170
+ secondary_values: parts[3],
171
+ status: parts[4] as ExperimentStatus,
172
+ description: parts[5],
173
+ measurement_duration_ms: parts[6] ? parseInt(parts[6], 10) : 0,
174
+ diff_stats: parts[7] || undefined,
175
+ }
176
+ }
177
+
178
+ /** Formats header + last N data rows from raw results.tsv content. */
179
+ export function formatRecentResults(raw: string, count = 15): string {
180
+ const lines = raw.split("\n").filter(Boolean)
181
+ if (lines.length <= 1) return lines.join("\n")
182
+
183
+ const header = lines[0]
184
+ const rows = lines.slice(1)
185
+ const recent = rows.slice(-count)
186
+ return [header, ...recent].join("\n")
187
+ }
188
+
189
+ /** Parses the last row of raw results.tsv content into a typed object. */
190
+ export function parseLastResult(raw: string): ExperimentResult | null {
191
+ const lines = raw.trim().split("\n")
192
+ if (lines.length <= 1) return null // only header
193
+ return parseTsvRow(lines[lines.length - 1])
194
+ }
195
+
196
+ /** Parses the last keep row from raw results.tsv content. */
197
+ export function parseLastKeepResult(raw: string): ExperimentResult | null {
198
+ const lines = raw.trim().split("\n")
199
+ for (let i = lines.length - 1; i >= 1; i--) {
200
+ const row = parseTsvRow(lines[i])
201
+ if (row?.status === "keep") return row
202
+ }
203
+ return null
204
+ }
205
+
206
+ /** Extracts SHAs of recent discarded/crashed experiments from raw results.tsv content. */
207
+ export function parseDiscardedShas(raw: string, count = 5): string[] {
208
+ const lines = raw.trim().split("\n")
209
+ const shas: string[] = []
210
+
211
+ for (let i = lines.length - 1; i >= 1 && shas.length < count; i--) {
212
+ const parts = lines[i].split("\t")
213
+ if (parts.length >= 5) {
214
+ const status = parts[4]
215
+ if (status === "discard" || status === "crash" || status === "measurement_failure") {
216
+ shas.push(parts[1]) // commit SHA
217
+ }
218
+ }
219
+ }
220
+
221
+ return shas
222
+ }
223
+
224
+ // --- Results Reading ---
225
+
226
+ /** Parses the entire results.tsv into a typed array. */
227
+ export async function readAllResults(runDir: string): Promise<ExperimentResult[]> {
228
+ const raw = await Bun.file(join(runDir, "results.tsv")).text()
229
+ const lines = raw.trim().split("\n")
230
+ if (lines.length <= 1) return [] // only header
231
+
232
+ const results: ExperimentResult[] = []
233
+ for (let i = 1; i < lines.length; i++) {
234
+ const row = parseTsvRow(lines[i])
235
+ if (row) results.push(row)
236
+ }
237
+ return results
238
+ }
239
+
240
+ /** Extracts metric values from keep results for sparkline/chart rendering. */
241
+ export function getMetricHistory(results: ExperimentResult[]): number[] {
242
+ return results
243
+ .filter((r) => r.status === "keep")
244
+ .map((r) => r.metric_value)
245
+ }
246
+
247
+ /** Computes average measurement duration from results that have duration data. */
248
+ export function getAvgMeasurementDuration(results: ExperimentResult[]): number | null {
249
+ const durations = results
250
+ .filter((r) => r.measurement_duration_ms > 0)
251
+ .map((r) => r.measurement_duration_ms)
252
+ if (durations.length === 0) return null
253
+ return Math.round(durations.reduce((sum, d) => sum + d, 0) / durations.length)
254
+ }
255
+
256
+ /** Derived statistics from state for the TUI dashboard. */
257
+ export interface RunStats {
258
+ total_experiments: number
259
+ total_keeps: number
260
+ total_discards: number
261
+ total_crashes: number
262
+ keep_rate: number
263
+ improvement_pct: number
264
+ current_improvement_pct: number
265
+ }
266
+
267
+ function computeImprovementPct(
268
+ original: number,
269
+ current: number,
270
+ direction: ProgramConfig["direction"],
271
+ ): number {
272
+ if (original === 0) return 0
273
+ return direction === "lower"
274
+ ? ((original - current) / Math.abs(original)) * 100
275
+ : ((current - original) / Math.abs(original)) * 100
276
+ }
277
+
278
+ /** Computes derived statistics from run state. Counts come from RunState's authoritative counters. */
279
+ export function getRunStats(state: RunState, direction: ProgramConfig["direction"]): RunStats {
280
+ const total = state.total_keeps + state.total_discards + state.total_crashes
281
+
282
+ return {
283
+ total_experiments: total,
284
+ total_keeps: state.total_keeps,
285
+ total_discards: state.total_discards,
286
+ total_crashes: state.total_crashes,
287
+ keep_rate: total > 0 ? state.total_keeps / total : 0,
288
+ improvement_pct: computeImprovementPct(state.original_baseline, state.best_metric, direction),
289
+ current_improvement_pct: computeImprovementPct(state.original_baseline, state.current_baseline, direction),
290
+ }
291
+ }
292
+
293
+ // --- Run Listing ---
294
+
295
+ /** Metadata for a run, used in list views. */
296
+ export interface RunInfo {
297
+ run_id: string
298
+ run_dir: string
299
+ state: RunState | null
300
+ }
301
+
302
+ /** Lists all runs for a program, sorted newest first. */
303
+ export async function listRuns(programDir: string): Promise<RunInfo[]> {
304
+ const runsDir = join(programDir, "runs")
305
+ let entries: string[]
306
+ try {
307
+ const dirents = await readdir(runsDir, { withFileTypes: true })
308
+ entries = dirents.filter((e) => e.isDirectory()).map((e) => e.name)
309
+ } catch {
310
+ return []
311
+ }
312
+
313
+ const runs = await Promise.all(
314
+ entries.map(async (runId): Promise<RunInfo> => {
315
+ const runDir = join(runsDir, runId)
316
+ let state: RunState | null = null
317
+ try {
318
+ state = await readState(runDir)
319
+ } catch {
320
+ // state.json missing or corrupt
321
+ }
322
+ return { run_id: runId, run_dir: runDir, state }
323
+ }),
324
+ )
325
+
326
+ runs.sort((a, b) => b.run_id.localeCompare(a.run_id))
327
+ return runs
328
+ }
329
+
330
+ /** Returns the most recent run for a program. */
331
+ export async function getLatestRun(programDir: string): Promise<RunInfo | null> {
332
+ const runs = await listRuns(programDir)
333
+ return runs.length > 0 ? runs[0] : null
334
+ }
335
+
336
+ export function isRunActive(r: RunInfo): boolean {
337
+ const phase = r.state?.phase
338
+ return phase != null && phase !== "complete" && phase !== "crashed"
339
+ }
340
+
341
+ // --- Run Deletion ---
342
+
343
+ /** Deletes a completed/crashed run: removes run directory, worktree, and git branch. */
344
+ export async function deleteRun(projectRoot: string, run: RunInfo): Promise<void> {
345
+ if (isRunActive(run)) {
346
+ throw new Error("Cannot delete an active run")
347
+ }
348
+
349
+ const state = run.state
350
+
351
+ // Remove worktree if it exists (skip for in-place runs — there's no worktree)
352
+ if (state?.worktree_path && !state?.in_place) {
353
+ await $`git worktree remove --force ${state.worktree_path}`.cwd(projectRoot).nothrow().quiet()
354
+ }
355
+
356
+ // Delete the experiment branch
357
+ if (state?.branch_name) {
358
+ await $`git branch -D ${state.branch_name}`.cwd(projectRoot).nothrow().quiet()
359
+ }
360
+
361
+ // Remove the run directory
362
+ await rm(run.run_dir, { recursive: true, force: true })
363
+ }
364
+
365
+ /** Deletes an entire program: removes all runs (worktrees + branches) and the program directory. */
366
+ export async function deleteProgram(projectRoot: string, slug: string): Promise<void> {
367
+ const programDir = getProgramDir(projectRoot, slug)
368
+ const runs = await listRuns(programDir)
369
+
370
+ const activeRun = runs.find(isRunActive)
371
+ if (activeRun) {
372
+ throw new Error("Cannot delete a program with an active run")
373
+ }
374
+
375
+ // Delete all runs first (cleans up worktrees + branches)
376
+ for (const run of runs) {
377
+ await deleteRun(projectRoot, run)
378
+ }
379
+
380
+ // Remove the program directory
381
+ await rm(programDir, { recursive: true, force: true })
382
+ }
383
+
@@ -0,0 +1,39 @@
1
+ import { SyntaxStyle } from "@opentui/core"
2
+ import type { ThemeTokenStyle } from "@opentui/core"
3
+
4
+ const theme: ThemeTokenStyle[] = [
5
+ // Base text
6
+ { scope: ["default"], style: { foreground: "#a9b1d6" } },
7
+
8
+ // Keywords & control flow
9
+ { scope: ["keyword"], style: { foreground: "#bb9af7", bold: true } },
10
+ { scope: ["operator"], style: { foreground: "#89ddff" } },
11
+
12
+ // Literals
13
+ { scope: ["string"], style: { foreground: "#9ece6a" } },
14
+ { scope: ["number"], style: { foreground: "#ff9e64" } },
15
+ { scope: ["constant"], style: { foreground: "#ff9e64" } },
16
+
17
+ // Functions & types
18
+ { scope: ["function"], style: { foreground: "#7aa2f7" } },
19
+ { scope: ["type"], style: { foreground: "#2ac3de" } },
20
+
21
+ // Comments
22
+ { scope: ["comment"], style: { foreground: "#565f89", italic: true } },
23
+
24
+ // Variables & properties
25
+ { scope: ["variable"], style: { foreground: "#c0caf5" } },
26
+ { scope: ["property"], style: { foreground: "#73daca" } },
27
+ { scope: ["punctuation"], style: { foreground: "#89ddff" } },
28
+ { scope: ["tag"], style: { foreground: "#f7768e" } },
29
+
30
+ // Markdown-specific
31
+ { scope: ["markup.heading"], style: { foreground: "#7aa2f7", bold: true } },
32
+ { scope: ["markup.italic"], style: { italic: true } },
33
+ { scope: ["markup.bold"], style: { bold: true } },
34
+ { scope: ["markup.link"], style: { foreground: "#7aa2f7", underline: true } },
35
+ { scope: ["markup.raw"], style: { foreground: "#9ece6a" } },
36
+ { scope: ["markup.list"], style: { foreground: "#ff7b72" } },
37
+ ]
38
+
39
+ export const syntaxStyle = SyntaxStyle.fromTheme(theme)
@@ -0,0 +1,77 @@
1
+ /** Returns the system prompt for the experiment agent. Wraps program.md with framing instructions. */
2
+ export function getExperimentSystemPrompt(
3
+ programMd: string,
4
+ options: { ideasBacklogEnabled?: boolean } = {},
5
+ ): string {
6
+ const useIdeasBacklog = options.ideasBacklogEnabled !== false
7
+ const notesInstruction = useIdeasBacklog ? `
8
+ ### 6. Leave Experiment Notes
9
+ At the end of your final response, include exactly one notes block for the orchestrator:
10
+
11
+ <autoauto_notes>
12
+ {"hypothesis":"one sentence describing what you tried and why it should affect the metric","why":"one sentence describing what happened or what failure mode to watch for","avoid":["specific approach to avoid repeating"],"next":["specific follow-up idea to try next"]}
13
+ </autoauto_notes>
14
+
15
+ Keep these notes factual and short. Do not edit any ideas backlog file yourself; the orchestrator persists these notes.
16
+ ` : ""
17
+ const exitSectionNumber = useIdeasBacklog ? "7" : "6"
18
+
19
+ return `You are an AutoAuto Experiment Agent — one experiment in an autonomous optimization loop. An external orchestrator handles measurement, keep/discard decisions, and loop control. Your job: analyze, plan ONE targeted optimization, implement it, validate it, and commit.
20
+
21
+ ${programMd}
22
+
23
+ ## How to Be a Good Experimenter
24
+
25
+ ### 1. Analyze Before Acting
26
+ - Read the codebase within scope. Understand the current implementation before proposing changes.
27
+ - Study results.tsv carefully: which approaches were kept? Which were discarded? What patterns emerge?
28
+ - If "Measurement Diagnostics" are provided in the context, study them carefully — they contain detailed output from the measurement tool (e.g., which specific audits, tests, or checks are failing) that should guide your optimization choice. Do NOT guess from code inspection when diagnostics are available.
29
+ - Review the 'Recently Discarded Experiments' section above to understand WHY past experiments failed — don't just note that they failed.
30
+ - Identify the actual bottleneck or opportunity. A targeted change to the real bottleneck beats a shotgun approach.
31
+ - If you're experiment #1, spend extra time reading the codebase. Later experiments should build on what the history tells you.
32
+
33
+ ### 2. Choose ONE Mechanism to Test
34
+ - Pick ONE specific mechanism per experiment. "Replace regex with indexOf in URL extraction to avoid backtracking" is good. "Various improvements" is bad.
35
+ - Build on what worked: if recent keeps share a pattern (e.g., reducing allocations), explore that direction further.
36
+ - Avoid what failed: if recent discards share a pattern (e.g., algorithmic changes that broke quality), steer clear.
37
+ - Do NOT repeat discarded approaches — even with minor variations. If tree shaking was discarded, "better tree shaking" is likely wasteful too.
38
+ - You should be able to explain in one sentence WHY your change should improve the metric. If you can't, pick a different approach.
39
+ - When the obvious optimizations are exhausted, look deeper: profile the code mentally, read the hot path line by line, check for redundant work, unnecessary allocations, or algorithmic inefficiency.
40
+
41
+ ### 3. Implement the Change
42
+ - Make exactly ONE focused change — not multiple changes at once.
43
+ - Keep diffs small and reviewable. A 10-line targeted fix beats a 200-line refactor.
44
+ - Stay strictly within the allowed file scope defined in program.md.
45
+ - NEVER modify files in .autoauto/ — these are locked by the orchestrator.
46
+ - NEVER modify measure.sh, build.sh, or config.json — they are read-only (chmod 444).
47
+ - NEVER hardcode absolute home directory paths (e.g. /Users/username/...). Use relative paths, \`$HOME\`, or \`~\`.
48
+
49
+ ### 4. Validate
50
+ - Run existing tests if available. If tests fail, fix them or revert — do NOT commit broken code.
51
+ - If your change breaks the build, try to fix it. If you can't fix it quickly, revert everything and exit without committing.
52
+ - Do NOT run the measurement script — the orchestrator handles that after you commit.
53
+
54
+ ### 5. Commit with a Descriptive Message
55
+ - Commit with: git add -A && git commit -m "<type>(scope): description"
56
+ - Explain the MECHANISM in your commit message, not just the action:
57
+ - Good: "perf(parser): replace regex with indexOf for URL extraction — avoids backtracking on long strings"
58
+ - Bad: "perf: improve performance"
59
+ - The commit message is how future experiment agents learn from your work. Make it count.
60
+ ${notesInstruction}
61
+ ### ${exitSectionNumber}. When to Exit Without Committing
62
+ - If you've analyzed the code and can't find a promising change within scope — exit. A no-commit is better than a low-quality experiment that wastes measurement time.
63
+ - If validation fails and you cannot fix it — revert and exit.
64
+ - If your proposed change is essentially the same as a recently discarded experiment — exit instead of wasting a cycle.
65
+ - Do NOT ask for human input — you are fully autonomous.
66
+
67
+ ## What Makes Experiments Fail (Avoid These)
68
+ - **Repeating discarded ideas:** The #1 waste of cycles. Read the history carefully.
69
+ - **Shotgun changes:** Multiple unrelated changes in one experiment. The orchestrator can't tell which one helped.
70
+ - **Out-of-scope modifications:** Touching files outside your allowed scope gets the entire experiment discarded.
71
+ - **Speculative changes without a mechanism:** "Maybe this will help" changes rarely work. Have a clear hypothesis.
72
+ - **Over-engineering:** Adding complexity that doesn't directly serve the metric. Simpler is better at equal metric.
73
+ - **Benchmark-specific tricks:** Bitwise hacks the compiler already does, unrolled loops for specific sizes — these don't generalize.
74
+
75
+ ## Simplification Bonus
76
+ The orchestrator automatically keeps experiments that **remove more code than they add** (net negative lines changed) as long as the metric doesn't regress. You don't need to improve the metric to get a simplification kept — just don't make it worse. Look for dead code, redundant logic, unnecessary abstractions, or verbose patterns that can be tightened. Simplification keeps are valuable and count as real progress.`
77
+ }
@@ -0,0 +1,90 @@
1
+ /** Returns the system prompt for the finalize agent. Read-only review + grouping of accumulated experiment changes. */
2
+ export function getFinalizeSystemPrompt(): string {
3
+ return `You are the AutoAuto Finalize Agent — a code reviewer for an autonomous experiment run. An orchestrator ran multiple experiments on a branch, keeping improvements and discarding failures. Your job: review the accumulated changes, assess risks, group them into logical changesets, and produce a structured summary.
4
+
5
+ ## Your Role
6
+
7
+ You are a READ-ONLY reviewer. You MUST NOT modify any files. You only analyze and report.
8
+
9
+ ## Tools
10
+
11
+ Use these tools to inspect the changes:
12
+ - **Bash**: Run \`git log\`, \`git diff\`, \`git show <sha>\` to inspect individual commits and the overall diff
13
+ - **Read**: Read source files to understand context around changes
14
+ - **Glob/Grep**: Search the codebase to understand how changed code is used
15
+
16
+ ## Task
17
+
18
+ 1. Review the full diff provided in the user message
19
+ 2. Inspect individual experiment commits via \`git log --oneline\` and \`git show <sha>\` to understand the evolution
20
+ 3. Read surrounding source code to assess impact of changes
21
+ 4. Group the changed files into logical changesets (see Group Analysis below)
22
+ 5. Produce a structured summary (see Output Format below)
23
+
24
+ ## Group Analysis
25
+
26
+ Your primary job is to group changed files into logical, independently-reviewable changesets. Each group will become its own git branch that can be reviewed and merged independently.
27
+
28
+ **Rules:**
29
+ - The user message includes a "Changed Files" list — this is the canonical set of files. Use ONLY files from this list.
30
+ - Each file must appear in exactly ONE group. You cannot split changes within a single file across groups.
31
+ - Group files that form a single logical change together (e.g., a feature + its tests, a refactor across related files).
32
+ - Each group should be independently mergeable — it should make sense on its own without the other groups.
33
+ - If all changes are tightly coupled and cannot be meaningfully separated, put everything in a single group. That's fine.
34
+ - Use kebab-case for group names (e.g., "optimize-image-loading", "remove-unused-deps").
35
+
36
+ ## Output Format
37
+
38
+ Your final output MUST contain all of these sections:
39
+
40
+ ## Summary
41
+ One paragraph overview of what the experiment run accomplished. Mention the metric, improvement achieved, and number of kept changes.
42
+
43
+ ## Changes
44
+ Bulleted list of each logical change. For each:
45
+ - What was changed (file paths, function names)
46
+ - Why it likely improved the metric
47
+ - How significant the change is
48
+
49
+ ## Risk Assessment
50
+ Flag any concerns:
51
+ - **Security**: New attack surfaces, input validation gaps, auth changes
52
+ - **User-facing behavior**: UI changes, API contract changes, output format changes
53
+ - **Performance**: Potential regressions in non-measured dimensions (memory, startup time)
54
+ - **Error handling**: Removed error checks, swallowed exceptions, narrowed error types
55
+ - **Correctness**: Logic changes that might break edge cases
56
+
57
+ If no risks are found, say "No significant risks identified."
58
+
59
+ ## Recommendations
60
+ List items that warrant manual review before merging. If none, say "No specific recommendations."
61
+
62
+ ## Finalize Groups
63
+ Wrap your grouping in XML tags containing a JSON array. Use conventional commit format for titles.
64
+
65
+ <finalize_groups>
66
+ [
67
+ {
68
+ "name": "optimize-image-loading",
69
+ "title": "perf(images): lazy-load below-fold images and use WebP format",
70
+ "description": "Converted eager image loading to intersection-observer-based lazy loading",
71
+ "files": ["src/components/ImageLoader.tsx", "src/utils/image.ts"],
72
+ "risk": "low"
73
+ },
74
+ {
75
+ "name": "remove-unused-deps",
76
+ "title": "refactor: remove lodash and moment.js dependencies",
77
+ "description": "Replaced lodash utilities with native array methods, moment with Intl.DateTimeFormat",
78
+ "files": ["package.json", "src/utils/date.ts", "src/utils/array.ts"],
79
+ "risk": "low"
80
+ }
81
+ ]
82
+ </finalize_groups>
83
+
84
+ Each group object must have:
85
+ - \`name\`: kebab-case identifier (used in branch name)
86
+ - \`title\`: conventional commit message for this group
87
+ - \`description\`: 1-2 sentence summary of what changed
88
+ - \`files\`: array of file paths (ONLY from the Changed Files list)
89
+ - \`risk\`: "low", "medium", or "high"`
90
+ }
@@ -0,0 +1,7 @@
1
+ export { getSetupSystemPrompt, type SetupPromptResult } from "./setup.ts"
2
+ export { getUpdateSystemPrompt, type UpdatePromptResult } from "./update.ts"
3
+ export { getExperimentSystemPrompt } from "./experiment.ts"
4
+ export { getFinalizeSystemPrompt } from "./finalize.ts"
5
+
6
+ export const DEFAULT_SYSTEM_PROMPT =
7
+ "You are AutoAuto, an autoresearch assistant. Be concise."