@spacek33z/autoauto 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/README.md +197 -0
  2. package/package.json +51 -0
  3. package/src/App.tsx +224 -0
  4. package/src/cli.ts +772 -0
  5. package/src/components/AgentPanel.tsx +254 -0
  6. package/src/components/Chat.test.tsx +71 -0
  7. package/src/components/Chat.tsx +308 -0
  8. package/src/components/CycleField.tsx +23 -0
  9. package/src/components/ModelPicker.tsx +97 -0
  10. package/src/components/PostUpdatePrompt.tsx +46 -0
  11. package/src/components/ResultsTable.tsx +172 -0
  12. package/src/components/RunCompletePrompt.tsx +90 -0
  13. package/src/components/RunSettingsOverlay.tsx +49 -0
  14. package/src/components/RunsTable.tsx +219 -0
  15. package/src/components/StatsHeader.tsx +100 -0
  16. package/src/daemon.ts +264 -0
  17. package/src/index.tsx +8 -0
  18. package/src/lib/agent/agent-provider.test.ts +133 -0
  19. package/src/lib/agent/claude-provider.ts +277 -0
  20. package/src/lib/agent/codex-provider.ts +413 -0
  21. package/src/lib/agent/default-providers.ts +10 -0
  22. package/src/lib/agent/index.ts +32 -0
  23. package/src/lib/agent/mock-provider.ts +61 -0
  24. package/src/lib/agent/opencode-provider.ts +424 -0
  25. package/src/lib/agent/types.ts +73 -0
  26. package/src/lib/auth.ts +11 -0
  27. package/src/lib/config.ts +152 -0
  28. package/src/lib/daemon-callbacks.ts +59 -0
  29. package/src/lib/daemon-client.ts +16 -0
  30. package/src/lib/daemon-lifecycle.ts +368 -0
  31. package/src/lib/daemon-spawn.ts +122 -0
  32. package/src/lib/daemon-status.ts +189 -0
  33. package/src/lib/daemon-watcher.ts +192 -0
  34. package/src/lib/experiment-loop.ts +679 -0
  35. package/src/lib/experiment.ts +356 -0
  36. package/src/lib/finalize.test.ts +143 -0
  37. package/src/lib/finalize.ts +511 -0
  38. package/src/lib/format.test.ts +32 -0
  39. package/src/lib/format.ts +44 -0
  40. package/src/lib/git.ts +176 -0
  41. package/src/lib/ideas-backlog.test.ts +54 -0
  42. package/src/lib/ideas-backlog.ts +109 -0
  43. package/src/lib/measure.ts +472 -0
  44. package/src/lib/model-options.ts +24 -0
  45. package/src/lib/programs.ts +247 -0
  46. package/src/lib/push-stream.ts +48 -0
  47. package/src/lib/run-context.ts +112 -0
  48. package/src/lib/run-setup.ts +34 -0
  49. package/src/lib/run.ts +383 -0
  50. package/src/lib/syntax-theme.ts +39 -0
  51. package/src/lib/system-prompts/experiment.ts +77 -0
  52. package/src/lib/system-prompts/finalize.ts +90 -0
  53. package/src/lib/system-prompts/index.ts +7 -0
  54. package/src/lib/system-prompts/setup.ts +516 -0
  55. package/src/lib/system-prompts/update.ts +188 -0
  56. package/src/lib/tool-events.ts +99 -0
  57. package/src/lib/validate-measurement.ts +326 -0
  58. package/src/lib/worktree.ts +40 -0
  59. package/src/screens/AuthErrorScreen.tsx +31 -0
  60. package/src/screens/ExecutionScreen.tsx +851 -0
  61. package/src/screens/FirstSetupScreen.tsx +168 -0
  62. package/src/screens/HomeScreen.tsx +406 -0
  63. package/src/screens/PreRunScreen.tsx +206 -0
  64. package/src/screens/SettingsScreen.tsx +189 -0
  65. package/src/screens/SetupScreen.tsx +226 -0
  66. package/src/tui.tsx +17 -0
  67. package/tsconfig.json +17 -0
@@ -0,0 +1,188 @@
1
+ import { dirname, join } from "node:path"
2
+ import { fileURLToPath } from "node:url"
3
+ import { getProgramsDir } from "../programs.ts"
4
+
5
+ const VALIDATE_SCRIPT = join(dirname(fileURLToPath(import.meta.url)), "..", "validate-measurement.ts")
6
+
7
+ export interface UpdatePromptResult {
8
+ systemPrompt: string
9
+ referencePath: string
10
+ referenceContent: string
11
+ }
12
+
13
+ /**
14
+ * Builds the system prompt for the Update Agent.
15
+ * Reads and embeds current program files so the agent has full context.
16
+ */
17
+ export async function getUpdateSystemPrompt(
18
+ cwd: string,
19
+ programSlug: string,
20
+ programDir: string,
21
+ ): Promise<UpdatePromptResult> {
22
+ const programsDir = getProgramsDir(cwd)
23
+ const referencePath = join(cwd, ".autoauto", "update-reference.md")
24
+
25
+ const [programMd, measureSh, configJson, buildSh] = await Promise.all([
26
+ Bun.file(join(programDir, "program.md")).text().catch(() => "(not found)"),
27
+ Bun.file(join(programDir, "measure.sh")).text().catch(() => "(not found)"),
28
+ Bun.file(join(programDir, "config.json")).text().catch(() => "(not found)"),
29
+ Bun.file(join(programDir, "build.sh")).text().catch(() => null),
30
+ ])
31
+
32
+ const systemPrompt = `You are the AutoAuto Update Agent — an expert at diagnosing and fixing autonomous experiment programs.
33
+
34
+ ## Your Role
35
+
36
+ You help users fix and improve an existing optimization program. A program consists of a measurement script (measure.sh), a program definition (program.md), a config (config.json), and optionally a build script (build.sh). You analyze previous run results, diagnose issues, and propose targeted fixes.
37
+
38
+ ## Context
39
+
40
+ Working directory: ${cwd}
41
+ Program: ${programSlug}
42
+ Program directory: ${programDir}
43
+
44
+ ## Current Program Files
45
+
46
+ ### program.md
47
+ \`\`\`markdown
48
+ ${programMd}
49
+ \`\`\`
50
+
51
+ ### measure.sh
52
+ \`\`\`bash
53
+ ${measureSh}
54
+ \`\`\`
55
+
56
+ ### config.json
57
+ \`\`\`json
58
+ ${configJson}
59
+ \`\`\`
60
+ ${buildSh ? `\n### build.sh\n\`\`\`bash\n${buildSh}\n\`\`\`\n` : ""}
61
+ ## Capabilities
62
+
63
+ You can read files, search the codebase, list directories, run shell commands, write files, and edit files. You may freely READ any file in the project, but you may only WRITE/EDIT files inside .autoauto/programs/${programSlug}/. Do not modify target project source files — those are changed exclusively through the experiment loop.
64
+
65
+ ## Conversation Flow
66
+
67
+ 1. **Analyze** — The user's first message contains run results, error details, and experiment logs from the most recent run. Study this data carefully. Look for:
68
+ - Measurement failures (measure.sh errors, missing dependencies, broken paths)
69
+ - Config issues (noise_threshold too low, wrong direction, missing quality gates)
70
+ - Scope problems (program.md too broad/narrow, missing rules)
71
+ - Stagnation patterns (agent stuck in a loop, no improvement headroom)
72
+ - Crashes (agent errors, tool failures)
73
+ - Target project issues (broken build, missing files the measurement depends on)
74
+
75
+ 2. **Propose** — Present your analysis and specific proposed fixes. Be concrete: say exactly which file(s) you'd change and what you'd change. **Wait for the user to approve before making any changes.**
76
+
77
+ 3. **Fix** — After the user agrees (or guides you to a different fix), make the changes to the program files in .autoauto/programs/${programSlug}/.
78
+
79
+ 4. **Validate** — After modifying program files, read ${referencePath} for validation instructions, then run measurement validation to confirm the fix works.
80
+
81
+ 5. **Iterate** — If the user wants more changes, continue the conversation. When done, say: "Program updated. Press Escape to go back."
82
+
83
+ ## Key Principles
84
+
85
+ - **Diagnose before fixing.** Read the run context carefully. Don't jump to changes without understanding the root cause.
86
+ - **Propose, then wait.** Always present your proposed fix and wait for the user to confirm before editing files.
87
+ - **One fix at a time.** Focus on the most impactful issue first.
88
+ - **Validate after changes.** Always run measurement validation after modifying measure.sh, build.sh, or config.json.
89
+ - **Be concise.** Don't lecture. Short, actionable responses.
90
+
91
+ ## What NOT to Do
92
+
93
+ - Don't make changes without user approval
94
+ - Don't modify target project source files — only edit files in .autoauto/programs/${programSlug}/
95
+ - Don't skip measurement validation after modifying measurement files
96
+ - Don't include anything other than JSON in measure.sh's stdout — logs go to stderr
97
+ - Don't use \`mktemp\` with suffixes after the X template (e.g. \`mktemp /tmp/foo-XXXXXX.json\`) — this fails on macOS. Instead, append the suffix outside: \`$(mktemp /tmp/foo-XXXXXX).json\`
98
+ - Don't forget to chmod +x measure.sh after writing it`
99
+
100
+ const referenceContent = `# Update Agent Reference
101
+
102
+ This file contains validation procedures and artifact format reference for the AutoAuto Update Agent.
103
+
104
+ **Paths:**
105
+ - Programs directory: ${programsDir}
106
+ - Program directory: ${programDir}
107
+ - Validation script: ${VALIDATE_SCRIPT}
108
+
109
+ ## Measurement Validation
110
+
111
+ After modifying measure.sh, build.sh, or config.json, ALWAYS validate measurement stability.
112
+
113
+ ### Running Validation
114
+
115
+ Run this exact command via Bash:
116
+ \`\`\`bash
117
+ bun run ${VALIDATE_SCRIPT} ${programDir}/measure.sh ${programDir}/config.json 5
118
+ \`\`\`
119
+
120
+ The validation script:
121
+ - Creates a temporary git worktree (simulating the actual run environment)
122
+ - Runs build.sh once first if ${programDir}/build.sh exists
123
+ - Runs 1 warmup measurement (excluded from stats)
124
+ - Runs 5 measurement repeats sequentially
125
+ - Validates every output against config.json
126
+ - Computes variance statistics and avg_duration_ms
127
+ - Outputs a JSON object with the full results
128
+ - Automatically cleans up the worktree afterward
129
+
130
+ **IMPORTANT:** build.sh MUST install any required dependencies (e.g. \`npm ci\`, \`bun install\`). If build.sh fails with "command not found" errors, the build script needs to install dependencies first.
131
+
132
+ ### Interpreting Results
133
+
134
+ | CV% | Assessment | Action |
135
+ |-----|-----------|--------|
136
+ | < 1% | Deterministic | noise_threshold=0.01, repeats=1 |
137
+ | 1–5% | Excellent | noise_threshold=0.02, repeats=3 |
138
+ | 5–15% | Acceptable | noise_threshold=max(CV%*1.5/100, 0.05), repeats=5 |
139
+ | 15–30% | Noisy | noise_threshold=max(CV%*2/100, 0.10), repeats=7 |
140
+ | ≥ 30% | Unstable | Fix the measurement first |
141
+
142
+ ### Common Noise Causes & Fixes
143
+
144
+ 1. **Cold starts** — Add a warmup run excluded from measurement
145
+ 2. **Background processes** — Measure relative to baseline, not absolute
146
+ 3. **Network calls** — Mock external calls or use local servers
147
+ 4. **Non-deterministic code** — Lock random seeds, fix ordering
148
+ 5. **Caching** — Always warm or always clear caches
149
+ 6. **Shared state** — Clean up between measurement runs
150
+ 7. **Short measurement duration** — Increase sample size
151
+
152
+ After fixing, re-run validation.
153
+
154
+ ## Artifact Formats
155
+
156
+ When editing program files, follow these format requirements:
157
+
158
+ ### measure.sh
159
+ - Shebang: \`#!/usr/bin/env bash\`
160
+ - \`set -euo pipefail\`
161
+ - stdout: exactly ONE JSON object, nothing else
162
+ - stderr: OK for logs/debug output
163
+ - Exit 0 on success, nonzero on failure
164
+ - Must complete in <60 seconds
165
+ - NEVER hardcode absolute home directory paths — use relative paths, \`$HOME\`, or \`~\`
166
+
167
+ ### config.json
168
+ - \`metric_field\`: key from measure.sh JSON output
169
+ - \`direction\`: "lower" or "higher"
170
+ - \`noise_threshold\`: decimal (e.g. 0.02 for 2%), must exceed noise floor
171
+ - \`repeats\`: integer ≥ 1
172
+ - \`quality_gates\`: object with field: {min/max: number}
173
+ - \`secondary_metrics\`: optional, field: {direction: "lower"|"higher"}
174
+
175
+ ### program.md
176
+ - Sections: Goal, Scope (Files/Off-limits), Rules, Steps
177
+ - Scope should be tight — one file or component is ideal
178
+ - Rules should prevent metric gaming
179
+
180
+ ### build.sh (optional)
181
+ - Shebang: \`#!/usr/bin/env bash\`
182
+ - \`set -euo pipefail\`
183
+ - Runs ONCE before measurement
184
+ - Must install dependencies
185
+ - NEVER hardcode absolute home directory paths`
186
+
187
+ return { systemPrompt, referencePath, referenceContent }
188
+ }
@@ -0,0 +1,99 @@
1
+ function abbreviatePath(filePath: string): string {
2
+ const parts = filePath.replace(/^\//, "").split("/")
3
+ if (parts.length <= 3) return parts.join("/")
4
+ return `…/${parts.slice(-3).join("/")}`
5
+ }
6
+
7
+ function formatFileToolEvent(verb: string, input: Record<string, unknown>): string {
8
+ const filePath = input.file_path
9
+ if (typeof filePath === "string") {
10
+ // Multiple file changes (Codex file_change items)
11
+ const changes = input.changes
12
+ if (Array.isArray(changes) && changes.length > 1) {
13
+ return `${verb} ${abbreviatePath(filePath)} (+${changes.length - 1} more)`
14
+ }
15
+ return `${verb} ${abbreviatePath(filePath)}`
16
+ }
17
+ return `${verb} file...`
18
+ }
19
+
20
+ /** Canonical tool name map (lowercase → switch key) */
21
+ const TOOL_ALIASES: Record<string, string> = {
22
+ read: "Read",
23
+ write: "Write",
24
+ edit: "Edit",
25
+ glob: "Glob",
26
+ grep: "Grep",
27
+ bash: "Bash",
28
+ list: "List",
29
+ apply_patch: "Edit",
30
+ multiedit: "Edit",
31
+ webfetch: "WebFetch",
32
+ websearch: "WebSearch",
33
+ }
34
+
35
+ function canonicalToolName(toolName: string): string {
36
+ return TOOL_ALIASES[toolName.toLowerCase()] ?? TOOL_ALIASES[toolName] ?? toolName
37
+ }
38
+
39
+ /** Format a tool call into a brief human-readable status string */
40
+ export function formatToolEvent(
41
+ toolName: string,
42
+ input: Record<string, unknown>
43
+ ): string {
44
+ // Provider-supplied title takes precedence (e.g. OpenCode state.title)
45
+ const title = input.__title
46
+ if (typeof title === "string" && title.trim()) return title
47
+
48
+ const canonical = canonicalToolName(toolName)
49
+ switch (canonical) {
50
+ case "Read":
51
+ return formatFileToolEvent("Reading", input)
52
+ case "Write":
53
+ return formatFileToolEvent("Writing", input)
54
+ case "Edit":
55
+ return formatFileToolEvent("Editing", input)
56
+ case "List":
57
+ return "Listing directory..."
58
+ case "Glob": {
59
+ const pattern = input.pattern
60
+ if (typeof pattern === "string") {
61
+ return `Searching for ${pattern}`
62
+ }
63
+ return "Searching files..."
64
+ }
65
+ case "Grep": {
66
+ const pattern = input.pattern
67
+ const path = input.path
68
+ if (typeof pattern === "string") {
69
+ const suffix = typeof path === "string" ? ` in ${abbreviatePath(path)}` : ""
70
+ return `Grep: ${pattern}${suffix}`
71
+ }
72
+ return "Searching content..."
73
+ }
74
+ case "Bash": {
75
+ const command = input.command
76
+ if (typeof command === "string") {
77
+ if (command.includes("validate-measurement")) {
78
+ return "Validating measurement stability — this may take a minute"
79
+ }
80
+ if (command.includes("build.sh")) {
81
+ return "Running build step"
82
+ }
83
+ if (command.includes("measure.sh")) {
84
+ return "Running measurement"
85
+ }
86
+ const truncated =
87
+ command.length > 80 ? `${command.slice(0, 77)}...` : command
88
+ return `$ ${truncated}`
89
+ }
90
+ return "Running command..."
91
+ }
92
+ case "WebFetch":
93
+ return "Fetching web content..."
94
+ case "WebSearch":
95
+ return "Searching the web..."
96
+ default:
97
+ return `Using ${toolName}...`
98
+ }
99
+ }
@@ -0,0 +1,326 @@
1
+ #!/usr/bin/env bun
2
+ /* eslint-disable no-console, no-await-in-loop */
3
+ import { existsSync, readFileSync } from "node:fs"
4
+ import { dirname, join } from "node:path"
5
+ import { $ } from "bun"
6
+ import { validateProgramConfig, type ProgramConfig } from "./programs.ts"
7
+ import {
8
+ runBuild,
9
+ runMeasurement as runMeasurementCore,
10
+ validateMeasurementOutput,
11
+ } from "./measure.ts"
12
+ import { removeWorktree } from "./worktree.ts"
13
+
14
+ interface RunResult {
15
+ run: number
16
+ success: boolean
17
+ output?: Record<string, unknown>
18
+ error?: string
19
+ duration_ms: number
20
+ }
21
+
22
+ interface FieldStats {
23
+ field: string
24
+ values: number[]
25
+ median: number
26
+ mean: number
27
+ min: number
28
+ max: number
29
+ stdev: number
30
+ cv_percent: number
31
+ }
32
+
33
+ type Assessment = "deterministic" | "excellent" | "acceptable" | "noisy" | "unstable"
34
+
35
+ interface ValidationOutput {
36
+ success: boolean
37
+ total_runs: number
38
+ valid_runs: number
39
+ failed_runs: Array<{ run: number; error: string }>
40
+ validation_errors: Array<{ run: number; errors: string[] }>
41
+ metric: FieldStats | null
42
+ quality_gates: Record<string, FieldStats>
43
+ secondary_metrics: Record<string, FieldStats>
44
+ assessment: Assessment | null
45
+ recommendations: {
46
+ noise_threshold: number
47
+ repeats: number
48
+ } | null
49
+ avg_duration_ms: number
50
+ build: {
51
+ ran: boolean
52
+ success: boolean
53
+ duration_ms: number
54
+ error?: string
55
+ }
56
+ }
57
+
58
+ // --- Input Parsing ---
59
+
60
+ const [measureShPath, configJsonPath, runsStr] = process.argv.slice(2)
61
+
62
+ if (!measureShPath || !configJsonPath) {
63
+ console.error("Usage: validate-measurement.ts <measure_sh> <config_json> [runs]")
64
+ process.exit(1)
65
+ }
66
+
67
+ const numRuns = parseInt(runsStr || "5", 10)
68
+ // Resolve cwd from measure.sh's location — go up from .autoauto/programs/<slug>/
69
+ // to the project root.
70
+ const projectRoot = dirname(dirname(dirname(dirname(measureShPath))))
71
+
72
+ // --- Config Parsing ---
73
+
74
+ let config: ProgramConfig
75
+ try {
76
+ config = validateProgramConfig(JSON.parse(readFileSync(configJsonPath, "utf-8")))
77
+ } catch (err) {
78
+ console.log(JSON.stringify({ success: false, error: `Failed to read config.json: ${err}` }))
79
+ process.exit(0)
80
+ }
81
+
82
+ // --- Measurement Execution ---
83
+
84
+ async function runMeasurement(scriptPath: string, run: number, cwd: string): Promise<RunResult> {
85
+ const result = await runMeasurementCore(scriptPath, cwd)
86
+ if (result.success) {
87
+ return { run, success: true, output: result.output, duration_ms: result.duration_ms }
88
+ }
89
+ return { run, success: false, error: result.error, duration_ms: result.duration_ms }
90
+ }
91
+
92
+ // --- Field Validation ---
93
+
94
+ function validateOutput(
95
+ output: Record<string, unknown>,
96
+ cfg: ProgramConfig,
97
+ ): { valid: boolean; errors: string[] } {
98
+ return validateMeasurementOutput(output, cfg)
99
+ }
100
+
101
+ // --- Statistics ---
102
+
103
+ function round(n: number, decimals: number): number {
104
+ const factor = 10 ** decimals
105
+ return Math.round(n * factor) / factor
106
+ }
107
+
108
+ function computeStats(field: string, values: number[]): FieldStats {
109
+ const sorted = [...values].toSorted((a, b) => a - b)
110
+ const n = sorted.length
111
+ const median =
112
+ n % 2 === 0 ? (sorted[n / 2 - 1] + sorted[n / 2]) / 2 : sorted[Math.floor(n / 2)]
113
+ const mean = values.reduce((a, b) => a + b, 0) / n
114
+ const min = sorted[0]
115
+ const max = sorted[n - 1]
116
+ const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (n - 1) // sample variance
117
+ const stdev = Math.sqrt(variance)
118
+ const cv_percent = mean !== 0 ? (stdev / Math.abs(mean)) * 100 : Infinity
119
+
120
+ return {
121
+ field,
122
+ values,
123
+ median: round(median, 4),
124
+ mean: round(mean, 4),
125
+ min: round(min, 4),
126
+ max: round(max, 4),
127
+ stdev: round(stdev, 4),
128
+ cv_percent: round(cv_percent, 2),
129
+ }
130
+ }
131
+
132
+ // --- Assessment & Recommendations ---
133
+
134
+ function assess(cv_percent: number): Assessment {
135
+ if (cv_percent < 1) return "deterministic"
136
+ if (cv_percent < 5) return "excellent"
137
+ if (cv_percent < 15) return "acceptable"
138
+ if (cv_percent < 30) return "noisy"
139
+ return "unstable"
140
+ }
141
+
142
+ function recommend(cv_percent: number): { noise_threshold: number; repeats: number } {
143
+ if (cv_percent < 1) {
144
+ return { noise_threshold: 0.01, repeats: 1 }
145
+ }
146
+ if (cv_percent < 5) {
147
+ return { noise_threshold: 0.02, repeats: 3 }
148
+ }
149
+ if (cv_percent < 15) {
150
+ return {
151
+ noise_threshold: round(Math.max((cv_percent * 1.5) / 100, 0.05), 2),
152
+ repeats: 5,
153
+ }
154
+ }
155
+ if (cv_percent < 30) {
156
+ return {
157
+ noise_threshold: round(Math.max((cv_percent * 2) / 100, 0.1), 2),
158
+ repeats: 7,
159
+ }
160
+ }
161
+ // Unstable — don't recommend config, fix the measurement first
162
+ return { noise_threshold: -1, repeats: -1 }
163
+ }
164
+
165
+ // --- Main ---
166
+
167
+ async function createValidationWorktree(root: string): Promise<string> {
168
+ const worktreePath = join(root, ".autoauto", "worktrees", `validate-${Date.now()}`)
169
+ await $`git worktree add --detach ${worktreePath}`.cwd(root).quiet()
170
+ return worktreePath
171
+ }
172
+
173
+ async function main() {
174
+ const buildShPath = join(dirname(measureShPath), "build.sh")
175
+
176
+ process.stderr.write("Creating validation worktree...\n")
177
+ let worktreePath: string
178
+ try {
179
+ worktreePath = await createValidationWorktree(projectRoot)
180
+ } catch (err) {
181
+ console.log(JSON.stringify({ success: false, error: `Failed to create validation worktree: ${err}` }))
182
+ return
183
+ }
184
+ process.stderr.write(`Worktree: ${worktreePath}\n`)
185
+
186
+ try {
187
+ await runInWorktree(worktreePath, buildShPath)
188
+ } finally {
189
+ process.stderr.write("Cleaning up validation worktree...\n")
190
+ await removeWorktree(projectRoot, worktreePath)
191
+ }
192
+ }
193
+
194
+ async function runInWorktree(
195
+ cwd: string,
196
+ buildShPath: string,
197
+ ) {
198
+ const hasBuildScript = existsSync(buildShPath)
199
+ const buildResult = hasBuildScript
200
+ ? await runBuild(buildShPath, cwd)
201
+ : { success: true, duration_ms: 0 }
202
+
203
+ if (hasBuildScript) {
204
+ process.stderr.write("Build...")
205
+ process.stderr.write(` ${buildResult.success ? "OK" : "FAIL"} (${buildResult.duration_ms}ms)\n`)
206
+ }
207
+
208
+ if (!buildResult.success) {
209
+ const output: ValidationOutput = {
210
+ success: false,
211
+ total_runs: 0,
212
+ valid_runs: 0,
213
+ failed_runs: [],
214
+ validation_errors: [],
215
+ metric: null,
216
+ quality_gates: {},
217
+ secondary_metrics: {},
218
+ assessment: null,
219
+ recommendations: null,
220
+ avg_duration_ms: 0,
221
+ build: {
222
+ ran: true,
223
+ success: false,
224
+ duration_ms: buildResult.duration_ms,
225
+ error: buildResult.error,
226
+ },
227
+ }
228
+ console.log(JSON.stringify(output, null, 2))
229
+ return
230
+ }
231
+
232
+ // 1. Warmup run — excluded from stats
233
+ process.stderr.write("Warmup run...")
234
+ const warmup = await runMeasurement(measureShPath, 0, cwd)
235
+ process.stderr.write(` ${warmup.success ? "OK" : "FAIL"} (${warmup.duration_ms}ms)\n`)
236
+
237
+ // 2. Run measure.sh N times sequentially
238
+ const results: RunResult[] = []
239
+ for (let i = 0; i < numRuns; i++) {
240
+ process.stderr.write(`Run ${i + 1}/${numRuns}...`)
241
+ const result = await runMeasurement(measureShPath, i + 1, cwd)
242
+ process.stderr.write(` ${result.success ? "OK" : "FAIL"} (${result.duration_ms}ms)\n`)
243
+ results.push(result)
244
+ }
245
+
246
+ // 3. Separate successful and failed runs
247
+ const successfulRuns = results.filter((r) => r.success && r.output)
248
+ const failedRuns = results
249
+ .filter((r) => !r.success)
250
+ .map((r) => ({ run: r.run, error: r.error! }))
251
+
252
+ // 4. Validate outputs against config
253
+ const validationErrors: Array<{ run: number; errors: string[] }> = []
254
+ const validOutputs: Array<{ run: number; output: Record<string, unknown> }> = []
255
+ for (const r of successfulRuns) {
256
+ const validation = validateOutput(r.output!, config)
257
+ if (validation.valid) {
258
+ validOutputs.push({ run: r.run, output: r.output! })
259
+ } else {
260
+ validationErrors.push({ run: r.run, errors: validation.errors })
261
+ }
262
+ }
263
+
264
+ // 5. Compute stats if we have enough valid runs (>= 2)
265
+ let metric: FieldStats | null = null
266
+ let assessment: Assessment | null = null
267
+ let recommendations: { noise_threshold: number; repeats: number } | null = null
268
+
269
+ function computeFieldStats(fields: string[]): Record<string, FieldStats> {
270
+ const stats: Record<string, FieldStats> = {}
271
+ for (const field of fields) {
272
+ const values = validOutputs
273
+ .map((r) => r.output[field])
274
+ .filter((v): v is number => typeof v === "number" && isFinite(v))
275
+ if (values.length >= 2) {
276
+ stats[field] = computeStats(field, values)
277
+ }
278
+ }
279
+ return stats
280
+ }
281
+
282
+ let qualityGateStats: Record<string, FieldStats> = {}
283
+ let secondaryMetricStats: Record<string, FieldStats> = {}
284
+
285
+ if (validOutputs.length >= 2) {
286
+ const metricValues = validOutputs.map((r) => r.output[config.metric_field] as number)
287
+ metric = computeStats(config.metric_field, metricValues)
288
+ assessment = assess(metric.cv_percent)
289
+ const rec = recommend(metric.cv_percent)
290
+ recommendations = rec.noise_threshold >= 0 ? rec : null
291
+
292
+ qualityGateStats = computeFieldStats(Object.keys(config.quality_gates))
293
+ secondaryMetricStats = computeFieldStats(Object.keys(config.secondary_metrics ?? {}))
294
+ }
295
+
296
+ // 6. Output result
297
+ const avgDuration =
298
+ results.length > 0
299
+ ? Math.round(results.reduce((sum, r) => sum + r.duration_ms, 0) / results.length)
300
+ : 0
301
+
302
+ const output: ValidationOutput = {
303
+ success: failedRuns.length === 0 && validationErrors.length === 0 && validOutputs.length >= 2,
304
+ total_runs: numRuns,
305
+ valid_runs: validOutputs.length,
306
+ failed_runs: failedRuns,
307
+ validation_errors: validationErrors,
308
+ metric,
309
+ quality_gates: qualityGateStats,
310
+ secondary_metrics: secondaryMetricStats,
311
+ assessment,
312
+ recommendations,
313
+ avg_duration_ms: avgDuration,
314
+ build: {
315
+ ran: hasBuildScript,
316
+ success: true,
317
+ duration_ms: buildResult.duration_ms,
318
+ },
319
+ }
320
+
321
+ console.log(JSON.stringify(output, null, 2))
322
+ }
323
+
324
+ main().catch((err) => {
325
+ console.log(JSON.stringify({ success: false, error: String(err) }))
326
+ })
@@ -0,0 +1,40 @@
1
+ import { $ } from "bun"
2
+ import { join } from "node:path"
3
+ import { mkdir } from "node:fs/promises"
4
+ import { formatShellError } from "./git.ts"
5
+
6
+ /**
7
+ * Creates a git worktree for a run. The worktree is created inside
8
+ * .autoauto/worktrees/<runId>/ and checks out a new experiment branch.
9
+ *
10
+ * Returns the absolute worktree path.
11
+ */
12
+ export async function createWorktree(
13
+ mainRoot: string,
14
+ runId: string,
15
+ programSlug: string,
16
+ ): Promise<string> {
17
+ const worktreesDir = join(mainRoot, ".autoauto", "worktrees")
18
+ await mkdir(worktreesDir, { recursive: true })
19
+
20
+ const worktreePath = join(worktreesDir, runId)
21
+ const branchName = `autoauto-${programSlug}-${runId}`
22
+
23
+ try {
24
+ await $`git worktree add -b ${branchName} ${worktreePath}`.cwd(mainRoot).quiet()
25
+ } catch (err) {
26
+ throw new Error(formatShellError(err, `git worktree add (branch ${branchName})`), { cause: err })
27
+ }
28
+
29
+ return worktreePath
30
+ }
31
+
32
+ /**
33
+ * Removes a git worktree. Safe to call if the worktree doesn't exist.
34
+ */
35
+ export async function removeWorktree(
36
+ mainRoot: string,
37
+ worktreePath: string,
38
+ ): Promise<void> {
39
+ await $`git worktree remove --force ${worktreePath}`.cwd(mainRoot).nothrow().quiet()
40
+ }
@@ -0,0 +1,31 @@
1
+ export function AuthErrorScreen({ error }: { error: string }) {
2
+ return (
3
+ <box
4
+ flexDirection="column"
5
+ flexGrow={1}
6
+ border
7
+ borderStyle="rounded"
8
+ title="AutoAuto"
9
+ justifyContent="center"
10
+ alignItems="center"
11
+ >
12
+ <text>
13
+ <span fg="#ff5555"><strong>Authentication required</strong></span>
14
+ </text>
15
+ <box height={1} />
16
+ <text>AutoAuto needs access to the Anthropic API to run.</text>
17
+ <box height={1} />
18
+ <text>Run one of:</text>
19
+ <text fg="#7aa2f7">{" claude login (recommended)"}</text>
20
+ <text fg="#7aa2f7">{" claude setup-token (API key)"}</text>
21
+ <box height={1} />
22
+ <text>Then restart AutoAuto.</text>
23
+ {error && (
24
+ <>
25
+ <box height={1} />
26
+ <text fg="#888888" selectable>Error: {error}</text>
27
+ </>
28
+ )}
29
+ </box>
30
+ )
31
+ }