@spacek33z/autoauto 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -0
- package/package.json +51 -0
- package/src/App.tsx +224 -0
- package/src/cli.ts +772 -0
- package/src/components/AgentPanel.tsx +254 -0
- package/src/components/Chat.test.tsx +71 -0
- package/src/components/Chat.tsx +308 -0
- package/src/components/CycleField.tsx +23 -0
- package/src/components/ModelPicker.tsx +97 -0
- package/src/components/PostUpdatePrompt.tsx +46 -0
- package/src/components/ResultsTable.tsx +172 -0
- package/src/components/RunCompletePrompt.tsx +90 -0
- package/src/components/RunSettingsOverlay.tsx +49 -0
- package/src/components/RunsTable.tsx +219 -0
- package/src/components/StatsHeader.tsx +100 -0
- package/src/daemon.ts +264 -0
- package/src/index.tsx +8 -0
- package/src/lib/agent/agent-provider.test.ts +133 -0
- package/src/lib/agent/claude-provider.ts +277 -0
- package/src/lib/agent/codex-provider.ts +413 -0
- package/src/lib/agent/default-providers.ts +10 -0
- package/src/lib/agent/index.ts +32 -0
- package/src/lib/agent/mock-provider.ts +61 -0
- package/src/lib/agent/opencode-provider.ts +424 -0
- package/src/lib/agent/types.ts +73 -0
- package/src/lib/auth.ts +11 -0
- package/src/lib/config.ts +152 -0
- package/src/lib/daemon-callbacks.ts +59 -0
- package/src/lib/daemon-client.ts +16 -0
- package/src/lib/daemon-lifecycle.ts +368 -0
- package/src/lib/daemon-spawn.ts +122 -0
- package/src/lib/daemon-status.ts +189 -0
- package/src/lib/daemon-watcher.ts +192 -0
- package/src/lib/experiment-loop.ts +679 -0
- package/src/lib/experiment.ts +356 -0
- package/src/lib/finalize.test.ts +143 -0
- package/src/lib/finalize.ts +511 -0
- package/src/lib/format.test.ts +32 -0
- package/src/lib/format.ts +44 -0
- package/src/lib/git.ts +176 -0
- package/src/lib/ideas-backlog.test.ts +54 -0
- package/src/lib/ideas-backlog.ts +109 -0
- package/src/lib/measure.ts +472 -0
- package/src/lib/model-options.ts +24 -0
- package/src/lib/programs.ts +247 -0
- package/src/lib/push-stream.ts +48 -0
- package/src/lib/run-context.ts +112 -0
- package/src/lib/run-setup.ts +34 -0
- package/src/lib/run.ts +383 -0
- package/src/lib/syntax-theme.ts +39 -0
- package/src/lib/system-prompts/experiment.ts +77 -0
- package/src/lib/system-prompts/finalize.ts +90 -0
- package/src/lib/system-prompts/index.ts +7 -0
- package/src/lib/system-prompts/setup.ts +516 -0
- package/src/lib/system-prompts/update.ts +188 -0
- package/src/lib/tool-events.ts +99 -0
- package/src/lib/validate-measurement.ts +326 -0
- package/src/lib/worktree.ts +40 -0
- package/src/screens/AuthErrorScreen.tsx +31 -0
- package/src/screens/ExecutionScreen.tsx +851 -0
- package/src/screens/FirstSetupScreen.tsx +168 -0
- package/src/screens/HomeScreen.tsx +406 -0
- package/src/screens/PreRunScreen.tsx +206 -0
- package/src/screens/SettingsScreen.tsx +189 -0
- package/src/screens/SetupScreen.tsx +226 -0
- package/src/tui.tsx +17 -0
- package/tsconfig.json +17 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import { dirname, join } from "node:path"
|
|
2
|
+
import { fileURLToPath } from "node:url"
|
|
3
|
+
import { getProgramsDir } from "../programs.ts"
|
|
4
|
+
|
|
5
|
+
const VALIDATE_SCRIPT = join(dirname(fileURLToPath(import.meta.url)), "..", "validate-measurement.ts")
|
|
6
|
+
|
|
7
|
+
export interface UpdatePromptResult {
|
|
8
|
+
systemPrompt: string
|
|
9
|
+
referencePath: string
|
|
10
|
+
referenceContent: string
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Builds the system prompt for the Update Agent.
|
|
15
|
+
* Reads and embeds current program files so the agent has full context.
|
|
16
|
+
*/
|
|
17
|
+
export async function getUpdateSystemPrompt(
|
|
18
|
+
cwd: string,
|
|
19
|
+
programSlug: string,
|
|
20
|
+
programDir: string,
|
|
21
|
+
): Promise<UpdatePromptResult> {
|
|
22
|
+
const programsDir = getProgramsDir(cwd)
|
|
23
|
+
const referencePath = join(cwd, ".autoauto", "update-reference.md")
|
|
24
|
+
|
|
25
|
+
const [programMd, measureSh, configJson, buildSh] = await Promise.all([
|
|
26
|
+
Bun.file(join(programDir, "program.md")).text().catch(() => "(not found)"),
|
|
27
|
+
Bun.file(join(programDir, "measure.sh")).text().catch(() => "(not found)"),
|
|
28
|
+
Bun.file(join(programDir, "config.json")).text().catch(() => "(not found)"),
|
|
29
|
+
Bun.file(join(programDir, "build.sh")).text().catch(() => null),
|
|
30
|
+
])
|
|
31
|
+
|
|
32
|
+
const systemPrompt = `You are the AutoAuto Update Agent — an expert at diagnosing and fixing autonomous experiment programs.
|
|
33
|
+
|
|
34
|
+
## Your Role
|
|
35
|
+
|
|
36
|
+
You help users fix and improve an existing optimization program. A program consists of a measurement script (measure.sh), a program definition (program.md), a config (config.json), and optionally a build script (build.sh). You analyze previous run results, diagnose issues, and propose targeted fixes.
|
|
37
|
+
|
|
38
|
+
## Context
|
|
39
|
+
|
|
40
|
+
Working directory: ${cwd}
|
|
41
|
+
Program: ${programSlug}
|
|
42
|
+
Program directory: ${programDir}
|
|
43
|
+
|
|
44
|
+
## Current Program Files
|
|
45
|
+
|
|
46
|
+
### program.md
|
|
47
|
+
\`\`\`markdown
|
|
48
|
+
${programMd}
|
|
49
|
+
\`\`\`
|
|
50
|
+
|
|
51
|
+
### measure.sh
|
|
52
|
+
\`\`\`bash
|
|
53
|
+
${measureSh}
|
|
54
|
+
\`\`\`
|
|
55
|
+
|
|
56
|
+
### config.json
|
|
57
|
+
\`\`\`json
|
|
58
|
+
${configJson}
|
|
59
|
+
\`\`\`
|
|
60
|
+
${buildSh ? `\n### build.sh\n\`\`\`bash\n${buildSh}\n\`\`\`\n` : ""}
|
|
61
|
+
## Capabilities
|
|
62
|
+
|
|
63
|
+
You can read files, search the codebase, list directories, run shell commands, write files, and edit files. You may freely READ any file in the project, but you may only WRITE/EDIT files inside .autoauto/programs/${programSlug}/. Do not modify target project source files — those are changed exclusively through the experiment loop.
|
|
64
|
+
|
|
65
|
+
## Conversation Flow
|
|
66
|
+
|
|
67
|
+
1. **Analyze** — The user's first message contains run results, error details, and experiment logs from the most recent run. Study this data carefully. Look for:
|
|
68
|
+
- Measurement failures (measure.sh errors, missing dependencies, broken paths)
|
|
69
|
+
- Config issues (noise_threshold too low, wrong direction, missing quality gates)
|
|
70
|
+
- Scope problems (program.md too broad/narrow, missing rules)
|
|
71
|
+
- Stagnation patterns (agent stuck in a loop, no improvement headroom)
|
|
72
|
+
- Crashes (agent errors, tool failures)
|
|
73
|
+
- Target project issues (broken build, missing files the measurement depends on)
|
|
74
|
+
|
|
75
|
+
2. **Propose** — Present your analysis and specific proposed fixes. Be concrete: say exactly which file(s) you'd change and what you'd change. **Wait for the user to approve before making any changes.**
|
|
76
|
+
|
|
77
|
+
3. **Fix** — After the user agrees (or guides you to a different fix), make the changes to the program files in .autoauto/programs/${programSlug}/.
|
|
78
|
+
|
|
79
|
+
4. **Validate** — After modifying program files, read ${referencePath} for validation instructions, then run measurement validation to confirm the fix works.
|
|
80
|
+
|
|
81
|
+
5. **Iterate** — If the user wants more changes, continue the conversation. When done, say: "Program updated. Press Escape to go back."
|
|
82
|
+
|
|
83
|
+
## Key Principles
|
|
84
|
+
|
|
85
|
+
- **Diagnose before fixing.** Read the run context carefully. Don't jump to changes without understanding the root cause.
|
|
86
|
+
- **Propose, then wait.** Always present your proposed fix and wait for the user to confirm before editing files.
|
|
87
|
+
- **One fix at a time.** Focus on the most impactful issue first.
|
|
88
|
+
- **Validate after changes.** Always run measurement validation after modifying measure.sh, build.sh, or config.json.
|
|
89
|
+
- **Be concise.** Don't lecture. Short, actionable responses.
|
|
90
|
+
|
|
91
|
+
## What NOT to Do
|
|
92
|
+
|
|
93
|
+
- Don't make changes without user approval
|
|
94
|
+
- Don't modify target project source files — only edit files in .autoauto/programs/${programSlug}/
|
|
95
|
+
- Don't skip measurement validation after modifying measurement files
|
|
96
|
+
- Don't include anything other than JSON in measure.sh's stdout — logs go to stderr
|
|
97
|
+
- Don't use \`mktemp\` with suffixes after the X template (e.g. \`mktemp /tmp/foo-XXXXXX.json\`) — this fails on macOS. Instead, append the suffix outside: \`$(mktemp /tmp/foo-XXXXXX).json\`
|
|
98
|
+
- Don't forget to chmod +x measure.sh after writing it`
|
|
99
|
+
|
|
100
|
+
const referenceContent = `# Update Agent Reference
|
|
101
|
+
|
|
102
|
+
This file contains validation procedures and artifact format reference for the AutoAuto Update Agent.
|
|
103
|
+
|
|
104
|
+
**Paths:**
|
|
105
|
+
- Programs directory: ${programsDir}
|
|
106
|
+
- Program directory: ${programDir}
|
|
107
|
+
- Validation script: ${VALIDATE_SCRIPT}
|
|
108
|
+
|
|
109
|
+
## Measurement Validation
|
|
110
|
+
|
|
111
|
+
After modifying measure.sh, build.sh, or config.json, ALWAYS validate measurement stability.
|
|
112
|
+
|
|
113
|
+
### Running Validation
|
|
114
|
+
|
|
115
|
+
Run this exact command via Bash:
|
|
116
|
+
\`\`\`bash
|
|
117
|
+
bun run ${VALIDATE_SCRIPT} ${programDir}/measure.sh ${programDir}/config.json 5
|
|
118
|
+
\`\`\`
|
|
119
|
+
|
|
120
|
+
The validation script:
|
|
121
|
+
- Creates a temporary git worktree (simulating the actual run environment)
|
|
122
|
+
- Runs build.sh once first if ${programDir}/build.sh exists
|
|
123
|
+
- Runs 1 warmup measurement (excluded from stats)
|
|
124
|
+
- Runs 5 measurement repeats sequentially
|
|
125
|
+
- Validates every output against config.json
|
|
126
|
+
- Computes variance statistics and avg_duration_ms
|
|
127
|
+
- Outputs a JSON object with the full results
|
|
128
|
+
- Automatically cleans up the worktree afterward
|
|
129
|
+
|
|
130
|
+
**IMPORTANT:** build.sh MUST install any required dependencies (e.g. \`npm ci\`, \`bun install\`). If build.sh fails with "command not found" errors, the build script needs to install dependencies first.
|
|
131
|
+
|
|
132
|
+
### Interpreting Results
|
|
133
|
+
|
|
134
|
+
| CV% | Assessment | Action |
|
|
135
|
+
|-----|-----------|--------|
|
|
136
|
+
| < 1% | Deterministic | noise_threshold=0.01, repeats=1 |
|
|
137
|
+
| 1–5% | Excellent | noise_threshold=0.02, repeats=3 |
|
|
138
|
+
| 5–15% | Acceptable | noise_threshold=max(CV%*1.5/100, 0.05), repeats=5 |
|
|
139
|
+
| 15–30% | Noisy | noise_threshold=max(CV%*2/100, 0.10), repeats=7 |
|
|
140
|
+
| ≥ 30% | Unstable | Fix the measurement first |
|
|
141
|
+
|
|
142
|
+
### Common Noise Causes & Fixes
|
|
143
|
+
|
|
144
|
+
1. **Cold starts** — Add a warmup run excluded from measurement
|
|
145
|
+
2. **Background processes** — Measure relative to baseline, not absolute
|
|
146
|
+
3. **Network calls** — Mock external calls or use local servers
|
|
147
|
+
4. **Non-deterministic code** — Lock random seeds, fix ordering
|
|
148
|
+
5. **Caching** — Always warm or always clear caches
|
|
149
|
+
6. **Shared state** — Clean up between measurement runs
|
|
150
|
+
7. **Short measurement duration** — Increase sample size
|
|
151
|
+
|
|
152
|
+
After fixing, re-run validation.
|
|
153
|
+
|
|
154
|
+
## Artifact Formats
|
|
155
|
+
|
|
156
|
+
When editing program files, follow these format requirements:
|
|
157
|
+
|
|
158
|
+
### measure.sh
|
|
159
|
+
- Shebang: \`#!/usr/bin/env bash\`
|
|
160
|
+
- \`set -euo pipefail\`
|
|
161
|
+
- stdout: exactly ONE JSON object, nothing else
|
|
162
|
+
- stderr: OK for logs/debug output
|
|
163
|
+
- Exit 0 on success, nonzero on failure
|
|
164
|
+
- Must complete in <60 seconds
|
|
165
|
+
- NEVER hardcode absolute home directory paths — use relative paths, \`$HOME\`, or \`~\`
|
|
166
|
+
|
|
167
|
+
### config.json
|
|
168
|
+
- \`metric_field\`: key from measure.sh JSON output
|
|
169
|
+
- \`direction\`: "lower" or "higher"
|
|
170
|
+
- \`noise_threshold\`: decimal (e.g. 0.02 for 2%), must exceed noise floor
|
|
171
|
+
- \`repeats\`: integer ≥ 1
|
|
172
|
+
- \`quality_gates\`: object with field: {min/max: number}
|
|
173
|
+
- \`secondary_metrics\`: optional, field: {direction: "lower"|"higher"}
|
|
174
|
+
|
|
175
|
+
### program.md
|
|
176
|
+
- Sections: Goal, Scope (Files/Off-limits), Rules, Steps
|
|
177
|
+
- Scope should be tight — one file or component is ideal
|
|
178
|
+
- Rules should prevent metric gaming
|
|
179
|
+
|
|
180
|
+
### build.sh (optional)
|
|
181
|
+
- Shebang: \`#!/usr/bin/env bash\`
|
|
182
|
+
- \`set -euo pipefail\`
|
|
183
|
+
- Runs ONCE before measurement
|
|
184
|
+
- Must install dependencies
|
|
185
|
+
- NEVER hardcode absolute home directory paths`
|
|
186
|
+
|
|
187
|
+
return { systemPrompt, referencePath, referenceContent }
|
|
188
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
function abbreviatePath(filePath: string): string {
|
|
2
|
+
const parts = filePath.replace(/^\//, "").split("/")
|
|
3
|
+
if (parts.length <= 3) return parts.join("/")
|
|
4
|
+
return `…/${parts.slice(-3).join("/")}`
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
function formatFileToolEvent(verb: string, input: Record<string, unknown>): string {
|
|
8
|
+
const filePath = input.file_path
|
|
9
|
+
if (typeof filePath === "string") {
|
|
10
|
+
// Multiple file changes (Codex file_change items)
|
|
11
|
+
const changes = input.changes
|
|
12
|
+
if (Array.isArray(changes) && changes.length > 1) {
|
|
13
|
+
return `${verb} ${abbreviatePath(filePath)} (+${changes.length - 1} more)`
|
|
14
|
+
}
|
|
15
|
+
return `${verb} ${abbreviatePath(filePath)}`
|
|
16
|
+
}
|
|
17
|
+
return `${verb} file...`
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/** Canonical tool name map (lowercase → switch key) */
|
|
21
|
+
const TOOL_ALIASES: Record<string, string> = {
|
|
22
|
+
read: "Read",
|
|
23
|
+
write: "Write",
|
|
24
|
+
edit: "Edit",
|
|
25
|
+
glob: "Glob",
|
|
26
|
+
grep: "Grep",
|
|
27
|
+
bash: "Bash",
|
|
28
|
+
list: "List",
|
|
29
|
+
apply_patch: "Edit",
|
|
30
|
+
multiedit: "Edit",
|
|
31
|
+
webfetch: "WebFetch",
|
|
32
|
+
websearch: "WebSearch",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function canonicalToolName(toolName: string): string {
|
|
36
|
+
return TOOL_ALIASES[toolName.toLowerCase()] ?? TOOL_ALIASES[toolName] ?? toolName
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** Format a tool call into a brief human-readable status string */
|
|
40
|
+
export function formatToolEvent(
|
|
41
|
+
toolName: string,
|
|
42
|
+
input: Record<string, unknown>
|
|
43
|
+
): string {
|
|
44
|
+
// Provider-supplied title takes precedence (e.g. OpenCode state.title)
|
|
45
|
+
const title = input.__title
|
|
46
|
+
if (typeof title === "string" && title.trim()) return title
|
|
47
|
+
|
|
48
|
+
const canonical = canonicalToolName(toolName)
|
|
49
|
+
switch (canonical) {
|
|
50
|
+
case "Read":
|
|
51
|
+
return formatFileToolEvent("Reading", input)
|
|
52
|
+
case "Write":
|
|
53
|
+
return formatFileToolEvent("Writing", input)
|
|
54
|
+
case "Edit":
|
|
55
|
+
return formatFileToolEvent("Editing", input)
|
|
56
|
+
case "List":
|
|
57
|
+
return "Listing directory..."
|
|
58
|
+
case "Glob": {
|
|
59
|
+
const pattern = input.pattern
|
|
60
|
+
if (typeof pattern === "string") {
|
|
61
|
+
return `Searching for ${pattern}`
|
|
62
|
+
}
|
|
63
|
+
return "Searching files..."
|
|
64
|
+
}
|
|
65
|
+
case "Grep": {
|
|
66
|
+
const pattern = input.pattern
|
|
67
|
+
const path = input.path
|
|
68
|
+
if (typeof pattern === "string") {
|
|
69
|
+
const suffix = typeof path === "string" ? ` in ${abbreviatePath(path)}` : ""
|
|
70
|
+
return `Grep: ${pattern}${suffix}`
|
|
71
|
+
}
|
|
72
|
+
return "Searching content..."
|
|
73
|
+
}
|
|
74
|
+
case "Bash": {
|
|
75
|
+
const command = input.command
|
|
76
|
+
if (typeof command === "string") {
|
|
77
|
+
if (command.includes("validate-measurement")) {
|
|
78
|
+
return "Validating measurement stability — this may take a minute"
|
|
79
|
+
}
|
|
80
|
+
if (command.includes("build.sh")) {
|
|
81
|
+
return "Running build step"
|
|
82
|
+
}
|
|
83
|
+
if (command.includes("measure.sh")) {
|
|
84
|
+
return "Running measurement"
|
|
85
|
+
}
|
|
86
|
+
const truncated =
|
|
87
|
+
command.length > 80 ? `${command.slice(0, 77)}...` : command
|
|
88
|
+
return `$ ${truncated}`
|
|
89
|
+
}
|
|
90
|
+
return "Running command..."
|
|
91
|
+
}
|
|
92
|
+
case "WebFetch":
|
|
93
|
+
return "Fetching web content..."
|
|
94
|
+
case "WebSearch":
|
|
95
|
+
return "Searching the web..."
|
|
96
|
+
default:
|
|
97
|
+
return `Using ${toolName}...`
|
|
98
|
+
}
|
|
99
|
+
}
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/* eslint-disable no-console, no-await-in-loop */
|
|
3
|
+
import { existsSync, readFileSync } from "node:fs"
|
|
4
|
+
import { dirname, join } from "node:path"
|
|
5
|
+
import { $ } from "bun"
|
|
6
|
+
import { validateProgramConfig, type ProgramConfig } from "./programs.ts"
|
|
7
|
+
import {
|
|
8
|
+
runBuild,
|
|
9
|
+
runMeasurement as runMeasurementCore,
|
|
10
|
+
validateMeasurementOutput,
|
|
11
|
+
} from "./measure.ts"
|
|
12
|
+
import { removeWorktree } from "./worktree.ts"
|
|
13
|
+
|
|
14
|
+
interface RunResult {
|
|
15
|
+
run: number
|
|
16
|
+
success: boolean
|
|
17
|
+
output?: Record<string, unknown>
|
|
18
|
+
error?: string
|
|
19
|
+
duration_ms: number
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
interface FieldStats {
|
|
23
|
+
field: string
|
|
24
|
+
values: number[]
|
|
25
|
+
median: number
|
|
26
|
+
mean: number
|
|
27
|
+
min: number
|
|
28
|
+
max: number
|
|
29
|
+
stdev: number
|
|
30
|
+
cv_percent: number
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
type Assessment = "deterministic" | "excellent" | "acceptable" | "noisy" | "unstable"
|
|
34
|
+
|
|
35
|
+
interface ValidationOutput {
|
|
36
|
+
success: boolean
|
|
37
|
+
total_runs: number
|
|
38
|
+
valid_runs: number
|
|
39
|
+
failed_runs: Array<{ run: number; error: string }>
|
|
40
|
+
validation_errors: Array<{ run: number; errors: string[] }>
|
|
41
|
+
metric: FieldStats | null
|
|
42
|
+
quality_gates: Record<string, FieldStats>
|
|
43
|
+
secondary_metrics: Record<string, FieldStats>
|
|
44
|
+
assessment: Assessment | null
|
|
45
|
+
recommendations: {
|
|
46
|
+
noise_threshold: number
|
|
47
|
+
repeats: number
|
|
48
|
+
} | null
|
|
49
|
+
avg_duration_ms: number
|
|
50
|
+
build: {
|
|
51
|
+
ran: boolean
|
|
52
|
+
success: boolean
|
|
53
|
+
duration_ms: number
|
|
54
|
+
error?: string
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// --- Input Parsing ---
|
|
59
|
+
|
|
60
|
+
const [measureShPath, configJsonPath, runsStr] = process.argv.slice(2)
|
|
61
|
+
|
|
62
|
+
if (!measureShPath || !configJsonPath) {
|
|
63
|
+
console.error("Usage: validate-measurement.ts <measure_sh> <config_json> [runs]")
|
|
64
|
+
process.exit(1)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const numRuns = parseInt(runsStr || "5", 10)
|
|
68
|
+
// Resolve cwd from measure.sh's location — go up from .autoauto/programs/<slug>/
|
|
69
|
+
// to the project root.
|
|
70
|
+
const projectRoot = dirname(dirname(dirname(dirname(measureShPath))))
|
|
71
|
+
|
|
72
|
+
// --- Config Parsing ---
|
|
73
|
+
|
|
74
|
+
let config: ProgramConfig
|
|
75
|
+
try {
|
|
76
|
+
config = validateProgramConfig(JSON.parse(readFileSync(configJsonPath, "utf-8")))
|
|
77
|
+
} catch (err) {
|
|
78
|
+
console.log(JSON.stringify({ success: false, error: `Failed to read config.json: ${err}` }))
|
|
79
|
+
process.exit(0)
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// --- Measurement Execution ---
|
|
83
|
+
|
|
84
|
+
async function runMeasurement(scriptPath: string, run: number, cwd: string): Promise<RunResult> {
|
|
85
|
+
const result = await runMeasurementCore(scriptPath, cwd)
|
|
86
|
+
if (result.success) {
|
|
87
|
+
return { run, success: true, output: result.output, duration_ms: result.duration_ms }
|
|
88
|
+
}
|
|
89
|
+
return { run, success: false, error: result.error, duration_ms: result.duration_ms }
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// --- Field Validation ---
|
|
93
|
+
|
|
94
|
+
function validateOutput(
|
|
95
|
+
output: Record<string, unknown>,
|
|
96
|
+
cfg: ProgramConfig,
|
|
97
|
+
): { valid: boolean; errors: string[] } {
|
|
98
|
+
return validateMeasurementOutput(output, cfg)
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// --- Statistics ---
|
|
102
|
+
|
|
103
|
+
function round(n: number, decimals: number): number {
|
|
104
|
+
const factor = 10 ** decimals
|
|
105
|
+
return Math.round(n * factor) / factor
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function computeStats(field: string, values: number[]): FieldStats {
|
|
109
|
+
const sorted = [...values].toSorted((a, b) => a - b)
|
|
110
|
+
const n = sorted.length
|
|
111
|
+
const median =
|
|
112
|
+
n % 2 === 0 ? (sorted[n / 2 - 1] + sorted[n / 2]) / 2 : sorted[Math.floor(n / 2)]
|
|
113
|
+
const mean = values.reduce((a, b) => a + b, 0) / n
|
|
114
|
+
const min = sorted[0]
|
|
115
|
+
const max = sorted[n - 1]
|
|
116
|
+
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (n - 1) // sample variance
|
|
117
|
+
const stdev = Math.sqrt(variance)
|
|
118
|
+
const cv_percent = mean !== 0 ? (stdev / Math.abs(mean)) * 100 : Infinity
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
field,
|
|
122
|
+
values,
|
|
123
|
+
median: round(median, 4),
|
|
124
|
+
mean: round(mean, 4),
|
|
125
|
+
min: round(min, 4),
|
|
126
|
+
max: round(max, 4),
|
|
127
|
+
stdev: round(stdev, 4),
|
|
128
|
+
cv_percent: round(cv_percent, 2),
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// --- Assessment & Recommendations ---
|
|
133
|
+
|
|
134
|
+
function assess(cv_percent: number): Assessment {
|
|
135
|
+
if (cv_percent < 1) return "deterministic"
|
|
136
|
+
if (cv_percent < 5) return "excellent"
|
|
137
|
+
if (cv_percent < 15) return "acceptable"
|
|
138
|
+
if (cv_percent < 30) return "noisy"
|
|
139
|
+
return "unstable"
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function recommend(cv_percent: number): { noise_threshold: number; repeats: number } {
|
|
143
|
+
if (cv_percent < 1) {
|
|
144
|
+
return { noise_threshold: 0.01, repeats: 1 }
|
|
145
|
+
}
|
|
146
|
+
if (cv_percent < 5) {
|
|
147
|
+
return { noise_threshold: 0.02, repeats: 3 }
|
|
148
|
+
}
|
|
149
|
+
if (cv_percent < 15) {
|
|
150
|
+
return {
|
|
151
|
+
noise_threshold: round(Math.max((cv_percent * 1.5) / 100, 0.05), 2),
|
|
152
|
+
repeats: 5,
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
if (cv_percent < 30) {
|
|
156
|
+
return {
|
|
157
|
+
noise_threshold: round(Math.max((cv_percent * 2) / 100, 0.1), 2),
|
|
158
|
+
repeats: 7,
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
// Unstable — don't recommend config, fix the measurement first
|
|
162
|
+
return { noise_threshold: -1, repeats: -1 }
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// --- Main ---
|
|
166
|
+
|
|
167
|
+
async function createValidationWorktree(root: string): Promise<string> {
|
|
168
|
+
const worktreePath = join(root, ".autoauto", "worktrees", `validate-${Date.now()}`)
|
|
169
|
+
await $`git worktree add --detach ${worktreePath}`.cwd(root).quiet()
|
|
170
|
+
return worktreePath
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
async function main() {
|
|
174
|
+
const buildShPath = join(dirname(measureShPath), "build.sh")
|
|
175
|
+
|
|
176
|
+
process.stderr.write("Creating validation worktree...\n")
|
|
177
|
+
let worktreePath: string
|
|
178
|
+
try {
|
|
179
|
+
worktreePath = await createValidationWorktree(projectRoot)
|
|
180
|
+
} catch (err) {
|
|
181
|
+
console.log(JSON.stringify({ success: false, error: `Failed to create validation worktree: ${err}` }))
|
|
182
|
+
return
|
|
183
|
+
}
|
|
184
|
+
process.stderr.write(`Worktree: ${worktreePath}\n`)
|
|
185
|
+
|
|
186
|
+
try {
|
|
187
|
+
await runInWorktree(worktreePath, buildShPath)
|
|
188
|
+
} finally {
|
|
189
|
+
process.stderr.write("Cleaning up validation worktree...\n")
|
|
190
|
+
await removeWorktree(projectRoot, worktreePath)
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
async function runInWorktree(
|
|
195
|
+
cwd: string,
|
|
196
|
+
buildShPath: string,
|
|
197
|
+
) {
|
|
198
|
+
const hasBuildScript = existsSync(buildShPath)
|
|
199
|
+
const buildResult = hasBuildScript
|
|
200
|
+
? await runBuild(buildShPath, cwd)
|
|
201
|
+
: { success: true, duration_ms: 0 }
|
|
202
|
+
|
|
203
|
+
if (hasBuildScript) {
|
|
204
|
+
process.stderr.write("Build...")
|
|
205
|
+
process.stderr.write(` ${buildResult.success ? "OK" : "FAIL"} (${buildResult.duration_ms}ms)\n`)
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if (!buildResult.success) {
|
|
209
|
+
const output: ValidationOutput = {
|
|
210
|
+
success: false,
|
|
211
|
+
total_runs: 0,
|
|
212
|
+
valid_runs: 0,
|
|
213
|
+
failed_runs: [],
|
|
214
|
+
validation_errors: [],
|
|
215
|
+
metric: null,
|
|
216
|
+
quality_gates: {},
|
|
217
|
+
secondary_metrics: {},
|
|
218
|
+
assessment: null,
|
|
219
|
+
recommendations: null,
|
|
220
|
+
avg_duration_ms: 0,
|
|
221
|
+
build: {
|
|
222
|
+
ran: true,
|
|
223
|
+
success: false,
|
|
224
|
+
duration_ms: buildResult.duration_ms,
|
|
225
|
+
error: buildResult.error,
|
|
226
|
+
},
|
|
227
|
+
}
|
|
228
|
+
console.log(JSON.stringify(output, null, 2))
|
|
229
|
+
return
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// 1. Warmup run — excluded from stats
|
|
233
|
+
process.stderr.write("Warmup run...")
|
|
234
|
+
const warmup = await runMeasurement(measureShPath, 0, cwd)
|
|
235
|
+
process.stderr.write(` ${warmup.success ? "OK" : "FAIL"} (${warmup.duration_ms}ms)\n`)
|
|
236
|
+
|
|
237
|
+
// 2. Run measure.sh N times sequentially
|
|
238
|
+
const results: RunResult[] = []
|
|
239
|
+
for (let i = 0; i < numRuns; i++) {
|
|
240
|
+
process.stderr.write(`Run ${i + 1}/${numRuns}...`)
|
|
241
|
+
const result = await runMeasurement(measureShPath, i + 1, cwd)
|
|
242
|
+
process.stderr.write(` ${result.success ? "OK" : "FAIL"} (${result.duration_ms}ms)\n`)
|
|
243
|
+
results.push(result)
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// 3. Separate successful and failed runs
|
|
247
|
+
const successfulRuns = results.filter((r) => r.success && r.output)
|
|
248
|
+
const failedRuns = results
|
|
249
|
+
.filter((r) => !r.success)
|
|
250
|
+
.map((r) => ({ run: r.run, error: r.error! }))
|
|
251
|
+
|
|
252
|
+
// 4. Validate outputs against config
|
|
253
|
+
const validationErrors: Array<{ run: number; errors: string[] }> = []
|
|
254
|
+
const validOutputs: Array<{ run: number; output: Record<string, unknown> }> = []
|
|
255
|
+
for (const r of successfulRuns) {
|
|
256
|
+
const validation = validateOutput(r.output!, config)
|
|
257
|
+
if (validation.valid) {
|
|
258
|
+
validOutputs.push({ run: r.run, output: r.output! })
|
|
259
|
+
} else {
|
|
260
|
+
validationErrors.push({ run: r.run, errors: validation.errors })
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// 5. Compute stats if we have enough valid runs (>= 2)
|
|
265
|
+
let metric: FieldStats | null = null
|
|
266
|
+
let assessment: Assessment | null = null
|
|
267
|
+
let recommendations: { noise_threshold: number; repeats: number } | null = null
|
|
268
|
+
|
|
269
|
+
function computeFieldStats(fields: string[]): Record<string, FieldStats> {
|
|
270
|
+
const stats: Record<string, FieldStats> = {}
|
|
271
|
+
for (const field of fields) {
|
|
272
|
+
const values = validOutputs
|
|
273
|
+
.map((r) => r.output[field])
|
|
274
|
+
.filter((v): v is number => typeof v === "number" && isFinite(v))
|
|
275
|
+
if (values.length >= 2) {
|
|
276
|
+
stats[field] = computeStats(field, values)
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
return stats
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
let qualityGateStats: Record<string, FieldStats> = {}
|
|
283
|
+
let secondaryMetricStats: Record<string, FieldStats> = {}
|
|
284
|
+
|
|
285
|
+
if (validOutputs.length >= 2) {
|
|
286
|
+
const metricValues = validOutputs.map((r) => r.output[config.metric_field] as number)
|
|
287
|
+
metric = computeStats(config.metric_field, metricValues)
|
|
288
|
+
assessment = assess(metric.cv_percent)
|
|
289
|
+
const rec = recommend(metric.cv_percent)
|
|
290
|
+
recommendations = rec.noise_threshold >= 0 ? rec : null
|
|
291
|
+
|
|
292
|
+
qualityGateStats = computeFieldStats(Object.keys(config.quality_gates))
|
|
293
|
+
secondaryMetricStats = computeFieldStats(Object.keys(config.secondary_metrics ?? {}))
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// 6. Output result
|
|
297
|
+
const avgDuration =
|
|
298
|
+
results.length > 0
|
|
299
|
+
? Math.round(results.reduce((sum, r) => sum + r.duration_ms, 0) / results.length)
|
|
300
|
+
: 0
|
|
301
|
+
|
|
302
|
+
const output: ValidationOutput = {
|
|
303
|
+
success: failedRuns.length === 0 && validationErrors.length === 0 && validOutputs.length >= 2,
|
|
304
|
+
total_runs: numRuns,
|
|
305
|
+
valid_runs: validOutputs.length,
|
|
306
|
+
failed_runs: failedRuns,
|
|
307
|
+
validation_errors: validationErrors,
|
|
308
|
+
metric,
|
|
309
|
+
quality_gates: qualityGateStats,
|
|
310
|
+
secondary_metrics: secondaryMetricStats,
|
|
311
|
+
assessment,
|
|
312
|
+
recommendations,
|
|
313
|
+
avg_duration_ms: avgDuration,
|
|
314
|
+
build: {
|
|
315
|
+
ran: hasBuildScript,
|
|
316
|
+
success: true,
|
|
317
|
+
duration_ms: buildResult.duration_ms,
|
|
318
|
+
},
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
console.log(JSON.stringify(output, null, 2))
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
main().catch((err) => {
|
|
325
|
+
console.log(JSON.stringify({ success: false, error: String(err) }))
|
|
326
|
+
})
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { $ } from "bun"
|
|
2
|
+
import { join } from "node:path"
|
|
3
|
+
import { mkdir } from "node:fs/promises"
|
|
4
|
+
import { formatShellError } from "./git.ts"
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Creates a git worktree for a run. The worktree is created inside
|
|
8
|
+
* .autoauto/worktrees/<runId>/ and checks out a new experiment branch.
|
|
9
|
+
*
|
|
10
|
+
* Returns the absolute worktree path.
|
|
11
|
+
*/
|
|
12
|
+
export async function createWorktree(
|
|
13
|
+
mainRoot: string,
|
|
14
|
+
runId: string,
|
|
15
|
+
programSlug: string,
|
|
16
|
+
): Promise<string> {
|
|
17
|
+
const worktreesDir = join(mainRoot, ".autoauto", "worktrees")
|
|
18
|
+
await mkdir(worktreesDir, { recursive: true })
|
|
19
|
+
|
|
20
|
+
const worktreePath = join(worktreesDir, runId)
|
|
21
|
+
const branchName = `autoauto-${programSlug}-${runId}`
|
|
22
|
+
|
|
23
|
+
try {
|
|
24
|
+
await $`git worktree add -b ${branchName} ${worktreePath}`.cwd(mainRoot).quiet()
|
|
25
|
+
} catch (err) {
|
|
26
|
+
throw new Error(formatShellError(err, `git worktree add (branch ${branchName})`), { cause: err })
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
return worktreePath
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Removes a git worktree. Safe to call if the worktree doesn't exist.
|
|
34
|
+
*/
|
|
35
|
+
export async function removeWorktree(
|
|
36
|
+
mainRoot: string,
|
|
37
|
+
worktreePath: string,
|
|
38
|
+
): Promise<void> {
|
|
39
|
+
await $`git worktree remove --force ${worktreePath}`.cwd(mainRoot).nothrow().quiet()
|
|
40
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export function AuthErrorScreen({ error }: { error: string }) {
|
|
2
|
+
return (
|
|
3
|
+
<box
|
|
4
|
+
flexDirection="column"
|
|
5
|
+
flexGrow={1}
|
|
6
|
+
border
|
|
7
|
+
borderStyle="rounded"
|
|
8
|
+
title="AutoAuto"
|
|
9
|
+
justifyContent="center"
|
|
10
|
+
alignItems="center"
|
|
11
|
+
>
|
|
12
|
+
<text>
|
|
13
|
+
<span fg="#ff5555"><strong>Authentication required</strong></span>
|
|
14
|
+
</text>
|
|
15
|
+
<box height={1} />
|
|
16
|
+
<text>AutoAuto needs access to the Anthropic API to run.</text>
|
|
17
|
+
<box height={1} />
|
|
18
|
+
<text>Run one of:</text>
|
|
19
|
+
<text fg="#7aa2f7">{" claude login (recommended)"}</text>
|
|
20
|
+
<text fg="#7aa2f7">{" claude setup-token (API key)"}</text>
|
|
21
|
+
<box height={1} />
|
|
22
|
+
<text>Then restart AutoAuto.</text>
|
|
23
|
+
{error && (
|
|
24
|
+
<>
|
|
25
|
+
<box height={1} />
|
|
26
|
+
<text fg="#888888" selectable>Error: {error}</text>
|
|
27
|
+
</>
|
|
28
|
+
)}
|
|
29
|
+
</box>
|
|
30
|
+
)
|
|
31
|
+
}
|