npm - @spacek33z/autoauto - Versions diffs - 0.0.1 - Mend

@spacek33z/autoauto 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/README.md +197 -0
package/package.json +51 -0
package/src/App.tsx +224 -0
package/src/cli.ts +772 -0
package/src/components/AgentPanel.tsx +254 -0
package/src/components/Chat.test.tsx +71 -0
package/src/components/Chat.tsx +308 -0
package/src/components/CycleField.tsx +23 -0
package/src/components/ModelPicker.tsx +97 -0
package/src/components/PostUpdatePrompt.tsx +46 -0
package/src/components/ResultsTable.tsx +172 -0
package/src/components/RunCompletePrompt.tsx +90 -0
package/src/components/RunSettingsOverlay.tsx +49 -0
package/src/components/RunsTable.tsx +219 -0
package/src/components/StatsHeader.tsx +100 -0
package/src/daemon.ts +264 -0
package/src/index.tsx +8 -0
package/src/lib/agent/agent-provider.test.ts +133 -0
package/src/lib/agent/claude-provider.ts +277 -0
package/src/lib/agent/codex-provider.ts +413 -0
package/src/lib/agent/default-providers.ts +10 -0
package/src/lib/agent/index.ts +32 -0
package/src/lib/agent/mock-provider.ts +61 -0
package/src/lib/agent/opencode-provider.ts +424 -0
package/src/lib/agent/types.ts +73 -0
package/src/lib/auth.ts +11 -0
package/src/lib/config.ts +152 -0
package/src/lib/daemon-callbacks.ts +59 -0
package/src/lib/daemon-client.ts +16 -0
package/src/lib/daemon-lifecycle.ts +368 -0
package/src/lib/daemon-spawn.ts +122 -0
package/src/lib/daemon-status.ts +189 -0
package/src/lib/daemon-watcher.ts +192 -0
package/src/lib/experiment-loop.ts +679 -0
package/src/lib/experiment.ts +356 -0
package/src/lib/finalize.test.ts +143 -0
package/src/lib/finalize.ts +511 -0
package/src/lib/format.test.ts +32 -0
package/src/lib/format.ts +44 -0
package/src/lib/git.ts +176 -0
package/src/lib/ideas-backlog.test.ts +54 -0
package/src/lib/ideas-backlog.ts +109 -0
package/src/lib/measure.ts +472 -0
package/src/lib/model-options.ts +24 -0
package/src/lib/programs.ts +247 -0
package/src/lib/push-stream.ts +48 -0
package/src/lib/run-context.ts +112 -0
package/src/lib/run-setup.ts +34 -0
package/src/lib/run.ts +383 -0
package/src/lib/syntax-theme.ts +39 -0
package/src/lib/system-prompts/experiment.ts +77 -0
package/src/lib/system-prompts/finalize.ts +90 -0
package/src/lib/system-prompts/index.ts +7 -0
package/src/lib/system-prompts/setup.ts +516 -0
package/src/lib/system-prompts/update.ts +188 -0
package/src/lib/tool-events.ts +99 -0
package/src/lib/validate-measurement.ts +326 -0
package/src/lib/worktree.ts +40 -0
package/src/screens/AuthErrorScreen.tsx +31 -0
package/src/screens/ExecutionScreen.tsx +851 -0
package/src/screens/FirstSetupScreen.tsx +168 -0
package/src/screens/HomeScreen.tsx +406 -0
package/src/screens/PreRunScreen.tsx +206 -0
package/src/screens/SettingsScreen.tsx +189 -0
package/src/screens/SetupScreen.tsx +226 -0
package/src/tui.tsx +17 -0
package/tsconfig.json +17 -0

package/README.md ADDED Viewed

@@ -0,0 +1,197 @@
+# AutoAuto
+A TUI tool that makes the [autoresearch](https://github.com/karpathy/autoresearch) pattern easy to set up and run on any codebase. Define a metric, let an AI agent iteratively optimize your code, keep improvements, discard failures, loop overnight.
+While autoresearch originated in ML training, AutoAuto applies it to everything: software performance, test stability, prompt optimization, search ranking, marketing copy — anything where you have code and a measurable metric. No training loops, datasets, or GPUs required.
+Don't understand autoresearch or don't know what to apply it on? No problem! This tool will scan your codebase, guide you step by step to create it.
+This tool takes care of everything:
+* Finding autoresearch opportunities in your codebase
+* Defining metrics, creating a scripts
+* Defining the best settings
+* Running the experiments (supports Claude, Codex and OpenCode)
+* Using best practices
+## What it does
+AutoAuto wraps the full autoresearch workflow — from defining what to optimize, to running hundreds of autonomous experiments, to packaging the results — into a terminal UI that handles all the tricky parts for you.
+```
+┌─ Setup ────────────────────────────────────────────────────────────┐
+│ An AI agent inspects your repo, helps you define what to optimize, │
+│ generates a measurement script, and validates it's stable.         │
+└────────────────────────────────────────────────────────────────────┘
+                              ↓
+┌─ Execute ──────────────────────────────────────────────────────────┐
+│ Autonomous loop in a background daemon:                            │
+│   1. Spawn a fresh agent with context from previous experiments    │
+│   2. Agent makes one change and commits                            │
+│   3. Measure (median of N runs)                                    │
+│   4. Keep if improved beyond noise threshold, discard otherwise    │
+│   5. Repeat                                                        │
+│ Runs in a git worktree — your main checkout stays clean.           │
+│ Survives terminal close.                                           │
+└────────────────────────────────────────────────────────────────────┘
+                              ↓
+┌─ Finalize ─────────────────────────────────────────────────────────┐
+│ Review the accumulated diff, group changes into independent        │
+│ branches, and produce a summary.                                   │
+└────────────────────────────────────────────────────────────────────┘
+```
+## Why use this instead of a script
+Autoresearch looks simple — "just loop an agent and measure" — but real implementations fail in predictable ways. AutoAuto encodes lessons from 30+ real-world implementations so you don't have to learn the hard way:
+- **Metric gaming** — Agents optimize the measurement instead of the real goal (random seed manipulation, stripping untested features, benchmark-specific hacks). AutoAuto locks measurement files, enforces scope constraints, and supports quality gates.
+- **Variance** — A "3% improvement" means nothing if your measurement has 5% noise. AutoAuto validates measurement stability during setup, runs median-of-N measurements, and uses a noise threshold to filter false improvements.
+- **Agent drift** — Without constraints, agents rewrite your architecture or add dependencies you never wanted. AutoAuto's `program.md` defines exactly what's in scope and off-limits.
+- **Narrative momentum** — Long-running agents convince themselves their approach is working and resist changing direction. AutoAuto spawns a fresh agent per experiment with no memory except a structured context packet.
+- **Recovery** — One bad experiment shouldn't corrupt the run. AutoAuto uses `git reset --hard` for clean rollback after every failure, re-baselines to detect environment drift, and auto-stops after prolonged stagnation.
+## Quick start
+### Prerequisites
+- [Bun](https://bun.sh) runtime
+- One of the supported AI providers:
+  - **Claude** — [Claude CLI](https://docs.anthropic.com/en/docs/claude-code) installed and authenticated, or `ANTHROPIC_API_KEY` set
+  - **Codex** — [Codex CLI](https://github.com/openai/codex) installed
+  - **OpenCode** — [OpenCode](https://opencode.ai) installed
+### Install and run
+TODO: add easier way to install
+```bash
+git clone https://github.com/SpaceK33z/autoauto.git
+cd autoauto
+bun install
+bun dev
+```
+Or install globally:
+```bash
+bun link
+autoauto
+```
+### Headless CLI
+AutoAuto also has a headless CLI for coding agents, CI or just generally scripting:
+```bash
+autoauto list                        # List programs
+autoauto runs <program>              # List runs for a program
+autoauto run <program>               # Start a run
+autoauto run <program> --max 50      # Run with max experiments
+autoauto stop <program>              # Stop after current experiment
+autoauto attach <program>            # Attach TUI to a running daemon
+```
+## How it works
+### 1. Setup — define what to optimize
+The Setup Agent inspects your repo and walks you through an interactive chat to configure an optimization **program**:
+- **Goal** — What are you optimizing? (e.g., "reduce homepage LCP", "fix flaky test suite", "improve prompt pass rate")
+- **Measurement script** — A `measure.sh` that outputs a JSON object with your metric. AutoAuto validates it runs cleanly and measures variance across multiple runs.
+- **Scope constraints** — Which files the agent can touch, what's off-limits, and rules it must follow.
+- **Quality gates** — Secondary metrics that must stay within bounds (e.g., "CLS must remain below 0.1" while optimizing LCP).
+The result is a reusable program stored in `.autoauto/programs/<name>/` — you can run it repeatedly.
+### 2. Execute — the autonomous loop
+Hit run and AutoAuto:
+1. Creates a git worktree (your main checkout stays untouched)
+2. Spawns a background daemon that survives terminal close
+3. Establishes a baseline measurement
+4. Loops: spawn agent → one change → commit → measure → keep or discard → repeat
+Each experiment agent gets a **context packet** — not the full chat history, but a structured summary: current baseline, recent results, git log of kept changes, diffs from recently discarded attempts, and an ideas backlog of what's been tried. This prevents repeating failed approaches while keeping context small.
+The live TUI dashboard shows:
+- **Stats header** — experiment count, keeps/discards, baseline vs best with improvement %, cost, and a sparkline
+- **Results table** — color-coded experiment outcomes (green = kept, red = discarded)
+- **Agent panel** — live streaming output from the current experiment
+### 3. Finalize — package the results
+After the loop completes (or you stop it), a Finalize Agent reviews the accumulated diff and groups changes into independent branches for clean review and merge. Falls back to a single squash commit if changes are too intertwined.
+## Key safeguards
+| Safeguard | What it prevents |
+|-----------|-----------------|
+| **Locked evaluator** — `measure.sh` + `config.json` are `chmod 444` during runs | Agent modifying the measurement to fake improvements |
+| **Scope constraints** — `program.md` defines allowed files and off-limits areas | Agent drifting into unrelated code or risky changes |
+| **Quality gates** — secondary metrics with hard thresholds | Agent improving one metric by degrading another |
+| **Noise threshold** — improvements must exceed measured variance | False positives from measurement noise |
+| **Median-of-N** — repeated measurements with median aggregation | Outlier measurements causing bad decisions |
+| **One agent per experiment** — fresh context each iteration | Narrative momentum and compounding errors |
+| **Git worktree isolation** — experiments run in a separate checkout | Corrupting your working directory |
+| **Lock violation detection** — discards any experiment that touches `.autoauto/` | Agent tampering with its own config |
+| **Re-baselining** — fresh baseline after keeps and after consecutive discards | Environment drift causing phantom improvements |
+| **Stagnation detection** — auto-stops after 10 consecutive non-improving experiments | Burning money when the agent is stuck |
+| **Simplicity criterion** — auto-keeps within-noise changes that reduce LOC | Rewarding code simplification even without metric gain |
+## Data model
+```
+.autoauto/                            # All state, gitignored automatically
+  config.json                         # Project config (models, provider)
+  programs/
+    homepage-lcp/
+      program.md                      # Agent instructions + scope constraints
+      measure.sh                      # Measurement script
+      config.json                     # Metric, direction, noise, quality gates
+      build.sh                        # Optional build step before measurement
+      runs/
+        20260407-143022/
+          state.json                  # Run state checkpoint
+          results.tsv                 # Append-only experiment outcomes
+          ideas.md                    # Ideas backlog (optional)
+          stream-001.log              # Per-experiment agent output
+          ...
+  worktrees/
+    20260407-143022/                   # Git worktree for active run
+```
+## Configuration
+AutoAuto supports two model slots:
+- **Execution model** — powers the experiment agents (default: Sonnet)
+- **Support model** — powers setup, update, and finalize agents (default: Sonnet)
+Both are configurable per-provider with effort level (low/medium/high). Override per-run from the pre-run config screen.
+Supported providers: **Claude** (Agent SDK), **Codex** (CLI), **OpenCode**.
+## Stack
+- **Runtime:** [Bun](https://bun.sh)
+- **Language:** TypeScript (strict mode)
+- **TUI:** [OpenTUI](https://opentui.com) (React reconciler for the terminal)
+- **Agent:** [Claude Agent SDK](https://github.com/anthropics/claude-agent-sdk-typescript), with pluggable provider support
+## Documentation
+| Doc | Contents |
+|-----|----------|
+| [Architecture](docs/architecture.md) | System architecture, data model, daemon design |
+| [Orchestration Patterns](docs/orchestration-patterns.md) | Loop design, context packets, stopping criteria |
+| [Measurement Patterns](docs/measurement-patterns.md) | Metric design, scoring approaches, variance handling |
+| [Failure Patterns](docs/failure-patterns.md) | Documented failure modes and safeguards from real implementations |
+| [Autoresearch Ideas](docs/autoresearch-ideas.md) | Non-ML use cases across performance, prompts, marketing, and more |
+## License
+MIT

package/package.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "name": "@spacek33z/autoauto",
+  "version": "0.0.1",
+  "description": "TUI tool for autoresearch — autonomous experiment loops on any codebase",
+  "type": "module",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/SpaceK33z/autoauto.git"
+  },
+  "author": "Kees Kluskens",
+  "homepage": "https://github.com/SpaceK33z/autoauto",
+  "engines": {
+    "bun": ">=1.0.0"
+  },
+  "keywords": [
+    "autoresearch",
+    "tui",
+    "ai",
+    "automation",
+    "experiments",
+    "claude",
+    "codex"
+  ],
+  "files": [
+    "src/",
+    "tsconfig.json"
+  ],
+  "scripts": {
+    "dev": "bun run src/index.tsx",
+    "build": "bun build --compile src/index.tsx --outfile dist/autoauto",
+    "lint": "oxlint",
+    "typecheck": "bun tsc --noEmit"
+  },
+  "bin": {
+    "autoauto": "./src/index.tsx"
+  },
+  "dependencies": {
+    "@anthropic-ai/claude-agent-sdk": "^0.2.92",
+    "@openai/codex-sdk": "^0.118.0",
+    "@opencode-ai/sdk": "^1.3.17",
+    "@opentui/core": "^0.1.97",
+    "@opentui/react": "^0.1.97"
+  },
+  "devDependencies": {
+    "@types/bun": "^1.3.11",
+    "@types/react": "^19.2.14",
+    "oxlint": "^1.58.0",
+    "typescript": "^5.7.0"
+  }
+}

package/src/App.tsx ADDED Viewed

@@ -0,0 +1,224 @@
+import { useState, useEffect } from "react"
+import {
+  useKeyboard,
+  useRenderer,
+  useTerminalDimensions,
+} from "@opentui/react"
+import { HomeScreen } from "./screens/HomeScreen.tsx"
+import { SetupScreen } from "./screens/SetupScreen.tsx"
+import { SettingsScreen } from "./screens/SettingsScreen.tsx"
+import { ExecutionScreen } from "./screens/ExecutionScreen.tsx"
+import { PreRunScreen, type PreRunOverrides } from "./screens/PreRunScreen.tsx"
+import { FirstSetupScreen } from "./screens/FirstSetupScreen.tsx"
+import { PostUpdatePrompt } from "./components/PostUpdatePrompt.tsx"
+import { ensureAutoAutoDir, getProjectRoot, type Screen } from "./lib/programs.ts"
+import { loadProjectConfig, configExists, DEFAULT_CONFIG, type ProjectConfig } from "./lib/config.ts"
+import { isRunActive } from "./lib/run.ts"
+const cwd = process.cwd()
+export function App() {
+  const renderer = useRenderer()
+  const { width, height } = useTerminalDimensions()
+  const [screen, setScreen] = useState<Screen | null>(null)
+  const [selectedProgram, setSelectedProgram] = useState<string | null>(null)
+  const [projectRoot, setProjectRoot] = useState(cwd)
+  const [projectConfig, setProjectConfig] = useState<ProjectConfig>(DEFAULT_CONFIG)
+  const [preRunOverrides, setPreRunOverrides] = useState<PreRunOverrides | null>(null)
+  const [attachRunId, setAttachRunId] = useState<string | null>(null)
+  const [attachReadOnly, setAttachReadOnly] = useState(false)
+  const [autoFinalize, setAutoFinalize] = useState(false)
+  const [updateProgramSlug, setUpdateProgramSlug] = useState<string | null>(null)
+  const [showPostUpdatePrompt, setShowPostUpdatePrompt] = useState(false)
+  useEffect(() => {
+    getProjectRoot(cwd).then(setProjectRoot).catch(() => {})
+    ensureAutoAutoDir(cwd).catch(() => {})
+    configExists(cwd).then((exists) => {
+      setScreen(exists ? "home" : "first-setup")
+    })
+  }, [])
+  // Load project config + reload when returning to home
+  useEffect(() => {
+    if (screen === "home") {
+      loadProjectConfig(cwd).then(setProjectConfig)
+    }
+  }, [screen])
+  useKeyboard((key) => {
+    if (key.name === "escape") {
+      if (screen === "home") {
+        renderer.destroy()
+      }
+      // execution screen handles its own Escape
+    }
+  })
+  if (!screen) {
+    return (
+      <box flexDirection="column" width={width} height={height}>
+        <box flexGrow={1} justifyContent="center" alignItems="center">
+          <text fg="#888888">Loading...</text>
+        </box>
+      </box>
+    )
+  }
+  const footerText =
+    screen === "home"
+      ? " n: new | e: edit | d: delete | f: finalize | s: settings | Tab: switch | Enter: run | Esc: quit"
+      : screen === "execution"
+        ? " Escape: detach (daemon continues) | Tab: switch panel | s: settings | q: stop | Ctrl+C: abort"
+        : screen === "settings"
+          ? " ↑↓: navigate | ←→: change/open | Enter: open model picker | Escape: back"
+          : screen === "first-setup"
+            ? " ↑↓: navigate | ←→: cycle | Enter: select/continue"
+            : " Escape: back"
+  return (
+    <box flexDirection="column" width={width} height={height}>
+      {screen !== "execution" && screen !== "pre-run" && (
+        <box
+          height={3}
+          border
+          borderStyle="rounded"
+          justifyContent="center"
+          alignItems="center"
+        >
+          <text>
+            <strong>AutoAuto</strong>
+          </text>
+        </box>
+      )}
+      <box flexDirection="column" flexGrow={1} flexShrink={1}>
+        {screen === "first-setup" && (
+          <FirstSetupScreen
+            cwd={cwd}
+            navigate={setScreen}
+            onConfigChange={setProjectConfig}
+          />
+        )}
+        {screen === "home" && (
+          <HomeScreen
+            cwd={cwd}
+            navigate={setScreen}
+            onSelectProgram={(slug) => {
+              setSelectedProgram(slug)
+              setAttachRunId(null)
+              setAttachReadOnly(false)
+              setScreen("pre-run")
+            }}
+            onSelectRun={(run) => {
+              if (!run.state) return
+              setSelectedProgram(run.state.program_slug)
+              setPreRunOverrides(null)
+              setAttachRunId(run.run_id)
+              setAutoFinalize(false)
+              setAttachReadOnly(!isRunActive(run))
+              setScreen("execution")
+            }}
+            onFinalizeRun={(run) => {
+              if (!run.state) return
+              setSelectedProgram(run.state.program_slug)
+              setPreRunOverrides(null)
+              setAttachRunId(run.run_id)
+              setAutoFinalize(true)
+              setAttachReadOnly(false)
+              setScreen("execution")
+            }}
+            onUpdateProgram={(slug) => {
+              setUpdateProgramSlug(slug)
+              setSelectedProgram(slug)
+              setScreen("setup")
+            }}
+          />
+        )}
+        {screen === "setup" && !showPostUpdatePrompt && (
+          <SetupScreen
+            cwd={projectRoot}
+            navigate={(s) => {
+              if (updateProgramSlug && s === "home") {
+                // Leaving update mode — show post-update prompt
+                setShowPostUpdatePrompt(true)
+              } else {
+                setUpdateProgramSlug(null)
+                setScreen(s)
+              }
+            }}
+            modelConfig={projectConfig.supportModel}
+            programSlug={updateProgramSlug ?? undefined}
+          />
+        )}
+        {screen === "setup" && showPostUpdatePrompt && selectedProgram && (
+          <PostUpdatePrompt
+            programSlug={selectedProgram}
+            onStartRun={() => {
+              setShowPostUpdatePrompt(false)
+              setUpdateProgramSlug(null)
+              setScreen("pre-run")
+            }}
+            onGoHome={() => {
+              setShowPostUpdatePrompt(false)
+              setUpdateProgramSlug(null)
+              setScreen("home")
+            }}
+          />
+        )}
+        {screen === "settings" && (
+          <SettingsScreen
+            cwd={cwd}
+            navigate={setScreen}
+            config={projectConfig}
+            onConfigChange={setProjectConfig}
+          />
+        )}
+        {screen === "pre-run" && selectedProgram && (
+          <PreRunScreen
+            cwd={projectRoot}
+            programSlug={selectedProgram}
+            defaultModelConfig={projectConfig.executionModel}
+            navigate={setScreen}
+            onStart={(overrides) => {
+              setPreRunOverrides(overrides)
+              setAttachRunId(null)
+              setAttachReadOnly(false)
+              setScreen("execution")
+            }}
+          />
+        )}
+        {screen === "execution" && selectedProgram && (preRunOverrides || attachRunId) && (
+          <ExecutionScreen
+            cwd={projectRoot}
+            programSlug={selectedProgram}
+            modelConfig={preRunOverrides?.modelConfig ?? projectConfig.executionModel}
+            supportModelConfig={projectConfig.supportModel}
+            ideasBacklogEnabled={projectConfig.ideasBacklogEnabled}
+            navigate={(s) => { setPreRunOverrides(null); setAttachRunId(null); setAttachReadOnly(false); setAutoFinalize(false); setScreen(s) }}
+            maxExperiments={preRunOverrides?.maxExperiments ?? 0}
+            useWorktree={preRunOverrides?.useWorktree ?? true}
+            attachRunId={attachRunId ?? undefined}
+            readOnly={attachReadOnly}
+            autoFinalize={autoFinalize}
+            onUpdateProgram={(slug) => {
+              setPreRunOverrides(null)
+              setAttachRunId(null)
+              setAttachReadOnly(false)
+              setAutoFinalize(false)
+              setUpdateProgramSlug(slug)
+              setSelectedProgram(slug)
+              setScreen("setup")
+            }}
+          />
+        )}
+      </box>
+      {screen !== "pre-run" && (
+        <box height={1} flexShrink={0} paddingX={1}>
+          <text fg="#888888">{footerText}</text>
+        </box>
+      )}
+    </box>
+  )
+}