wf-gremlins 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wf_gremlins-0.1.0/.claude/scheduled_tasks.lock +1 -0
- wf_gremlins-0.1.0/.claude/settings.local.json +76 -0
- wf_gremlins-0.1.0/.github/workflows/ci.yml +26 -0
- wf_gremlins-0.1.0/.gitignore +10 -0
- wf_gremlins-0.1.0/.gremlins/env +3 -0
- wf_gremlins-0.1.0/DESIGN.md +546 -0
- wf_gremlins-0.1.0/Makefile +15 -0
- wf_gremlins-0.1.0/PKG-INFO +411 -0
- wf_gremlins-0.1.0/README.md +396 -0
- wf_gremlins-0.1.0/gremlins/AGENTS.md +137 -0
- wf_gremlins-0.1.0/gremlins/__init__.py +0 -0
- wf_gremlins-0.1.0/gremlins/__main__.py +4 -0
- wf_gremlins-0.1.0/gremlins/bail.py +52 -0
- wf_gremlins-0.1.0/gremlins/cli.py +320 -0
- wf_gremlins-0.1.0/gremlins/clients/__init__.py +25 -0
- wf_gremlins-0.1.0/gremlins/clients/claude.py +188 -0
- wf_gremlins-0.1.0/gremlins/clients/copilot.py +120 -0
- wf_gremlins-0.1.0/gremlins/clients/fake.py +111 -0
- wf_gremlins-0.1.0/gremlins/clients/protocol.py +33 -0
- wf_gremlins-0.1.0/gremlins/clients/resolve.py +119 -0
- wf_gremlins-0.1.0/gremlins/clients/stream.py +160 -0
- wf_gremlins-0.1.0/gremlins/env_file.py +50 -0
- wf_gremlins-0.1.0/gremlins/fleet/AGENTS.md +27 -0
- wf_gremlins-0.1.0/gremlins/fleet/__init__.py +114 -0
- wf_gremlins-0.1.0/gremlins/fleet/ack.py +44 -0
- wf_gremlins-0.1.0/gremlins/fleet/cli.py +334 -0
- wf_gremlins-0.1.0/gremlins/fleet/close.py +38 -0
- wf_gremlins-0.1.0/gremlins/fleet/constants.py +33 -0
- wf_gremlins-0.1.0/gremlins/fleet/duration.py +15 -0
- wf_gremlins-0.1.0/gremlins/fleet/land.py +881 -0
- wf_gremlins-0.1.0/gremlins/fleet/log.py +36 -0
- wf_gremlins-0.1.0/gremlins/fleet/render.py +88 -0
- wf_gremlins-0.1.0/gremlins/fleet/rescue.py +921 -0
- wf_gremlins-0.1.0/gremlins/fleet/resolve.py +45 -0
- wf_gremlins-0.1.0/gremlins/fleet/session_summary.py +302 -0
- wf_gremlins-0.1.0/gremlins/fleet/state.py +181 -0
- wf_gremlins-0.1.0/gremlins/fleet/stop.py +108 -0
- wf_gremlins-0.1.0/gremlins/fleet/views.py +248 -0
- wf_gremlins-0.1.0/gremlins/gh_utils.py +288 -0
- wf_gremlins-0.1.0/gremlins/git.py +546 -0
- wf_gremlins-0.1.0/gremlins/handoff.py +591 -0
- wf_gremlins-0.1.0/gremlins/init.py +232 -0
- wf_gremlins-0.1.0/gremlins/launcher.py +675 -0
- wf_gremlins-0.1.0/gremlins/logging_setup.py +28 -0
- wf_gremlins-0.1.0/gremlins/orchestrators/AGENTS.md +54 -0
- wf_gremlins-0.1.0/gremlins/orchestrators/__init__.py +0 -0
- wf_gremlins-0.1.0/gremlins/orchestrators/boss.py +1127 -0
- wf_gremlins-0.1.0/gremlins/orchestrators/gh.py +795 -0
- wf_gremlins-0.1.0/gremlins/orchestrators/local.py +669 -0
- wf_gremlins-0.1.0/gremlins/pipeline.py +208 -0
- wf_gremlins-0.1.0/gremlins/pipelines/AGENTS.md +42 -0
- wf_gremlins-0.1.0/gremlins/pipelines/__init__.py +0 -0
- wf_gremlins-0.1.0/gremlins/pipelines/gh.yaml +14 -0
- wf_gremlins-0.1.0/gremlins/pipelines/local.yaml +10 -0
- wf_gremlins-0.1.0/gremlins/pipelines/prompts/address_code.md +16 -0
- wf_gremlins-0.1.0/gremlins/pipelines/prompts/code_style.md +9 -0
- wf_gremlins-0.1.0/gremlins/pipelines/prompts/ghaddress.md +105 -0
- wf_gremlins-0.1.0/gremlins/pipelines/prompts/ghplan.md +49 -0
- wf_gremlins-0.1.0/gremlins/pipelines/prompts/ghreview.md +67 -0
- wf_gremlins-0.1.0/gremlins/pipelines/prompts/implement_gh.md +9 -0
- wf_gremlins-0.1.0/gremlins/pipelines/prompts/implement_local.md +7 -0
- wf_gremlins-0.1.0/gremlins/pipelines/prompts/plan.md +22 -0
- wf_gremlins-0.1.0/gremlins/pipelines/prompts/review/detail.md +13 -0
- wf_gremlins-0.1.0/gremlins/prompts/__init__.py +3 -0
- wf_gremlins-0.1.0/gremlins/prompts/loader.py +17 -0
- wf_gremlins-0.1.0/gremlins/run_pipeline.py +47 -0
- wf_gremlins-0.1.0/gremlins/runner.py +375 -0
- wf_gremlins-0.1.0/gremlins/session_summary.py +12 -0
- wf_gremlins-0.1.0/gremlins/stages/AGENTS.md +76 -0
- wf_gremlins-0.1.0/gremlins/stages/__init__.py +4 -0
- wf_gremlins-0.1.0/gremlins/stages/address_code.py +119 -0
- wf_gremlins-0.1.0/gremlins/stages/all.py +31 -0
- wf_gremlins-0.1.0/gremlins/stages/base.py +55 -0
- wf_gremlins-0.1.0/gremlins/stages/ci_fix.md +16 -0
- wf_gremlins-0.1.0/gremlins/stages/commit_pr.py +123 -0
- wf_gremlins-0.1.0/gremlins/stages/commit_pr_fresh.md +1 -0
- wf_gremlins-0.1.0/gremlins/stages/commit_pr_handoff_clean.md +1 -0
- wf_gremlins-0.1.0/gremlins/stages/commit_pr_handoff_dirty.md +1 -0
- wf_gremlins-0.1.0/gremlins/stages/context.py +27 -0
- wf_gremlins-0.1.0/gremlins/stages/ghaddress.py +55 -0
- wf_gremlins-0.1.0/gremlins/stages/ghplan.py +89 -0
- wf_gremlins-0.1.0/gremlins/stages/ghreview.py +60 -0
- wf_gremlins-0.1.0/gremlins/stages/implement.py +253 -0
- wf_gremlins-0.1.0/gremlins/stages/plan.py +45 -0
- wf_gremlins-0.1.0/gremlins/stages/registry.py +17 -0
- wf_gremlins-0.1.0/gremlins/stages/request_copilot.py +46 -0
- wf_gremlins-0.1.0/gremlins/stages/review_code.py +148 -0
- wf_gremlins-0.1.0/gremlins/stages/verify.py +152 -0
- wf_gremlins-0.1.0/gremlins/stages/verify_fix.md +29 -0
- wf_gremlins-0.1.0/gremlins/stages/wait_ci.py +285 -0
- wf_gremlins-0.1.0/gremlins/stages/wait_copilot.py +53 -0
- wf_gremlins-0.1.0/gremlins/state.py +258 -0
- wf_gremlins-0.1.0/pyproject.toml +44 -0
- wf_gremlins-0.1.0/requirements.dev.txt +3 -0
- wf_gremlins-0.1.0/requirements.txt +0 -0
- wf_gremlins-0.1.0/scripts/check-venv.sh +81 -0
- wf_gremlins-0.1.0/tests/__init__.py +0 -0
- wf_gremlins-0.1.0/tests/conftest.py +113 -0
- wf_gremlins-0.1.0/tests/fixtures/__init__.py +0 -0
- wf_gremlins-0.1.0/tests/fixtures/boss_state_sample.json +99 -0
- wf_gremlins-0.1.0/tests/fixtures/fake_claude.py +395 -0
- wf_gremlins-0.1.0/tests/fixtures/fake_gh.py +112 -0
- wf_gremlins-0.1.0/tests/fixtures/handoff_bad_chain_done.md +12 -0
- wf_gremlins-0.1.0/tests/fixtures/handoff_bad_next_plan.md +21 -0
- wf_gremlins-0.1.0/tests/fixtures/shell_env.py +202 -0
- wf_gremlins-0.1.0/tests/test_cli.py +413 -0
- wf_gremlins-0.1.0/tests/test_clients_claude.py +212 -0
- wf_gremlins-0.1.0/tests/test_clients_copilot.py +184 -0
- wf_gremlins-0.1.0/tests/test_clients_spec.py +84 -0
- wf_gremlins-0.1.0/tests/test_env_file.py +43 -0
- wf_gremlins-0.1.0/tests/test_fleet.py +1232 -0
- wf_gremlins-0.1.0/tests/test_gh_utils.py +120 -0
- wf_gremlins-0.1.0/tests/test_handoff.py +688 -0
- wf_gremlins-0.1.0/tests/test_init.py +292 -0
- wf_gremlins-0.1.0/tests/test_launcher.py +957 -0
- wf_gremlins-0.1.0/tests/test_logging_setup.py +45 -0
- wf_gremlins-0.1.0/tests/test_orchestrator_boss.py +2607 -0
- wf_gremlins-0.1.0/tests/test_orchestrator_gh.py +1922 -0
- wf_gremlins-0.1.0/tests/test_orchestrator_local.py +570 -0
- wf_gremlins-0.1.0/tests/test_parallel_bail_shards.py +664 -0
- wf_gremlins-0.1.0/tests/test_parallel_runner.py +320 -0
- wf_gremlins-0.1.0/tests/test_pipeline_loader.py +274 -0
- wf_gremlins-0.1.0/tests/test_prompts_loader.py +61 -0
- wf_gremlins-0.1.0/tests/test_registry_isolation.py +41 -0
- wf_gremlins-0.1.0/tests/test_rescue_phase_a.py +434 -0
- wf_gremlins-0.1.0/tests/test_runner.py +91 -0
- wf_gremlins-0.1.0/tests/test_session_summary.py +505 -0
- wf_gremlins-0.1.0/tests/test_skills_python_sh.py +117 -0
- wf_gremlins-0.1.0/tests/test_stage_ghaddress.py +93 -0
- wf_gremlins-0.1.0/tests/test_stage_ghreview.py +93 -0
- wf_gremlins-0.1.0/tests/test_stage_request_copilot.py +69 -0
- wf_gremlins-0.1.0/tests/test_stage_verify.py +281 -0
- wf_gremlins-0.1.0/tests/test_stage_wait_ci.py +376 -0
- wf_gremlins-0.1.0/tests/test_stages_gh.py +153 -0
- wf_gremlins-0.1.0/tests/test_stages_local.py +470 -0
- wf_gremlins-0.1.0/tests/test_state_isolation.py +343 -0
- wf_gremlins-0.1.0/tests/test_stream_events.py +198 -0
- wf_gremlins-0.1.0/uv.lock +199 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"sessionId":"bb2c6c43-252c-458f-bf72-53f151a1f3e6","pid":96974,"procStart":"Mon May 4 01:42:09 2026","acquiredAt":1777869723354}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(git add *)",
|
|
5
|
+
"Bash(git commit *)",
|
|
6
|
+
"Bash(git push *)",
|
|
7
|
+
"Bash(python -m pytest tests/test_fleet.py -x -q)",
|
|
8
|
+
"Bash(cat)",
|
|
9
|
+
"Bash(python /tmp/analyze_inline.py)",
|
|
10
|
+
"Skill(ghgremlin)",
|
|
11
|
+
"Bash(python -c \"import gremlins.cli; print\\('importable'\\)\")",
|
|
12
|
+
"Bash(python *)",
|
|
13
|
+
"Bash(git fetch *)",
|
|
14
|
+
"Bash(git worktree *)",
|
|
15
|
+
"Read(//private/tmp/fix-45b/**)",
|
|
16
|
+
"Bash(ruff format *)",
|
|
17
|
+
"Bash(ruff check *)",
|
|
18
|
+
"Bash(~/.claude/.venv/bin/gremlins launch *)",
|
|
19
|
+
"Read(//tmp/**)",
|
|
20
|
+
"Read(//Users/xath/.claude/plugins/**)",
|
|
21
|
+
"Read(//Users/xath/.claude/**)",
|
|
22
|
+
"Skill(ghplan)",
|
|
23
|
+
"Bash(gremlins gh *)",
|
|
24
|
+
"Skill(gremlins)",
|
|
25
|
+
"Bash(git rebase *)",
|
|
26
|
+
"Bash(git checkout *)",
|
|
27
|
+
"Bash(gremlins resume *)",
|
|
28
|
+
"Bash(pytest tests/test_clients_claude.py -v)",
|
|
29
|
+
"Bash(pytest -x -q)",
|
|
30
|
+
"Skill(bossgremlin)",
|
|
31
|
+
"Bash(gremlins launch *)",
|
|
32
|
+
"Bash(make check *)",
|
|
33
|
+
"Bash(make test *)",
|
|
34
|
+
"Bash(ps -p 27228 -o pid,etime,stat,command)",
|
|
35
|
+
"Bash(gremlins stop *)",
|
|
36
|
+
"Bash(ps -p 27228 -o pid,stat)",
|
|
37
|
+
"Bash(git -C /Users/xath/Desktop/amorphous-industries/gremlins fetch origin main)",
|
|
38
|
+
"Bash(git -C /Users/xath/Desktop/amorphous-industries/gremlins log --oneline origin/main -5)",
|
|
39
|
+
"Bash(gremlins rescue *)",
|
|
40
|
+
"Bash(git *)",
|
|
41
|
+
"Bash(copilot --help)",
|
|
42
|
+
"Bash(copilot -p \"hello\" --allow-all-tools)",
|
|
43
|
+
"Bash(copilot -p \"say exactly: pong\" --allow-all-tools)",
|
|
44
|
+
"Bash(copilot *)",
|
|
45
|
+
"Bash(uv pip *)",
|
|
46
|
+
"Bash(uv run *)",
|
|
47
|
+
"Bash(/private/tmp/ai_venvs/_Users_xath_Desktop_amorphous-industries_gremlins/bin/python *)",
|
|
48
|
+
"Bash(MODEL_RE)",
|
|
49
|
+
"Bash(pipeline_model)",
|
|
50
|
+
"Bash(gh.py)",
|
|
51
|
+
"Bash(state_model)",
|
|
52
|
+
"Bash(ghgremlin)",
|
|
53
|
+
"Bash(localgremlin)",
|
|
54
|
+
"Bash(_resolve_client_label)",
|
|
55
|
+
"Bash(gh_main)",
|
|
56
|
+
"Bash(bash *)",
|
|
57
|
+
"Bash(chmod +x /Users/xath/Desktop/amorphous-industries/gremlins/scripts/check-venv.sh)",
|
|
58
|
+
"Bash(/Users/xath/Desktop/amorphous-industries/gremlins/scripts/check-venv.sh)",
|
|
59
|
+
"Bash(echo \"exit=$?\")",
|
|
60
|
+
"Bash(./scripts/check-venv.sh)",
|
|
61
|
+
"Bash(awk '/boss_workdir, model$/ {print NR\": [\"$0\"]\"}' tests/test_orchestrator_boss.py)",
|
|
62
|
+
"Bash(pytest tests/test_handoff.py tests/test_orchestrator_boss.py -x)",
|
|
63
|
+
"Bash(pytest tests/test_handoff.py -x)",
|
|
64
|
+
"Bash(pytest tests/test_handoff.py)",
|
|
65
|
+
"Bash(pytest tests/test_orchestrator_boss.py)",
|
|
66
|
+
"Bash(pytest)",
|
|
67
|
+
"Bash(pytest tests/test_handoff.py tests/test_orchestrator_boss.py)",
|
|
68
|
+
"Bash(make typecheck *)",
|
|
69
|
+
"Bash(gremlins boss *)",
|
|
70
|
+
"Bash(uv build *)",
|
|
71
|
+
"Bash(curl -s \"https://pypi.org/simple/\")",
|
|
72
|
+
"Bash(curl -s -o /dev/null -w '%{http_code}\\\\n' https://pypi.org/pypi/__TRACKED_VAR__/json)",
|
|
73
|
+
"Bash(curl *)"
|
|
74
|
+
]
|
|
75
|
+
}
|
|
76
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
check:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
timeout-minutes: 15
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.11"
|
|
20
|
+
- uses: astral-sh/setup-uv@v5
|
|
21
|
+
- name: Install
|
|
22
|
+
run: uv pip install --system -e ".[dev]"
|
|
23
|
+
- name: Lint, format, type-check
|
|
24
|
+
run: make check
|
|
25
|
+
- name: Test
|
|
26
|
+
run: make test
|
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
# Gremlins — System Design
|
|
2
|
+
|
|
3
|
+
This document describes how the gremlins workflow system is put together: where
|
|
4
|
+
we use deterministic code, where we delegate to a model, how context flows (and
|
|
5
|
+
is deliberately *not* shared) between stages, and the cost model that follows
|
|
6
|
+
from those choices.
|
|
7
|
+
|
|
8
|
+
It is not a reference for individual modules — see the per-package `AGENTS.md`
|
|
9
|
+
files for that. It is the rationale you need to evaluate proposed changes
|
|
10
|
+
without re-deriving the trade-offs each time.
|
|
11
|
+
|
|
12
|
+
## 1. The shape of a gremlin
|
|
13
|
+
|
|
14
|
+
A gremlin is a sequence of **stages** executed by a thin orchestrator. The
|
|
15
|
+
sequence is described in a YAML pipeline (`gremlins/pipelines/local.yaml`,
|
|
16
|
+
`gh.yaml`, optionally a project-scoped override at `.gremlins/pipelines/`).
|
|
17
|
+
|
|
18
|
+
A typical pipeline looks like this:
|
|
19
|
+
|
|
20
|
+
plan → implement → review-code → address-code → verify → commit-pr → ...
|
|
21
|
+
|
|
22
|
+
Each stage is one of two kinds:
|
|
23
|
+
|
|
24
|
+
- **Deterministic stages** are plain Python. They run shell commands, talk to
|
|
25
|
+
`gh`, manage worktrees, parse JSON, wait on CI. They do not invoke a model.
|
|
26
|
+
Examples: `verify`, `request-copilot`, `wait-copilot`, `wait-ci`,
|
|
27
|
+
`commit-pr` (the mechanics; the message-writing is delegated).
|
|
28
|
+
- **Agentic stages** invoke `claude -p` via an injected `ClaudeClient`. They
|
|
29
|
+
receive a prompt assembled from pipeline-declared prompt files, run to
|
|
30
|
+
completion, and produce an artifact on disk (a markdown file, a commit, a
|
|
31
|
+
PR comment). Examples: `plan`, `implement`, `review-code`, `address-code`,
|
|
32
|
+
`ghreview`, `ghaddress`. Most agentic stages invoke the model exactly
|
|
33
|
+
once; the two self-healing stages described in §2.2 are the exception.
|
|
34
|
+
|
|
35
|
+
The orchestrator (`runner.run_stages`) is responsible for sequencing,
|
|
36
|
+
`--resume-from <stage>` semantics, and SIGINT/SIGTERM reaping of live `claude`
|
|
37
|
+
children. It is *not* responsible for any decision the model could plausibly
|
|
38
|
+
make better.
|
|
39
|
+
|
|
40
|
+
## 2. Determinism and agency
|
|
41
|
+
|
|
42
|
+
Gremlins run unattended — sometimes overnight, sometimes in chains of a
|
|
43
|
+
dozen. The system has to be robust against model misbehavior, transient
|
|
44
|
+
infrastructure failures, and operator interruption. That shapes where we
|
|
45
|
+
allow agency and where we refuse to.
|
|
46
|
+
|
|
47
|
+
**Use deterministic code for:**
|
|
48
|
+
|
|
49
|
+
- The sequence itself. The pipeline YAML is the contract; stages do not get
|
|
50
|
+
to decide what runs next. This is what makes resumption,
|
|
51
|
+
rescue-after-bail, and chained-boss workflows tractable — the operator
|
|
52
|
+
always knows what stage a gremlin is in, and what comes after, by reading
|
|
53
|
+
one file.
|
|
54
|
+
- Anything observed by another process. Stage names, bail classes, and the
|
|
55
|
+
marker-protocol bail reasons are byte-stable strings. They are written to
|
|
56
|
+
`state.json` and read by the launcher, the fleet manager, the rescue
|
|
57
|
+
protocol, and shell hooks. A model rewording any of these would silently
|
|
58
|
+
break cross-process consumers, so they live in Python constants and YAML
|
|
59
|
+
rather than in prompts.
|
|
60
|
+
- Filesystem and git mechanics. Worktree creation, branch handling, commit
|
|
61
|
+
authorship, PR opening — these are deterministic helpers. The model writes
|
|
62
|
+
*content* (a commit message, a plan, a code change) but does not run `git`
|
|
63
|
+
itself.
|
|
64
|
+
- Bookkeeping. `state.set_stage` and `state.emit_bail` write atomically and
|
|
65
|
+
never raise. A gremlin that crashes mid-stage must leave behind a state
|
|
66
|
+
file the rescue protocol can interpret.
|
|
67
|
+
|
|
68
|
+
**Delegate to a model for:**
|
|
69
|
+
|
|
70
|
+
- Reading code and forming a plan from an issue or a free-text prompt.
|
|
71
|
+
- Making the code change.
|
|
72
|
+
- Reviewing a diff against a lens.
|
|
73
|
+
- Deciding which review findings are worth addressing and editing the code
|
|
74
|
+
accordingly.
|
|
75
|
+
- Writing a commit message, a PR description, or a reply to a reviewer
|
|
76
|
+
comment.
|
|
77
|
+
- Chain-step decisions: the `handoff` agent decides whether a boss chain is
|
|
78
|
+
done, and if not, what the next child should plan.
|
|
79
|
+
|
|
80
|
+
The dividing line is consistent: **the model produces content; deterministic
|
|
81
|
+
code moves it around.** A stage that needed a model to decide *whether to run*
|
|
82
|
+
would be a sign the pipeline was modeled wrong.
|
|
83
|
+
|
|
84
|
+
### 2.1 Why a YAML pipeline rather than an agent loop
|
|
85
|
+
|
|
86
|
+
We could build this as a single long-lived agent that reads tools, makes
|
|
87
|
+
decisions, and produces a PR — and we considered it. We don't, for three
|
|
88
|
+
reasons:
|
|
89
|
+
|
|
90
|
+
1. **Resumability.** A pipeline with named stages and a `state.json` cursor
|
|
91
|
+
can be resumed from any stage by an operator or a rescue script. A
|
|
92
|
+
single agent loop has no equivalent — its "stage" is whatever its scratchpad
|
|
93
|
+
says it is.
|
|
94
|
+
2. **Observability.** Stage transitions are logged events that downstream
|
|
95
|
+
tools (`gremlins` status, `fleet/`, the session-summary hook) consume.
|
|
96
|
+
An agent loop's progress is opaque without parsing its transcript.
|
|
97
|
+
3. **Cost predictability.** Each stage has a bounded prompt and a bounded
|
|
98
|
+
workspace; per-stage cost is roughly stable for the one-shot stages
|
|
99
|
+
(and bounded by `--test-max-attempts` for the self-healing two). An
|
|
100
|
+
agent loop's cost is a function of how long it stays interested,
|
|
101
|
+
which is not a property we want to discover in production.
|
|
102
|
+
|
|
103
|
+
The pipeline is the deterministic skeleton; agency is intentionally confined
|
|
104
|
+
to one stage at a time.
|
|
105
|
+
|
|
106
|
+
### 2.2 Self-healing stages
|
|
107
|
+
|
|
108
|
+
Two stages — `verify` and `wait-ci` — embed an agent retry loop inside the
|
|
109
|
+
stage body. They run a deterministic check (a test command, a CI status
|
|
110
|
+
poll), and on failure invoke a fixer agent against the failure output, then
|
|
111
|
+
re-run the check. Up to `--test-max-attempts` iterations per stage call.
|
|
112
|
+
|
|
113
|
+
This is a deliberate exception to the "one agent invocation per stage"
|
|
114
|
+
shape implied above. The justification is that the artifact these stages
|
|
115
|
+
produce is *the green check itself*: the loop's exit condition is a
|
|
116
|
+
deterministic re-run, the number of fix attempts isn't known up front, and
|
|
117
|
+
from the outside the stage still either produces its artifact or bails.
|
|
118
|
+
Splitting the loop across pipeline stages would require either loop
|
|
119
|
+
semantics in the pipeline YAML (a much larger change) or unrolled stages
|
|
120
|
+
that decide whether to skip — which §2 explicitly forbids.
|
|
121
|
+
|
|
122
|
+
The cost of this exception:
|
|
123
|
+
|
|
124
|
+
- §5's "per-stage cost is roughly stable" does not hold for these two.
|
|
125
|
+
Cost scales with the number of fix attempts and the size of the
|
|
126
|
+
accumulated check output. Per-attempt streams are written to the
|
|
127
|
+
session directory (`stream-verify-N.jsonl`, `verify-attempt-N.log`)
|
|
128
|
+
so post-hoc cost analysis can resolve a loopy stage from a one-shot
|
|
129
|
+
one; the per-stage `total_cost_usd` rollup cannot.
|
|
130
|
+
- A self-healing stage is the one place where an agent invocation sees
|
|
131
|
+
content the *same stage* produced earlier — the failing check output
|
|
132
|
+
following its own fix. The isolation rules in §3 still hold across
|
|
133
|
+
stages; they bend within these two.
|
|
134
|
+
|
|
135
|
+
We accept the exception because the alternatives are worse and the set
|
|
136
|
+
is closed: exactly two stages, both deterministic-check-plus-fixer, and
|
|
137
|
+
we don't expect a third.
|
|
138
|
+
|
|
139
|
+
### 2.3 Bail as a control-flow channel
|
|
140
|
+
|
|
141
|
+
A stage can halt the pipeline two ways. The first is to raise:
|
|
142
|
+
`runner.run_stages` does not catch, so any unhandled exception ends the
|
|
143
|
+
run. The second is to call `state.emit_bail`, which writes a
|
|
144
|
+
`bail_class` (and optional `bail_detail`) to `state.json`.
|
|
145
|
+
|
|
146
|
+
The two routes do different jobs:
|
|
147
|
+
|
|
148
|
+
- **Raising** is for ordinary stage failure. The traceback goes to the
|
|
149
|
+
log; the operator reads it and decides what to do.
|
|
150
|
+
- **`emit_bail`** records a *structured*, *persistent* halt reason.
|
|
151
|
+
`bail_class` is one of a small set of byte-stable strings
|
|
152
|
+
(`reviewer_requested_changes`, `security`, `secrets`, `other`);
|
|
153
|
+
`bail_detail` is a one-line human note. Both live in `state.json`
|
|
154
|
+
after the process exits.
|
|
155
|
+
|
|
156
|
+
The persistence is the point. `bail_class` is read by the rescue
|
|
157
|
+
protocol (§4.3), the fleet manager, the boss recovery table, and shell
|
|
158
|
+
hooks — exactly the cross-process consumers §2 says we serve with
|
|
159
|
+
byte-stable strings rather than prose. A stage that only raises tells a
|
|
160
|
+
human; a stage that calls `emit_bail` first also tells a *script*.
|
|
161
|
+
|
|
162
|
+
`emit_bail` does not itself halt the pipeline. It writes the marker and
|
|
163
|
+
returns; the caller raises immediately afterward, or an in-stage agent
|
|
164
|
+
invokes `python -m gremlins.bail` and the stage's normal exit-code
|
|
165
|
+
handling raises on its behalf. The pairing — write the marker, then
|
|
166
|
+
raise — is the pattern. The marker outlives the raise.
|
|
167
|
+
|
|
168
|
+
`state.check_bail` is the read side. It raises `RuntimeError` if
|
|
169
|
+
`state.json` already has a `bail_class` recorded. It is called at the
|
|
170
|
+
*entry* of stages that follow a soft-failure point — `ghaddress` and
|
|
171
|
+
`ghreview` call it before posting anything to GitHub, and the
|
|
172
|
+
self-healing stages (§2.2) call it inside the retry loop after each
|
|
173
|
+
fixer agent runs, so an agent that bails via `python -m gremlins.bail`
|
|
174
|
+
halts the loop without the stage having to inspect agent output.
|
|
175
|
+
`run_stages` itself does not call `check_bail`; it doesn't need to,
|
|
176
|
+
because every emitter that wants the pipeline to stop also raises.
|
|
177
|
+
|
|
178
|
+
Why not replace this with a typed exception the orchestrator catches?
|
|
179
|
+
Because a typed exception is process-local, and by the time the rescue
|
|
180
|
+
protocol runs, the original process is gone. The bail class has to
|
|
181
|
+
survive the process boundary, and `state.json` is already the
|
|
182
|
+
cross-process contract for everything else about a gremlin. A typed
|
|
183
|
+
exception would duplicate that channel without replacing it.
|
|
184
|
+
|
|
185
|
+
## 3. Context management
|
|
186
|
+
|
|
187
|
+
Context is the central design constraint. The cheapest, most reliable, and
|
|
188
|
+
most reproducible run is the one where each agent gets exactly the
|
|
189
|
+
information it needs and nothing else. We push hard on this.
|
|
190
|
+
|
|
191
|
+
### 3.1 Stages do not share an in-memory context
|
|
192
|
+
|
|
193
|
+
Every agentic stage starts a **fresh `claude -p` subprocess** with a
|
|
194
|
+
**fresh model context**. There is no shared scratchpad, no in-memory history
|
|
195
|
+
threaded between stages, no rolling summary. When `address-code` runs after
|
|
196
|
+
`review-code`, it does not inherit anything from the review process — it gets
|
|
197
|
+
the same starting context any cold invocation would, plus the artifacts the
|
|
198
|
+
review wrote to disk.
|
|
199
|
+
|
|
200
|
+
This is a deliberate constraint, not an oversight:
|
|
201
|
+
|
|
202
|
+
- It bounds the per-stage prompt to something we can reason about.
|
|
203
|
+
- It makes stages independently testable — a stage's behavior is a function
|
|
204
|
+
of its prompt and the worktree, not of a hidden history.
|
|
205
|
+
- It forces communication between stages to go through **artifacts** —
|
|
206
|
+
files that are also useful to the operator: `plan.md`,
|
|
207
|
+
`review-code-*.md`, the commit itself, the PR description.
|
|
208
|
+
- It makes `--resume-from <stage>` semantically clean. Resuming from
|
|
209
|
+
`address-code` means re-running `address-code` with whatever artifacts
|
|
210
|
+
exist on disk; there is no "but the previous run's reviewer was thinking
|
|
211
|
+
about X" hidden state to recover.
|
|
212
|
+
|
|
213
|
+
### 3.2 Prompts are composed, not inherited
|
|
214
|
+
|
|
215
|
+
A stage's prompt is the concatenation of:
|
|
216
|
+
|
|
217
|
+
1. Prompt files declared in the pipeline YAML (`prompts/code_style.md`,
|
|
218
|
+
`prompts/implement_local.md`, etc.). These are pinned per-pipeline.
|
|
219
|
+
2. The artifacts produced by upstream stages, read from disk and embedded
|
|
220
|
+
into the prompt by the stage body.
|
|
221
|
+
3. The minimum task framing the stage needs to do its job.
|
|
222
|
+
|
|
223
|
+
A reviewer does not see the planner's prompt. The implementer does not see
|
|
224
|
+
the reviewer's lens. Each stage is given its own job in its own words, and
|
|
225
|
+
upstream output crosses the boundary as data, not as context.
|
|
226
|
+
|
|
227
|
+
### 3.3 The worktree is the workspace
|
|
228
|
+
|
|
229
|
+
Every gremlin runs in its own git worktree. The agent inside a stage uses
|
|
230
|
+
its own tools (Read, Edit, Bash) to navigate that worktree; the orchestrator
|
|
231
|
+
does not pre-load files into the prompt. This keeps the prompt small even
|
|
232
|
+
for large codebases — the agent only loads what it actually needs — and it
|
|
233
|
+
means the same prompt scales from a 100-file repo to a 10,000-file repo
|
|
234
|
+
without modification.
|
|
235
|
+
|
|
236
|
+
The cost of this is a bit of redundant exploration: `implement` re-reads
|
|
237
|
+
files that `plan` already read. We accept that cost (see §5) because the
|
|
238
|
+
alternative — pre-loading the union of files plan touched — would couple
|
|
239
|
+
the stages together, defeat resumption, and bloat the prompt.
|
|
240
|
+
|
|
241
|
+
### 3.4 Across-gremlin context isolation
|
|
242
|
+
|
|
243
|
+
Different gremlins share nothing. Different `gr_id`s have different
|
|
244
|
+
worktrees, different `state.json` files, different log directories. A boss
|
|
245
|
+
chain coordinates child gremlins by reading their `state.json` files, not
|
|
246
|
+
by sharing context with them. This is what lets a boss recover from a child
|
|
247
|
+
bail without inheriting any of the child's confusion.
|
|
248
|
+
|
|
249
|
+
### 3.5 Parallel stages
|
|
250
|
+
|
|
251
|
+
A `type: parallel` block in a pipeline YAML runs N children concurrently.
|
|
252
|
+
At runtime the block materialises as **three stages**, keeping §2's
|
|
253
|
+
deterministic-vs-agentic line intact:
|
|
254
|
+
|
|
255
|
+
- **`<group>-fanout`** (deterministic). Creates per-child artifact
|
|
256
|
+
subdirs and per-child git worktrees, each a detached checkout of the
|
|
257
|
+
current branch tip. Runs `git worktree prune` first to clear leftovers
|
|
258
|
+
from any previous interrupted run.
|
|
259
|
+
- **`<group>`** (agentic, N concurrent). Runs N `claude -p` invocations in
|
|
260
|
+
a thread pool, each in its own `StageContext` with its `child_key` and
|
|
261
|
+
the worktree path from fan-out. Children write `bail_class` and
|
|
262
|
+
`bail_detail` into `state.json` under `parallel_bails[child_key]`, never
|
|
263
|
+
into the top-level bail slot, so children cannot see each other's bails.
|
|
264
|
+
`check_bail` called with a `child_key` reads only that child's shard.
|
|
265
|
+
- **`<group>-fanin`** (deterministic). Reads `parallel_bails`, applies the
|
|
266
|
+
block's `bail_policy`, promotes a bail to the top-level `bail_class` if
|
|
267
|
+
warranted, clears `parallel_bails`, and tears down all per-child
|
|
268
|
+
worktrees with `git worktree remove --force` + `git worktree prune`.
|
|
269
|
+
Fan-in is also responsible for cleanup on crash — it runs teardown in a
|
|
270
|
+
`try/finally` so worktrees don't accumulate from aborted runs.
|
|
271
|
+
|
|
272
|
+
This decomposition fixes two latent bugs in the prior single-stage
|
|
273
|
+
parallel wrapper:
|
|
274
|
+
|
|
275
|
+
- **Lost bail.** `patch_state` did a read-modify-write without a lock.
|
|
276
|
+
Concurrent `emit_bail` calls raced; last writer won. The fix is twofold:
|
|
277
|
+
`patch_state` now holds an exclusive `fcntl.flock` on a per-`state.json`
|
|
278
|
+
lock file for the duration of each read-modify-write, and child bails go
|
|
279
|
+
into `parallel_bails[child_key]` rather than the shared top-level slot.
|
|
280
|
+
- **Bail cross-contamination.** `check_bail` read the global top-level
|
|
281
|
+
`bail_class`. A parallel child completing after a sibling bailed would
|
|
282
|
+
falsely report itself as bailed. `check_bail` is now parameterised by
|
|
283
|
+
`child_key` and reads only `parallel_bails[child_key]`.
|
|
284
|
+
|
|
285
|
+
Both fixes are backward-compatible: `child_key=None` (the default, used by
|
|
286
|
+
all sequential stages) preserves existing top-level bail semantics.
|
|
287
|
+
|
|
288
|
+
**Per-block knobs** (declared on the parallel block in the pipeline YAML):
|
|
289
|
+
|
|
290
|
+
- `cancel_on_bail: false` (default). All children run to completion even if
|
|
291
|
+
one bails. Right for review lenses where each lens is independent.
|
|
292
|
+
Set to `true` for parallel implementers where a structural bail by one
|
|
293
|
+
child makes the others irrelevant — on first bail a cancel flag is set
|
|
294
|
+
and children that have not yet started are skipped.
|
|
295
|
+
- `bail_policy: any` (default). Any bailing child causes the group to bail
|
|
296
|
+
after fan-in. Set to `all` to require every child to bail before the
|
|
297
|
+
group bails. The top-level `bail_class` is populated from the first
|
|
298
|
+
bailing child's shard.
|
|
299
|
+
|
|
300
|
+
**Worktrees are always-on.** Every parallel child gets its own worktree,
|
|
301
|
+
regardless of whether it mutates. The cost — one full working-tree checkout
|
|
302
|
+
per child, with object storage shared via `.git/worktrees/` — is small
|
|
303
|
+
relative to gremlin runtime. Unconditional worktrees remove a flag and a
|
|
304
|
+
code path: read-only and mutating parallel are architecturally identical;
|
|
305
|
+
the only difference is what the children write and what fan-in does with it.
|
|
306
|
+
|
|
307
|
+
**The merge problem is unsolved.** Fan-in for blocks whose children mutated
|
|
308
|
+
their worktrees raises `NotImplementedError`. Deciding what to do when N
|
|
309
|
+
agents each produced a different diff — pick the best, merge all,
|
|
310
|
+
cherry-pick — requires a concrete use case before the right shape is clear.
|
|
311
|
+
The current parallel use (review lenses) is read-only; it does not hit this
|
|
312
|
+
path.
|
|
313
|
+
|
|
314
|
+
**Resumability.** The three-stage decomposition makes resume targets
|
|
315
|
+
explicit:
|
|
316
|
+
|
|
317
|
+
- `--resume-from <group>-fanout`: re-create slots and run end-to-end.
|
|
318
|
+
- `--resume-from <group>`: rerun all children from cold worktrees (fan-out
|
|
319
|
+
must have already run). Do not try to skip "already-completed" children —
|
|
320
|
+
partial state from a prior run is the in-memory-context-leak §3 forbids.
|
|
321
|
+
- `--resume-from <group>-fanin`: re-aggregate whatever shards exist without
|
|
322
|
+
rerunning workers. The clean win when workers finished but fan-in crashed.
|
|
323
|
+
|
|
324
|
+
## 4. Boss gremlins and chained workflows
|
|
325
|
+
|
|
326
|
+
A single gremlin produces one PR from one plan. Many real tasks don't fit
|
|
327
|
+
that shape — they are sequences of related changes that have to land in
|
|
328
|
+
order, where each step's plan depends on what the previous step actually
|
|
329
|
+
did. The **boss gremlin** is the pattern for those.
|
|
330
|
+
|
|
331
|
+
A boss is itself a long-running process, but it is not a stage pipeline
|
|
332
|
+
in the §1 sense. It runs a loop:
|
|
333
|
+
|
|
334
|
+
1. Decide what the next child should do (handoff agent).
|
|
335
|
+
2. Launch a child gremlin with that plan.
|
|
336
|
+
3. Wait for the child to finish.
|
|
337
|
+
4. Land the child's PR (or recognize an externally-landed one).
|
|
338
|
+
5. Goto 1, until the handoff agent says the chain is done.
|
|
339
|
+
|
|
340
|
+
The boss's own state lives in `boss_state.json`, separate from any child's
|
|
341
|
+
`state.json`. Children are ordinary `local` or `gh` gremlins — the boss
|
|
342
|
+
doesn't run them in-process; it spawns them through the same launcher an
|
|
343
|
+
operator would, with their own worktrees, their own logs, their own
|
|
344
|
+
lifecycles. From a child's perspective there is no boss; it just has a
|
|
345
|
+
plan and runs the pipeline.
|
|
346
|
+
|
|
347
|
+
Boss resumption is keyed off `boss_state.json`, not the pipeline stage
|
|
348
|
+
vocabulary. The shared launcher resume path still tracks `state.json.stage`
|
|
349
|
+
for fleet status, but it does not pass `--resume-from` when re-spawning a
|
|
350
|
+
boss. If a caller does provide `--resume-from`, `boss_main` logs that the
|
|
351
|
+
flag is being ignored and resumes from the chain cursor in `boss_state.json`.
|
|
352
|
+
|
|
353
|
+
### 4.1 Where the agency lives
|
|
354
|
+
|
|
355
|
+
The boss reuses the §2 dividing line, applied at a different scale:
|
|
356
|
+
|
|
357
|
+
- **Deterministic:** spawning children, polling for completion, landing
|
|
358
|
+
PRs, writing `boss_state.json`, parsing children's `state.json`,
|
|
359
|
+
deciding when the chain has structurally stalled.
|
|
360
|
+
- **Agentic, exactly once per step:** the **handoff agent**
|
|
361
|
+
(`gremlins/handoff.py`). It reads the rolling plan, the chain spec, and
|
|
362
|
+
the diff accumulated on the branch, and produces one of three
|
|
363
|
+
decisions: `next-plan` (here is the plan for child N+1), `chain-done`
|
|
364
|
+
(we are finished), or `bail` (something is structurally wrong, stop and
|
|
365
|
+
ask the operator).
|
|
366
|
+
|
|
367
|
+
Everything else in the boss loop is plain Python. Notably, the boss does
|
|
368
|
+
*not* use a model to decide whether a child succeeded — it reads the
|
|
369
|
+
child's `state.json`. This is the same byte-stable-strings discipline
|
|
370
|
+
from §2 applied across the parent/child boundary.
|
|
371
|
+
|
|
372
|
+
### 4.2 Context isolation across the chain
|
|
373
|
+
|
|
374
|
+
Children inherit nothing from each other in-memory. The chain accumulates
|
|
375
|
+
context the same way stages within a single gremlin do (§3): through
|
|
376
|
+
**artifacts** — landed commits on the shared branch, an updated rolling
|
|
377
|
+
plan file, the chain spec. The handoff agent reads those artifacts to
|
|
378
|
+
decide step N+1; child N+1 then runs cold against the post-step-N
|
|
379
|
+
worktree.
|
|
380
|
+
|
|
381
|
+
This is what makes the chain resumable. A boss can be killed and rescued
|
|
382
|
+
mid-chain. The operator can stop a child, edit its PR by hand, and tell
|
|
383
|
+
the boss to continue. None of that requires reconstructing in-memory
|
|
384
|
+
context, because there isn't any — every decision is a function of files
|
|
385
|
+
on disk and `state.json` cursors.
|
|
386
|
+
|
|
387
|
+
### 4.3 The child-bail recovery protocol
|
|
388
|
+
|
|
389
|
+
When a child bails, the boss halts and the operator decides what
|
|
390
|
+
happened. There are three operator commands, each writing one
|
|
391
|
+
unambiguous fact to the child's `state.json`:
|
|
392
|
+
|
|
393
|
+
- `gremlins resume <child-id>` — re-spawn the bailed child at its bail
|
|
394
|
+
point. The child's work is still in flight; the operator pushed a fix
|
|
395
|
+
or edited the worktree.
|
|
396
|
+
- `gremlins ack <child-id>` — assert the child's work is already in
|
|
397
|
+
main. Writes `external_outcome=landed`. Used after a manual merge.
|
|
398
|
+
- `gremlins skip <child-id>` — give up on the child's plan. Writes
|
|
399
|
+
`external_outcome=abandoned`. The handoff agent will plan something
|
|
400
|
+
different.
|
|
401
|
+
|
|
402
|
+
The boss's rescue logic is then a deterministic table lookup on the
|
|
403
|
+
child's recorded state. If the operator hasn't recorded a decision, the
|
|
404
|
+
boss prints the three options and exits non-zero — it never silently
|
|
405
|
+
re-handoffs and spawns a near-duplicate child. This is a deliberate
|
|
406
|
+
design choice: ambiguity at the chain level is surfaced to the operator
|
|
407
|
+
rather than papered over by another model call.
|
|
408
|
+
|
|
409
|
+
### 4.4 Why a boss isn't just a longer pipeline
|
|
410
|
+
|
|
411
|
+
We could express boss workflows as a single longer YAML pipeline with
|
|
412
|
+
many `plan → implement → review-code → ...` repetitions. We don't,
|
|
413
|
+
because:
|
|
414
|
+
|
|
415
|
+
- The number of steps isn't known up front. A real chain ends when the
|
|
416
|
+
feature is done, not at a step count we picked yesterday.
|
|
417
|
+
- Each step's plan is a function of the previous step's diff. That's an
|
|
418
|
+
agentic decision (the handoff agent), and putting it inside a stage
|
|
419
|
+
pipeline would mean a stage that decides whether the next stage runs
|
|
420
|
+
— which §2 forbids.
|
|
421
|
+
- Children need to be independently rescuable, landable, and abandonable
|
|
422
|
+
by an operator. That works because each child is a separately
|
|
423
|
+
launched gremlin with its own state file. A flattened pipeline would
|
|
424
|
+
collapse them into one process and lose the granularity.
|
|
425
|
+
|
|
426
|
+
The boss is the right abstraction precisely because it stays out of the
|
|
427
|
+
child's pipeline and confines its own agency to one decision per step.
|
|
428
|
+
|
|
429
|
+
## 5. Cost model
|
|
430
|
+
|
|
431
|
+
Per-gremlin cost is dominated by two things:
|
|
432
|
+
|
|
433
|
+
- **Token volume per stage.** Driven by prompt size + how much the agent
|
|
434
|
+
reads from the worktree. Bounded by §3 — small prompts, scoped agents.
|
|
435
|
+
- **Number of stages × per-stage volume.** Bounded by the pipeline YAML.
|
|
436
|
+
|
|
437
|
+
We measure cost per run via `CompletedRun.cost_usd`, summed across stages
|
|
438
|
+
into `SubprocessClaudeClient.total_cost_usd`. That number is the unit we
|
|
439
|
+
optimize against.
|
|
440
|
+
|
|
441
|
+
The cost knobs we *do* use:
|
|
442
|
+
|
|
443
|
+
- **Model selection per stage.** The pipeline's `clients` block lets a
|
|
444
|
+
stage pick a smaller model. We default everything to Sonnet and would
|
|
445
|
+
drop individual stages to Haiku only with a measured reason.
|
|
446
|
+
- **Prompt size discipline.** Prompt files are reviewed for length the same
|
|
447
|
+
way code is. A bloated lens file is a regression.
|
|
448
|
+
- **Pipeline length.** Adding a stage is adding a fixed cost to every
|
|
449
|
+
gremlin forever. We resist it.
|
|
450
|
+
|
|
451
|
+
The cost knobs we have *considered and are not using today*:
|
|
452
|
+
|
|
453
|
+
### 5.1 Why session-resumption caching is out for now
|
|
454
|
+
|
|
455
|
+
We did try this.
|
|
456
|
+
|
|
457
|
+
`CompletedRun.session_id` used to be captured from the `claude -p`
|
|
458
|
+
stream-json output so a later stage could call
|
|
459
|
+
`claude --resume <session_id>`. The idea was straightforward: if two
|
|
460
|
+
stages ran back-to-back, Anthropic's prompt cache might make the second
|
|
461
|
+
stage cheaper because the first stage had already paid to build context.
|
|
462
|
+
|
|
463
|
+
What we learned is not "session continuation is fundamentally wrong."
|
|
464
|
+
What we learned is that this implementation sat in an awkward spot in
|
|
465
|
+
this design.
|
|
466
|
+
|
|
467
|
+
1. **Too few stage edges benefited.** The strongest candidate edge was
|
|
468
|
+
`review-code → address-code`, where address could plausibly reuse
|
|
469
|
+
review's reads and findings. Most other edges either do not run
|
|
470
|
+
back-to-back (`verify`, `wait-copilot`, `wait-ci`, CI gates) or do
|
|
471
|
+
not preserve enough useful context to matter (`implement → review-code`
|
|
472
|
+
rereads a tree that implement just changed).
|
|
473
|
+
|
|
474
|
+
2. **The best-looking edge in theory is weak in practice.** The
|
|
475
|
+
`plan → implement` handoff often happens minutes or hours later, with
|
|
476
|
+
the plan authored interactively and then handed to the gremlin via
|
|
477
|
+
`--plan` or an issue. That is usually outside the cache window, so the
|
|
478
|
+
main tempting edge often does not cash out.
|
|
479
|
+
|
|
480
|
+
3. **The plumbing cost was permanent.** Supporting continuation meant
|
|
481
|
+
carrying `session_id` through the client protocol, stage context, and
|
|
482
|
+
resume semantics while still keeping the cold-start path. That added
|
|
483
|
+
ongoing complexity to infrastructure we want to stay boring.
|
|
484
|
+
|
|
485
|
+
4. **It weakened the stage boundary in exactly the way §3 tries to
|
|
486
|
+
avoid.** A resumed session imports the prior stage's full message
|
|
487
|
+
history. That is in-memory context sharing through a side door.
|
|
488
|
+
Once a stage depends on inherited history, `--resume-from <stage>`
|
|
489
|
+
becomes less clean, prompts stop being as bounded, and individual
|
|
490
|
+
stages are less independently testable.
|
|
491
|
+
|
|
492
|
+
5. **It did not compose with the whole pipeline.** Sessions are linear.
|
|
493
|
+
Some of our pipelines are not. Parallel `review-code` stages and any
|
|
494
|
+
future fan-out stages still need a cold-start path, because one
|
|
495
|
+
session cannot be resumed into multiple concurrent children.
|
|
496
|
+
|
|
497
|
+
So the current position is: we removed `CompletedRun.session_id` and are
|
|
498
|
+
not pursuing session-resumption caching in the current implementation.
|
|
499
|
+
That is a "not now" decision, not a permanent design taboo.
|
|
500
|
+
|
|
501
|
+
If we ever reopen it, the bar should be concrete:
|
|
502
|
+
|
|
503
|
+
1. Measured evidence that a specific stage edge is a real cost hot spot.
|
|
504
|
+
2. A continuation model that preserves the clean cold-start and
|
|
505
|
+
`--resume-from` paths.
|
|
506
|
+
3. A clear answer for which edges may share history and which must stay
|
|
507
|
+
isolated.
|
|
508
|
+
4. A story for parallel stages, or an explicit decision that the feature
|
|
509
|
+
only applies to a narrow sequential subset.
|
|
510
|
+
|
|
511
|
+
Until then, the simpler rule wins: stages communicate through explicit
|
|
512
|
+
artifacts, not inherited session history.
|
|
513
|
+
|
|
514
|
+
### 5.2 What we'd do instead, if cost became a problem
|
|
515
|
+
|
|
516
|
+
Before reaching for session caching we would:
|
|
517
|
+
|
|
518
|
+
1. Profile per-stage cost on real gremlins and find the actual hot stage.
|
|
519
|
+
It is almost always `implement`, occasionally `review-code` on large
|
|
520
|
+
diffs.
|
|
521
|
+
2. Trim that stage's prompt or split it. Prompt size is the cheapest
|
|
522
|
+
variable to move.
|
|
523
|
+
3. Drop non-critical stages to a smaller model. `commit-pr`,
|
|
524
|
+
`ghaddress`, and the chain-step `handoff` agent are good candidates;
|
|
525
|
+
they're constrained tasks that don't need Sonnet's headroom.
|
|
526
|
+
4. Only then consider structural changes to context flow.
|
|
527
|
+
|
|
528
|
+
The discipline is: cost work follows measurement, not intuition.
|
|
529
|
+
|
|
530
|
+
## 6. What this design is not good at
|
|
531
|
+
|
|
532
|
+
Worth stating, so future contributors don't try to bend the system into
|
|
533
|
+
shapes it resists:
|
|
534
|
+
|
|
535
|
+
- **Tight feedback loops between stages.** If you find yourself wanting
|
|
536
|
+
`implement` to ask `plan` a clarifying question, the answer is to make
|
|
537
|
+
the plan better, not to wire a back-channel.
|
|
538
|
+
- **Cross-gremlin learning.** Each gremlin is independent. If two
|
|
539
|
+
gremlins are duplicating work, the fix is at the planning layer (one
|
|
540
|
+
bigger plan, or a boss chain), not at the runtime layer.
|
|
541
|
+
- **Streaming partial output to operators.** Stages run to completion and
|
|
542
|
+
produce artifacts. The log is tail-able, but no stage commits to
|
|
543
|
+
emitting structured progress mid-flight.
|
|
544
|
+
|
|
545
|
+
These are non-goals on purpose. The system is designed to be boring,
|
|
546
|
+
resumable, and cheap to reason about, in roughly that order.
|