tilth 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. tilth-0.1.0/.claude/settings.json +14 -0
  2. tilth-0.1.0/.env.example +43 -0
  3. tilth-0.1.0/.github/workflows/docs.yml +23 -0
  4. tilth-0.1.0/.github/workflows/release.yml +36 -0
  5. tilth-0.1.0/.gitignore +47 -0
  6. tilth-0.1.0/CLAUDE.md +218 -0
  7. tilth-0.1.0/LICENSE +21 -0
  8. tilth-0.1.0/PKG-INFO +143 -0
  9. tilth-0.1.0/README.md +112 -0
  10. tilth-0.1.0/docs/architecture/agent-visibility.md +56 -0
  11. tilth-0.1.0/docs/architecture/anatomy-of-a-run.md +79 -0
  12. tilth-0.1.0/docs/architecture/memory-channels.md +93 -0
  13. tilth-0.1.0/docs/architecture/overview.md +78 -0
  14. tilth-0.1.0/docs/assets/IMAGE_STYLE.md +92 -0
  15. tilth-0.1.0/docs/assets/SITE_STYLE.md +116 -0
  16. tilth-0.1.0/docs/assets/anatomy-of-a-run.png +0 -0
  17. tilth-0.1.0/docs/assets/brain-hands-session.png +0 -0
  18. tilth-0.1.0/docs/assets/harness-loop.png +0 -0
  19. tilth-0.1.0/docs/assets/iter-cap-and-summary.png +0 -0
  20. tilth-0.1.0/docs/assets/per-task-lifecycle.png +0 -0
  21. tilth-0.1.0/docs/assets/resume-after-iter-cap.png +0 -0
  22. tilth-0.1.0/docs/assets/session-end.png +0 -0
  23. tilth-0.1.0/docs/assets/session-layout.png +0 -0
  24. tilth-0.1.0/docs/assets/session-render.png +0 -0
  25. tilth-0.1.0/docs/deep-dives/hyper-observability.md +134 -0
  26. tilth-0.1.0/docs/deep-dives/index.md +14 -0
  27. tilth-0.1.0/docs/deep-dives/session-layout.md +56 -0
  28. tilth-0.1.0/docs/deep-dives/task-format.md +110 -0
  29. tilth-0.1.0/docs/deep-dives/token-recording.md +88 -0
  30. tilth-0.1.0/docs/deep-dives/two-loops.md +136 -0
  31. tilth-0.1.0/docs/deep-dives/worker-evaluator-dialogue.md +134 -0
  32. tilth-0.1.0/docs/getting-started/installation.md +102 -0
  33. tilth-0.1.0/docs/getting-started/resuming-and-resetting.md +83 -0
  34. tilth-0.1.0/docs/getting-started/running-the-demo.md +105 -0
  35. tilth-0.1.0/docs/getting-started/visualizing.md +46 -0
  36. tilth-0.1.0/docs/getting-started/your-own-project.md +81 -0
  37. tilth-0.1.0/docs/index.md +59 -0
  38. tilth-0.1.0/docs/reference/releasing.md +99 -0
  39. tilth-0.1.0/docs/reference/safety-guards.md +47 -0
  40. tilth-0.1.0/docs/stylesheets/extra.css +378 -0
  41. tilth-0.1.0/documents/session-render.png +0 -0
  42. tilth-0.1.0/mkdocs.yml +203 -0
  43. tilth-0.1.0/proposals/completed/frictions-2026-05-26.md +214 -0
  44. tilth-0.1.0/proposals/completed/prep-feature.md +296 -0
  45. tilth-0.1.0/proposals/completed/v1-implementation-plan.md +389 -0
  46. tilth-0.1.0/proposals/completed/v1-worker-evaluator-dialogue.md +198 -0
  47. tilth-0.1.0/proposals/observability-dashboard/README.md +61 -0
  48. tilth-0.1.0/proposals/observability-dashboard/mockup-a-ledger.html +642 -0
  49. tilth-0.1.0/proposals/observability-dashboard/mockup-b-flight-recorder.html +638 -0
  50. tilth-0.1.0/proposals/probes/phase1_verdict_tool_call_probe.py +319 -0
  51. tilth-0.1.0/pyproject.toml +93 -0
  52. tilth-0.1.0/tests/test_assistant_history_message.py +73 -0
  53. tilth-0.1.0/tests/test_case_parsing.py +185 -0
  54. tilth-0.1.0/tests/test_cli_router.py +156 -0
  55. tilth-0.1.0/tests/test_client_routing.py +172 -0
  56. tilth-0.1.0/tests/test_config.py +110 -0
  57. tilth-0.1.0/tests/test_context_files.py +142 -0
  58. tilth-0.1.0/tests/test_dispatch_hook_runs.py +59 -0
  59. tilth-0.1.0/tests/test_env_template_no_drift.py +20 -0
  60. tilth-0.1.0/tests/test_evaluator_raw_capture.py +60 -0
  61. tilth-0.1.0/tests/test_evaluator_sees_ledger.py +69 -0
  62. tilth-0.1.0/tests/test_evaluator_verdict_parsing.py +255 -0
  63. tilth-0.1.0/tests/test_feature_dir.py +88 -0
  64. tilth-0.1.0/tests/test_files_path_escape.py +61 -0
  65. tilth-0.1.0/tests/test_finish_reason.py +50 -0
  66. tilth-0.1.0/tests/test_info_config.py +216 -0
  67. tilth-0.1.0/tests/test_init_cmd.py +56 -0
  68. tilth-0.1.0/tests/test_iter_events.py +60 -0
  69. tilth-0.1.0/tests/test_ledger_includes_case.py +36 -0
  70. tilth-0.1.0/tests/test_ledger_io.py +80 -0
  71. tilth-0.1.0/tests/test_ledger_resume.py +33 -0
  72. tilth-0.1.0/tests/test_limit_utilization.py +125 -0
  73. tilth-0.1.0/tests/test_loop_provider_health.py +255 -0
  74. tilth-0.1.0/tests/test_loop_structured_feedback.py +92 -0
  75. tilth-0.1.0/tests/test_memory_manifest.py +105 -0
  76. tilth-0.1.0/tests/test_missing_config.py +67 -0
  77. tilth-0.1.0/tests/test_no_harness_filenames_in_agent_text.py +149 -0
  78. tilth-0.1.0/tests/test_overview_injection.py +55 -0
  79. tilth-0.1.0/tests/test_parse_json_lenient.py +53 -0
  80. tilth-0.1.0/tests/test_paths.py +136 -0
  81. tilth-0.1.0/tests/test_reasoning_request.py +75 -0
  82. tilth-0.1.0/tests/test_response_health.py +138 -0
  83. tilth-0.1.0/tests/test_run_end_to_end.py +157 -0
  84. tilth-0.1.0/tests/test_run_requires_tasks.py +40 -0
  85. tilth-0.1.0/tests/test_session_source_field.py +58 -0
  86. tilth-0.1.0/tests/test_session_status.py +62 -0
  87. tilth-0.1.0/tests/test_session_usage.py +116 -0
  88. tilth-0.1.0/tests/test_summary.py +313 -0
  89. tilth-0.1.0/tests/test_summary_rejection_category_counts.py +120 -0
  90. tilth-0.1.0/tests/test_task_status.py +62 -0
  91. tilth-0.1.0/tests/test_tasks_loading.py +211 -0
  92. tilth-0.1.0/tests/test_trace_span_ids.py +24 -0
  93. tilth-0.1.0/tests/test_usage.py +140 -0
  94. tilth-0.1.0/tests/test_visualize_dashboard.py +252 -0
  95. tilth-0.1.0/tests/test_visualize_server.py +367 -0
  96. tilth-0.1.0/tests/test_worker_case_loop.py +51 -0
  97. tilth-0.1.0/tests/test_worker_prompt_contains_full_prd.py +120 -0
  98. tilth-0.1.0/tests/test_worker_prompt_contains_own_ledger.py +85 -0
  99. tilth-0.1.0/tests/test_workspace.py +73 -0
  100. tilth-0.1.0/tilth/__init__.py +5 -0
  101. tilth-0.1.0/tilth/case.py +271 -0
  102. tilth-0.1.0/tilth/cli.py +242 -0
  103. tilth-0.1.0/tilth/client.py +309 -0
  104. tilth-0.1.0/tilth/data/env.example +43 -0
  105. tilth-0.1.0/tilth/hooks/__init__.py +17 -0
  106. tilth-0.1.0/tilth/hooks/pre_tool.py +39 -0
  107. tilth-0.1.0/tilth/loop.py +1833 -0
  108. tilth-0.1.0/tilth/memory.py +338 -0
  109. tilth-0.1.0/tilth/paths.py +60 -0
  110. tilth-0.1.0/tilth/prompts/evaluator.md +70 -0
  111. tilth-0.1.0/tilth/prompts/system.md +33 -0
  112. tilth-0.1.0/tilth/session.py +404 -0
  113. tilth-0.1.0/tilth/summary.py +216 -0
  114. tilth-0.1.0/tilth/tasks.py +291 -0
  115. tilth-0.1.0/tilth/tools/__init__.py +78 -0
  116. tilth-0.1.0/tilth/tools/bash.py +64 -0
  117. tilth-0.1.0/tilth/tools/files.py +139 -0
  118. tilth-0.1.0/tilth/tools/search.py +106 -0
  119. tilth-0.1.0/tilth/usage.py +143 -0
  120. tilth-0.1.0/tilth/verdict.py +311 -0
  121. tilth-0.1.0/tilth/visualize/__init__.py +14 -0
  122. tilth-0.1.0/tilth/visualize/app.js +736 -0
  123. tilth-0.1.0/tilth/visualize/render.py +425 -0
  124. tilth-0.1.0/tilth/visualize/server.py +332 -0
  125. tilth-0.1.0/tilth/visualize/theme.css +807 -0
  126. tilth-0.1.0/tilth/visualize/theme.py +179 -0
  127. tilth-0.1.0/tilth/workspace.py +259 -0
  128. tilth-0.1.0/uv.lock +987 -0
@@ -0,0 +1,14 @@
1
+ {
2
+ "hooks": {
3
+ "InstructionsLoaded": [
4
+ {
5
+ "hooks": [
6
+ {
7
+ "type": "command",
8
+ "command": "jq -r '({\"User\":\"👤\",\"Project\":\"📁\",\"Local\":\"📍\",\"Plugin\":\"🔌\"}[.memory_type] // \"📄\") as $e | \"\\($e) \\(.memory_type) \\(.file_path) \\(.load_reason)\"' >> instructions-loaded.log"
9
+ }
10
+ ]
11
+ }
12
+ ]
13
+ }
14
+ }
@@ -0,0 +1,43 @@
1
+ # Provider configuration. Tilth talks to an OpenAI-compatible endpoint.
2
+ # All three of these are REQUIRED — Tilth refuses to start without them.
3
+ # Tilth is tested against OpenRouter today; other OpenAI-flavour gateways
4
+ # should work via the OpenAI SDK but haven't been validated yet.
5
+
6
+ TILTH_BASE_URL=https://openrouter.ai/api/v1
7
+ TILTH_API_KEY=sk-or-your_api_key_here
8
+ TILTH_WORKER_MODEL=deepseek/deepseek-v4-flash
9
+
10
+ # Evaluator model. Defaults to the worker if TILTH_EVALUATOR_MODEL is unset.
11
+ #
12
+ # Routing the evaluator to a *different provider* (set TILTH_EVALUATOR_BASE_URL +
13
+ # TILTH_EVALUATOR_API_KEY) is mostly about INDEPENDENCE, not cost. Cross-family
14
+ # judging (e.g. open-source worker, frontier closed evaluator) catches failure
15
+ # modes a same-family evaluator will miss.
16
+ #
17
+ # As a rule, the evaluator should be AT LEAST as capable as the worker — for code
18
+ # diff review, a weaker evaluator silently accepts bad work because it doesn't
19
+ # notice the problem. "Cheap evaluator" only works for shallow checks (binary,
20
+ # regex, policy gates), not for correctness gating in a Ralph loop.
21
+ TILTH_EVALUATOR_MODEL=deepseek/deepseek-v4-pro
22
+
23
+ # Project-context files. The worker and evaluator read these from the workspace
24
+ # root (in order, concatenated) as the project-conventions channel — Tilth never
25
+ # writes them. Comma-separated; defaults to AGENTS.md,CLAUDE.md so both the
26
+ # emerging AGENTS.md standard and existing Claude Code repos work out of the box.
27
+ # TILTH_CONTEXT_FILES=AGENTS.md,CLAUDE.md
28
+
29
+ # Safety caps.
30
+ TILTH_MAX_ITERATIONS_PER_TASK=32
31
+ TILTH_MAX_WALL_CLOCK_MINUTES=120
32
+ # Cumulative USD spend cap for the whole session (worker + evaluator), checked
33
+ # between tasks. Read from the provider's own per-call `cost` figure — verified
34
+ # against OpenRouter; gateways that don't report cost leave this at $0 spent, so
35
+ # the dollar cap never trips for them and wall-clock is the backstop.
36
+ TILTH_MAX_TOKEN_DOLLAR_SPEND=10.00
37
+
38
+ # Optional cap on evaluator calls per task. The evaluator runs every time the
39
+ # worker submits a case ("done"); a stuck task can ping-pong worker↔evaluator and
40
+ # burn the whole spend budget on one task. Set this to halt a task after N
41
+ # evaluator rejections (the task is marked failed, the run stops). Unset or
42
+ # 0 = unlimited.
43
+ # MAX_EVALUATOR_CALLS_PER_TASK=3
@@ -0,0 +1,23 @@
1
+ name: docs
2
+ on:
3
+ push:
4
+ branches: [main]
5
+ permissions:
6
+ contents: write
7
+ jobs:
8
+ deploy:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - name: Configure Git Credentials
13
+ run: |
14
+ git config user.name "github-actions[bot]"
15
+ git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
16
+ - uses: astral-sh/setup-uv@v5
17
+ - run: uv sync --extra docs
18
+ # Strict gate: broken nav refs / dead relative links fail CI here rather
19
+ # than publishing silently. Mirrors the local `mkdocs build --strict`.
20
+ - name: Strict build (gate)
21
+ run: uv run mkdocs build --strict --site-dir /tmp/tilth-site
22
+ - name: Deploy to gh-pages
23
+ run: uv run mkdocs gh-deploy --force
@@ -0,0 +1,36 @@
1
+ name: Release to PyPI
2
+
3
+ # Publishes the package to PyPI when a GitHub Release is published.
4
+ # Tag the release `vX.Y.Z` and match it to `version` in pyproject.toml.
5
+ #
6
+ # Authentication is PyPI Trusted Publishing (OIDC) — no API token is stored
7
+ # anywhere. One-time setup: add this repo + workflow as a "pending publisher"
8
+ # on PyPI before the first release. See docs/reference/releasing.md.
9
+
10
+ on:
11
+ release:
12
+ types: [published]
13
+
14
+ permissions:
15
+ id-token: write # required for OIDC trusted publishing
16
+ contents: read
17
+
18
+ jobs:
19
+ publish:
20
+ runs-on: ubuntu-latest
21
+ environment: pypi # configure this environment on the repo (optional gate)
22
+ steps:
23
+ - name: Checkout
24
+ uses: actions/checkout@v4
25
+
26
+ - name: Install uv
27
+ uses: astral-sh/setup-uv@v6
28
+
29
+ - name: Build sdist and wheel
30
+ run: uv build
31
+
32
+ - name: Verify the wheel exposes the tilth entry point
33
+ run: uv run --isolated --no-project --with dist/*.whl tilth --help
34
+
35
+ - name: Publish to PyPI
36
+ run: uv publish
tilth-0.1.0/.gitignore ADDED
@@ -0,0 +1,47 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ .venv/
11
+ venv/
12
+
13
+ # Environments
14
+ .env
15
+ .env.local
16
+
17
+ # Sessions (per-run state, large jsonl files, worktrees)
18
+ sessions/
19
+ !sessions/.gitkeep
20
+
21
+ # `examples/seed-reference/` is tracked — frozen seeds captured as teaching
22
+ # artifacts (see examples/seed-reference/todo-cli/README.md). Anything else
23
+ # under examples/ is treated as a safety net for stray clones, since the
24
+ # canonical demo path is {{your projects folder}}/tilth-demo (sibling of
25
+ # Tilth). Demo source: https://github.com/AlteredCraft/tilth-demo-todo-cli
26
+ examples/*
27
+ !examples/seed-reference/
28
+
29
+ # Editor
30
+ .vscode/
31
+ .idea/
32
+ *.swp
33
+ *.swo
34
+ .DS_Store
35
+
36
+ # Tooling
37
+ .mypy_cache/
38
+ .ruff_cache/
39
+ .pytest_cache/
40
+ .coverage
41
+ htmlcov/
42
+
43
+ # MkDocs build output
44
+ site/
45
+
46
+ # Claude Code (per-clone observability artefacts)
47
+ instructions-loaded.log
tilth-0.1.0/CLAUDE.md ADDED
@@ -0,0 +1,218 @@
1
+ # CLAUDE.md
2
+
3
+ Guidance for Claude Code (claude.ai/code) when working in this repo.
4
+
5
+ ## What this is
6
+
7
+ A minimal long-running agent harness against any OpenAI-compatible LLM endpoint. It implements the Brain / Hands / Session split, the Ralph loop, and the four memory channels from Addy Osmani's posts on long-running agents (plus a fifth Tilth adds — the per-task evaluator ledger). Built as both a working tool and the practical centerpiece of an Altered Craft article.
8
+
9
+ The ultimate goal is a minimal, productive agent harness with **hyper-observability**: every prompt the harness sends is accessible and adaptable, and every run is fully inspectable after the fact. The [Visualizing a session](docs/getting-started/visualizing.md) page is an early example of that extended observability — every run browsable as a chat-style web app, tailing an active loop in near-realtime or replaying a finished one end-to-end from its `events.jsonl`.
10
+
11
+ ## Where to look first
12
+
13
+ - **`mkdocs.yml`** — **the canonical map of the documentation set**, and your primary entry point when looking for docs by topic. The `nav:` block has a one- to four-line comment above each leaf entry summarising what the linked `.md` covers and when you'd reach for it; skim those comments first, then open the page that fits. Everything that matters for users and contributors lives under `docs/`; `README.md` is the GitHub landing page and points into `docs/` for anything beyond the elevator pitch.
14
+ - **`README.md`** — terse GitHub landing page: product elevator pitch (with the Brain/Hands/Session image), a minimal quickstart, and the working-with-the-codebase commands (lint, tests, docs). **Not a mirror** of `docs/index.md` — for any product detail beyond the pitch, README points readers into `docs/`. Edit the two files independently.
15
+ - **`docs/getting-started/your-own-project.md`** — the "honest version" of using Tilth on a non-demo codebase: what prep your repo actually needs (a clean git repo, optionally an `AGENTS.md`), authoring the feature as markdown in a named feature dir under `.tilth/<feature>/` (you write it by hand, or have a model fill the templates — there's no interview/seed step), picking an evaluator model, the caveats that aren't obvious from a demo run, and when it's the wrong tool. (Successor to the old root-level `USAGE.md`.)
16
+ - **`docs/deep-dives/`** — code-level walk-throughs of the two loops, the worker↔evaluator dialogue (case / verdict / ledger), iteration accounting, token recording/enforcement, and the agent-visibility boundary. Read this before changing any of those mechanics. (Successor to the old root-level `deep-dives.md`.)
17
+ - **The demo workspace** — lives in its own repo at [`AlteredCraft/tilth-demo-todo-cli`](https://github.com/AlteredCraft/tilth-demo-todo-cli). The docs use `~/projects/tilth-demo` as an illustrative path, but Tilth treats the path as just an argument so any layout works.
18
+ - **`docs/assets/IMAGE_STYLE.md`** — the prompt scaffold for generating new docs *images*, anchored to the canonical `brain-hands-session.png`. Use this whenever you generate a new diagram or illustration so the visual voice stays consistent across pages. Not in the published nav (excluded via `not_in_nav` in `mkdocs.yml`).
19
+ - **`docs/assets/SITE_STYLE.md`** — the visual identity for the rendered docs *site* (Material for MkDocs + custom CSS). Documents the provenance of the theme (Hex, from [refero.design](https://refero.design)), the load-bearing tokens, and the do's-and-don'ts to follow when editing `docs/stylesheets/extra.css` or `mkdocs.yml`. Companion to `IMAGE_STYLE.md`; also excluded from the published nav.
20
+
21
+ ## Don't confuse the three "agent instruction" files
22
+
23
+ The repo has *three* files that look like agent instructions but speak to different audiences:
24
+
25
+ | File | Audience | Purpose |
26
+ |---|---|---|
27
+ | `CLAUDE.md` (this file) | Claude Code working on the harness itself | Conventions for editing this codebase |
28
+ | `tilth/prompts/system.md` | The worker agent inside the harness loop | Role, tool guidance, "done" criteria |
29
+ | `<demo-workspace>/AGENTS.md` | The worker agent operating on the demo workspace | Project conventions for the toy todo-cli |
30
+
31
+ When the user says "update the agent's instructions," ask which one — they're not the same thing.
32
+
33
+ ## Repo layout
34
+
35
+ ```
36
+ tilth/
37
+ ├── README.md, CLAUDE.md, mkdocs.yml
38
+ ├── docs/ # MkDocs source (annotated nav in mkdocs.yml is the topic index)
39
+ ├── pyproject.toml, .env.example, .gitignore
40
+ ├── tilth/
41
+ │ ├── cli.py # verb-routed entry: init / run / resume / reset / visualize
42
+ │ ├── paths.py # ~/.tilth resolution: sessions dir + .env search order
43
+ │ ├── loop.py # Ralph loop + inner tool-use loop + subcommand handlers
44
+ │ ├── client.py # OpenAI-compat wrapper, dual-client routing (worker / evaluator)
45
+ │ ├── session.py # events.jsonl + checkpoint.json + per-task ledger + wake()
46
+ │ ├── summary.py # roll events.jsonl into summary.json (denormalised view)
47
+ │ ├── memory.py # AGENTS.md / progress.txt / overview / full-plan injection
48
+ │ ├── tasks.py # load + validate <repo>/.tilth/<feature>/ (overview + T-NNN files)
49
+ │ ├── workspace.py # git worktree create / commit / diff
50
+ │ ├── case.py # worker submit_case schema / parse / render
51
+ │ ├── verdict.py # evaluator submit_verdict schema / parse / ledger format
52
+ │ ├── tools/ # bash, files, search — registered in __init__.py (worker)
53
+ │ ├── hooks/ # pre_tool
54
+ │ ├── prompts/ # system.md, evaluator.md
55
+ │ ├── data/ # env.example template (shipped in the wheel for `tilth init`)
56
+ │ └── visualize/ # tilth visualize: live web viewer over ~/.tilth/sessions/ (stdlib http server)
57
+ ├── examples/seed-reference/ # frozen pre-prompt-driven example (historical teaching artifact)
58
+ └── tests/ # the harness's own pytest suite (run state now lives in ~/.tilth/)
59
+ ```
60
+
61
+ The demo workspace is a separate repo (`AlteredCraft/tilth-demo-todo-cli`) — not part of the Tilth repo. Clone it wherever you keep code; the docs use `~/projects/tilth-demo` as an illustrative path, but the location is arbitrary.
62
+
63
+ ## Conventions
64
+
65
+ - **Python 3.12.** `from __future__ import annotations` everywhere.
66
+ - **`uv` for env management.** `uv sync`
67
+ - **`ruff` for lint.** Config in `pyproject.toml`. Run `ruff check tilth/` before declaring work done.
68
+ - **Type hints on public functions.** Internal helpers can skip them.
69
+ - **No comments unless the WHY is non-obvious.** Don't narrate WHAT the code does.
70
+ - **Standard library first.** Third-party deps live in `pyproject.toml`; resist adding more.
71
+ - **External interfaces — verify, don't guess.** For provider APIs (OpenRouter, OpenAI SDK, Ollama, etc.), library specs, or any third-party wire format: consult the official docs first (use Context7, WebFetch, or the provider's sitemap to find them), and probe the live response shape with a tiny one-shot script before writing the fix. Don't infer field names from error messages — providers often surface their *upstream* internal field names in errors (e.g. SiliconFlow says `reasoning_content` but the OpenRouter wire field is `reasoning_details`). Don't infer from training data — these surfaces churn. A synthetic unit test built on a guessed shape gives false confidence; the test passes against your made-up contract while the real bug stays. Probe → write the test against the real shape → fix.
72
+
73
+ ## Architecture invariants worth preserving
74
+
75
+ These are load-bearing. Read the relevant page under `docs/deep-dives/` before breaking any of them.
76
+
77
+ 1. **Brain / Hands / Session split.** Don't blur the three. New code goes in the module whose job it is — model calls in `client.py`, sandbox/tool ops in `workspace.py` and `tools/`, durable state in `session.py`.
78
+ 2. **The agent doesn't see harness mechanics.** No `task-status.json` or status fields, no `events.jsonl`, no `summary.json`, no token counts, no checkpoints, no cross-task evaluator. Hiding these prevents gaming, shortcutting, and self-managed state. (The visibility expansion deliberately softened this: the worker sees the feature overview and the whole task list *as prose context* — not the mutable status store — plus the evaluator's prior verdicts on its *current* task, so it can act on review feedback. The harness files, token counts, checkpoints, and the wider evaluation machinery stay hidden.) New features should preserve this boundary unless the user explicitly asks otherwise.
79
+
80
+ **Honest scope.** This is a *design goal*, not an enforcement guarantee in default mode. The worker has `bash` and the worktree is mounted at `~/.tilth/sessions/<id>/workspace/`, so a determined model can reach harness state via relative paths — `events.jsonl`, `summary.json`, `checkpoint.json`, `task-status.json` all live one directory up (`../`). The invariant's near-term purpose is to keep new code from making harness state *more* obviously surfaced to the worker; real enforcement is opt-in process isolation, planned in [#13](https://github.com/AlteredCraft/tilth/issues/13).
81
+ 3. **Tool registry is the canonical source for "what tools exist".** `tilth/tools/__init__.py` defines the registry; system.md should *not* enumerate tools (it gets stale).
82
+ 4. **Hook contract: "success silent, failures verbose" — to the *agent*.** Pass states inject nothing into the loop's message history; failures inject a feedback message that the next worker iteration sees. **Telemetry is separate.** Every hook invocation should emit a `hook_run` event regardless of outcome — observability is for the developer reading `events.jsonl`, not the agent. "Silent to the agent" must not mean "invisible in the log".
83
+ 5. **The worktree branch is never auto-merged.** `commit_task` commits to the session branch; humans review and merge. Don't add an "auto-merge on success" feature without an explicit ask.
84
+ 6. **Token cap enforcement is between tasks, not mid-task.** The "always finish the current task cleanly" property matters; preserve it.
85
+
86
+ ## Where to file new things
87
+
88
+ | Adding... | Lives in... | Don't forget... |
89
+ |---|---|---|
90
+ | A tool | `tilth/tools/{name}.py` | Register in `tools/__init__.py:_registry()` |
91
+ | A hook | `tilth/hooks/{name}.py` | Wire into `tools/__init__.py:dispatch()` |
92
+ | A prompt | `tilth/prompts/{name}.md` | Add a loader in `loop.py` |
93
+ | A session event type | Use it in `session.log("...", {...})` | Document the type in `session.py`'s module docstring |
94
+ | A summary metric | `tilth/summary.py:build_from_events()` | Update the schema in the module docstring; bump `SUMMARY_VERSION` if shape breaks |
95
+
96
+ ## Common commands
97
+
98
+ ```bash
99
+ # Setup (contributor flow: run from the clone with `uv run tilth …`, no install)
100
+ uv sync
101
+ uv run tilth init # writes ~/.tilth/.env (or `cp .env.example .env` for a
102
+ # clone-local one — Tilth's .env search falls back to CWD)
103
+ # edit the .env: TILTH_BASE_URL, TILTH_API_KEY, TILTH_WORKER_MODEL
104
+
105
+ # Lint
106
+ .venv/bin/python -m ruff check tilth/
107
+
108
+ # Docs — strict build (catches broken nav refs, missing files, dead relative
109
+ # links). Run after editing mkdocs.yml or anything under docs/. This is the
110
+ # command CI will run when docs validation gets wired in; keep it green.
111
+ uv run --extra docs mkdocs build --strict --site-dir /tmp/tilth-site
112
+
113
+ # Docs — live preview at http://127.0.0.1:8000
114
+ uv run --extra docs mkdocs serve
115
+
116
+ # Demo (needs provider config from Setup, and a local clone of the demo repo
117
+ # at AlteredCraft/tilth-demo-todo-cli — clone it wherever; path below is illustrative)
118
+ git clone git@github.com:AlteredCraft/tilth-demo-todo-cli.git ~/projects/tilth-demo
119
+ # author ~/projects/tilth-demo/.tilth/<feature>/ (overview.md + T-NNN-*.md; run prints
120
+ # ready-to-fill templates if the dir is missing) — there is no prep/interview step
121
+ uv run tilth run ~/projects/tilth-demo/.tilth/<feature>
122
+
123
+ # Resume an interrupted session (latest in ~/.tilth/sessions/, or by id)
124
+ uv run tilth resume
125
+ uv run tilth resume <session_id>
126
+
127
+ # Reset a session — removes the worktree, deletes session/<id>, drops ~/.tilth/sessions/<id>/
128
+ uv run tilth reset
129
+ uv run tilth reset <session_id>
130
+ uv run tilth reset --yes # skip the confirmation prompt
131
+
132
+ # Serve the live session viewer (read-only, 127.0.0.1:8765; --port to change).
133
+ # Index of all sessions + per-session chat view that tails an active run.
134
+ uv run tilth visualize
135
+ uv run tilth visualize <session_id> # deep-link this session on startup
136
+
137
+ # Inspect a session log
138
+ jq -c . ~/.tilth/sessions/<session_id>/events.jsonl | head -40
139
+ ```
140
+
141
+ ## Working with GitHub milestones
142
+
143
+ There is no `gh milestone` command. Manage the milestone *object* through the REST API (`gh api`); assign issues to it with the built-in `gh issue` flags — which reference the milestone by **title**, so it must exist first.
144
+
145
+ ```bash
146
+ # Manage the milestone object (use the milestone NUMBER, not an issue id, for edit/delete)
147
+ gh api repos/{owner}/{repo}/milestones -f title="…" -f description="…" -f due_on="2026-07-01T00:00:00Z"
148
+ gh api repos/{owner}/{repo}/milestones --jq '.[] | "#\(.number) \(.state) \(.title)"'
149
+ gh api -X PATCH repos/{owner}/{repo}/milestones/<n> -f state=closed
150
+ gh api -X DELETE repos/{owner}/{repo}/milestones/<n>
151
+
152
+ # Assign / filter by milestone title
153
+ gh issue edit <n>... --milestone "…" # accepts multiple issues; --remove-milestone detaches
154
+ gh issue list --milestone "…"
155
+ ```
156
+
157
+ Creating issues or milestones is network-side — per *Things not to do without asking*, only do it on an explicit request.
158
+
159
+ ## Working with the demo
160
+
161
+ The demo lives in its own repo at [`AlteredCraft/tilth-demo-todo-cli`](https://github.com/AlteredCraft/tilth-demo-todo-cli). Clone it wherever you keep code before running it. The path is just an argument to `uv run tilth`, so any layout works; the docs use `~/projects/tilth-demo` as an illustrative example.
162
+
163
+ The demo has to be a git repo because Tilth's worktree machinery requires it. To tear down a session's artifacts (worktree, `session/<id>` branch, `~/.tilth/sessions/<id>/`), use `tilth reset` rather than the manual recipe:
164
+
165
+ ```bash
166
+ uv run tilth reset # most recent session
167
+ uv run tilth reset <session_id> # explicit
168
+ ```
169
+
170
+ `tilth reset` reads the session's checkpoint and `session_start` event to recover the source repo + worktree path + branch, runs `git worktree remove --force` and `git branch -D` against the source repo, and deletes `~/.tilth/sessions/<id>/`. Force-removes a dirty worktree by design — its whole purpose is to discard a session's work; the `[y/N]` prompt is the safety gate.
171
+
172
+ If `tilth reset` itself can't run (e.g., session metadata missing), the manual fallback is:
173
+
174
+ ```bash
175
+ cd <demo-clone-path> # e.g. ~/projects/tilth-demo
176
+ git worktree prune
177
+ git branch -D session/<id> # if it still exists
178
+ rm -rf ~/.tilth/sessions/<id>/ # or $TILTH_SESSIONS_DIR/<id>/ if overridden
179
+ ```
180
+
181
+ Don't commit changes the agent made on `session/*` branches into the demo clone's `main`. Those are run artefacts; the demo's `main` should stay clean of them. The session worktree and all harness state live under `~/.tilth/`, never in the demo repo — the only thing you'd intentionally add to the demo is the `.tilth/<feature>/` directory you author.
182
+
183
+ ## Things not to do without asking
184
+
185
+ - Commit changes (per the user's standing instruction — only commit when explicitly asked).
186
+ - Push to a remote, create PRs, or do anything network-side beyond running the harness itself.
187
+ - Change the architecture invariants above.
188
+ - Add a new dependency to `pyproject.toml` for convenience — justify the addition.
189
+ - Rewrite the system prompts to be more verbose. They are short on purpose; every character ships every turn.
190
+ - Auto-fix the demo workspace to pass tests yourself if a demo run fails — that defeats the point of the demo. Investigate why the harness didn't.
191
+
192
+ ## Article context
193
+
194
+ This codebase is the practical centerpiece of an article in the user's PKM vault at:
195
+
196
+ ```
197
+ ~/_PRIMARY_VAULT/AlteredCraft/Altered Craft Publications/Notes/Long running agents/
198
+ ```
199
+
200
+ That folder has `research-findings.md`, `research-links.md`, `mvp-spec.md`, and `draft.md`. When changes here are likely to be article-worthy (e.g. surprising findings from a demo run, new lessons from extending a slice), surface them so the user can update the draft. Don't edit those files unless asked.
201
+
202
+ ### Session-start sweep for article-worthy learnings
203
+
204
+ A separate running notes file lives at:
205
+
206
+ ```
207
+ ~/_PRIMARY_VAULT/AlteredCraft/Altered Craft Publications/Notes/tilth-learnings/notes.md
208
+ ```
209
+
210
+ It's a bulleted, themed corpus of transferable lessons from Tilth development (provider quirks, robustness patterns, multi-agent failure modes, observability wins) with commit-SHA links into [`AlteredCraft/tilth`](https://github.com/AlteredCraft/tilth). At the start of a new session, spawn a general-purpose subagent to:
211
+
212
+ 1. `git log --since="3 days ago"` — see what's landed.
213
+ 2. Read the notes file above.
214
+ 3. For any commit surfacing a non-obvious lesson that *isn't* already represented, append a bullet under the right themed section (or open a new section) with the commit SHA linked and the bullet's date tagged on the section's `*Observed:*` line.
215
+ 4. Skip docs-sync / lint / surface-polish commits — the file is for transferable lessons, not changelog.
216
+ 5. Report back a short summary of what was added (and what was already there).
217
+
218
+ Match the existing voice: terse bullets, themed sections (not chronological), date range under each section header, lesson framed as a transferable principle plus a concrete commit anchor.
tilth-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sam Keen / AlteredCraft
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
tilth-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: tilth
3
+ Version: 0.1.0
4
+ Summary: Tilth — minimal long-running agent harness against any OpenAI-compatible endpoint. Brain/Hands/Session split, Ralph loop, four memory channels.
5
+ Project-URL: Homepage, https://github.com/AlteredCraft/tilth
6
+ Project-URL: Documentation, https://alteredcraft.github.io/tilth/
7
+ Project-URL: Repository, https://github.com/AlteredCraft/tilth
8
+ Project-URL: Issues, https://github.com/AlteredCraft/tilth/issues
9
+ Author-email: Sam Keen <sam@alteredcraft.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: agent,agent-harness,autonomous-agents,llm,long-running-agents,openai-compatible,openrouter,ralph-loop
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Code Generators
21
+ Requires-Python: >=3.12
22
+ Requires-Dist: openai>=1.40
23
+ Requires-Dist: pydantic>=2.6
24
+ Requires-Dist: python-dotenv>=1.0
25
+ Requires-Dist: rich>=13.7
26
+ Provides-Extra: docs
27
+ Requires-Dist: mkdocs-glightbox>=0.4; extra == 'docs'
28
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
29
+ Requires-Dist: mkdocs>=1.6; extra == 'docs'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # Tilth
33
+
34
+ > *Prepare the ground, let the agent grow the work.*
35
+
36
+ A minimal long-running agent harness against an **OpenAI-compatible** LLM endpoint. Tested today against [OpenRouter](https://openrouter.ai); the OpenAI SDK underneath means other OpenAI-flavour gateways should work, but support for them is on the roadmap rather than validated. Built to learn (and demonstrate) the Brain/Hands/Session split, the Ralph loop, and the file-backed memory channels described in Addy Osmani's [long-running agents](https://addyosmani.com/blog/long-running-agents/) and [agent harness engineering](https://addyosmani.com/blog/agent-harness-engineering/) posts.
37
+
38
+ ![Brain / Hands / Session split — three boxes connected by flow arrows, with the files that implement each piece](https://raw.githubusercontent.com/AlteredCraft/tilth/main/docs/assets/brain-hands-session.png)
39
+
40
+ **Audience:** This is an active research project for my work in [Altered Craft](https://alteredcraft.com). I do actively use it for real work, so I'd suggest it for single-dev / few-dev teams who want to *understand* what a long-running agent harness actually does. That's today (June 2026); the future, we shall see.
41
+
42
+ **Target run:** 10–60 minutes of autonomous work against an open model (default `deepseek/deepseek-v4-flash` on OpenRouter for the worker; the evaluator defaults to `deepseek/deepseek-v4-pro`), completing a short task list against a small project on a per-session git worktree.
43
+
44
+ > **Status — prompt-driven core.** Tilth is deliberately small and currently being driven *down* to its essentials: a worker and an independent evaluator, the base file/search/bash tools, and full observability. There is **no codified test/lint gate** — the evaluator is the only gate — and **no interview step**: you author the work as markdown and run it. Capabilities get added back only as testing shows they're needed.
45
+
46
+ ## How Tilth differs
47
+
48
+ Many minimal coding agents are *interactive* — a developer watches the output and course-corrects, kills a bad run, or re-prompts. Tilth runs *autonomously* for the length of a run, with no one watching mid-task. That single difference is why it carries machinery a pair-programming agent can skip: an **evaluator** — a second model that judges whether a change is a *proper* solution against the task's acceptance criteria, not just whether the code runs; **between-task caps** that stand in for the budget ceiling a human would otherwise impose; a per-task **evaluator ledger** so a retried task sees the reviewer's prior verdicts; **state kept out of the model's context**; and **offline-first observability** (detailed just below). None of this is a knock on interactive agents; it's a different shape for a different job.
49
+
50
+ ### Hyper-observability
51
+
52
+ If no one is watching a run mid-flight, the recording *is* the supervision. Tilth's standing goal is **hyper-observability** — *every prompt the harness sends is accessible, and every run is fully inspectable after the fact.* Every assembled prompt, memory load, model call, and evaluator verdict lands in an append-only `events.jsonl`, and `tilth visualize` serves the whole thing as a local chat-style web app — tail an active run in near-realtime or replay a finished one end-to-end, with no state hidden out of reach.
53
+
54
+ ![A finished Tilth run rendered by `tilth visualize`: the dashboard band — a header with an `all_done` status chip plus token-cost and event-count chips, a "Limit utilization" row of cost-budget, wall-clock, and per-task iteration meters, a stat band (tokens, cost, model and tool calls, verdicts), a session timeline, and a context-pressure chart](https://raw.githubusercontent.com/AlteredCraft/tilth/main/docs/assets/session-render.png)
55
+
56
+ *A finished run, rendered by `tilth visualize`.*
57
+
58
+ It's an early example of the goal, not a finished product. For the full product story — the Brain/Hands/Session split in detail, the memory channels, the two loops, and the worker↔evaluator dialogue — see the **[docs site](https://alteredcraft.github.io/tilth/)**. (The docs are mid-revision for the prompt-driven core; the README is the current source of truth for the run flow.)
59
+
60
+ ## Quickstart
61
+
62
+ ```bash
63
+ # Install the CLI from PyPI (puts `tilth` on your PATH, runnable from anywhere)
64
+ uv tool install tilth
65
+ # …or run it without installing, npx-style: uvx tilth --help
66
+ # …or with pipx: pipx install tilth
67
+
68
+ tilth init # scaffolds ~/.tilth/.env
69
+ # edit ~/.tilth/.env — TILTH_BASE_URL, TILTH_API_KEY, TILTH_WORKER_MODEL are all
70
+ # required (Tilth refuses to start without them so a misconfigured run can't
71
+ # silently fall back to a provider/model your account doesn't have)
72
+ ```
73
+
74
+ > **Working *on* Tilth?** Install from a clone instead: `uv tool install --editable .` puts `tilth` on your PATH while tracking your working copy — see [Working with the codebase](#working-with-the-codebase).
75
+
76
+ Tilth keeps all per-user state under `~/.tilth/` — the `.env` above and every run's `sessions/<id>/`. Relocate it with `$TILTH_HOME` (whole tree) or `$TILTH_SESSIONS_DIR` (just the runs).
77
+
78
+ You author the feature as markdown in the target repo, then run it — there's no interview step. The work lives in a feature directory you name under `<repo>/.tilth/<feature>/` (one repo can hold several features):
79
+
80
+ ```
81
+ .tilth/todo-cli/
82
+ ├── overview.md # the feature's goal + scope boundaries (required)
83
+ ├── T-001-<slug>.md # one file per task, ordered by id
84
+ ├── T-002-<slug>.md
85
+ └── ...
86
+ ```
87
+
88
+ Each task file is small frontmatter plus two sections:
89
+
90
+ ```markdown
91
+ ---
92
+ id: T-001
93
+ title: Add the `add` subcommand
94
+ ---
95
+
96
+ ## Description
97
+ What to build, in the worker's voice. Real paths/symbols
98
+ (todo_cli/__main__.py:main()), not "the entrypoint".
99
+
100
+ ## Acceptance criteria
101
+ - An externally checkable behaviour
102
+ - Another one
103
+ ```
104
+
105
+ Then point Tilth at the feature directory:
106
+
107
+ ```bash
108
+ git clone git@github.com:AlteredCraft/tilth-demo-todo-cli.git tilth-demo
109
+ # author tilth-demo/.tilth/todo-cli/ (run prints ready-to-fill templates if it's missing)
110
+ tilth run ./tilth-demo/.tilth/todo-cli
111
+ ```
112
+
113
+ For each pending task, Tilth resets context from disk, lets the worker work with the file/search/bash tools until it calls `submit_case`, hands the case + diff to the evaluator in a fresh context, and on `accept` commits one task = one commit on the `session/<id>` branch (humans review and merge — Tilth never auto-merges). A run stops on all-tasks-done or a cap (iterations / wall-clock / dollar spend / evaluator calls). Interrupt with Ctrl-C; resume with `tilth resume`.
114
+
115
+ ```bash
116
+ tilth resume # continue the latest session
117
+ tilth reset # tear down a session's worktree + branch + dir
118
+ tilth visualize # serve the live session viewer (127.0.0.1:8765)
119
+ tilth info # list sessions (status, progress, tokens); `tilth info <id>` for one session's detail
120
+ tilth config # show resolved provider config + run caps (API keys masked)
121
+ ```
122
+
123
+ The `TILTH_*` env-var table (caps, evaluator routing, context-file selection) is documented in the generated `~/.tilth/.env` (copied from `.env.example`).
124
+
125
+ ## Working with the codebase
126
+
127
+ Working *on* Tilth itself rather than using it? `uv sync` for the dev env, then run the CLI straight from the clone with `uv run tilth …` (no install needed — sessions still land in `~/.tilth/` unless you set `$TILTH_HOME`).
128
+
129
+ ```bash
130
+ # Lint
131
+ .venv/bin/python -m ruff check tilth/
132
+
133
+ # Tests
134
+ .venv/bin/python -m pytest
135
+
136
+ # Docs — live preview at http://127.0.0.1:8000
137
+ uv run --extra docs mkdocs serve
138
+
139
+ # Docs — strict build (the CI gate; catches broken nav refs, missing files, dead links)
140
+ uv run --extra docs mkdocs build --strict --site-dir /tmp/tilth-site
141
+ ```
142
+
143
+ See [`CLAUDE.md`](./CLAUDE.md) for repo conventions and the architecture invariants worth preserving when editing the harness itself.