agentkernel-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. agentkernel/__init__.py +7 -0
  2. agentkernel/__main__.py +5 -0
  3. agentkernel/agent.py +311 -0
  4. agentkernel/approval/__init__.py +23 -0
  5. agentkernel/approval/base.py +34 -0
  6. agentkernel/approval/cli.py +129 -0
  7. agentkernel/approval/policy.py +58 -0
  8. agentkernel/approval/risk.py +91 -0
  9. agentkernel/approval/sandbox.py +201 -0
  10. agentkernel/budget.py +64 -0
  11. agentkernel/checkpoint.py +50 -0
  12. agentkernel/cli.py +1482 -0
  13. agentkernel/config.py +224 -0
  14. agentkernel/context/__init__.py +17 -0
  15. agentkernel/context/manager.py +216 -0
  16. agentkernel/context/truncate.py +35 -0
  17. agentkernel/cron.py +146 -0
  18. agentkernel/curation.py +183 -0
  19. agentkernel/doctor.py +141 -0
  20. agentkernel/embeddings.py +132 -0
  21. agentkernel/evaluation.py +186 -0
  22. agentkernel/improvement.py +133 -0
  23. agentkernel/insights.py +141 -0
  24. agentkernel/kanban.py +114 -0
  25. agentkernel/knowledge.py +383 -0
  26. agentkernel/loops.py +145 -0
  27. agentkernel/mcp/__init__.py +23 -0
  28. agentkernel/mcp/client.py +181 -0
  29. agentkernel/mcp/config.py +59 -0
  30. agentkernel/mcp/tools.py +96 -0
  31. agentkernel/memory.py +1208 -0
  32. agentkernel/paths.py +73 -0
  33. agentkernel/plugins.py +76 -0
  34. agentkernel/profiles.py +70 -0
  35. agentkernel/progress.py +89 -0
  36. agentkernel/providers/__init__.py +35 -0
  37. agentkernel/providers/_http.py +157 -0
  38. agentkernel/providers/anthropic.py +282 -0
  39. agentkernel/providers/base.py +38 -0
  40. agentkernel/providers/credentials.py +65 -0
  41. agentkernel/providers/local.py +34 -0
  42. agentkernel/providers/openai.py +260 -0
  43. agentkernel/redaction.py +77 -0
  44. agentkernel/semantic_index.py +139 -0
  45. agentkernel/semantic_memory.py +253 -0
  46. agentkernel/skills.py +268 -0
  47. agentkernel/subagent.py +161 -0
  48. agentkernel/telemetry.py +199 -0
  49. agentkernel/templates/README.md +35 -0
  50. agentkernel/templates/SKILL.md +28 -0
  51. agentkernel/templates/eval-suite.toml +22 -0
  52. agentkernel/templates/loop.toml +29 -0
  53. agentkernel/templates/mcp-servers.toml +22 -0
  54. agentkernel/templates/profile.toml +29 -0
  55. agentkernel/templates/tool_module.py +64 -0
  56. agentkernel/tools/__init__.py +5 -0
  57. agentkernel/tools/base.py +100 -0
  58. agentkernel/tools/builtin/__init__.py +37 -0
  59. agentkernel/tools/builtin/checkpoint_tool.py +33 -0
  60. agentkernel/tools/builtin/clarify.py +60 -0
  61. agentkernel/tools/builtin/files.py +221 -0
  62. agentkernel/tools/builtin/kanban_tool.py +100 -0
  63. agentkernel/tools/builtin/search.py +225 -0
  64. agentkernel/tools/builtin/shell.py +67 -0
  65. agentkernel/tools/builtin/todo.py +106 -0
  66. agentkernel/tui/__init__.py +50 -0
  67. agentkernel/tui/app.py +594 -0
  68. agentkernel/types.py +127 -0
  69. agentkernel/worktree.py +64 -0
  70. agentkernel_cli-0.1.0.dist-info/METADATA +426 -0
  71. agentkernel_cli-0.1.0.dist-info/RECORD +74 -0
  72. agentkernel_cli-0.1.0.dist-info/WHEEL +4 -0
  73. agentkernel_cli-0.1.0.dist-info/entry_points.txt +2 -0
  74. agentkernel_cli-0.1.0.dist-info/licenses/LICENSE +201 -0
agentkernel/types.py ADDED
@@ -0,0 +1,127 @@
1
+ """Canonical, provider-independent data types (design §4).
2
+
3
+ These types are the lingua franca of the kernel. Nothing outside a provider
4
+ adapter speaks a provider's native format: Anthropic content blocks and OpenAI
5
+ ``tool_calls`` arrays are translated to and from these types inside
6
+ ``agentkernel/providers/*`` and never appear in the loop, registry, or context.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass, field
12
+ from typing import Any, Literal
13
+
14
+ Role = Literal["system", "user", "assistant", "tool"]
15
+
16
+
17
+ @dataclass
18
+ class ToolCall:
19
+ """A model request to invoke a tool. ``id`` is unique within a run."""
20
+
21
+ id: str
22
+ name: str
23
+ arguments: dict[str, Any] # already parsed from JSON by the adapter
24
+
25
+ def to_dict(self) -> dict[str, Any]:
26
+ return {"id": self.id, "name": self.name, "arguments": self.arguments}
27
+
28
+ @classmethod
29
+ def from_dict(cls, data: dict[str, Any]) -> ToolCall:
30
+ return cls(id=data["id"], name=data["name"], arguments=data.get("arguments", {}))
31
+
32
+
33
+ @dataclass
34
+ class ToolResult:
35
+ """The outcome of a tool call. ``call_id`` pairs back to ``ToolCall.id``.
36
+
37
+ A failure (validation error, approval denial, handler exception, or the
38
+ tool's own error) is reported with ``is_error=True`` rather than raised, so
39
+ the loop continues and the model can recover (design §8.3).
40
+ """
41
+
42
+ call_id: str
43
+ content: str # text shown to the model
44
+ is_error: bool = False
45
+ data: dict | None = None # structured payload for kernel use; not model-visible
46
+
47
+ def to_dict(self) -> dict[str, Any]:
48
+ return {
49
+ "call_id": self.call_id,
50
+ "content": self.content,
51
+ "is_error": self.is_error,
52
+ "data": self.data,
53
+ }
54
+
55
+ @classmethod
56
+ def from_dict(cls, data: dict[str, Any]) -> ToolResult:
57
+ return cls(
58
+ call_id=data["call_id"],
59
+ content=data.get("content", ""),
60
+ is_error=data.get("is_error", False),
61
+ data=data.get("data"),
62
+ )
63
+
64
+
65
+ @dataclass
66
+ class Message:
67
+ """One conversational turn in canonical form.
68
+
69
+ A single assistant turn may carry both ``content`` text and one or more
70
+ ``tool_calls``. A tool-role turn carries ``tool_results`` (one per call from
71
+ the preceding assistant turn).
72
+ """
73
+
74
+ role: Role
75
+ content: str = ""
76
+ tool_calls: list[ToolCall] = field(default_factory=list) # assistant turns only
77
+ tool_results: list[ToolResult] = field(default_factory=list) # tool turns only
78
+ # Bookkeeping:
79
+ cacheable: bool = False # marks a stable prefix boundary (design §9.3)
80
+ token_estimate: int | None = None
81
+
82
+ def to_dict(self) -> dict[str, Any]:
83
+ """Serialize to a plain dict (e.g., for persistence in memory stores)."""
84
+ return {
85
+ "role": self.role,
86
+ "content": self.content,
87
+ "tool_calls": [tc.to_dict() for tc in self.tool_calls],
88
+ "tool_results": [tr.to_dict() for tr in self.tool_results],
89
+ "cacheable": self.cacheable,
90
+ "token_estimate": self.token_estimate,
91
+ }
92
+
93
+ @classmethod
94
+ def from_dict(cls, data: dict[str, Any]) -> Message:
95
+ """Reconstruct a Message from `to_dict()` output."""
96
+ return cls(
97
+ role=data["role"],
98
+ content=data.get("content", ""),
99
+ tool_calls=[ToolCall.from_dict(tc) for tc in data.get("tool_calls", [])],
100
+ tool_results=[ToolResult.from_dict(tr) for tr in data.get("tool_results", [])],
101
+ cacheable=data.get("cacheable", False),
102
+ token_estimate=data.get("token_estimate"),
103
+ )
104
+
105
+ def __hash__(self) -> int:
106
+ # Messages are mutable, but a stable hash is useful for in-memory store keys.
107
+ return id(self)
108
+
109
+
110
+ @dataclass
111
+ class Usage:
112
+ """Token accounting for one completion, including cache read/write."""
113
+
114
+ input_tokens: int = 0
115
+ output_tokens: int = 0
116
+ cache_read_tokens: int = 0
117
+ cache_write_tokens: int = 0
118
+
119
+
120
+ @dataclass
121
+ class CompletionResponse:
122
+ """A provider's reply, normalized. ``raw`` is for debugging only."""
123
+
124
+ message: Message # the assistant message (text and/or tool_calls)
125
+ usage: Usage
126
+ stop_reason: str # "end_turn" | "tool_use" | "max_tokens" | provider-specific
127
+ raw: Any = None # untouched provider response; never inspected by the loop
@@ -0,0 +1,64 @@
1
+ """Git worktree isolation for sub-agents (design §18.3).
2
+
3
+ When a spawned child edits code, running it in a throwaway ``git worktree`` keeps
4
+ parallel children from colliding on the same files. This is a thin wrapper over
5
+ the ``git worktree`` CLI: create a worktree on a fresh branch, ask whether it has
6
+ changes, and remove it. No git library dependency — just the CLI behind an
7
+ injectable runner so the command construction is testable offline.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import shutil
13
+ import subprocess
14
+ import tempfile
15
+ import uuid
16
+ from collections.abc import Callable
17
+ from pathlib import Path
18
+
19
+ Runner = Callable[[list[str]], "tuple[int, str, str]"]
20
+
21
+
22
+ class WorktreeError(RuntimeError):
23
+ """A git worktree operation failed."""
24
+
25
+
26
+ def git_available() -> bool:
27
+ return shutil.which("git") is not None
28
+
29
+
30
+ class WorktreeManager:
31
+ """Create and clean up git worktrees for a repository."""
32
+
33
+ def __init__(self, repo_dir: str = ".", *, runner: Runner | None = None) -> None:
34
+ self._repo = Path(repo_dir).resolve()
35
+ self._run = runner or self._default_runner
36
+
37
+ def _default_runner(self, args: list[str]) -> tuple[int, str, str]:
38
+ proc = subprocess.run(
39
+ ["git", *args], cwd=self._repo, capture_output=True, text=True
40
+ )
41
+ return proc.returncode, proc.stdout, proc.stderr
42
+
43
+ def is_git_repo(self) -> bool:
44
+ code, out, _ = self._run(["rev-parse", "--is-inside-work-tree"])
45
+ return code == 0 and out.strip() == "true"
46
+
47
+ def create(self, *, prefix: str = "ak") -> tuple[Path, str]:
48
+ """Add a worktree on a new branch at a temp path. Returns (path, branch)."""
49
+ name = f"{prefix}-{uuid.uuid4().hex[:8]}"
50
+ path = Path(tempfile.gettempdir()) / name
51
+ branch = f"agentkernel/{name}"
52
+ code, _out, err = self._run(["worktree", "add", "-b", branch, str(path)])
53
+ if code != 0:
54
+ raise WorktreeError(f"git worktree add failed: {err.strip() or 'unknown error'}")
55
+ return path, branch
56
+
57
+ def has_changes(self, path: Path | str) -> bool:
58
+ """True if the worktree at ``path`` has uncommitted changes."""
59
+ code, out, _ = self._run(["-C", str(path), "status", "--porcelain"])
60
+ return code == 0 and bool(out.strip())
61
+
62
+ def remove(self, path: Path | str) -> None:
63
+ """Remove a worktree (force, since it may have untracked files)."""
64
+ self._run(["worktree", "remove", "--force", str(path)])
@@ -0,0 +1,426 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentkernel-cli
3
+ Version: 0.1.0
4
+ Summary: A minimal, dependency-light kernel for a general-purpose AI agent — runs anywhere, brings its own brain.
5
+ Project-URL: Homepage, https://github.com/sebbyrule/agentkernal
6
+ Project-URL: Repository, https://github.com/sebbyrule/agentkernal
7
+ Project-URL: Issues, https://github.com/sebbyrule/agentkernal/issues
8
+ Author-email: Sebastian Manczak <manczak.sebastian@gmail.com>
9
+ License-Expression: Apache-2.0
10
+ License-File: LICENSE
11
+ Keywords: agent,agents,ai,anthropic,cli,llm,mcp,openai,tool-use
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.11
24
+ Requires-Dist: httpx>=0.27
25
+ Requires-Dist: jsonschema>=4.21
26
+ Requires-Dist: windows-curses>=2.4.2; sys_platform == 'win32'
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=8.0; extra == 'dev'
29
+ Requires-Dist: ruff>=0.6; extra == 'dev'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # agentkernel
33
+
34
+ A minimal, dependency-light **kernel for a general-purpose AI agent**. The kernel runs the agent loop — send a conversation plus tool definitions to a language model, parse out any tool calls, execute them through a registry, feed the results back, repeat until the model produces a final answer — and nothing more.
35
+
36
+ It is **provider-agnostic**, **tool-agnostic**, and **fully testable without network access**. Everything a user might call a "feature" (web search, file editing, project memory, profiles) is built *on top of* this kernel as a tool, a piece of injected context, or a run parameter — never inside it.
37
+
38
+ > Dependencies: `jsonschema` + `httpx`. No agent frameworks (no LangChain, LlamaIndex, CrewAI, …). The loop is the product.
39
+
40
+ ---
41
+
42
+ ## Why this exists
43
+
44
+ Most "agent frameworks" hide the one thing that actually matters — the loop — behind layers of abstraction. agentkernel keeps the loop small, explicit, and readable, and pushes everything else to the edges:
45
+
46
+ - **Everything is a tool, a context injection, or a run parameter.** If a proposed addition isn't one of those three, it doesn't belong in the kernel.
47
+ - **One canonical message format.** Provider quirks (Anthropic content blocks vs. OpenAI `tool_calls` arrays) are normalized inside adapters and never leak into the loop or the registry.
48
+ - **Errors become tool results, not exceptions.** A failing tool returns a `ToolResult(is_error=True)`; the loop keeps going so the model can recover. Only unrecoverable kernel faults raise.
49
+ - **The cacheable prefix is stable.** System prompt + tool definitions are assembled once per run and never reordered, so prompt caching actually hits — treated as a correctness property, not an optimization.
50
+ - **Every mutation is gated.** Tools that write files or run shell pass through an approver, and code runs inside a sandbox confined to the working directory.
51
+ - **Telemetry from turn one.** Every turn records tokens (including cache read/write), tool calls, stop reason, and cost — redacted by default.
52
+
53
+ ---
54
+
55
+ ## Installation
56
+
57
+ Requires **Python 3.11+**.
58
+
59
+ **As a CLI tool** (recommended for everyday use — puts `agentkernel` on your `PATH`):
60
+
61
+ ```bash
62
+ uv tool install agentkernel-cli # or: pipx install agentkernel-cli / pip install agentkernel-cli
63
+ agentkernel init # scaffold ./agentkernel.toml (or: agentkernel init --global)
64
+ agentkernel run "summarize the failing tests"
65
+ ```
66
+
67
+ **For development** (working on agentkernel itself), with [uv](https://docs.astral.sh/uv/):
68
+
69
+ ```bash
70
+ uv sync --extra dev # install runtime + dev (pytest, ruff) dependencies
71
+ uv run pytest
72
+ ```
73
+
74
+ API keys are read **only** from the environment — never from config files or traces:
75
+
76
+ ```bash
77
+ export ANTHROPIC_API_KEY=*** # for provider = "anthropic"
78
+ export OPENAI_API_KEY=*** # for provider = "openai" / embeddings
79
+ # local/OpenAI-compatible endpoints (Ollama, vLLM) usually need no key
80
+ # Credential pool: give several keys and the provider rotates on rate limits —
81
+ # comma-separate (ANTHROPIC_API_KEY="k1,k2") or number them (ANTHROPIC_API_KEY_1, _2).
82
+ ```
83
+
84
+ ### Running anywhere
85
+
86
+ `agentkernel` works from any directory, not just its own repo:
87
+
88
+ ```bash
89
+ agentkernel -C ~/code/my-app run "summarize the failing tests"
90
+ ```
91
+
92
+ Config is discovered in layers — **explicit `--config`** overrides everything;
93
+ otherwise the user-global **`~/.agentkernel/config.toml`** is the base and the
94
+ nearest project **`agentkernel.toml`** (found by walking up from the target
95
+ directory) overrides it, then `AGENTKERNEL_*` env vars, then defaults. Set
96
+ `AGENTKERNEL_HOME` to relocate the global home.
97
+
98
+ State follows a **global brain, project sessions** policy:
99
+
100
+ | Lives in `~/.agentkernel/` (global) | Lives in `<project>/.agentkernel/` (per-project) |
101
+ |---|---|
102
+ | memory notebook, knowledge graph, skills, profiles, improvements, cron jobs | session traces, kanban board, checkpoints, the session memory store |
103
+
104
+ So your skills, long-term memory, and scheduled jobs are shared across every
105
+ project, while each project keeps its own transcripts and work board. `-C PATH`
106
+ (like `git -C`) points the agent at a project from elsewhere; an absolute path in
107
+ config is always honored as-is, and a path customized in a project's
108
+ `agentkernel.toml` stays project-local.
109
+
110
+ ## Quick start
111
+
112
+ ```bash
113
+ uv run agentkernel # interactive REPL (default)
114
+ uv run agentkernel tui # full-screen curses terminal UI
115
+ uv run agentkernel run "your prompt" # single non-interactive run, prints the answer
116
+ uv run agentkernel run --file task.md # single run from a prompt file
117
+ uv run agentkernel run --background "..." # detached run; output goes to a file
118
+ uv run agentkernel improve # reflect on the latest trace, write a rule note
119
+ uv run agentkernel eval --suite s.toml # run an eval suite, score answers with a judge
120
+ uv run agentkernel eval --suite s.toml -o report.json # ...and write a JSON report
121
+ uv run agentkernel loop --file l.toml # run a workflow loop until its stopping condition
122
+ uv run agentkernel insights --days 30 # aggregate session traces into a usage/cost report
123
+ uv run agentkernel doctor # check config, deps, credentials, sandbox
124
+ uv run agentkernel sessions list # list saved sessions (needs a memory store)
125
+ uv run agentkernel --resume <id> run "..." # resume a saved session by id
126
+ uv run agentkernel cron add 1h "check CI" # schedule a job; `cron tick` runs what's due
127
+ uv run agentkernel cron tick # run all due jobs once (drive from OS scheduler)
128
+ uv run agentkernel kanban add "ship release" # file a task on the shared work board
129
+ uv run agentkernel kanban list # inspect the work board
130
+ uv run agentkernel new skill my-skill # scaffold a skill/profile/loop/eval from a template
131
+ uv run agentkernel --profile reviewer run "review src/" # run with a bundled profile
132
+ uv run agentkernel --skill code-review repl # start REPL with a skill pinned
133
+ uv run agentkernel --model o3-mini run "hi" # override the model for one run
134
+ uv run agentkernel --help # options
135
+ uv run pytest # full test suite, offline
136
+ ```
137
+
138
+ The REPL keeps conversational context across messages, prints a one-line progress
139
+ status per turn, and writes a per-session JSONL trace. It supports slash commands:
140
+
141
+ ```
142
+ $ uv run agentkernel
143
+ [session trace: .agentkernel/traces/<session-id>.jsonl]
144
+ agentkernel REPL - type your message and press enter. Commands: /exit, /clear,
145
+ /system, /profile, /skills, /skill, /tools, /trace, /cost, /memory, /improve.
146
+ > summarize the files in this directory
147
+ ```
148
+
149
+ | Command | Effect |
150
+ |---|---|
151
+ | `/clear` | reset the conversation context |
152
+ | `/system [text]` | set (or clear) the system prompt for following turns |
153
+ | `/profile [name]` | show, or load, a profile from `profile_dir` |
154
+ | `/skills` | list discovered skills (`*` = active) |
155
+ | `/skill <name>` | toggle a skill on/off |
156
+ | `/tools` | list registered tools (builtin + MCP + graph) |
157
+ | `/trace` / `/cost` | show the trace path / cumulative session cost |
158
+ | `/memory [list [limit] \| delete <note_id> \| export [path] \| reindex]` | manage the notebook |
159
+ | `/improve [trace-path]` | reflect on the current (or chosen) trace and write an improvement |
160
+ | `/exit` | leave |
161
+
162
+ ### Terminal UI
163
+
164
+ `uv run agentkernel tui` launches a full-screen [curses](agentkernel/tui) interface over the same runtime: a scrollable, color-coded chat history, a multi-line input area, and a status bar, with the agent running on a background thread so the UI stays responsive. Type and press **Enter** to send, **PgUp/PgDn** (or arrows) to scroll, **Esc**/**q** to quit. It reads the same `agentkernel.toml`, so any configured memory, skills, and MCP servers are active. On Windows the `windows-curses` backend is installed automatically; on Unix `curses` ships with Python.
165
+
166
+ ### Using the kernel as a library
167
+
168
+ ```python
169
+ from agentkernel.config import Config
170
+ from agentkernel.cli import build_runtime
171
+
172
+ config = Config(provider="anthropic", model="claude-sonnet-4-6")
173
+ agent, telemetry, mcp_clients = build_runtime(config)
174
+ try:
175
+ print(agent.run("List the Python files here and count the lines in each."))
176
+ finally:
177
+ telemetry.close()
178
+ for client in mcp_clients:
179
+ client.close()
180
+ ```
181
+
182
+ `build_runtime` wires a provider, the builtin tools inside a `LocalSandbox`, a `CliApprover`, JSONL telemetry, and any configured MCP servers / skills / knowledge-graph / memory tools into an `Agent`. You can also assemble these yourself — every collaborator is injected, nothing is global.
183
+
184
+ ---
185
+
186
+ ## Configuration
187
+
188
+ Configuration loads from `agentkernel.toml` (see [`agentkernel.toml.example`](agentkernel.toml.example)) with this precedence:
189
+
190
+ > explicit constructor args **>** `AGENTKERNEL_*` environment variables **>** `agentkernel.toml` **>** defaults
191
+ > CLI flags (`--model`, `--profile`, `--skill`, `--memory`) override the file.
192
+
193
+ | Key | Default | Meaning |
194
+ |---|---|---|
195
+ | `provider` | `anthropic` | `anthropic` \| `openai` \| `local` |
196
+ | `model` | `claude-sonnet-4-6` | model id for the selected provider |
197
+ | `base_url` | `None` | endpoint for `provider = "local"` |
198
+ | `max_output_tokens` | `4096` | reply token cap |
199
+ | `output_reserve` | `8192` | budget headroom reserved for the reply |
200
+ | `max_iterations` | `25` | loop guard against runaway sessions |
201
+ | `keep_recent_turns` | `6` | turns kept verbatim during compaction |
202
+ | `max_tool_result_tokens` | `4096` | per-result truncation cap |
203
+ | `approval_policy` | `always_ask` | `always_ask` \| `auto_allow` \| `deny_mutations` \| `smart` |
204
+ | `approval_allowlist` | `[]` | patterns that skip the approval prompt |
205
+ | `approval_judge_model` | `None` | model that judges call risk under `smart` (defaults to `summarizer_model`, then `model`) |
206
+ | `redact_tool_output` | `True` | scrub secret-looking strings from tool results before they enter context/traces |
207
+ | `checkpoints` | `False` | back up files before edits and register a `rollback` tool to undo them |
208
+ | `enable_todo` / `enable_clarify` | `False` | register the in-session `todo` planning tool / the `clarify` ask-the-user tool |
209
+ | `enable_plugins` / `plugins_dir` | `False` / `plugins` | auto-load user tool modules from `plugins_dir` (executes their code) |
210
+ | `enable_kanban` / `kanban_path` | `False` / `.agentkernel/kanban.json` | register the `kanban` shared work-board tool for multi-agent coordination |
211
+ | `working_dir` | `.` | root that file/shell tools are confined to |
212
+ | `summarizer_model` | `None` | cheap model for compaction (`None` → structural fallback) |
213
+ | `log_dir` | `.agentkernel/traces` | where session traces are written |
214
+ | `max_cost_usd` | `None` | per-run cost ceiling; the run stops when exceeded |
215
+ | `max_input_tokens_per_run` | `None` | per-run input-token ceiling |
216
+ | `profile` / `profile_dir` | `None` / `profiles` | active profile name and where profiles are loaded from |
217
+ | `memory_store` / `memory_notes_path` | `None` / `.agentkernel/memory/notes.jsonl` | `file` \| `memory` \| `sqlite`; notebook directory/path |
218
+ | `enable_memory_tools` | `False` | register `remember`/`recall`/`forget` tools |
219
+ | `memory_auto_context` / `memory_auto_context_limit` | `False` / `3` | auto-inject recalled notes before each user message |
220
+ | `memory_store_budget` | `None` | summarize older turns before persisting memory |
221
+ | `memory_curator_model` | `None` | cheap model for `memory extract`/`consolidate` (falls back to `summarizer_model`/`model`) |
222
+ | `semantic_search` | `False` | rank note recall with dense embeddings (SQLite only) |
223
+ | `semantic_search_lsh_bits` | `None` | approximate vector index bits; omit for brute force |
224
+ | `embedding_model` | `text-embedding-3-small` | OpenAI-compatible embedding model |
225
+ | `embedding_dimensions` | `None` | optional truncation (OpenAI only) |
226
+ | `embedding_base_url` | `None` | OpenAI-compatible embedding endpoint |
227
+ | `embedding_api_key_env` | `OPENAI_API_KEY` | env var holding the embedding API key |
228
+ | `skills_dir` / `skills` | `skills` / `[]` | skill source directory and the initially-active skill names |
229
+ | `enable_graph` / `graph_path` | `False` / `.agentkernel/graph.jsonl` | register `graph_*` tools backed by this file |
230
+ | `mcp_log_dir` | `mcp_logs/` | one stderr log file per configured MCP server |
231
+ | `improvements_dir` | `.agentkernel/improvements` | where `improve` writes reflection notes |
232
+ | `sandbox` / `sandbox_image` / `sandbox_network` | `local` / `python:3.12-slim` / `none` | execution boundary: `local` or `docker`, plus the container image and network |
233
+ | `enable_spawn` / `spawn_max_depth` | `False` / `2` | register the `spawn` sub-agent tool and bound its recursion |
234
+ | `judge_model` / `eval_threshold` / `eval_rubric` | `None` / `0.6` / `None` | model that scores evals (defaults to `model`), the pass cutoff, and a default rubric |
235
+
236
+ MCP servers are declared separately as `[[mcp_servers]]` tables (see [MCP](#mcp-mcp) below). Each server supports an optional `timeout` (request seconds) and emits its stderr to `mcp_log_dir/<name>.log`.
237
+
238
+ ---
239
+
240
+ ## Architecture
241
+
242
+ ```
243
+ ┌─────────────────────────────────────────────┐
244
+ │ Agent │
245
+ │ (the loop; orchestrates everything below) │
246
+ └───┬───────────┬───────────┬──────────┬───────┘
247
+ │ │ │ │
248
+ ┌─────────▼──┐ ┌─────▼─────┐ ┌───▼──────┐ ┌─▼──────────┐
249
+ │ Provider │ │ Tool │ │ Context │ │ Approver │
250
+ │ (adapter) │ │ Registry │ │ Manager │ │ + Sandbox │
251
+ └─────┬──────┘ └─────┬─────┘ └──────────┘ └────────────┘
252
+ │ │
253
+ Anthropic/OpenAI/ builtin tools
254
+ local endpoint (files, shell)
255
+
256
+ Cross-cutting: Config (injected), Telemetry (records every turn)
257
+ ```
258
+
259
+ **One turn:**
260
+
261
+ 1. The `ContextManager` returns the message window within budget (compacting if needed).
262
+ 2. The provider adapter translates canonical → wire, calls the API, and translates the reply back to a canonical `CompletionResponse`.
263
+ 3. The assistant message is appended. No tool calls → the run ends and returns the final text.
264
+ 4. For each tool call: validate args → check approval → execute → produce a `ToolResult`.
265
+ 5. All results are appended as one tool-role message, paired to their call ids.
266
+ 6. Telemetry records the turn. Loop.
267
+
268
+ ### Canonical types ([`types.py`](agentkernel/types.py))
269
+
270
+ `Message`, `ToolCall`, `ToolResult`, `Usage`, `CompletionResponse` — stdlib dataclasses that are the lingua franca of the kernel. Nothing outside a provider adapter speaks a provider's native format.
271
+
272
+ ### Providers ([`providers/`](agentkernel/providers))
273
+
274
+ Hand-written `httpx` adapters for **Anthropic** (Messages API), **OpenAI** (Chat Completions), and **local** (OpenAI-compatible: Ollama, vLLM, LM Studio). Each adapter:
275
+
276
+ - translates canonical messages/tools to the provider's exact wire shape and back,
277
+ - handles the **tool-result pairing** fan-out (Anthropic: all results in one `user` message of `tool_result` blocks; OpenAI: one `role:"tool"` message per result),
278
+ - reports cache read/write token counts where available,
279
+ - applies cache markers on the stable prefix (Anthropic `cache_control: ephemeral`).
280
+
281
+ Translation is implemented as **pure functions** separate from the HTTP call, which is what makes adapter behavior testable offline. The adapters share one `httpx` transport ([`providers/_http.py`](agentkernel/providers/_http.py)) that retries transient failures (timeouts and `429`/`5xx`), honoring a server `Retry-After` header (bounded) when present, and raises `ProviderError` only once retries are exhausted.
282
+
283
+ ### Tool system ([`tools/`](agentkernel/tools))
284
+
285
+ A `ToolSpec` carries a JSON-Schema parameter definition, a handler, and flags (`requires_approval`, `mutates`, `runs_code`). The `ToolRegistry` validates arguments against the schema (validation failures become error results, not executions) and dispatches. Builtin tools:
286
+
287
+ | Tool | Flags |
288
+ |---|---|
289
+ | `read_file(path)` | read-only |
290
+ | `list_dir(path)` | read-only |
291
+ | `find_files(pattern, path?)` | read-only — glob search (`**/*.py`), skips noise dirs |
292
+ | `search_text(pattern, glob?, …)` | read-only — regex grep → `path:line: text` |
293
+ | `file_info(path)` | read-only — size / type / mtime / line count |
294
+ | `write_file(path, content)` | mutates, requires approval |
295
+ | `edit_file(path, old, new, replace_all?)` | mutates, requires approval — exact-substring replace |
296
+ | `rollback()` | restores files to their pre-edit state (only when `checkpoints = true`) |
297
+ | `bash(command)` | runs code, mutates, requires approval |
298
+
299
+ File and search tools confine paths to the working directory (rejecting `..` escapes and absolute paths outside the root); `bash` runs inside the sandbox boundary. See [`examples/`](examples) for a playground project, ready-to-paste prompts, and a scored eval suite that exercise these tools.
300
+
301
+ ### Context management ([`context/`](agentkernel/context))
302
+
303
+ Per-message token accounting, a budget (`provider.context_window − output_reserve`), and **compaction**: when the budget is exceeded, the oldest completed turns collapse into one synthetic summary while the most recent turns are kept verbatim. Compaction never splits an open tool-call/result pair, and the system prompt can never be lost (it lives in the cacheable prefix, not the message list). The summarizer is pluggable; the default is a deterministic structural summary.
304
+
305
+ ### Approval & sandbox ([`approval/`](agentkernel/approval))
306
+
307
+ `Approver` implementations (`CliApprover`, `AutoApprover`) apply a shared policy. Two execution boundaries sit behind the `Sandbox` protocol:
308
+
309
+ - **`LocalSandbox`** (default) — a subprocess confined to the working directory, with a scrubbed environment and a **real** timeout that kills the whole process tree. Convenient, but cwd-scoped, not a security jail (a command using absolute paths can still reach the host).
310
+ - **`DockerSandbox`** (`sandbox = "docker"`) — one long-lived container per project. The working directory is bind-mounted; by default there is **no network**, a separate filesystem, and bounded memory/CPU/PIDs, so a command can't reach the host or the network. The Docker CLI is driven through an injectable runner, so the argv/lifecycle are unit-tested without a daemon. Use this to run untrusted tasks.
311
+
312
+ ### Telemetry ([`telemetry.py`](agentkernel/telemetry.py))
313
+
314
+ One JSONL file per session. Each turn records tokens (input/output/cache), estimated cost (from a per-model price table; unknown models log tokens with `null` cost), tool-call outcomes, stop reason, and any compaction event. **Redaction is the default** — tool arguments are logged as a hash + length, never raw; file contents never enter a record. `--verbose-trace` opts into raw arguments for local debugging.
315
+
316
+ ### MCP ([`mcp/`](agentkernel/mcp))
317
+
318
+ A hand-written [Model Context Protocol](https://modelcontextprotocol.io) client (JSON-RPC 2.0 over stdio — no SDK dependency) connects to MCP servers, discovers their tools, and registers each as an ordinary `ToolSpec`. The registry and loop are **completely unchanged** — an MCP-backed tool and a native builtin register identically. Read-only tools (advertising `readOnlyHint`) skip the approval gate; everything else is gated by default. A transport or protocol fault becomes an error result, never a raise.
319
+
320
+ Each server gets its own stderr log file under `mcp_log_dir` for easy debugging, and an optional `timeout` controls per-request patience:
321
+
322
+ ```toml
323
+ [[mcp_servers]]
324
+ name = "filesystem"
325
+ command = "npx"
326
+ args = ["-y", "@modelcontextprotocol/server-filesystem", "."]
327
+ timeout = 30
328
+ ```
329
+
330
+ On Windows, point `command` at the actual executable (e.g. `npx.cmd`) since the client launches the process directly without a shell.
331
+
332
+ ### Higher-level capabilities (built on the kernel)
333
+
334
+ These are implemented on top of the kernel using the three primitives — a tool, a context injection, or a run parameter — never by changing the loop:
335
+
336
+ - **Profiles** ([`profiles.py`](agentkernel/profiles.py)) — a run parameter `(system_prompt, tool_filter, model_override, rubric)` loaded from `profiles/<name>.toml`. The loop honors `system_prompt` and `tool_filter`; CLI `--profile` sets the active profile, and a profile's `model_override` or `rubric` override the defaults.
337
+ - **Skills** ([`skills.py`](agentkernel/skills.py)) — [Anthropic-style](https://github.com/anthropics/skills) `SKILL.md` folders (YAML frontmatter `name`/`description` + body + bundled files) discovered from `skills_dir`, with **progressive disclosure**: only a name+description catalog sits in the (stable, assembled-once) prefix; the model loads a skill's full body + file listing on demand via the `use_skill` tool. A skill can also be *pinned* (`skills = [...]`, `--skill <name>`, or `/skill <name>`) to force its body into the prefix. Loose `.md`/`.toml` skills still work.
338
+ - **Memory** ([`memory.py`](agentkernel/memory.py), [`semantic_memory.py`](agentkernel/semantic_memory.py)) — a `MemoryStore` loaded before a run and saved after; ships with in-memory, JSONL, and SQLite/FTS5 stores. Enable with `memory_store`. The SQLite notebook supports optional `semantic_search` via an OpenAI-compatible embedding endpoint, cosine-ranked recall, and a standard-library-only approximate LSH index (`semantic_search_lsh_bits`) for large notebooks. The `reindex_memory` tool backfills embeddings when a notebook is promoted to semantic recall.
339
+ - **Knowledge graph** ([`knowledge.py`](agentkernel/knowledge.py)) — a file-backed triple store exposed purely as `graph_add`, `graph_query`, `graph_neighbors`, `graph_path`, and `graph_stats` tools (`enable_graph = true`). The kernel keeps no graph state.
340
+ - **Loops** ([`loops.py`](agentkernel/loops.py)) — [loop-engineering](https://signals.forwardfuture.ai/loop-library/) workflows: `agentkernel loop` re-runs the agent on a loop's prompt until a stopping condition (a success shell-check and/or an N-in-a-row streak), following **action → check → iterate → stop**. Loops load from TOML or from a skill body (`--skill`), and the success check runs in the sandbox so a loop can verify its own work (e.g. "fix until `pytest` is green").
341
+ - **Self-improvement** ([`improvement.py`](agentkernel/improvement.py)) — `agentkernel improve` or the REPL's `/improve` reads a session trace and asks the model for one concrete rule, written to `improvements_dir`. This is why telemetry exists from turn one.
342
+ - **Sub-agents** ([`subagent.py`](agentkernel/subagent.py)) — `enable_spawn = true` registers a `spawn` tool so the model can delegate a self-contained subtask to a focused child `Agent` (own context, optional system prompt and tool subset), depth-limited by `spawn_max_depth`. Built on the loop's re-entrancy; no loop change.
343
+ - **Evaluators** ([`evaluation.py`](agentkernel/evaluation.py)) — `agentkernel eval --suite suite.toml` runs each case through the agent, then a judge model scores the answer against a rubric (0–1, pass/fail). Use `--case <glob>` to filter cases and `--output/-o report.json` to write a machine-readable report. Aggregates to pass-rate and mean score; exits non-zero unless every case passes, so it doubles as a CI gate and a way to compare models.
344
+ - **Budget guard** ([`budget.py`](agentkernel/budget.py)) — per-run cost/token ceilings (`max_cost_usd`, `max_input_tokens_per_run`) that stop a run cleanly.
345
+
346
+ ---
347
+
348
+ ## Project layout
349
+
350
+ ```
351
+ agentkernel/
352
+ types.py # canonical data types
353
+ config.py # configuration loading + layered discovery
354
+ paths.py # agent home / project root resolution (run anywhere)
355
+ telemetry.py # JSONL traces + cost table
356
+ agent.py # the loop
357
+ providers/ # base protocol + anthropic / openai / local adapters
358
+ tools/ # ToolSpec, ToolRegistry, builtin file & shell tools
359
+ context/ # accounting, compaction, shared truncation
360
+ approval/ # Approver, Sandbox, policies
361
+ mcp/ # MCP stdio client; registers remote tools as ToolSpecs
362
+ budget.py # per-run cost/token guard
363
+ progress.py # per-turn REPL status lines
364
+ profiles.py # run-parameter profiles (Phase 5)
365
+ skills.py # Anthropic-style SKILL.md skills (progressive disclosure)
366
+ memory.py # pre/post-run MemoryStore and notebook backends (Phase 3)
367
+ semantic_memory.py # dense embeddings + cosine-ranked recall over SQLite
368
+ semantic_index.py # standard-library LSH approximate vector index
369
+ embeddings.py # OpenAI-compatible embedding provider protocol
370
+ knowledge.py # triple store exposed as tools (Phase 6)
371
+ improvement.py # trace -> improvement rule (Phase 7)
372
+ subagent.py # spawn tool: delegate to a child Agent
373
+ evaluation.py # eval harness: judge-scored runs
374
+ loops.py # loop-engineering runner (run-until-condition)
375
+ cli.py # REPL + run/improve/eval/loop/tui/new entry points
376
+ tui/ # curses interactive terminal UI (agentkernel tui)
377
+ skills/ # bundled starter skills (auto-discovered)
378
+ profiles/ # bundled run profiles (reviewer, coder, researcher, …)
379
+ loops/ # bundled loop workflows (until-tests-pass, …)
380
+ templates/ # annotated skeletons + `agentkernel new` scaffolding
381
+ examples/ # playground project, eval suite, sample skill/loop
382
+ tests/ # offline suite (FakeProvider-driven)
383
+ ```
384
+
385
+ ### Bundled content
386
+
387
+ The skills, profiles, and loop machinery ships with a starter library so it's
388
+ useful out of the box, plus templates to author your own:
389
+
390
+ - **[`skills/`](skills)** — `code-review`, `debug-triage`, `write-tests`, `refactor`, `commit-and-pr`, `security-review`. Discovered automatically; pin one with `--skill <name>` or load on demand via `use_skill`.
391
+ - **[`profiles/`](profiles)** — `reviewer` (read-only + rubric), `coder`, `researcher` (mutations denied), `planner` (plan-only), `safe` (minimal). Run with `--profile <name>`.
392
+ - **[`loops/`](loops)** — `until-tests-pass`, `until-lint-clean`, `until-typecheck-clean`, `until-build-succeeds`, `review-and-fix`. Run with `loop --file loops/<name>.toml`.
393
+ - **[`templates/`](templates)** — annotated skeletons for each, plus `agentkernel new skill|profile|loop|eval <name>` to scaffold a fresh one with the name filled in.
394
+
395
+ ---
396
+
397
+ ## Testing
398
+
399
+ ```bash
400
+ uv run pytest
401
+ ```
402
+
403
+ The suite is **fully offline** — a `FakeProvider` returns scripted responses, so the loop, registry, compaction, approval path, and adapter translation are all exercised with zero network calls. No test makes a network request.
404
+
405
+ ---
406
+
407
+ ## The seam principle
408
+
409
+ The kernel proves its design by adding every capability through one of three primitives — a tool, a context injection, or a run parameter — **without changing the loop or the registry**:
410
+
411
+ - **MCP** — an MCP client registers each remote tool as a `ToolSpec` (a tool).
412
+ - **Knowledge graph** — `graph_add`/`graph_query` are ordinary registered tools.
413
+ - **Skills** — a `ContextSource` contributes system-prompt text (a context injection).
414
+ - **Memory** — pre-run load and post-run save hooks around `run`, plus optional recall injected before each user message (context injection). A model-controlled notebook (`remember`/`recall`/…) and **self-curation** ([curation.py](agentkernel/curation.py)): `agentkernel memory extract` distils a session transcript into durable facts (deduped against existing notes), and `agentkernel memory consolidate` has the model merge related notes and supersede outdated ones. Schedule it via `cron` to keep memory tidy automatically.
415
+ - **Profiles** — `run()` accepts a `profile` parameter (a run parameter).
416
+ - **Sub-agents** — the `spawn` tool builds a child `Agent` from inside a handler (a tool, on top of re-entrancy).
417
+ - **Self-improvement** — reads the telemetry the kernel has emitted since turn one.
418
+ - **Evaluators** — a harness that runs the agent and judge-scores the output; no kernel change.
419
+
420
+ Each lands at the edge. The loop in [`agent.py`](agentkernel/agent.py) still reads like the design's pseudocode.
421
+
422
+ ---
423
+
424
+ ## Scope & contributing
425
+
426
+ This repository is the **kernel only**. Contributions should preserve the design principles above: keep the kernel small, keep provider details inside adapters, return errors as results, keep the cacheable prefix stable, gate mutations, and never log secrets or raw file contents. New features belong on top of the kernel as tools, context injections, or run parameters — not inside the loop.