agentkernel-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. agentkernel/__init__.py +7 -0
  2. agentkernel/__main__.py +5 -0
  3. agentkernel/agent.py +311 -0
  4. agentkernel/approval/__init__.py +23 -0
  5. agentkernel/approval/base.py +34 -0
  6. agentkernel/approval/cli.py +129 -0
  7. agentkernel/approval/policy.py +58 -0
  8. agentkernel/approval/risk.py +91 -0
  9. agentkernel/approval/sandbox.py +201 -0
  10. agentkernel/budget.py +64 -0
  11. agentkernel/checkpoint.py +50 -0
  12. agentkernel/cli.py +1482 -0
  13. agentkernel/config.py +224 -0
  14. agentkernel/context/__init__.py +17 -0
  15. agentkernel/context/manager.py +216 -0
  16. agentkernel/context/truncate.py +35 -0
  17. agentkernel/cron.py +146 -0
  18. agentkernel/curation.py +183 -0
  19. agentkernel/doctor.py +141 -0
  20. agentkernel/embeddings.py +132 -0
  21. agentkernel/evaluation.py +186 -0
  22. agentkernel/improvement.py +133 -0
  23. agentkernel/insights.py +141 -0
  24. agentkernel/kanban.py +114 -0
  25. agentkernel/knowledge.py +383 -0
  26. agentkernel/loops.py +145 -0
  27. agentkernel/mcp/__init__.py +23 -0
  28. agentkernel/mcp/client.py +181 -0
  29. agentkernel/mcp/config.py +59 -0
  30. agentkernel/mcp/tools.py +96 -0
  31. agentkernel/memory.py +1208 -0
  32. agentkernel/paths.py +73 -0
  33. agentkernel/plugins.py +76 -0
  34. agentkernel/profiles.py +70 -0
  35. agentkernel/progress.py +89 -0
  36. agentkernel/providers/__init__.py +35 -0
  37. agentkernel/providers/_http.py +157 -0
  38. agentkernel/providers/anthropic.py +282 -0
  39. agentkernel/providers/base.py +38 -0
  40. agentkernel/providers/credentials.py +65 -0
  41. agentkernel/providers/local.py +34 -0
  42. agentkernel/providers/openai.py +260 -0
  43. agentkernel/redaction.py +77 -0
  44. agentkernel/semantic_index.py +139 -0
  45. agentkernel/semantic_memory.py +253 -0
  46. agentkernel/skills.py +268 -0
  47. agentkernel/subagent.py +161 -0
  48. agentkernel/telemetry.py +199 -0
  49. agentkernel/templates/README.md +35 -0
  50. agentkernel/templates/SKILL.md +28 -0
  51. agentkernel/templates/eval-suite.toml +22 -0
  52. agentkernel/templates/loop.toml +29 -0
  53. agentkernel/templates/mcp-servers.toml +22 -0
  54. agentkernel/templates/profile.toml +29 -0
  55. agentkernel/templates/tool_module.py +64 -0
  56. agentkernel/tools/__init__.py +5 -0
  57. agentkernel/tools/base.py +100 -0
  58. agentkernel/tools/builtin/__init__.py +37 -0
  59. agentkernel/tools/builtin/checkpoint_tool.py +33 -0
  60. agentkernel/tools/builtin/clarify.py +60 -0
  61. agentkernel/tools/builtin/files.py +221 -0
  62. agentkernel/tools/builtin/kanban_tool.py +100 -0
  63. agentkernel/tools/builtin/search.py +225 -0
  64. agentkernel/tools/builtin/shell.py +67 -0
  65. agentkernel/tools/builtin/todo.py +106 -0
  66. agentkernel/tui/__init__.py +50 -0
  67. agentkernel/tui/app.py +594 -0
  68. agentkernel/types.py +127 -0
  69. agentkernel/worktree.py +64 -0
  70. agentkernel_cli-0.1.0.dist-info/METADATA +426 -0
  71. agentkernel_cli-0.1.0.dist-info/RECORD +74 -0
  72. agentkernel_cli-0.1.0.dist-info/WHEEL +4 -0
  73. agentkernel_cli-0.1.0.dist-info/entry_points.txt +2 -0
  74. agentkernel_cli-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,183 @@
1
+ """Self-curating memory: extract durable facts from a transcript, and
2
+ consolidate the notebook with an LLM (design §13, Phase 3).
3
+
4
+ The storage/recall machinery (``NoteStore``, semantic search, dedup) is mature;
5
+ what was missing is *populating and cleaning* it without the model having to call
6
+ ``remember`` in the moment. This module adds two harness operations on top of the
7
+ kernel (an Agent is not required — just a ``NoteStore`` and a ``Provider``):
8
+
9
+ * ``extract(messages)`` — distil a finished conversation into candidate facts,
10
+ skipping ones that duplicate (by token overlap) what is already stored.
11
+ * ``consolidate()`` — ask the model to merge related notes and supersede
12
+ outdated ones, then rebuild the notebook from the cleaned set.
13
+
14
+ Both are best-effort: an unparseable model reply leaves memory unchanged rather
15
+ than raising or destroying notes.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import re
22
+ from dataclasses import dataclass, field
23
+ from typing import TYPE_CHECKING
24
+
25
+ from agentkernel.memory import MemoryNote, _tokens
26
+ from agentkernel.types import Message
27
+
28
+ if TYPE_CHECKING:
29
+ from agentkernel.memory import NoteStore
30
+ from agentkernel.providers import Provider
31
+
32
+ _EXTRACT_SYSTEM = (
33
+ "You curate an AI agent's long-term memory. From a conversation, extract only "
34
+ "DURABLE facts worth remembering across future sessions: stable user "
35
+ "preferences, project facts and conventions, decisions, and constraints. "
36
+ "Exclude transient task details, pleasantries, and anything ephemeral. "
37
+ 'Respond with ONLY a JSON array of objects {"text": "...", "tags": ["..."]}. '
38
+ "Return [] if nothing is worth remembering."
39
+ )
40
+
41
+ _CONSOLIDATE_SYSTEM = (
42
+ "You consolidate an AI agent's long-term memory notes. Produce a cleaner set: "
43
+ "merge notes that say the same or closely related things, remove redundancy, "
44
+ "and when two notes conflict keep only the most recent/true statement. Do not "
45
+ "drop any distinct information. "
46
+ 'Respond with ONLY a JSON array of objects {"text": "...", "tags": ["..."]}.'
47
+ )
48
+
49
+
50
+ @dataclass
51
+ class ExtractionResult:
52
+ added: list[MemoryNote] = field(default_factory=list)
53
+ skipped_duplicates: int = 0
54
+
55
+
56
+ @dataclass
57
+ class ConsolidationResult:
58
+ before: int
59
+ after: int
60
+ notes: list[MemoryNote] = field(default_factory=list)
61
+
62
+ @property
63
+ def removed(self) -> int:
64
+ return max(self.before - self.after, 0)
65
+
66
+
67
+ def _parse_json_array(text: str) -> list[dict]:
68
+ """Extract a JSON array of objects from a model reply, tolerating prose."""
69
+ match = re.search(r"\[.*\]", text, re.DOTALL)
70
+ if not match:
71
+ return []
72
+ try:
73
+ data = json.loads(match.group(0))
74
+ except json.JSONDecodeError:
75
+ return []
76
+ return [item for item in data if isinstance(item, dict)]
77
+
78
+
79
+ def _render_transcript(messages: list[Message], max_chars: int) -> str:
80
+ lines: list[str] = []
81
+ for m in messages:
82
+ if m.content and m.role in ("user", "assistant"):
83
+ lines.append(f"{m.role}: {m.content}")
84
+ for tc in m.tool_calls:
85
+ lines.append(f"assistant called {tc.name}")
86
+ for r in m.tool_results:
87
+ snippet = r.content if len(r.content) < 200 else r.content[:200] + "…"
88
+ lines.append(f"tool[{'error' if r.is_error else 'ok'}]: {snippet}")
89
+ text = "\n".join(lines)
90
+ if len(text) > max_chars:
91
+ text = text[:max_chars] + "\n… [transcript truncated]"
92
+ return text
93
+
94
+
95
+ class MemoryCurator:
96
+ """Populates and cleans a ``NoteStore`` with the help of a model."""
97
+
98
+ def __init__(
99
+ self,
100
+ notes: NoteStore,
101
+ provider: Provider,
102
+ *,
103
+ max_tokens: int = 1024,
104
+ dedup_threshold: float = 0.8,
105
+ transcript_chars: int = 16000,
106
+ ) -> None:
107
+ self._notes = notes
108
+ self._provider = provider
109
+ self._max_tokens = max_tokens
110
+ self._dedup_threshold = dedup_threshold
111
+ self._transcript_chars = transcript_chars
112
+
113
+ def extract(self, messages: list[Message]) -> ExtractionResult:
114
+ transcript = _render_transcript(messages, self._transcript_chars)
115
+ if not transcript.strip():
116
+ return ExtractionResult()
117
+ candidates = self._ask(
118
+ _EXTRACT_SYSTEM,
119
+ f"Conversation:\n{transcript}\n\nExtract the durable facts.",
120
+ )
121
+ existing = self._notes.all()
122
+ result = ExtractionResult()
123
+ for cand in candidates:
124
+ text = str(cand.get("text", "")).strip()
125
+ if not text:
126
+ continue
127
+ if self._is_duplicate(text, existing):
128
+ result.skipped_duplicates += 1
129
+ continue
130
+ note = self._notes.add(text, tags=cand.get("tags") or [])
131
+ existing.append(note) # dedup later candidates against it too
132
+ result.added.append(note)
133
+ return result
134
+
135
+ def consolidate(self) -> ConsolidationResult:
136
+ existing = self._notes.all()
137
+ if len(existing) < 2:
138
+ return ConsolidationResult(len(existing), len(existing), existing)
139
+ listing = "\n".join(
140
+ f"{n.note_id}. {n.text}"
141
+ + (f" [tags: {', '.join(n.tags)}]" if n.tags else "")
142
+ for n in existing
143
+ )
144
+ cleaned = [
145
+ c for c in self._ask(_CONSOLIDATE_SYSTEM, f"Current memory notes:\n{listing}")
146
+ if str(c.get("text", "")).strip()
147
+ ]
148
+ if not cleaned:
149
+ return ConsolidationResult(len(existing), len(existing), existing) # no-op
150
+ # Rebuild from the consolidated set using the store's public API so all
151
+ # backends (JSONL / SQLite / semantic) stay consistent.
152
+ for note in existing:
153
+ self._notes.forget(note_id=note.note_id)
154
+ new_notes = [
155
+ self._notes.add(str(c["text"]).strip(), tags=c.get("tags") or [])
156
+ for c in cleaned
157
+ ]
158
+ return ConsolidationResult(len(existing), len(new_notes), new_notes)
159
+
160
+ # --- internals ---------------------------------------------------------
161
+
162
+ def _ask(self, system: str, user: str) -> list[dict]:
163
+ resp = self._provider.complete(
164
+ [Message(role="user", content=user)],
165
+ [],
166
+ max_tokens=self._max_tokens,
167
+ temperature=0.0,
168
+ system=system,
169
+ )
170
+ return _parse_json_array(resp.message.content)
171
+
172
+ def _is_duplicate(self, text: str, existing: list[MemoryNote]) -> bool:
173
+ terms = _tokens(text)
174
+ if not terms:
175
+ return False
176
+ for note in existing:
177
+ other = _tokens(note.text)
178
+ if not other:
179
+ continue
180
+ jaccard = len(terms & other) / len(terms | other)
181
+ if jaccard >= self._dedup_threshold:
182
+ return True
183
+ return False
agentkernel/doctor.py ADDED
@@ -0,0 +1,141 @@
1
+ """Environment health check (design §18.7).
2
+
3
+ `agentkernel doctor` runs a set of fast, network-free checks — Python version,
4
+ required and optional dependencies, provider credentials, sandbox availability,
5
+ and writable paths — and prints a checklist. It exits non-zero if any check
6
+ fails, so it doubles as a setup smoke test. Checks are pure functions of the
7
+ config + environment, so they're deterministic and offline-testable.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import importlib.util
13
+ import os
14
+ import shutil
15
+ import sys
16
+ from dataclasses import dataclass
17
+
18
+ from agentkernel.config import Config
19
+
20
+ # ASCII marks: plain print() to a Windows cp1252 console can't encode unicode glyphs.
21
+ _MARK = {"ok": "[ OK ]", "warn": "[WARN]", "fail": "[FAIL]"}
22
+
23
+ # provider -> the env var that holds its key (local needs no key).
24
+ _PROVIDER_KEY_ENV = {"anthropic": "ANTHROPIC_API_KEY", "openai": "OPENAI_API_KEY"}
25
+
26
+
27
+ @dataclass
28
+ class Check:
29
+ name: str
30
+ status: str # "ok" | "warn" | "fail"
31
+ detail: str = ""
32
+
33
+
34
+ def _have_module(name: str) -> bool:
35
+ return importlib.util.find_spec(name) is not None
36
+
37
+
38
+ def run_checks(config: Config, *, env: dict[str, str] | None = None) -> list[Check]:
39
+ """Return the list of health checks for ``config`` and ``env``."""
40
+ env = os.environ if env is None else env
41
+ checks: list[Check] = []
42
+
43
+ # Python version.
44
+ v = sys.version_info
45
+ checks.append(
46
+ Check(
47
+ "python",
48
+ "ok" if v >= (3, 11) else "fail",
49
+ f"{v.major}.{v.minor}.{v.micro}" + ("" if v >= (3, 11) else " (need >= 3.11)"),
50
+ )
51
+ )
52
+
53
+ # Required dependencies.
54
+ for dep in ("jsonschema", "httpx"):
55
+ checks.append(
56
+ Check(f"dependency: {dep}", "ok" if _have_module(dep) else "fail")
57
+ )
58
+
59
+ # Provider credentials.
60
+ provider = config.provider
61
+ if provider == "local":
62
+ checks.append(
63
+ Check(
64
+ "provider: local",
65
+ "ok" if config.base_url else "warn",
66
+ config.base_url or "no base_url set - local endpoint won't be reachable",
67
+ )
68
+ )
69
+ else:
70
+ key_env = _PROVIDER_KEY_ENV.get(provider)
71
+ if key_env is None:
72
+ checks.append(Check(f"provider: {provider}", "warn", "unknown provider"))
73
+ else:
74
+ present = bool(env.get(key_env))
75
+ checks.append(
76
+ Check(
77
+ f"provider: {provider}",
78
+ "ok" if present else "fail",
79
+ f"{key_env} is set" if present else f"{key_env} is not set",
80
+ )
81
+ )
82
+
83
+ # Sandbox.
84
+ if config.sandbox == "docker":
85
+ have = shutil.which("docker") is not None
86
+ checks.append(
87
+ Check("sandbox: docker", "ok" if have else "fail",
88
+ "docker CLI found" if have else "docker CLI not on PATH")
89
+ )
90
+ else:
91
+ checks.append(Check("sandbox: local", "ok"))
92
+
93
+ # Semantic search needs an embedding key.
94
+ if config.semantic_search:
95
+ present = bool(env.get(config.embedding_api_key_env))
96
+ checks.append(
97
+ Check(
98
+ "semantic search",
99
+ "ok" if present else "warn",
100
+ f"{config.embedding_api_key_env} "
101
+ + ("is set" if present else "is not set - recall will error"),
102
+ )
103
+ )
104
+
105
+ # TUI backend on Windows.
106
+ if sys.platform == "win32":
107
+ checks.append(
108
+ Check(
109
+ "tui: curses",
110
+ "ok" if _have_module("curses") else "warn",
111
+ "available" if _have_module("curses")
112
+ else "windows-curses not installed - `agentkernel tui` won't run",
113
+ )
114
+ )
115
+
116
+ # Log dir writable.
117
+ try:
118
+ from pathlib import Path
119
+
120
+ Path(config.log_dir).mkdir(parents=True, exist_ok=True)
121
+ checks.append(Check("log dir writable", "ok", config.log_dir))
122
+ except OSError as exc:
123
+ checks.append(Check("log dir writable", "fail", f"{config.log_dir}: {exc}"))
124
+
125
+ return checks
126
+
127
+
128
+ def format_checks(checks: list[Check]) -> str:
129
+ lines = ["agentkernel doctor", ""]
130
+ for c in checks:
131
+ detail = f" - {c.detail}" if c.detail else ""
132
+ lines.append(f" {_MARK[c.status]} {c.name}{detail}")
133
+ fails = sum(1 for c in checks if c.status == "fail")
134
+ warns = sum(1 for c in checks if c.status == "warn")
135
+ lines.append("")
136
+ lines.append(f"{len(checks)} checks: {fails} failed, {warns} warnings.")
137
+ return "\n".join(lines)
138
+
139
+
140
+ def has_failures(checks: list[Check]) -> bool:
141
+ return any(c.status == "fail" for c in checks)
@@ -0,0 +1,132 @@
1
+ """Embedding providers for semantic note search.
2
+
3
+ This module intentionally stays outside of ``agentkernel.memory`` so the default
4
+ SQLite/JSONL notebook does not depend on an embedding endpoint. The provider is
5
+ only instantiated when ``Config.semantic_search`` is enabled.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import urllib.error
13
+ import urllib.request
14
+ from dataclasses import dataclass
15
+ from typing import Protocol
16
+
17
+ from agentkernel.config import Config
18
+
19
+
20
+ class EmbeddingProvider(Protocol):
21
+ """A tiny embedding seam. Anything matching this can back semantic search."""
22
+
23
+ def embed(self, texts: list[str]) -> list[list[float]]:
24
+ """Return one dense vector per non-empty text input."""
25
+ ...
26
+
27
+
28
+ class EmbeddingError(RuntimeError):
29
+ """Raised when an embedding request cannot be completed."""
30
+
31
+
32
+ @dataclass
33
+ class OpenAIEmbeddingProvider:
34
+ """OpenAI-compatible embedding endpoint using only stdlib ``urllib``.
35
+
36
+ Works with OpenAI, any OpenAI-compatible local server, or cloud providers
37
+ that expose the ``/embeddings`` route. The API key is read from the
38
+ environment (``OPENAI_API_KEY`` by default) and never persisted.
39
+ """
40
+
41
+ model: str = "text-embedding-3-small"
42
+ base_url: str = "https://api.openai.com/v1"
43
+ dimensions: int | None = None
44
+ api_key_env: str = "OPENAI_API_KEY"
45
+ timeout: float = 60.0
46
+
47
+ @classmethod
48
+ def from_config(
49
+ cls, config: Config, *, api_key_env: str = "OPENAI_API_KEY"
50
+ ) -> OpenAIEmbeddingProvider:
51
+ """Convenience constructor fromConfig.
52
+
53
+ Infers a sensible endpoint from Config ``provider``/``base_url`` when
54
+ ``embedding_base_url`` is not set:
55
+ - provider "openai" → https://api.openai.com/v1
56
+ - provider "local" → config.base_url (fallback to ollama-style default)
57
+ - provider "anthropic" or other → requires explicit ``embedding_base_url``.
58
+ """
59
+ base_url = config.embedding_base_url
60
+ if base_url is None:
61
+ if config.provider == "openai":
62
+ base_url = "https://api.openai.com/v1"
63
+ elif config.provider == "local":
64
+ base_url = config.base_url or "http://localhost:11434/v1"
65
+ else:
66
+ raise EmbeddingError(
67
+ f"Cannot infer embedding endpoint for provider={config.provider!r}. "
68
+ "Set `embedding_base_url` in agentkernel.toml."
69
+ )
70
+ return cls(
71
+ model=config.embedding_model,
72
+ base_url=base_url,
73
+ dimensions=config.embedding_dimensions,
74
+ api_key_env=api_key_env,
75
+ )
76
+
77
+ def embed(self, texts: list[str]) -> list[list[float]]:
78
+ """Call the embeddings endpoint and return vectors in input order."""
79
+ texts = [t.strip() for t in texts]
80
+ if not texts:
81
+ return []
82
+ if all(not t for t in texts):
83
+ return [[] for _ in texts]
84
+ key = os.environ.get(self.api_key_env)
85
+ if not key:
86
+ raise EmbeddingError(f"Environment variable {self.api_key_env} is not set")
87
+ payload: dict[str, Any] = {"model": self.model, "input": texts}
88
+ if self.dimensions:
89
+ payload["dimensions"] = self.dimensions
90
+ url = self.base_url.rstrip("/") + "/embeddings"
91
+ req = urllib.request.Request(
92
+ url,
93
+ data=json.dumps(payload).encode("utf-8"),
94
+ headers={
95
+ "Authorization": f"Bearer {key}",
96
+ "Content-Type": "application/json",
97
+ },
98
+ method="POST",
99
+ )
100
+ try:
101
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp:
102
+ result = json.loads(resp.read().decode("utf-8"))
103
+ except urllib.error.HTTPError as exc:
104
+ body = exc.read().decode(errors="ignore")
105
+ raise EmbeddingError(f"Embedding request failed ({exc.code}): {body}") from exc
106
+ except Exception as exc:
107
+ raise EmbeddingError(f"Embedding request failed: {exc}") from exc
108
+
109
+ embeddings: list[list[float]] = []
110
+ for item in sorted(result.get("data", []), key=lambda x: x.get("index", 0)):
111
+ embeddings.append(item.get("embedding", []))
112
+ if len(embeddings) != len(texts):
113
+ raise EmbeddingError(
114
+ f"Embedding response length mismatch: expected {len(texts)}, got {len(embeddings)}"
115
+ )
116
+ return embeddings
117
+
118
+
119
+ def cosine_similarity(a: list[float], b: list[float]) -> float:
120
+ """Cosine similarity between two equal-length vectors (0.0 if invalid)."""
121
+ if not a or not b or len(a) != len(b):
122
+ return 0.0
123
+ dot = sum(x * y for x, y in zip(a, b, strict=True))
124
+ norm_a = sum(x * x for x in a) ** 0.5
125
+ norm_b = sum(x * x for x in b) ** 0.5
126
+ if norm_a == 0.0 or norm_b == 0.0:
127
+ return 0.0
128
+ return dot / (norm_a * norm_b)
129
+
130
+
131
+ # Forward import for type annotations that need a runtime reference to Any.
132
+ from typing import Any # noqa: E402
@@ -0,0 +1,186 @@
1
+ """Evaluator / eval harness (design §13, Phase 5).
2
+
3
+ "An evaluator is a profile whose final output is a structured score." This runs
4
+ the agent on each case, then asks a judge model to score the answer against a
5
+ rubric, producing a structured ``EvalResult``. A suite aggregates into pass-rate
6
+ and mean score — useful for regression tests and model comparison, and as signal
7
+ for the self-improvement loop.
8
+
9
+ Built entirely on the kernel (an Agent + a provider); the loop is untouched.
10
+ Judging is best-effort: an unparseable judge reply scores 0 rather than raising.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import re
17
+ import tomllib
18
+ from collections.abc import Callable
19
+ from dataclasses import dataclass, field
20
+ from pathlib import Path
21
+ from typing import TYPE_CHECKING
22
+
23
+ from agentkernel.types import Message
24
+
25
+ if TYPE_CHECKING:
26
+ from agentkernel.agent import Agent
27
+ from agentkernel.providers import Provider
28
+
29
+ # A factory that returns a FRESH agent per case (independent context).
30
+ AgentFactory = Callable[[], "Agent"]
31
+
32
+ _JUDGE_SYSTEM = (
33
+ "You are a strict evaluator. Score how well an agent's answer satisfies the "
34
+ "rubric for the given task. Respond with ONLY a JSON object: "
35
+ '{"score": <0-100 integer>, "pass": <true|false>, "reasoning": "<one sentence>"}.'
36
+ )
37
+ _DEFAULT_RUBRIC = "The answer is correct, complete, and directly addresses the task."
38
+
39
+
40
+ @dataclass
41
+ class EvalCase:
42
+ name: str
43
+ prompt: str
44
+ rubric: str | None = None # overrides the suite/default rubric
45
+
46
+
47
+ @dataclass
48
+ class EvalResult:
49
+ name: str
50
+ answer: str
51
+ score: float # normalized 0.0–1.0
52
+ passed: bool
53
+ reasoning: str
54
+ raw_judge: str = ""
55
+
56
+
57
+ @dataclass
58
+ class EvalSummary:
59
+ results: list[EvalResult] = field(default_factory=list)
60
+
61
+ @property
62
+ def total(self) -> int:
63
+ return len(self.results)
64
+
65
+ @property
66
+ def passed(self) -> int:
67
+ return sum(1 for r in self.results if r.passed)
68
+
69
+ @property
70
+ def pass_rate(self) -> float:
71
+ return self.passed / self.total if self.total else 0.0
72
+
73
+ @property
74
+ def mean_score(self) -> float:
75
+ return sum(r.score for r in self.results) / self.total if self.total else 0.0
76
+
77
+ def to_dict(self) -> dict[str, object]:
78
+ return {
79
+ "total": self.total,
80
+ "passed": self.passed,
81
+ "pass_rate": self.pass_rate,
82
+ "mean_score": self.mean_score,
83
+ "results": [
84
+ {
85
+ "name": r.name,
86
+ "answer": r.answer,
87
+ "score": r.score,
88
+ "passed": r.passed,
89
+ "reasoning": r.reasoning,
90
+ "raw_judge": r.raw_judge,
91
+ }
92
+ for r in self.results
93
+ ],
94
+ }
95
+
96
+
97
+ def _parse_score(text: str, pass_threshold: float) -> tuple[float, bool, str]:
98
+ """Extract ``(score 0-1, passed, reasoning)`` from a judge reply.
99
+
100
+ Tolerant of prose around the JSON; if nothing parseable is found, the case
101
+ scores 0 and fails (a non-answer should never silently pass)."""
102
+ match = re.search(r"\{.*\}", text, re.DOTALL)
103
+ if not match:
104
+ return 0.0, False, "could not parse judge output"
105
+ try:
106
+ data = json.loads(match.group(0))
107
+ except json.JSONDecodeError:
108
+ return 0.0, False, "could not parse judge output"
109
+
110
+ raw = data.get("score", 0)
111
+ try:
112
+ score = float(raw)
113
+ except (TypeError, ValueError):
114
+ score = 0.0
115
+ if score > 1.0: # judges return 0-100; normalize to 0-1
116
+ score = score / 100.0
117
+ score = max(0.0, min(1.0, score))
118
+
119
+ passed = data.get("pass")
120
+ if not isinstance(passed, bool):
121
+ passed = score >= pass_threshold
122
+ return score, passed, str(data.get("reasoning", ""))
123
+
124
+
125
+ class Evaluator:
126
+ """Runs cases through an agent and scores answers with a judge provider."""
127
+
128
+ def __init__(
129
+ self,
130
+ agent_factory: AgentFactory,
131
+ judge: Provider,
132
+ *,
133
+ default_rubric: str = _DEFAULT_RUBRIC,
134
+ pass_threshold: float = 0.6,
135
+ judge_max_tokens: int = 512,
136
+ ) -> None:
137
+ self._agent_factory = agent_factory
138
+ self._judge = judge
139
+ self._default_rubric = default_rubric
140
+ self._pass_threshold = pass_threshold
141
+ self._judge_max_tokens = judge_max_tokens
142
+
143
+ def evaluate_case(self, case: EvalCase) -> EvalResult:
144
+ answer = self._agent_factory().run(case.prompt)
145
+ rubric = case.rubric or self._default_rubric
146
+ raw = self._score(case.prompt, rubric, answer)
147
+ score, passed, reasoning = _parse_score(raw, self._pass_threshold)
148
+ return EvalResult(case.name, answer, score, passed, reasoning, raw)
149
+
150
+ def run_suite(self, cases: list[EvalCase]) -> EvalSummary:
151
+ return EvalSummary([self.evaluate_case(c) for c in cases])
152
+
153
+ def _score(self, prompt: str, rubric: str, answer: str) -> str:
154
+ judge_prompt = (
155
+ f"Task:\n{prompt}\n\nRubric:\n{rubric}\n\nAgent answer:\n{answer}\n\n"
156
+ "Score the answer against the rubric."
157
+ )
158
+ resp = self._judge.complete(
159
+ [Message(role="user", content=judge_prompt)],
160
+ [],
161
+ max_tokens=self._judge_max_tokens,
162
+ temperature=0.0,
163
+ system=_JUDGE_SYSTEM,
164
+ )
165
+ return resp.message.content.strip()
166
+
167
+
168
+ def load_eval_suite(path: str | Path) -> tuple[str, list[EvalCase]]:
169
+ """Load ``(default_rubric, cases)`` from a TOML suite file.
170
+
171
+ Format::
172
+
173
+ rubric = "default rubric for all cases" # optional
174
+ [[cases]]
175
+ name = "..."
176
+ prompt = "..."
177
+ rubric = "..." # optional per-case override
178
+ """
179
+ with Path(path).open("rb") as fh:
180
+ data = tomllib.load(fh)
181
+ default_rubric = data.get("rubric", _DEFAULT_RUBRIC)
182
+ cases = [
183
+ EvalCase(name=c["name"], prompt=c["prompt"], rubric=c.get("rubric"))
184
+ for c in data.get("cases", [])
185
+ ]
186
+ return default_rubric, cases