agentkernel-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentkernel/__init__.py +7 -0
- agentkernel/__main__.py +5 -0
- agentkernel/agent.py +311 -0
- agentkernel/approval/__init__.py +23 -0
- agentkernel/approval/base.py +34 -0
- agentkernel/approval/cli.py +129 -0
- agentkernel/approval/policy.py +58 -0
- agentkernel/approval/risk.py +91 -0
- agentkernel/approval/sandbox.py +201 -0
- agentkernel/budget.py +64 -0
- agentkernel/checkpoint.py +50 -0
- agentkernel/cli.py +1482 -0
- agentkernel/config.py +224 -0
- agentkernel/context/__init__.py +17 -0
- agentkernel/context/manager.py +216 -0
- agentkernel/context/truncate.py +35 -0
- agentkernel/cron.py +146 -0
- agentkernel/curation.py +183 -0
- agentkernel/doctor.py +141 -0
- agentkernel/embeddings.py +132 -0
- agentkernel/evaluation.py +186 -0
- agentkernel/improvement.py +133 -0
- agentkernel/insights.py +141 -0
- agentkernel/kanban.py +114 -0
- agentkernel/knowledge.py +383 -0
- agentkernel/loops.py +145 -0
- agentkernel/mcp/__init__.py +23 -0
- agentkernel/mcp/client.py +181 -0
- agentkernel/mcp/config.py +59 -0
- agentkernel/mcp/tools.py +96 -0
- agentkernel/memory.py +1208 -0
- agentkernel/paths.py +73 -0
- agentkernel/plugins.py +76 -0
- agentkernel/profiles.py +70 -0
- agentkernel/progress.py +89 -0
- agentkernel/providers/__init__.py +35 -0
- agentkernel/providers/_http.py +157 -0
- agentkernel/providers/anthropic.py +282 -0
- agentkernel/providers/base.py +38 -0
- agentkernel/providers/credentials.py +65 -0
- agentkernel/providers/local.py +34 -0
- agentkernel/providers/openai.py +260 -0
- agentkernel/redaction.py +77 -0
- agentkernel/semantic_index.py +139 -0
- agentkernel/semantic_memory.py +253 -0
- agentkernel/skills.py +268 -0
- agentkernel/subagent.py +161 -0
- agentkernel/telemetry.py +199 -0
- agentkernel/templates/README.md +35 -0
- agentkernel/templates/SKILL.md +28 -0
- agentkernel/templates/eval-suite.toml +22 -0
- agentkernel/templates/loop.toml +29 -0
- agentkernel/templates/mcp-servers.toml +22 -0
- agentkernel/templates/profile.toml +29 -0
- agentkernel/templates/tool_module.py +64 -0
- agentkernel/tools/__init__.py +5 -0
- agentkernel/tools/base.py +100 -0
- agentkernel/tools/builtin/__init__.py +37 -0
- agentkernel/tools/builtin/checkpoint_tool.py +33 -0
- agentkernel/tools/builtin/clarify.py +60 -0
- agentkernel/tools/builtin/files.py +221 -0
- agentkernel/tools/builtin/kanban_tool.py +100 -0
- agentkernel/tools/builtin/search.py +225 -0
- agentkernel/tools/builtin/shell.py +67 -0
- agentkernel/tools/builtin/todo.py +106 -0
- agentkernel/tui/__init__.py +50 -0
- agentkernel/tui/app.py +594 -0
- agentkernel/types.py +127 -0
- agentkernel/worktree.py +64 -0
- agentkernel_cli-0.1.0.dist-info/METADATA +426 -0
- agentkernel_cli-0.1.0.dist-info/RECORD +74 -0
- agentkernel_cli-0.1.0.dist-info/WHEEL +4 -0
- agentkernel_cli-0.1.0.dist-info/entry_points.txt +2 -0
- agentkernel_cli-0.1.0.dist-info/licenses/LICENSE +201 -0
agentkernel/curation.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Self-curating memory: extract durable facts from a transcript, and
|
|
2
|
+
consolidate the notebook with an LLM (design §13, Phase 3).
|
|
3
|
+
|
|
4
|
+
The storage/recall machinery (``NoteStore``, semantic search, dedup) is mature;
|
|
5
|
+
what was missing is *populating and cleaning* it without the model having to call
|
|
6
|
+
``remember`` in the moment. This module adds two harness operations on top of the
|
|
7
|
+
kernel (an Agent is not required — just a ``NoteStore`` and a ``Provider``):
|
|
8
|
+
|
|
9
|
+
* ``extract(messages)`` — distil a finished conversation into candidate facts,
|
|
10
|
+
skipping ones that duplicate (by token overlap) what is already stored.
|
|
11
|
+
* ``consolidate()`` — ask the model to merge related notes and supersede
|
|
12
|
+
outdated ones, then rebuild the notebook from the cleaned set.
|
|
13
|
+
|
|
14
|
+
Both are best-effort: an unparseable model reply leaves memory unchanged rather
|
|
15
|
+
than raising or destroying notes.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import re
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from typing import TYPE_CHECKING
|
|
24
|
+
|
|
25
|
+
from agentkernel.memory import MemoryNote, _tokens
|
|
26
|
+
from agentkernel.types import Message
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from agentkernel.memory import NoteStore
|
|
30
|
+
from agentkernel.providers import Provider
|
|
31
|
+
|
|
32
|
+
_EXTRACT_SYSTEM = (
|
|
33
|
+
"You curate an AI agent's long-term memory. From a conversation, extract only "
|
|
34
|
+
"DURABLE facts worth remembering across future sessions: stable user "
|
|
35
|
+
"preferences, project facts and conventions, decisions, and constraints. "
|
|
36
|
+
"Exclude transient task details, pleasantries, and anything ephemeral. "
|
|
37
|
+
'Respond with ONLY a JSON array of objects {"text": "...", "tags": ["..."]}. '
|
|
38
|
+
"Return [] if nothing is worth remembering."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
_CONSOLIDATE_SYSTEM = (
|
|
42
|
+
"You consolidate an AI agent's long-term memory notes. Produce a cleaner set: "
|
|
43
|
+
"merge notes that say the same or closely related things, remove redundancy, "
|
|
44
|
+
"and when two notes conflict keep only the most recent/true statement. Do not "
|
|
45
|
+
"drop any distinct information. "
|
|
46
|
+
'Respond with ONLY a JSON array of objects {"text": "...", "tags": ["..."]}.'
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class ExtractionResult:
|
|
52
|
+
added: list[MemoryNote] = field(default_factory=list)
|
|
53
|
+
skipped_duplicates: int = 0
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class ConsolidationResult:
|
|
58
|
+
before: int
|
|
59
|
+
after: int
|
|
60
|
+
notes: list[MemoryNote] = field(default_factory=list)
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def removed(self) -> int:
|
|
64
|
+
return max(self.before - self.after, 0)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _parse_json_array(text: str) -> list[dict]:
|
|
68
|
+
"""Extract a JSON array of objects from a model reply, tolerating prose."""
|
|
69
|
+
match = re.search(r"\[.*\]", text, re.DOTALL)
|
|
70
|
+
if not match:
|
|
71
|
+
return []
|
|
72
|
+
try:
|
|
73
|
+
data = json.loads(match.group(0))
|
|
74
|
+
except json.JSONDecodeError:
|
|
75
|
+
return []
|
|
76
|
+
return [item for item in data if isinstance(item, dict)]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _render_transcript(messages: list[Message], max_chars: int) -> str:
|
|
80
|
+
lines: list[str] = []
|
|
81
|
+
for m in messages:
|
|
82
|
+
if m.content and m.role in ("user", "assistant"):
|
|
83
|
+
lines.append(f"{m.role}: {m.content}")
|
|
84
|
+
for tc in m.tool_calls:
|
|
85
|
+
lines.append(f"assistant called {tc.name}")
|
|
86
|
+
for r in m.tool_results:
|
|
87
|
+
snippet = r.content if len(r.content) < 200 else r.content[:200] + "…"
|
|
88
|
+
lines.append(f"tool[{'error' if r.is_error else 'ok'}]: {snippet}")
|
|
89
|
+
text = "\n".join(lines)
|
|
90
|
+
if len(text) > max_chars:
|
|
91
|
+
text = text[:max_chars] + "\n… [transcript truncated]"
|
|
92
|
+
return text
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class MemoryCurator:
|
|
96
|
+
"""Populates and cleans a ``NoteStore`` with the help of a model."""
|
|
97
|
+
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
notes: NoteStore,
|
|
101
|
+
provider: Provider,
|
|
102
|
+
*,
|
|
103
|
+
max_tokens: int = 1024,
|
|
104
|
+
dedup_threshold: float = 0.8,
|
|
105
|
+
transcript_chars: int = 16000,
|
|
106
|
+
) -> None:
|
|
107
|
+
self._notes = notes
|
|
108
|
+
self._provider = provider
|
|
109
|
+
self._max_tokens = max_tokens
|
|
110
|
+
self._dedup_threshold = dedup_threshold
|
|
111
|
+
self._transcript_chars = transcript_chars
|
|
112
|
+
|
|
113
|
+
def extract(self, messages: list[Message]) -> ExtractionResult:
|
|
114
|
+
transcript = _render_transcript(messages, self._transcript_chars)
|
|
115
|
+
if not transcript.strip():
|
|
116
|
+
return ExtractionResult()
|
|
117
|
+
candidates = self._ask(
|
|
118
|
+
_EXTRACT_SYSTEM,
|
|
119
|
+
f"Conversation:\n{transcript}\n\nExtract the durable facts.",
|
|
120
|
+
)
|
|
121
|
+
existing = self._notes.all()
|
|
122
|
+
result = ExtractionResult()
|
|
123
|
+
for cand in candidates:
|
|
124
|
+
text = str(cand.get("text", "")).strip()
|
|
125
|
+
if not text:
|
|
126
|
+
continue
|
|
127
|
+
if self._is_duplicate(text, existing):
|
|
128
|
+
result.skipped_duplicates += 1
|
|
129
|
+
continue
|
|
130
|
+
note = self._notes.add(text, tags=cand.get("tags") or [])
|
|
131
|
+
existing.append(note) # dedup later candidates against it too
|
|
132
|
+
result.added.append(note)
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
def consolidate(self) -> ConsolidationResult:
|
|
136
|
+
existing = self._notes.all()
|
|
137
|
+
if len(existing) < 2:
|
|
138
|
+
return ConsolidationResult(len(existing), len(existing), existing)
|
|
139
|
+
listing = "\n".join(
|
|
140
|
+
f"{n.note_id}. {n.text}"
|
|
141
|
+
+ (f" [tags: {', '.join(n.tags)}]" if n.tags else "")
|
|
142
|
+
for n in existing
|
|
143
|
+
)
|
|
144
|
+
cleaned = [
|
|
145
|
+
c for c in self._ask(_CONSOLIDATE_SYSTEM, f"Current memory notes:\n{listing}")
|
|
146
|
+
if str(c.get("text", "")).strip()
|
|
147
|
+
]
|
|
148
|
+
if not cleaned:
|
|
149
|
+
return ConsolidationResult(len(existing), len(existing), existing) # no-op
|
|
150
|
+
# Rebuild from the consolidated set using the store's public API so all
|
|
151
|
+
# backends (JSONL / SQLite / semantic) stay consistent.
|
|
152
|
+
for note in existing:
|
|
153
|
+
self._notes.forget(note_id=note.note_id)
|
|
154
|
+
new_notes = [
|
|
155
|
+
self._notes.add(str(c["text"]).strip(), tags=c.get("tags") or [])
|
|
156
|
+
for c in cleaned
|
|
157
|
+
]
|
|
158
|
+
return ConsolidationResult(len(existing), len(new_notes), new_notes)
|
|
159
|
+
|
|
160
|
+
# --- internals ---------------------------------------------------------
|
|
161
|
+
|
|
162
|
+
def _ask(self, system: str, user: str) -> list[dict]:
|
|
163
|
+
resp = self._provider.complete(
|
|
164
|
+
[Message(role="user", content=user)],
|
|
165
|
+
[],
|
|
166
|
+
max_tokens=self._max_tokens,
|
|
167
|
+
temperature=0.0,
|
|
168
|
+
system=system,
|
|
169
|
+
)
|
|
170
|
+
return _parse_json_array(resp.message.content)
|
|
171
|
+
|
|
172
|
+
def _is_duplicate(self, text: str, existing: list[MemoryNote]) -> bool:
|
|
173
|
+
terms = _tokens(text)
|
|
174
|
+
if not terms:
|
|
175
|
+
return False
|
|
176
|
+
for note in existing:
|
|
177
|
+
other = _tokens(note.text)
|
|
178
|
+
if not other:
|
|
179
|
+
continue
|
|
180
|
+
jaccard = len(terms & other) / len(terms | other)
|
|
181
|
+
if jaccard >= self._dedup_threshold:
|
|
182
|
+
return True
|
|
183
|
+
return False
|
agentkernel/doctor.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Environment health check (design §18.7).
|
|
2
|
+
|
|
3
|
+
`agentkernel doctor` runs a set of fast, network-free checks — Python version,
|
|
4
|
+
required and optional dependencies, provider credentials, sandbox availability,
|
|
5
|
+
and writable paths — and prints a checklist. It exits non-zero if any check
|
|
6
|
+
fails, so it doubles as a setup smoke test. Checks are pure functions of the
|
|
7
|
+
config + environment, so they're deterministic and offline-testable.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import importlib.util
|
|
13
|
+
import os
|
|
14
|
+
import shutil
|
|
15
|
+
import sys
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
|
|
18
|
+
from agentkernel.config import Config
|
|
19
|
+
|
|
20
|
+
# ASCII marks: plain print() to a Windows cp1252 console can't encode unicode glyphs.
|
|
21
|
+
_MARK = {"ok": "[ OK ]", "warn": "[WARN]", "fail": "[FAIL]"}
|
|
22
|
+
|
|
23
|
+
# provider -> the env var that holds its key (local needs no key).
|
|
24
|
+
_PROVIDER_KEY_ENV = {"anthropic": "ANTHROPIC_API_KEY", "openai": "OPENAI_API_KEY"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class Check:
|
|
29
|
+
name: str
|
|
30
|
+
status: str # "ok" | "warn" | "fail"
|
|
31
|
+
detail: str = ""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _have_module(name: str) -> bool:
|
|
35
|
+
return importlib.util.find_spec(name) is not None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_checks(config: Config, *, env: dict[str, str] | None = None) -> list[Check]:
|
|
39
|
+
"""Return the list of health checks for ``config`` and ``env``."""
|
|
40
|
+
env = os.environ if env is None else env
|
|
41
|
+
checks: list[Check] = []
|
|
42
|
+
|
|
43
|
+
# Python version.
|
|
44
|
+
v = sys.version_info
|
|
45
|
+
checks.append(
|
|
46
|
+
Check(
|
|
47
|
+
"python",
|
|
48
|
+
"ok" if v >= (3, 11) else "fail",
|
|
49
|
+
f"{v.major}.{v.minor}.{v.micro}" + ("" if v >= (3, 11) else " (need >= 3.11)"),
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Required dependencies.
|
|
54
|
+
for dep in ("jsonschema", "httpx"):
|
|
55
|
+
checks.append(
|
|
56
|
+
Check(f"dependency: {dep}", "ok" if _have_module(dep) else "fail")
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Provider credentials.
|
|
60
|
+
provider = config.provider
|
|
61
|
+
if provider == "local":
|
|
62
|
+
checks.append(
|
|
63
|
+
Check(
|
|
64
|
+
"provider: local",
|
|
65
|
+
"ok" if config.base_url else "warn",
|
|
66
|
+
config.base_url or "no base_url set - local endpoint won't be reachable",
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
else:
|
|
70
|
+
key_env = _PROVIDER_KEY_ENV.get(provider)
|
|
71
|
+
if key_env is None:
|
|
72
|
+
checks.append(Check(f"provider: {provider}", "warn", "unknown provider"))
|
|
73
|
+
else:
|
|
74
|
+
present = bool(env.get(key_env))
|
|
75
|
+
checks.append(
|
|
76
|
+
Check(
|
|
77
|
+
f"provider: {provider}",
|
|
78
|
+
"ok" if present else "fail",
|
|
79
|
+
f"{key_env} is set" if present else f"{key_env} is not set",
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Sandbox.
|
|
84
|
+
if config.sandbox == "docker":
|
|
85
|
+
have = shutil.which("docker") is not None
|
|
86
|
+
checks.append(
|
|
87
|
+
Check("sandbox: docker", "ok" if have else "fail",
|
|
88
|
+
"docker CLI found" if have else "docker CLI not on PATH")
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
checks.append(Check("sandbox: local", "ok"))
|
|
92
|
+
|
|
93
|
+
# Semantic search needs an embedding key.
|
|
94
|
+
if config.semantic_search:
|
|
95
|
+
present = bool(env.get(config.embedding_api_key_env))
|
|
96
|
+
checks.append(
|
|
97
|
+
Check(
|
|
98
|
+
"semantic search",
|
|
99
|
+
"ok" if present else "warn",
|
|
100
|
+
f"{config.embedding_api_key_env} "
|
|
101
|
+
+ ("is set" if present else "is not set - recall will error"),
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# TUI backend on Windows.
|
|
106
|
+
if sys.platform == "win32":
|
|
107
|
+
checks.append(
|
|
108
|
+
Check(
|
|
109
|
+
"tui: curses",
|
|
110
|
+
"ok" if _have_module("curses") else "warn",
|
|
111
|
+
"available" if _have_module("curses")
|
|
112
|
+
else "windows-curses not installed - `agentkernel tui` won't run",
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Log dir writable.
|
|
117
|
+
try:
|
|
118
|
+
from pathlib import Path
|
|
119
|
+
|
|
120
|
+
Path(config.log_dir).mkdir(parents=True, exist_ok=True)
|
|
121
|
+
checks.append(Check("log dir writable", "ok", config.log_dir))
|
|
122
|
+
except OSError as exc:
|
|
123
|
+
checks.append(Check("log dir writable", "fail", f"{config.log_dir}: {exc}"))
|
|
124
|
+
|
|
125
|
+
return checks
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def format_checks(checks: list[Check]) -> str:
|
|
129
|
+
lines = ["agentkernel doctor", ""]
|
|
130
|
+
for c in checks:
|
|
131
|
+
detail = f" - {c.detail}" if c.detail else ""
|
|
132
|
+
lines.append(f" {_MARK[c.status]} {c.name}{detail}")
|
|
133
|
+
fails = sum(1 for c in checks if c.status == "fail")
|
|
134
|
+
warns = sum(1 for c in checks if c.status == "warn")
|
|
135
|
+
lines.append("")
|
|
136
|
+
lines.append(f"{len(checks)} checks: {fails} failed, {warns} warnings.")
|
|
137
|
+
return "\n".join(lines)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def has_failures(checks: list[Check]) -> bool:
|
|
141
|
+
return any(c.status == "fail" for c in checks)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Embedding providers for semantic note search.
|
|
2
|
+
|
|
3
|
+
This module intentionally stays outside of ``agentkernel.memory`` so the default
|
|
4
|
+
SQLite/JSONL notebook does not depend on an embedding endpoint. The provider is
|
|
5
|
+
only instantiated when ``Config.semantic_search`` is enabled.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import urllib.error
|
|
13
|
+
import urllib.request
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import Protocol
|
|
16
|
+
|
|
17
|
+
from agentkernel.config import Config
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EmbeddingProvider(Protocol):
|
|
21
|
+
"""A tiny embedding seam. Anything matching this can back semantic search."""
|
|
22
|
+
|
|
23
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
24
|
+
"""Return one dense vector per non-empty text input."""
|
|
25
|
+
...
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class EmbeddingError(RuntimeError):
|
|
29
|
+
"""Raised when an embedding request cannot be completed."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class OpenAIEmbeddingProvider:
|
|
34
|
+
"""OpenAI-compatible embedding endpoint using only stdlib ``urllib``.
|
|
35
|
+
|
|
36
|
+
Works with OpenAI, any OpenAI-compatible local server, or cloud providers
|
|
37
|
+
that expose the ``/embeddings`` route. The API key is read from the
|
|
38
|
+
environment (``OPENAI_API_KEY`` by default) and never persisted.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
model: str = "text-embedding-3-small"
|
|
42
|
+
base_url: str = "https://api.openai.com/v1"
|
|
43
|
+
dimensions: int | None = None
|
|
44
|
+
api_key_env: str = "OPENAI_API_KEY"
|
|
45
|
+
timeout: float = 60.0
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def from_config(
|
|
49
|
+
cls, config: Config, *, api_key_env: str = "OPENAI_API_KEY"
|
|
50
|
+
) -> OpenAIEmbeddingProvider:
|
|
51
|
+
"""Convenience constructor fromConfig.
|
|
52
|
+
|
|
53
|
+
Infers a sensible endpoint from Config ``provider``/``base_url`` when
|
|
54
|
+
``embedding_base_url`` is not set:
|
|
55
|
+
- provider "openai" → https://api.openai.com/v1
|
|
56
|
+
- provider "local" → config.base_url (fallback to ollama-style default)
|
|
57
|
+
- provider "anthropic" or other → requires explicit ``embedding_base_url``.
|
|
58
|
+
"""
|
|
59
|
+
base_url = config.embedding_base_url
|
|
60
|
+
if base_url is None:
|
|
61
|
+
if config.provider == "openai":
|
|
62
|
+
base_url = "https://api.openai.com/v1"
|
|
63
|
+
elif config.provider == "local":
|
|
64
|
+
base_url = config.base_url or "http://localhost:11434/v1"
|
|
65
|
+
else:
|
|
66
|
+
raise EmbeddingError(
|
|
67
|
+
f"Cannot infer embedding endpoint for provider={config.provider!r}. "
|
|
68
|
+
"Set `embedding_base_url` in agentkernel.toml."
|
|
69
|
+
)
|
|
70
|
+
return cls(
|
|
71
|
+
model=config.embedding_model,
|
|
72
|
+
base_url=base_url,
|
|
73
|
+
dimensions=config.embedding_dimensions,
|
|
74
|
+
api_key_env=api_key_env,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
78
|
+
"""Call the embeddings endpoint and return vectors in input order."""
|
|
79
|
+
texts = [t.strip() for t in texts]
|
|
80
|
+
if not texts:
|
|
81
|
+
return []
|
|
82
|
+
if all(not t for t in texts):
|
|
83
|
+
return [[] for _ in texts]
|
|
84
|
+
key = os.environ.get(self.api_key_env)
|
|
85
|
+
if not key:
|
|
86
|
+
raise EmbeddingError(f"Environment variable {self.api_key_env} is not set")
|
|
87
|
+
payload: dict[str, Any] = {"model": self.model, "input": texts}
|
|
88
|
+
if self.dimensions:
|
|
89
|
+
payload["dimensions"] = self.dimensions
|
|
90
|
+
url = self.base_url.rstrip("/") + "/embeddings"
|
|
91
|
+
req = urllib.request.Request(
|
|
92
|
+
url,
|
|
93
|
+
data=json.dumps(payload).encode("utf-8"),
|
|
94
|
+
headers={
|
|
95
|
+
"Authorization": f"Bearer {key}",
|
|
96
|
+
"Content-Type": "application/json",
|
|
97
|
+
},
|
|
98
|
+
method="POST",
|
|
99
|
+
)
|
|
100
|
+
try:
|
|
101
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
|
102
|
+
result = json.loads(resp.read().decode("utf-8"))
|
|
103
|
+
except urllib.error.HTTPError as exc:
|
|
104
|
+
body = exc.read().decode(errors="ignore")
|
|
105
|
+
raise EmbeddingError(f"Embedding request failed ({exc.code}): {body}") from exc
|
|
106
|
+
except Exception as exc:
|
|
107
|
+
raise EmbeddingError(f"Embedding request failed: {exc}") from exc
|
|
108
|
+
|
|
109
|
+
embeddings: list[list[float]] = []
|
|
110
|
+
for item in sorted(result.get("data", []), key=lambda x: x.get("index", 0)):
|
|
111
|
+
embeddings.append(item.get("embedding", []))
|
|
112
|
+
if len(embeddings) != len(texts):
|
|
113
|
+
raise EmbeddingError(
|
|
114
|
+
f"Embedding response length mismatch: expected {len(texts)}, got {len(embeddings)}"
|
|
115
|
+
)
|
|
116
|
+
return embeddings
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
120
|
+
"""Cosine similarity between two equal-length vectors (0.0 if invalid)."""
|
|
121
|
+
if not a or not b or len(a) != len(b):
|
|
122
|
+
return 0.0
|
|
123
|
+
dot = sum(x * y for x, y in zip(a, b, strict=True))
|
|
124
|
+
norm_a = sum(x * x for x in a) ** 0.5
|
|
125
|
+
norm_b = sum(x * x for x in b) ** 0.5
|
|
126
|
+
if norm_a == 0.0 or norm_b == 0.0:
|
|
127
|
+
return 0.0
|
|
128
|
+
return dot / (norm_a * norm_b)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# Forward import for type annotations that need a runtime reference to Any.
|
|
132
|
+
from typing import Any # noqa: E402
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""Evaluator / eval harness (design §13, Phase 5).
|
|
2
|
+
|
|
3
|
+
"An evaluator is a profile whose final output is a structured score." This runs
|
|
4
|
+
the agent on each case, then asks a judge model to score the answer against a
|
|
5
|
+
rubric, producing a structured ``EvalResult``. A suite aggregates into pass-rate
|
|
6
|
+
and mean score — useful for regression tests and model comparison, and as signal
|
|
7
|
+
for the self-improvement loop.
|
|
8
|
+
|
|
9
|
+
Built entirely on the kernel (an Agent + a provider); the loop is untouched.
|
|
10
|
+
Judging is best-effort: an unparseable judge reply scores 0 rather than raising.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import re
|
|
17
|
+
import tomllib
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import TYPE_CHECKING
|
|
22
|
+
|
|
23
|
+
from agentkernel.types import Message
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from agentkernel.agent import Agent
|
|
27
|
+
from agentkernel.providers import Provider
|
|
28
|
+
|
|
29
|
+
# A factory that returns a FRESH agent per case (independent context).
|
|
30
|
+
AgentFactory = Callable[[], "Agent"]
|
|
31
|
+
|
|
32
|
+
_JUDGE_SYSTEM = (
|
|
33
|
+
"You are a strict evaluator. Score how well an agent's answer satisfies the "
|
|
34
|
+
"rubric for the given task. Respond with ONLY a JSON object: "
|
|
35
|
+
'{"score": <0-100 integer>, "pass": <true|false>, "reasoning": "<one sentence>"}.'
|
|
36
|
+
)
|
|
37
|
+
_DEFAULT_RUBRIC = "The answer is correct, complete, and directly addresses the task."
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class EvalCase:
|
|
42
|
+
name: str
|
|
43
|
+
prompt: str
|
|
44
|
+
rubric: str | None = None # overrides the suite/default rubric
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class EvalResult:
|
|
49
|
+
name: str
|
|
50
|
+
answer: str
|
|
51
|
+
score: float # normalized 0.0–1.0
|
|
52
|
+
passed: bool
|
|
53
|
+
reasoning: str
|
|
54
|
+
raw_judge: str = ""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class EvalSummary:
|
|
59
|
+
results: list[EvalResult] = field(default_factory=list)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def total(self) -> int:
|
|
63
|
+
return len(self.results)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def passed(self) -> int:
|
|
67
|
+
return sum(1 for r in self.results if r.passed)
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def pass_rate(self) -> float:
|
|
71
|
+
return self.passed / self.total if self.total else 0.0
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def mean_score(self) -> float:
|
|
75
|
+
return sum(r.score for r in self.results) / self.total if self.total else 0.0
|
|
76
|
+
|
|
77
|
+
def to_dict(self) -> dict[str, object]:
|
|
78
|
+
return {
|
|
79
|
+
"total": self.total,
|
|
80
|
+
"passed": self.passed,
|
|
81
|
+
"pass_rate": self.pass_rate,
|
|
82
|
+
"mean_score": self.mean_score,
|
|
83
|
+
"results": [
|
|
84
|
+
{
|
|
85
|
+
"name": r.name,
|
|
86
|
+
"answer": r.answer,
|
|
87
|
+
"score": r.score,
|
|
88
|
+
"passed": r.passed,
|
|
89
|
+
"reasoning": r.reasoning,
|
|
90
|
+
"raw_judge": r.raw_judge,
|
|
91
|
+
}
|
|
92
|
+
for r in self.results
|
|
93
|
+
],
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _parse_score(text: str, pass_threshold: float) -> tuple[float, bool, str]:
|
|
98
|
+
"""Extract ``(score 0-1, passed, reasoning)`` from a judge reply.
|
|
99
|
+
|
|
100
|
+
Tolerant of prose around the JSON; if nothing parseable is found, the case
|
|
101
|
+
scores 0 and fails (a non-answer should never silently pass)."""
|
|
102
|
+
match = re.search(r"\{.*\}", text, re.DOTALL)
|
|
103
|
+
if not match:
|
|
104
|
+
return 0.0, False, "could not parse judge output"
|
|
105
|
+
try:
|
|
106
|
+
data = json.loads(match.group(0))
|
|
107
|
+
except json.JSONDecodeError:
|
|
108
|
+
return 0.0, False, "could not parse judge output"
|
|
109
|
+
|
|
110
|
+
raw = data.get("score", 0)
|
|
111
|
+
try:
|
|
112
|
+
score = float(raw)
|
|
113
|
+
except (TypeError, ValueError):
|
|
114
|
+
score = 0.0
|
|
115
|
+
if score > 1.0: # judges return 0-100; normalize to 0-1
|
|
116
|
+
score = score / 100.0
|
|
117
|
+
score = max(0.0, min(1.0, score))
|
|
118
|
+
|
|
119
|
+
passed = data.get("pass")
|
|
120
|
+
if not isinstance(passed, bool):
|
|
121
|
+
passed = score >= pass_threshold
|
|
122
|
+
return score, passed, str(data.get("reasoning", ""))
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class Evaluator:
|
|
126
|
+
"""Runs cases through an agent and scores answers with a judge provider."""
|
|
127
|
+
|
|
128
|
+
def __init__(
|
|
129
|
+
self,
|
|
130
|
+
agent_factory: AgentFactory,
|
|
131
|
+
judge: Provider,
|
|
132
|
+
*,
|
|
133
|
+
default_rubric: str = _DEFAULT_RUBRIC,
|
|
134
|
+
pass_threshold: float = 0.6,
|
|
135
|
+
judge_max_tokens: int = 512,
|
|
136
|
+
) -> None:
|
|
137
|
+
self._agent_factory = agent_factory
|
|
138
|
+
self._judge = judge
|
|
139
|
+
self._default_rubric = default_rubric
|
|
140
|
+
self._pass_threshold = pass_threshold
|
|
141
|
+
self._judge_max_tokens = judge_max_tokens
|
|
142
|
+
|
|
143
|
+
def evaluate_case(self, case: EvalCase) -> EvalResult:
|
|
144
|
+
answer = self._agent_factory().run(case.prompt)
|
|
145
|
+
rubric = case.rubric or self._default_rubric
|
|
146
|
+
raw = self._score(case.prompt, rubric, answer)
|
|
147
|
+
score, passed, reasoning = _parse_score(raw, self._pass_threshold)
|
|
148
|
+
return EvalResult(case.name, answer, score, passed, reasoning, raw)
|
|
149
|
+
|
|
150
|
+
def run_suite(self, cases: list[EvalCase]) -> EvalSummary:
|
|
151
|
+
return EvalSummary([self.evaluate_case(c) for c in cases])
|
|
152
|
+
|
|
153
|
+
def _score(self, prompt: str, rubric: str, answer: str) -> str:
|
|
154
|
+
judge_prompt = (
|
|
155
|
+
f"Task:\n{prompt}\n\nRubric:\n{rubric}\n\nAgent answer:\n{answer}\n\n"
|
|
156
|
+
"Score the answer against the rubric."
|
|
157
|
+
)
|
|
158
|
+
resp = self._judge.complete(
|
|
159
|
+
[Message(role="user", content=judge_prompt)],
|
|
160
|
+
[],
|
|
161
|
+
max_tokens=self._judge_max_tokens,
|
|
162
|
+
temperature=0.0,
|
|
163
|
+
system=_JUDGE_SYSTEM,
|
|
164
|
+
)
|
|
165
|
+
return resp.message.content.strip()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def load_eval_suite(path: str | Path) -> tuple[str, list[EvalCase]]:
|
|
169
|
+
"""Load ``(default_rubric, cases)`` from a TOML suite file.
|
|
170
|
+
|
|
171
|
+
Format::
|
|
172
|
+
|
|
173
|
+
rubric = "default rubric for all cases" # optional
|
|
174
|
+
[[cases]]
|
|
175
|
+
name = "..."
|
|
176
|
+
prompt = "..."
|
|
177
|
+
rubric = "..." # optional per-case override
|
|
178
|
+
"""
|
|
179
|
+
with Path(path).open("rb") as fh:
|
|
180
|
+
data = tomllib.load(fh)
|
|
181
|
+
default_rubric = data.get("rubric", _DEFAULT_RUBRIC)
|
|
182
|
+
cases = [
|
|
183
|
+
EvalCase(name=c["name"], prompt=c["prompt"], rubric=c.get("rubric"))
|
|
184
|
+
for c in data.get("cases", [])
|
|
185
|
+
]
|
|
186
|
+
return default_rubric, cases
|