RouteKitAI 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- routekitai/__init__.py +53 -0
- routekitai/cli/__init__.py +18 -0
- routekitai/cli/main.py +40 -0
- routekitai/cli/replay.py +80 -0
- routekitai/cli/run.py +95 -0
- routekitai/cli/serve.py +966 -0
- routekitai/cli/test_agent.py +178 -0
- routekitai/cli/trace.py +209 -0
- routekitai/cli/trace_analyze.py +120 -0
- routekitai/cli/trace_search.py +126 -0
- routekitai/core/__init__.py +58 -0
- routekitai/core/agent.py +325 -0
- routekitai/core/errors.py +49 -0
- routekitai/core/hooks.py +174 -0
- routekitai/core/memory.py +54 -0
- routekitai/core/message.py +132 -0
- routekitai/core/model.py +91 -0
- routekitai/core/policies.py +373 -0
- routekitai/core/policy.py +85 -0
- routekitai/core/policy_adapter.py +133 -0
- routekitai/core/runtime.py +1403 -0
- routekitai/core/tool.py +148 -0
- routekitai/core/tools.py +180 -0
- routekitai/evals/__init__.py +13 -0
- routekitai/evals/dataset.py +75 -0
- routekitai/evals/metrics.py +101 -0
- routekitai/evals/runner.py +184 -0
- routekitai/graphs/__init__.py +12 -0
- routekitai/graphs/executors.py +457 -0
- routekitai/graphs/graph.py +164 -0
- routekitai/memory/__init__.py +13 -0
- routekitai/memory/episodic.py +242 -0
- routekitai/memory/kv.py +34 -0
- routekitai/memory/retrieval.py +192 -0
- routekitai/memory/vector.py +700 -0
- routekitai/memory/working.py +66 -0
- routekitai/message.py +29 -0
- routekitai/model.py +48 -0
- routekitai/observability/__init__.py +21 -0
- routekitai/observability/analyzer.py +314 -0
- routekitai/observability/exporters/__init__.py +10 -0
- routekitai/observability/exporters/base.py +30 -0
- routekitai/observability/exporters/jsonl.py +81 -0
- routekitai/observability/exporters/otel.py +119 -0
- routekitai/observability/spans.py +111 -0
- routekitai/observability/streaming.py +117 -0
- routekitai/observability/trace.py +144 -0
- routekitai/providers/__init__.py +9 -0
- routekitai/providers/anthropic.py +227 -0
- routekitai/providers/azure_openai.py +243 -0
- routekitai/providers/local.py +196 -0
- routekitai/providers/openai.py +321 -0
- routekitai/py.typed +0 -0
- routekitai/sandbox/__init__.py +12 -0
- routekitai/sandbox/filesystem.py +131 -0
- routekitai/sandbox/network.py +142 -0
- routekitai/sandbox/permissions.py +70 -0
- routekitai/tool.py +33 -0
- routekitai-0.1.0.dist-info/METADATA +328 -0
- routekitai-0.1.0.dist-info/RECORD +64 -0
- routekitai-0.1.0.dist-info/WHEEL +5 -0
- routekitai-0.1.0.dist-info/entry_points.txt +2 -0
- routekitai-0.1.0.dist-info/licenses/LICENSE +21 -0
- routekitai-0.1.0.dist-info/top_level.txt +1 -0
routekitai/core/tool.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Tool primitive for RouteKit."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any, TypeVar
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field, create_model
|
|
8
|
+
|
|
9
|
+
from routekitai.core.errors import ToolError
|
|
10
|
+
|
|
11
|
+
TInput = TypeVar("TInput", bound=BaseModel)
|
|
12
|
+
TOutput = TypeVar("TOutput", bound=BaseModel)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ToolPermission(str, Enum):
|
|
16
|
+
"""Tool permission types."""
|
|
17
|
+
|
|
18
|
+
NETWORK = "network"
|
|
19
|
+
FILESYSTEM = "filesystem"
|
|
20
|
+
DATABASE = "database"
|
|
21
|
+
NONE = "none"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Tool(BaseModel, ABC):
|
|
25
|
+
"""Base class for tools with pydantic input/output models."""
|
|
26
|
+
|
|
27
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
28
|
+
|
|
29
|
+
name: str = Field(..., description="Tool name")
|
|
30
|
+
description: str = Field(..., description="Tool description")
|
|
31
|
+
input_model: type[BaseModel] | None = Field(
|
|
32
|
+
default=None, description="Pydantic model for input validation"
|
|
33
|
+
)
|
|
34
|
+
output_model: type[BaseModel] | None = Field(
|
|
35
|
+
default=None, description="Pydantic model for output validation"
|
|
36
|
+
)
|
|
37
|
+
permissions: list[ToolPermission] = Field(
|
|
38
|
+
default_factory=list, description="Required permissions"
|
|
39
|
+
)
|
|
40
|
+
rate_limit: int | None = Field(default=None, description="Rate limit (calls per second)")
|
|
41
|
+
timeout: float | None = Field(default=None, description="Timeout in seconds")
|
|
42
|
+
redact_fields: list[str] = Field(
|
|
43
|
+
default_factory=list,
|
|
44
|
+
description="Field names to redact in traces (e.g., ['api_key', 'password'])",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def parameters(self) -> dict[str, Any]:
|
|
49
|
+
"""Generate JSON Schema from input_model.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
JSON Schema dictionary
|
|
53
|
+
"""
|
|
54
|
+
if self.input_model is None:
|
|
55
|
+
return {"type": "object", "properties": {}}
|
|
56
|
+
return self.input_model.model_json_schema()
|
|
57
|
+
|
|
58
|
+
def redact_data(self, data: dict[str, Any]) -> dict[str, Any]:
|
|
59
|
+
"""Redact sensitive fields from data (handles nested dicts).
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
data: Data dictionary to redact
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Data with redacted fields
|
|
66
|
+
"""
|
|
67
|
+
if not self.redact_fields:
|
|
68
|
+
return data
|
|
69
|
+
|
|
70
|
+
def _redact_recursive(obj: Any) -> Any:
|
|
71
|
+
"""Recursively redact fields in nested structures."""
|
|
72
|
+
if isinstance(obj, dict):
|
|
73
|
+
redacted: dict[str, Any] = {}
|
|
74
|
+
for key, value in obj.items():
|
|
75
|
+
if key in self.redact_fields:
|
|
76
|
+
redacted[key] = "[REDACTED]"
|
|
77
|
+
elif isinstance(value, (dict, list)):
|
|
78
|
+
redacted[key] = _redact_recursive(value)
|
|
79
|
+
else:
|
|
80
|
+
redacted[key] = value
|
|
81
|
+
return redacted
|
|
82
|
+
elif isinstance(obj, list):
|
|
83
|
+
return [_redact_recursive(item) for item in obj]
|
|
84
|
+
else:
|
|
85
|
+
return obj
|
|
86
|
+
|
|
87
|
+
result = _redact_recursive(data)
|
|
88
|
+
assert isinstance(result, dict)
|
|
89
|
+
return result
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
async def run(self, input: BaseModel) -> BaseModel:
|
|
93
|
+
"""Execute the tool with validated input.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
input: Validated input model instance
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Validated output model instance
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
ToolError: If tool execution fails
|
|
103
|
+
"""
|
|
104
|
+
raise NotImplementedError("Subclasses must implement run")
|
|
105
|
+
|
|
106
|
+
async def execute(self, **kwargs: Any) -> Any:
|
|
107
|
+
"""Execute tool with raw kwargs (validates input/output).
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
**kwargs: Raw tool arguments
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Tool output (validated if output_model is set)
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
ToolError: If validation or execution fails
|
|
117
|
+
"""
|
|
118
|
+
try:
|
|
119
|
+
# Validate input
|
|
120
|
+
if self.input_model is not None:
|
|
121
|
+
input_instance = self.input_model(**kwargs)
|
|
122
|
+
else:
|
|
123
|
+
# Create a minimal model if no input_model
|
|
124
|
+
if not kwargs:
|
|
125
|
+
# Empty kwargs - create empty model
|
|
126
|
+
InputModel = create_model("InputModel")
|
|
127
|
+
input_instance = InputModel()
|
|
128
|
+
else:
|
|
129
|
+
# Dynamically create model from kwargs
|
|
130
|
+
field_definitions: dict[str, Any] = {}
|
|
131
|
+
for k, v in kwargs.items():
|
|
132
|
+
field_definitions[k] = (type(v), ...)
|
|
133
|
+
InputModel = create_model("InputModel", **field_definitions)
|
|
134
|
+
input_instance = InputModel(**kwargs)
|
|
135
|
+
|
|
136
|
+
# Execute
|
|
137
|
+
output = await self.run(input_instance)
|
|
138
|
+
|
|
139
|
+
# Validate output
|
|
140
|
+
if self.output_model is not None and not isinstance(output, self.output_model):
|
|
141
|
+
raise ToolError(f"Tool {self.name} returned invalid output type")
|
|
142
|
+
|
|
143
|
+
return output
|
|
144
|
+
|
|
145
|
+
except Exception as e:
|
|
146
|
+
if isinstance(e, ToolError):
|
|
147
|
+
raise
|
|
148
|
+
raise ToolError(f"Tool {self.name} execution failed: {e}") from e
|
routekitai/core/tools.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Built-in tools for RouteKit."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from routekitai.core.errors import ToolError
|
|
8
|
+
from routekitai.core.tool import Tool, ToolPermission
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EchoInput(BaseModel):
|
|
12
|
+
"""Input for EchoTool."""
|
|
13
|
+
|
|
14
|
+
message: str = Field(..., description="Message to echo")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EchoOutput(BaseModel):
|
|
18
|
+
"""Output for EchoTool."""
|
|
19
|
+
|
|
20
|
+
echoed: str = Field(..., description="Echoed message")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EchoTool(Tool):
|
|
24
|
+
"""Echo tool for testing.
|
|
25
|
+
|
|
26
|
+
Simply echoes back the input message. Useful for testing and debugging.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
30
|
+
|
|
31
|
+
def __init__(self) -> None:
|
|
32
|
+
super().__init__(
|
|
33
|
+
name="echo",
|
|
34
|
+
description="Echo back a message (useful for testing)",
|
|
35
|
+
input_model=EchoInput,
|
|
36
|
+
output_model=EchoOutput,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
async def run(self, input: BaseModel) -> BaseModel:
|
|
40
|
+
"""Echo the input message.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
input: EchoInput instance
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
EchoOutput instance
|
|
47
|
+
"""
|
|
48
|
+
if not isinstance(input, EchoInput):
|
|
49
|
+
raise ToolError("Invalid input type for EchoTool")
|
|
50
|
+
return EchoOutput(echoed=input.message)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class HttpGetInput(BaseModel):
|
|
54
|
+
"""Input for HttpGetTool."""
|
|
55
|
+
|
|
56
|
+
url: str = Field(..., description="URL to fetch")
|
|
57
|
+
headers: dict[str, str] = Field(default_factory=dict, description="HTTP headers")
|
|
58
|
+
timeout: float = Field(default=30.0, description="Request timeout in seconds")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class HttpGetOutput(BaseModel):
|
|
62
|
+
"""Output for HttpGetTool."""
|
|
63
|
+
|
|
64
|
+
status_code: int = Field(..., description="HTTP status code")
|
|
65
|
+
headers: dict[str, str] = Field(..., description="Response headers")
|
|
66
|
+
body: str = Field(..., description="Response body")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class HttpGetTool(Tool):
|
|
70
|
+
"""HTTP GET tool.
|
|
71
|
+
|
|
72
|
+
Requires NETWORK permission. Redacts 'api_key' and 'authorization' from headers.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
76
|
+
|
|
77
|
+
def __init__(self) -> None:
|
|
78
|
+
super().__init__(
|
|
79
|
+
name="http_get",
|
|
80
|
+
description="Perform HTTP GET request",
|
|
81
|
+
input_model=HttpGetInput,
|
|
82
|
+
output_model=HttpGetOutput,
|
|
83
|
+
permissions=[ToolPermission.NETWORK],
|
|
84
|
+
redact_fields=["api_key", "authorization"],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
async def run(self, input: BaseModel) -> BaseModel:
|
|
88
|
+
"""Execute HTTP GET request.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
input: HttpGetInput instance
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
HttpGetOutput instance
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
ToolError: If request fails
|
|
98
|
+
"""
|
|
99
|
+
if not isinstance(input, HttpGetInput):
|
|
100
|
+
raise ToolError("Invalid input type for HttpGetTool")
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
# Try to import httpx (optional dependency)
|
|
104
|
+
try:
|
|
105
|
+
import httpx
|
|
106
|
+
except ImportError:
|
|
107
|
+
raise ToolError(
|
|
108
|
+
"httpx is required for HttpGetTool. Install with: pip install httpx"
|
|
109
|
+
) from None
|
|
110
|
+
|
|
111
|
+
async with httpx.AsyncClient(timeout=input.timeout) as client:
|
|
112
|
+
response = await client.get(input.url, headers=input.headers)
|
|
113
|
+
return HttpGetOutput(
|
|
114
|
+
status_code=response.status_code,
|
|
115
|
+
headers=dict(response.headers),
|
|
116
|
+
body=response.text,
|
|
117
|
+
)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
raise ToolError(f"HTTP GET failed: {e}") from e
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class FileReadInput(BaseModel):
|
|
123
|
+
"""Input for FileReadTool."""
|
|
124
|
+
|
|
125
|
+
path: str = Field(..., description="File path to read")
|
|
126
|
+
encoding: str = Field(default="utf-8", description="File encoding")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class FileReadOutput(BaseModel):
|
|
130
|
+
"""Output for FileReadTool."""
|
|
131
|
+
|
|
132
|
+
content: str = Field(..., description="File content")
|
|
133
|
+
size: int = Field(..., description="File size in bytes")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class FileReadTool(Tool):
|
|
137
|
+
"""File read tool.
|
|
138
|
+
|
|
139
|
+
Requires FILESYSTEM permission. Reads file content.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
143
|
+
|
|
144
|
+
def __init__(self) -> None:
|
|
145
|
+
super().__init__(
|
|
146
|
+
name="file_read",
|
|
147
|
+
description="Read content from a file",
|
|
148
|
+
input_model=FileReadInput,
|
|
149
|
+
output_model=FileReadOutput,
|
|
150
|
+
permissions=[ToolPermission.FILESYSTEM],
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
async def run(self, input: BaseModel) -> BaseModel:
|
|
154
|
+
"""Read file content.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
input: FileReadInput instance
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
FileReadOutput instance
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
ToolError: If file read fails
|
|
164
|
+
"""
|
|
165
|
+
if not isinstance(input, FileReadInput):
|
|
166
|
+
raise ToolError("Invalid input type for FileReadTool")
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
file_path = Path(input.path)
|
|
170
|
+
if not file_path.exists():
|
|
171
|
+
raise ToolError(f"File not found: {input.path}")
|
|
172
|
+
if not file_path.is_file():
|
|
173
|
+
raise ToolError(f"Path is not a file: {input.path}")
|
|
174
|
+
|
|
175
|
+
content = file_path.read_text(encoding=input.encoding)
|
|
176
|
+
size = file_path.stat().st_size
|
|
177
|
+
|
|
178
|
+
return FileReadOutput(content=content, size=size)
|
|
179
|
+
except Exception as e:
|
|
180
|
+
raise ToolError(f"File read failed: {e}") from e
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Evaluation harness for routkitai agents."""
|
|
2
|
+
|
|
3
|
+
from routekitai.evals.dataset import Dataset
|
|
4
|
+
from routekitai.evals.metrics import ContainsMetric, ExactMatchMetric, RegexMetric
|
|
5
|
+
from routekitai.evals.runner import EvalRunner
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Dataset",
|
|
9
|
+
"EvalRunner",
|
|
10
|
+
"ExactMatchMetric",
|
|
11
|
+
"ContainsMetric",
|
|
12
|
+
"RegexMetric",
|
|
13
|
+
]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Dataset format for evaluations."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EvalExample(BaseModel):
|
|
11
|
+
"""A single evaluation example."""
|
|
12
|
+
|
|
13
|
+
id: str = Field(..., description="Example ID")
|
|
14
|
+
input: str = Field(..., description="Input prompt")
|
|
15
|
+
expected_output: str | None = Field(default=None, description="Expected output")
|
|
16
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Dataset(BaseModel):
|
|
20
|
+
"""Evaluation dataset."""
|
|
21
|
+
|
|
22
|
+
name: str = Field(..., description="Dataset name")
|
|
23
|
+
examples: list[EvalExample] = Field(default_factory=list, description="Evaluation examples")
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def from_jsonl(cls, file_path: Path | str) -> "Dataset":
|
|
27
|
+
"""Load dataset from JSONL file.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
file_path: Path to JSONL file
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Dataset instance
|
|
34
|
+
"""
|
|
35
|
+
file_path = Path(file_path)
|
|
36
|
+
examples = []
|
|
37
|
+
|
|
38
|
+
with open(file_path, encoding="utf-8") as f:
|
|
39
|
+
for line_num, line in enumerate(f, 1):
|
|
40
|
+
line = line.strip()
|
|
41
|
+
if not line:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
data = json.loads(line)
|
|
46
|
+
example = EvalExample(
|
|
47
|
+
id=data.get("id", f"example_{line_num}"),
|
|
48
|
+
input=data.get("input", data.get("prompt", "")),
|
|
49
|
+
expected_output=data.get("expected_output", data.get("output")),
|
|
50
|
+
metadata=data.get("metadata", {}),
|
|
51
|
+
)
|
|
52
|
+
examples.append(example)
|
|
53
|
+
except json.JSONDecodeError as e:
|
|
54
|
+
raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
|
|
55
|
+
|
|
56
|
+
return cls(name=file_path.stem, examples=examples)
|
|
57
|
+
|
|
58
|
+
def to_jsonl(self, file_path: Path | str) -> None:
|
|
59
|
+
"""Save dataset to JSONL file.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
file_path: Path to save JSONL file
|
|
63
|
+
"""
|
|
64
|
+
file_path = Path(file_path)
|
|
65
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
|
|
67
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
68
|
+
for example in self.examples:
|
|
69
|
+
data = {
|
|
70
|
+
"id": example.id,
|
|
71
|
+
"input": example.input,
|
|
72
|
+
"expected_output": example.expected_output,
|
|
73
|
+
"metadata": example.metadata,
|
|
74
|
+
}
|
|
75
|
+
f.write(json.dumps(data) + "\n")
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Evaluation metrics for agent outputs."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Metric(BaseModel, ABC):
|
|
11
|
+
"""Base class for evaluation metrics."""
|
|
12
|
+
|
|
13
|
+
name: str = Field(..., description="Metric name")
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def score(self, expected: str | None, actual: str) -> float:
|
|
17
|
+
"""Score an output against expected output.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
expected: Expected output (can be None)
|
|
21
|
+
actual: Actual output
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Score between 0.0 and 1.0
|
|
25
|
+
"""
|
|
26
|
+
raise NotImplementedError("Subclasses must implement score")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ExactMatchMetric(Metric):
|
|
30
|
+
"""Exact match metric (case-insensitive)."""
|
|
31
|
+
|
|
32
|
+
name: str = Field(default="exact_match", description="Metric name")
|
|
33
|
+
case_sensitive: bool = Field(default=False, description="Whether to be case-sensitive")
|
|
34
|
+
|
|
35
|
+
def score(self, expected: str | None, actual: str) -> float:
|
|
36
|
+
"""Score based on exact match.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
expected: Expected output
|
|
40
|
+
actual: Actual output
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
1.0 if exact match, 0.0 otherwise
|
|
44
|
+
"""
|
|
45
|
+
if expected is None:
|
|
46
|
+
return 0.0
|
|
47
|
+
|
|
48
|
+
if self.case_sensitive:
|
|
49
|
+
return 1.0 if expected.strip() == actual.strip() else 0.0
|
|
50
|
+
else:
|
|
51
|
+
return 1.0 if expected.strip().lower() == actual.strip().lower() else 0.0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ContainsMetric(Metric):
|
|
55
|
+
"""Contains metric (checks if expected is contained in actual)."""
|
|
56
|
+
|
|
57
|
+
name: str = Field(default="contains", description="Metric name")
|
|
58
|
+
case_sensitive: bool = Field(default=False, description="Whether to be case-sensitive")
|
|
59
|
+
|
|
60
|
+
def score(self, expected: str | None, actual: str) -> float:
|
|
61
|
+
"""Score based on substring match.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
expected: Expected output (substring to find)
|
|
65
|
+
actual: Actual output
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
1.0 if expected is contained in actual, 0.0 otherwise
|
|
69
|
+
"""
|
|
70
|
+
if expected is None:
|
|
71
|
+
return 0.0
|
|
72
|
+
|
|
73
|
+
if self.case_sensitive:
|
|
74
|
+
return 1.0 if expected.strip() in actual else 0.0
|
|
75
|
+
else:
|
|
76
|
+
return 1.0 if expected.strip().lower() in actual.lower() else 0.0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class RegexMetric(Metric):
|
|
80
|
+
"""Regex pattern matching metric."""
|
|
81
|
+
|
|
82
|
+
name: str = Field(default="regex", description="Metric name")
|
|
83
|
+
pattern: str = Field(..., description="Regex pattern to match")
|
|
84
|
+
flags: int = Field(default=0, description="Regex flags")
|
|
85
|
+
|
|
86
|
+
def __init__(self, **data: Any) -> None:
|
|
87
|
+
"""Initialize regex metric."""
|
|
88
|
+
super().__init__(**data)
|
|
89
|
+
self._compiled_pattern = re.compile(self.pattern, self.flags)
|
|
90
|
+
|
|
91
|
+
def score(self, expected: str | None, actual: str) -> float:
|
|
92
|
+
"""Score based on regex match.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
expected: Expected output (not used, pattern is used instead)
|
|
96
|
+
actual: Actual output
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
1.0 if pattern matches, 0.0 otherwise
|
|
100
|
+
"""
|
|
101
|
+
return 1.0 if self._compiled_pattern.search(actual) else 0.0
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Evaluation runner for agent testing."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
from routekitai.core.runtime import Runtime
|
|
10
|
+
from routekitai.evals.dataset import Dataset
|
|
11
|
+
from routekitai.evals.metrics import Metric
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EvalResult(BaseModel):
|
|
15
|
+
"""Result for a single evaluation example."""
|
|
16
|
+
|
|
17
|
+
example_id: str = Field(..., description="Example ID")
|
|
18
|
+
input: str = Field(..., description="Input prompt")
|
|
19
|
+
expected_output: str | None = Field(default=None, description="Expected output")
|
|
20
|
+
actual_output: str = Field(..., description="Actual output")
|
|
21
|
+
scores: dict[str, float] = Field(default_factory=dict, description="Metric scores")
|
|
22
|
+
trace_id: str | None = Field(default=None, description="Trace ID")
|
|
23
|
+
passed: bool = Field(default=False, description="Whether evaluation passed")
|
|
24
|
+
error: str | None = Field(default=None, description="Error if evaluation failed")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class EvalReport(BaseModel):
|
|
28
|
+
"""Evaluation report with aggregated results."""
|
|
29
|
+
|
|
30
|
+
dataset_name: str = Field(..., description="Dataset name")
|
|
31
|
+
agent_name: str = Field(..., description="Agent name")
|
|
32
|
+
total_examples: int = Field(..., description="Total number of examples")
|
|
33
|
+
passed: int = Field(..., description="Number of passed examples")
|
|
34
|
+
failed: int = Field(..., description="Number of failed examples")
|
|
35
|
+
errors: int = Field(..., description="Number of errors")
|
|
36
|
+
average_scores: dict[str, float] = Field(
|
|
37
|
+
default_factory=dict, description="Average scores per metric"
|
|
38
|
+
)
|
|
39
|
+
results: list[EvalResult] = Field(default_factory=list, description="Individual results")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class EvalRunner(BaseModel):
|
|
43
|
+
"""Runner for evaluating agents on datasets."""
|
|
44
|
+
|
|
45
|
+
runtime: Runtime = Field(..., description="Runtime for agent execution")
|
|
46
|
+
metrics: list[Metric] = Field(default_factory=list, description="Metrics to compute")
|
|
47
|
+
regression_mode: bool = Field(
|
|
48
|
+
default=False, description="Enable regression mode (compare to baseline)"
|
|
49
|
+
)
|
|
50
|
+
baseline_dir: Path | None = Field(default=None, description="Directory with baseline traces")
|
|
51
|
+
|
|
52
|
+
async def run(self, agent_name: str, dataset: Dataset, **kwargs: Any) -> EvalReport:
|
|
53
|
+
"""Run evaluation on a dataset.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
agent_name: Name of agent to evaluate
|
|
57
|
+
dataset: Evaluation dataset
|
|
58
|
+
**kwargs: Additional parameters for agent execution
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
EvalReport with results
|
|
62
|
+
"""
|
|
63
|
+
if agent_name not in self.runtime.agents:
|
|
64
|
+
raise ValueError(f"Agent '{agent_name}' not found in runtime")
|
|
65
|
+
|
|
66
|
+
results: list[EvalResult] = []
|
|
67
|
+
passed = 0
|
|
68
|
+
failed = 0
|
|
69
|
+
errors = 0
|
|
70
|
+
|
|
71
|
+
for example in dataset.examples:
|
|
72
|
+
try:
|
|
73
|
+
# Execute agent
|
|
74
|
+
result = await self.runtime.run(agent_name, example.input, **kwargs)
|
|
75
|
+
actual_output = result.output.content
|
|
76
|
+
|
|
77
|
+
# Compute scores
|
|
78
|
+
scores: dict[str, float] = {}
|
|
79
|
+
for metric in self.metrics:
|
|
80
|
+
score = metric.score(example.expected_output, actual_output)
|
|
81
|
+
scores[metric.name] = score
|
|
82
|
+
|
|
83
|
+
# Check if passed (at least one metric must score > 0)
|
|
84
|
+
passed_example = any(score > 0 for score in scores.values()) if scores else False
|
|
85
|
+
|
|
86
|
+
# Regression mode: compare to baseline
|
|
87
|
+
if self.regression_mode and self.baseline_dir:
|
|
88
|
+
baseline_result = await self._load_baseline_result(example.id)
|
|
89
|
+
if baseline_result:
|
|
90
|
+
if baseline_result.actual_output != actual_output:
|
|
91
|
+
# Output changed - mark as regression
|
|
92
|
+
passed_example = False
|
|
93
|
+
scores["regression"] = 0.0
|
|
94
|
+
else:
|
|
95
|
+
scores["regression"] = 1.0
|
|
96
|
+
|
|
97
|
+
if passed_example:
|
|
98
|
+
passed += 1
|
|
99
|
+
else:
|
|
100
|
+
failed += 1
|
|
101
|
+
|
|
102
|
+
eval_result = EvalResult(
|
|
103
|
+
example_id=example.id,
|
|
104
|
+
input=example.input,
|
|
105
|
+
expected_output=example.expected_output,
|
|
106
|
+
actual_output=actual_output,
|
|
107
|
+
scores=scores,
|
|
108
|
+
trace_id=result.trace_id,
|
|
109
|
+
passed=passed_example,
|
|
110
|
+
)
|
|
111
|
+
results.append(eval_result)
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
errors += 1
|
|
115
|
+
eval_result = EvalResult(
|
|
116
|
+
example_id=example.id,
|
|
117
|
+
input=example.input,
|
|
118
|
+
expected_output=example.expected_output,
|
|
119
|
+
actual_output="",
|
|
120
|
+
error=str(e),
|
|
121
|
+
passed=False,
|
|
122
|
+
)
|
|
123
|
+
results.append(eval_result)
|
|
124
|
+
|
|
125
|
+
# Compute average scores
|
|
126
|
+
average_scores: dict[str, float] = {}
|
|
127
|
+
if results:
|
|
128
|
+
metric_names: set[str] = set()
|
|
129
|
+
for eval_result in results:
|
|
130
|
+
metric_names.update(eval_result.scores.keys())
|
|
131
|
+
|
|
132
|
+
for metric_name in metric_names:
|
|
133
|
+
scores_list = [r.scores.get(metric_name, 0.0) for r in results if r.error is None]
|
|
134
|
+
if scores_list:
|
|
135
|
+
average_scores[metric_name] = sum(scores_list) / len(scores_list)
|
|
136
|
+
|
|
137
|
+
return EvalReport(
|
|
138
|
+
dataset_name=dataset.name,
|
|
139
|
+
agent_name=agent_name,
|
|
140
|
+
total_examples=len(dataset.examples),
|
|
141
|
+
passed=passed,
|
|
142
|
+
failed=failed,
|
|
143
|
+
errors=errors,
|
|
144
|
+
average_scores=average_scores,
|
|
145
|
+
results=results,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
async def _load_baseline_result(self, example_id: str) -> EvalResult | None:
|
|
149
|
+
"""Load baseline result for regression comparison.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
example_id: Example ID
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Baseline result or None
|
|
156
|
+
"""
|
|
157
|
+
if not self.baseline_dir:
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
baseline_file = self.baseline_dir / f"{example_id}.json"
|
|
161
|
+
if not baseline_file.exists():
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
with open(baseline_file, encoding="utf-8") as f:
|
|
166
|
+
data = json.load(f)
|
|
167
|
+
return EvalResult(**data)
|
|
168
|
+
except Exception:
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
def save_baseline(self, report: EvalReport, output_dir: Path) -> None:
|
|
172
|
+
"""Save evaluation results as baseline for regression testing.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
report: Evaluation report
|
|
176
|
+
output_dir: Directory to save baseline results
|
|
177
|
+
"""
|
|
178
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
179
|
+
|
|
180
|
+
for result in report.results:
|
|
181
|
+
if result.error is None:
|
|
182
|
+
baseline_file = output_dir / f"{result.example_id}.json"
|
|
183
|
+
with open(baseline_file, "w", encoding="utf-8") as f:
|
|
184
|
+
json.dump(result.model_dump(), f, indent=2)
|