RouteKitAI 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. routekitai/__init__.py +53 -0
  2. routekitai/cli/__init__.py +18 -0
  3. routekitai/cli/main.py +40 -0
  4. routekitai/cli/replay.py +80 -0
  5. routekitai/cli/run.py +95 -0
  6. routekitai/cli/serve.py +966 -0
  7. routekitai/cli/test_agent.py +178 -0
  8. routekitai/cli/trace.py +209 -0
  9. routekitai/cli/trace_analyze.py +120 -0
  10. routekitai/cli/trace_search.py +126 -0
  11. routekitai/core/__init__.py +58 -0
  12. routekitai/core/agent.py +325 -0
  13. routekitai/core/errors.py +49 -0
  14. routekitai/core/hooks.py +174 -0
  15. routekitai/core/memory.py +54 -0
  16. routekitai/core/message.py +132 -0
  17. routekitai/core/model.py +91 -0
  18. routekitai/core/policies.py +373 -0
  19. routekitai/core/policy.py +85 -0
  20. routekitai/core/policy_adapter.py +133 -0
  21. routekitai/core/runtime.py +1403 -0
  22. routekitai/core/tool.py +148 -0
  23. routekitai/core/tools.py +180 -0
  24. routekitai/evals/__init__.py +13 -0
  25. routekitai/evals/dataset.py +75 -0
  26. routekitai/evals/metrics.py +101 -0
  27. routekitai/evals/runner.py +184 -0
  28. routekitai/graphs/__init__.py +12 -0
  29. routekitai/graphs/executors.py +457 -0
  30. routekitai/graphs/graph.py +164 -0
  31. routekitai/memory/__init__.py +13 -0
  32. routekitai/memory/episodic.py +242 -0
  33. routekitai/memory/kv.py +34 -0
  34. routekitai/memory/retrieval.py +192 -0
  35. routekitai/memory/vector.py +700 -0
  36. routekitai/memory/working.py +66 -0
  37. routekitai/message.py +29 -0
  38. routekitai/model.py +48 -0
  39. routekitai/observability/__init__.py +21 -0
  40. routekitai/observability/analyzer.py +314 -0
  41. routekitai/observability/exporters/__init__.py +10 -0
  42. routekitai/observability/exporters/base.py +30 -0
  43. routekitai/observability/exporters/jsonl.py +81 -0
  44. routekitai/observability/exporters/otel.py +119 -0
  45. routekitai/observability/spans.py +111 -0
  46. routekitai/observability/streaming.py +117 -0
  47. routekitai/observability/trace.py +144 -0
  48. routekitai/providers/__init__.py +9 -0
  49. routekitai/providers/anthropic.py +227 -0
  50. routekitai/providers/azure_openai.py +243 -0
  51. routekitai/providers/local.py +196 -0
  52. routekitai/providers/openai.py +321 -0
  53. routekitai/py.typed +0 -0
  54. routekitai/sandbox/__init__.py +12 -0
  55. routekitai/sandbox/filesystem.py +131 -0
  56. routekitai/sandbox/network.py +142 -0
  57. routekitai/sandbox/permissions.py +70 -0
  58. routekitai/tool.py +33 -0
  59. routekitai-0.1.0.dist-info/METADATA +328 -0
  60. routekitai-0.1.0.dist-info/RECORD +64 -0
  61. routekitai-0.1.0.dist-info/WHEEL +5 -0
  62. routekitai-0.1.0.dist-info/entry_points.txt +2 -0
  63. routekitai-0.1.0.dist-info/licenses/LICENSE +21 -0
  64. routekitai-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,148 @@
1
+ """Tool primitive for RouteKit."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from enum import Enum
5
+ from typing import Any, TypeVar
6
+
7
+ from pydantic import BaseModel, Field, create_model
8
+
9
+ from routekitai.core.errors import ToolError
10
+
11
+ TInput = TypeVar("TInput", bound=BaseModel)
12
+ TOutput = TypeVar("TOutput", bound=BaseModel)
13
+
14
+
15
+ class ToolPermission(str, Enum):
16
+ """Tool permission types."""
17
+
18
+ NETWORK = "network"
19
+ FILESYSTEM = "filesystem"
20
+ DATABASE = "database"
21
+ NONE = "none"
22
+
23
+
24
+ class Tool(BaseModel, ABC):
25
+ """Base class for tools with pydantic input/output models."""
26
+
27
+ model_config = {"arbitrary_types_allowed": True}
28
+
29
+ name: str = Field(..., description="Tool name")
30
+ description: str = Field(..., description="Tool description")
31
+ input_model: type[BaseModel] | None = Field(
32
+ default=None, description="Pydantic model for input validation"
33
+ )
34
+ output_model: type[BaseModel] | None = Field(
35
+ default=None, description="Pydantic model for output validation"
36
+ )
37
+ permissions: list[ToolPermission] = Field(
38
+ default_factory=list, description="Required permissions"
39
+ )
40
+ rate_limit: int | None = Field(default=None, description="Rate limit (calls per second)")
41
+ timeout: float | None = Field(default=None, description="Timeout in seconds")
42
+ redact_fields: list[str] = Field(
43
+ default_factory=list,
44
+ description="Field names to redact in traces (e.g., ['api_key', 'password'])",
45
+ )
46
+
47
+ @property
48
+ def parameters(self) -> dict[str, Any]:
49
+ """Generate JSON Schema from input_model.
50
+
51
+ Returns:
52
+ JSON Schema dictionary
53
+ """
54
+ if self.input_model is None:
55
+ return {"type": "object", "properties": {}}
56
+ return self.input_model.model_json_schema()
57
+
58
+ def redact_data(self, data: dict[str, Any]) -> dict[str, Any]:
59
+ """Redact sensitive fields from data (handles nested dicts).
60
+
61
+ Args:
62
+ data: Data dictionary to redact
63
+
64
+ Returns:
65
+ Data with redacted fields
66
+ """
67
+ if not self.redact_fields:
68
+ return data
69
+
70
+ def _redact_recursive(obj: Any) -> Any:
71
+ """Recursively redact fields in nested structures."""
72
+ if isinstance(obj, dict):
73
+ redacted: dict[str, Any] = {}
74
+ for key, value in obj.items():
75
+ if key in self.redact_fields:
76
+ redacted[key] = "[REDACTED]"
77
+ elif isinstance(value, (dict, list)):
78
+ redacted[key] = _redact_recursive(value)
79
+ else:
80
+ redacted[key] = value
81
+ return redacted
82
+ elif isinstance(obj, list):
83
+ return [_redact_recursive(item) for item in obj]
84
+ else:
85
+ return obj
86
+
87
+ result = _redact_recursive(data)
88
+ assert isinstance(result, dict)
89
+ return result
90
+
91
+ @abstractmethod
92
+ async def run(self, input: BaseModel) -> BaseModel:
93
+ """Execute the tool with validated input.
94
+
95
+ Args:
96
+ input: Validated input model instance
97
+
98
+ Returns:
99
+ Validated output model instance
100
+
101
+ Raises:
102
+ ToolError: If tool execution fails
103
+ """
104
+ raise NotImplementedError("Subclasses must implement run")
105
+
106
+ async def execute(self, **kwargs: Any) -> Any:
107
+ """Execute tool with raw kwargs (validates input/output).
108
+
109
+ Args:
110
+ **kwargs: Raw tool arguments
111
+
112
+ Returns:
113
+ Tool output (validated if output_model is set)
114
+
115
+ Raises:
116
+ ToolError: If validation or execution fails
117
+ """
118
+ try:
119
+ # Validate input
120
+ if self.input_model is not None:
121
+ input_instance = self.input_model(**kwargs)
122
+ else:
123
+ # Create a minimal model if no input_model
124
+ if not kwargs:
125
+ # Empty kwargs - create empty model
126
+ InputModel = create_model("InputModel")
127
+ input_instance = InputModel()
128
+ else:
129
+ # Dynamically create model from kwargs
130
+ field_definitions: dict[str, Any] = {}
131
+ for k, v in kwargs.items():
132
+ field_definitions[k] = (type(v), ...)
133
+ InputModel = create_model("InputModel", **field_definitions)
134
+ input_instance = InputModel(**kwargs)
135
+
136
+ # Execute
137
+ output = await self.run(input_instance)
138
+
139
+ # Validate output
140
+ if self.output_model is not None and not isinstance(output, self.output_model):
141
+ raise ToolError(f"Tool {self.name} returned invalid output type")
142
+
143
+ return output
144
+
145
+ except Exception as e:
146
+ if isinstance(e, ToolError):
147
+ raise
148
+ raise ToolError(f"Tool {self.name} execution failed: {e}") from e
@@ -0,0 +1,180 @@
1
+ """Built-in tools for RouteKit."""
2
+
3
+ from pathlib import Path
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from routekitai.core.errors import ToolError
8
+ from routekitai.core.tool import Tool, ToolPermission
9
+
10
+
11
+ class EchoInput(BaseModel):
12
+ """Input for EchoTool."""
13
+
14
+ message: str = Field(..., description="Message to echo")
15
+
16
+
17
+ class EchoOutput(BaseModel):
18
+ """Output for EchoTool."""
19
+
20
+ echoed: str = Field(..., description="Echoed message")
21
+
22
+
23
+ class EchoTool(Tool):
24
+ """Echo tool for testing.
25
+
26
+ Simply echoes back the input message. Useful for testing and debugging.
27
+ """
28
+
29
+ model_config = {"arbitrary_types_allowed": True}
30
+
31
+ def __init__(self) -> None:
32
+ super().__init__(
33
+ name="echo",
34
+ description="Echo back a message (useful for testing)",
35
+ input_model=EchoInput,
36
+ output_model=EchoOutput,
37
+ )
38
+
39
+ async def run(self, input: BaseModel) -> BaseModel:
40
+ """Echo the input message.
41
+
42
+ Args:
43
+ input: EchoInput instance
44
+
45
+ Returns:
46
+ EchoOutput instance
47
+ """
48
+ if not isinstance(input, EchoInput):
49
+ raise ToolError("Invalid input type for EchoTool")
50
+ return EchoOutput(echoed=input.message)
51
+
52
+
53
+ class HttpGetInput(BaseModel):
54
+ """Input for HttpGetTool."""
55
+
56
+ url: str = Field(..., description="URL to fetch")
57
+ headers: dict[str, str] = Field(default_factory=dict, description="HTTP headers")
58
+ timeout: float = Field(default=30.0, description="Request timeout in seconds")
59
+
60
+
61
+ class HttpGetOutput(BaseModel):
62
+ """Output for HttpGetTool."""
63
+
64
+ status_code: int = Field(..., description="HTTP status code")
65
+ headers: dict[str, str] = Field(..., description="Response headers")
66
+ body: str = Field(..., description="Response body")
67
+
68
+
69
+ class HttpGetTool(Tool):
70
+ """HTTP GET tool.
71
+
72
+ Requires NETWORK permission. Redacts 'api_key' and 'authorization' from headers.
73
+ """
74
+
75
+ model_config = {"arbitrary_types_allowed": True}
76
+
77
+ def __init__(self) -> None:
78
+ super().__init__(
79
+ name="http_get",
80
+ description="Perform HTTP GET request",
81
+ input_model=HttpGetInput,
82
+ output_model=HttpGetOutput,
83
+ permissions=[ToolPermission.NETWORK],
84
+ redact_fields=["api_key", "authorization"],
85
+ )
86
+
87
+ async def run(self, input: BaseModel) -> BaseModel:
88
+ """Execute HTTP GET request.
89
+
90
+ Args:
91
+ input: HttpGetInput instance
92
+
93
+ Returns:
94
+ HttpGetOutput instance
95
+
96
+ Raises:
97
+ ToolError: If request fails
98
+ """
99
+ if not isinstance(input, HttpGetInput):
100
+ raise ToolError("Invalid input type for HttpGetTool")
101
+
102
+ try:
103
+ # Try to import httpx (optional dependency)
104
+ try:
105
+ import httpx
106
+ except ImportError:
107
+ raise ToolError(
108
+ "httpx is required for HttpGetTool. Install with: pip install httpx"
109
+ ) from None
110
+
111
+ async with httpx.AsyncClient(timeout=input.timeout) as client:
112
+ response = await client.get(input.url, headers=input.headers)
113
+ return HttpGetOutput(
114
+ status_code=response.status_code,
115
+ headers=dict(response.headers),
116
+ body=response.text,
117
+ )
118
+ except Exception as e:
119
+ raise ToolError(f"HTTP GET failed: {e}") from e
120
+
121
+
122
+ class FileReadInput(BaseModel):
123
+ """Input for FileReadTool."""
124
+
125
+ path: str = Field(..., description="File path to read")
126
+ encoding: str = Field(default="utf-8", description="File encoding")
127
+
128
+
129
+ class FileReadOutput(BaseModel):
130
+ """Output for FileReadTool."""
131
+
132
+ content: str = Field(..., description="File content")
133
+ size: int = Field(..., description="File size in bytes")
134
+
135
+
136
+ class FileReadTool(Tool):
137
+ """File read tool.
138
+
139
+ Requires FILESYSTEM permission. Reads file content.
140
+ """
141
+
142
+ model_config = {"arbitrary_types_allowed": True}
143
+
144
+ def __init__(self) -> None:
145
+ super().__init__(
146
+ name="file_read",
147
+ description="Read content from a file",
148
+ input_model=FileReadInput,
149
+ output_model=FileReadOutput,
150
+ permissions=[ToolPermission.FILESYSTEM],
151
+ )
152
+
153
+ async def run(self, input: BaseModel) -> BaseModel:
154
+ """Read file content.
155
+
156
+ Args:
157
+ input: FileReadInput instance
158
+
159
+ Returns:
160
+ FileReadOutput instance
161
+
162
+ Raises:
163
+ ToolError: If file read fails
164
+ """
165
+ if not isinstance(input, FileReadInput):
166
+ raise ToolError("Invalid input type for FileReadTool")
167
+
168
+ try:
169
+ file_path = Path(input.path)
170
+ if not file_path.exists():
171
+ raise ToolError(f"File not found: {input.path}")
172
+ if not file_path.is_file():
173
+ raise ToolError(f"Path is not a file: {input.path}")
174
+
175
+ content = file_path.read_text(encoding=input.encoding)
176
+ size = file_path.stat().st_size
177
+
178
+ return FileReadOutput(content=content, size=size)
179
+ except Exception as e:
180
+ raise ToolError(f"File read failed: {e}") from e
@@ -0,0 +1,13 @@
1
+ """Evaluation harness for routkitai agents."""
2
+
3
+ from routekitai.evals.dataset import Dataset
4
+ from routekitai.evals.metrics import ContainsMetric, ExactMatchMetric, RegexMetric
5
+ from routekitai.evals.runner import EvalRunner
6
+
7
+ __all__ = [
8
+ "Dataset",
9
+ "EvalRunner",
10
+ "ExactMatchMetric",
11
+ "ContainsMetric",
12
+ "RegexMetric",
13
+ ]
@@ -0,0 +1,75 @@
1
+ """Dataset format for evaluations."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class EvalExample(BaseModel):
11
+ """A single evaluation example."""
12
+
13
+ id: str = Field(..., description="Example ID")
14
+ input: str = Field(..., description="Input prompt")
15
+ expected_output: str | None = Field(default=None, description="Expected output")
16
+ metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
17
+
18
+
19
+ class Dataset(BaseModel):
20
+ """Evaluation dataset."""
21
+
22
+ name: str = Field(..., description="Dataset name")
23
+ examples: list[EvalExample] = Field(default_factory=list, description="Evaluation examples")
24
+
25
+ @classmethod
26
+ def from_jsonl(cls, file_path: Path | str) -> "Dataset":
27
+ """Load dataset from JSONL file.
28
+
29
+ Args:
30
+ file_path: Path to JSONL file
31
+
32
+ Returns:
33
+ Dataset instance
34
+ """
35
+ file_path = Path(file_path)
36
+ examples = []
37
+
38
+ with open(file_path, encoding="utf-8") as f:
39
+ for line_num, line in enumerate(f, 1):
40
+ line = line.strip()
41
+ if not line:
42
+ continue
43
+
44
+ try:
45
+ data = json.loads(line)
46
+ example = EvalExample(
47
+ id=data.get("id", f"example_{line_num}"),
48
+ input=data.get("input", data.get("prompt", "")),
49
+ expected_output=data.get("expected_output", data.get("output")),
50
+ metadata=data.get("metadata", {}),
51
+ )
52
+ examples.append(example)
53
+ except json.JSONDecodeError as e:
54
+ raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
55
+
56
+ return cls(name=file_path.stem, examples=examples)
57
+
58
+ def to_jsonl(self, file_path: Path | str) -> None:
59
+ """Save dataset to JSONL file.
60
+
61
+ Args:
62
+ file_path: Path to save JSONL file
63
+ """
64
+ file_path = Path(file_path)
65
+ file_path.parent.mkdir(parents=True, exist_ok=True)
66
+
67
+ with open(file_path, "w", encoding="utf-8") as f:
68
+ for example in self.examples:
69
+ data = {
70
+ "id": example.id,
71
+ "input": example.input,
72
+ "expected_output": example.expected_output,
73
+ "metadata": example.metadata,
74
+ }
75
+ f.write(json.dumps(data) + "\n")
@@ -0,0 +1,101 @@
1
+ """Evaluation metrics for agent outputs."""
2
+
3
+ import re
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class Metric(BaseModel, ABC):
11
+ """Base class for evaluation metrics."""
12
+
13
+ name: str = Field(..., description="Metric name")
14
+
15
+ @abstractmethod
16
+ def score(self, expected: str | None, actual: str) -> float:
17
+ """Score an output against expected output.
18
+
19
+ Args:
20
+ expected: Expected output (can be None)
21
+ actual: Actual output
22
+
23
+ Returns:
24
+ Score between 0.0 and 1.0
25
+ """
26
+ raise NotImplementedError("Subclasses must implement score")
27
+
28
+
29
+ class ExactMatchMetric(Metric):
30
+ """Exact match metric (case-insensitive)."""
31
+
32
+ name: str = Field(default="exact_match", description="Metric name")
33
+ case_sensitive: bool = Field(default=False, description="Whether to be case-sensitive")
34
+
35
+ def score(self, expected: str | None, actual: str) -> float:
36
+ """Score based on exact match.
37
+
38
+ Args:
39
+ expected: Expected output
40
+ actual: Actual output
41
+
42
+ Returns:
43
+ 1.0 if exact match, 0.0 otherwise
44
+ """
45
+ if expected is None:
46
+ return 0.0
47
+
48
+ if self.case_sensitive:
49
+ return 1.0 if expected.strip() == actual.strip() else 0.0
50
+ else:
51
+ return 1.0 if expected.strip().lower() == actual.strip().lower() else 0.0
52
+
53
+
54
+ class ContainsMetric(Metric):
55
+ """Contains metric (checks if expected is contained in actual)."""
56
+
57
+ name: str = Field(default="contains", description="Metric name")
58
+ case_sensitive: bool = Field(default=False, description="Whether to be case-sensitive")
59
+
60
+ def score(self, expected: str | None, actual: str) -> float:
61
+ """Score based on substring match.
62
+
63
+ Args:
64
+ expected: Expected output (substring to find)
65
+ actual: Actual output
66
+
67
+ Returns:
68
+ 1.0 if expected is contained in actual, 0.0 otherwise
69
+ """
70
+ if expected is None:
71
+ return 0.0
72
+
73
+ if self.case_sensitive:
74
+ return 1.0 if expected.strip() in actual else 0.0
75
+ else:
76
+ return 1.0 if expected.strip().lower() in actual.lower() else 0.0
77
+
78
+
79
+ class RegexMetric(Metric):
80
+ """Regex pattern matching metric."""
81
+
82
+ name: str = Field(default="regex", description="Metric name")
83
+ pattern: str = Field(..., description="Regex pattern to match")
84
+ flags: int = Field(default=0, description="Regex flags")
85
+
86
+ def __init__(self, **data: Any) -> None:
87
+ """Initialize regex metric."""
88
+ super().__init__(**data)
89
+ self._compiled_pattern = re.compile(self.pattern, self.flags)
90
+
91
+ def score(self, expected: str | None, actual: str) -> float:
92
+ """Score based on regex match.
93
+
94
+ Args:
95
+ expected: Expected output (not used, pattern is used instead)
96
+ actual: Actual output
97
+
98
+ Returns:
99
+ 1.0 if pattern matches, 0.0 otherwise
100
+ """
101
+ return 1.0 if self._compiled_pattern.search(actual) else 0.0
@@ -0,0 +1,184 @@
1
+ """Evaluation runner for agent testing."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ from routekitai.core.runtime import Runtime
10
+ from routekitai.evals.dataset import Dataset
11
+ from routekitai.evals.metrics import Metric
12
+
13
+
14
+ class EvalResult(BaseModel):
15
+ """Result for a single evaluation example."""
16
+
17
+ example_id: str = Field(..., description="Example ID")
18
+ input: str = Field(..., description="Input prompt")
19
+ expected_output: str | None = Field(default=None, description="Expected output")
20
+ actual_output: str = Field(..., description="Actual output")
21
+ scores: dict[str, float] = Field(default_factory=dict, description="Metric scores")
22
+ trace_id: str | None = Field(default=None, description="Trace ID")
23
+ passed: bool = Field(default=False, description="Whether evaluation passed")
24
+ error: str | None = Field(default=None, description="Error if evaluation failed")
25
+
26
+
27
+ class EvalReport(BaseModel):
28
+ """Evaluation report with aggregated results."""
29
+
30
+ dataset_name: str = Field(..., description="Dataset name")
31
+ agent_name: str = Field(..., description="Agent name")
32
+ total_examples: int = Field(..., description="Total number of examples")
33
+ passed: int = Field(..., description="Number of passed examples")
34
+ failed: int = Field(..., description="Number of failed examples")
35
+ errors: int = Field(..., description="Number of errors")
36
+ average_scores: dict[str, float] = Field(
37
+ default_factory=dict, description="Average scores per metric"
38
+ )
39
+ results: list[EvalResult] = Field(default_factory=list, description="Individual results")
40
+
41
+
42
+ class EvalRunner(BaseModel):
43
+ """Runner for evaluating agents on datasets."""
44
+
45
+ runtime: Runtime = Field(..., description="Runtime for agent execution")
46
+ metrics: list[Metric] = Field(default_factory=list, description="Metrics to compute")
47
+ regression_mode: bool = Field(
48
+ default=False, description="Enable regression mode (compare to baseline)"
49
+ )
50
+ baseline_dir: Path | None = Field(default=None, description="Directory with baseline traces")
51
+
52
+ async def run(self, agent_name: str, dataset: Dataset, **kwargs: Any) -> EvalReport:
53
+ """Run evaluation on a dataset.
54
+
55
+ Args:
56
+ agent_name: Name of agent to evaluate
57
+ dataset: Evaluation dataset
58
+ **kwargs: Additional parameters for agent execution
59
+
60
+ Returns:
61
+ EvalReport with results
62
+ """
63
+ if agent_name not in self.runtime.agents:
64
+ raise ValueError(f"Agent '{agent_name}' not found in runtime")
65
+
66
+ results: list[EvalResult] = []
67
+ passed = 0
68
+ failed = 0
69
+ errors = 0
70
+
71
+ for example in dataset.examples:
72
+ try:
73
+ # Execute agent
74
+ result = await self.runtime.run(agent_name, example.input, **kwargs)
75
+ actual_output = result.output.content
76
+
77
+ # Compute scores
78
+ scores: dict[str, float] = {}
79
+ for metric in self.metrics:
80
+ score = metric.score(example.expected_output, actual_output)
81
+ scores[metric.name] = score
82
+
83
+ # Check if passed (at least one metric must score > 0)
84
+ passed_example = any(score > 0 for score in scores.values()) if scores else False
85
+
86
+ # Regression mode: compare to baseline
87
+ if self.regression_mode and self.baseline_dir:
88
+ baseline_result = await self._load_baseline_result(example.id)
89
+ if baseline_result:
90
+ if baseline_result.actual_output != actual_output:
91
+ # Output changed - mark as regression
92
+ passed_example = False
93
+ scores["regression"] = 0.0
94
+ else:
95
+ scores["regression"] = 1.0
96
+
97
+ if passed_example:
98
+ passed += 1
99
+ else:
100
+ failed += 1
101
+
102
+ eval_result = EvalResult(
103
+ example_id=example.id,
104
+ input=example.input,
105
+ expected_output=example.expected_output,
106
+ actual_output=actual_output,
107
+ scores=scores,
108
+ trace_id=result.trace_id,
109
+ passed=passed_example,
110
+ )
111
+ results.append(eval_result)
112
+
113
+ except Exception as e:
114
+ errors += 1
115
+ eval_result = EvalResult(
116
+ example_id=example.id,
117
+ input=example.input,
118
+ expected_output=example.expected_output,
119
+ actual_output="",
120
+ error=str(e),
121
+ passed=False,
122
+ )
123
+ results.append(eval_result)
124
+
125
+ # Compute average scores
126
+ average_scores: dict[str, float] = {}
127
+ if results:
128
+ metric_names: set[str] = set()
129
+ for eval_result in results:
130
+ metric_names.update(eval_result.scores.keys())
131
+
132
+ for metric_name in metric_names:
133
+ scores_list = [r.scores.get(metric_name, 0.0) for r in results if r.error is None]
134
+ if scores_list:
135
+ average_scores[metric_name] = sum(scores_list) / len(scores_list)
136
+
137
+ return EvalReport(
138
+ dataset_name=dataset.name,
139
+ agent_name=agent_name,
140
+ total_examples=len(dataset.examples),
141
+ passed=passed,
142
+ failed=failed,
143
+ errors=errors,
144
+ average_scores=average_scores,
145
+ results=results,
146
+ )
147
+
148
+ async def _load_baseline_result(self, example_id: str) -> EvalResult | None:
149
+ """Load baseline result for regression comparison.
150
+
151
+ Args:
152
+ example_id: Example ID
153
+
154
+ Returns:
155
+ Baseline result or None
156
+ """
157
+ if not self.baseline_dir:
158
+ return None
159
+
160
+ baseline_file = self.baseline_dir / f"{example_id}.json"
161
+ if not baseline_file.exists():
162
+ return None
163
+
164
+ try:
165
+ with open(baseline_file, encoding="utf-8") as f:
166
+ data = json.load(f)
167
+ return EvalResult(**data)
168
+ except Exception:
169
+ return None
170
+
171
+ def save_baseline(self, report: EvalReport, output_dir: Path) -> None:
172
+ """Save evaluation results as baseline for regression testing.
173
+
174
+ Args:
175
+ report: Evaluation report
176
+ output_dir: Directory to save baseline results
177
+ """
178
+ output_dir.mkdir(parents=True, exist_ok=True)
179
+
180
+ for result in report.results:
181
+ if result.error is None:
182
+ baseline_file = output_dir / f"{result.example_id}.json"
183
+ with open(baseline_file, "w", encoding="utf-8") as f:
184
+ json.dump(result.model_dump(), f, indent=2)