ase-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. ase/__init__.py +21 -0
  2. ase/adapters/__init__.py +14 -0
  3. ase/adapters/contract.py +28 -0
  4. ase/adapters/frameworks/__init__.py +17 -0
  5. ase/adapters/frameworks/base.py +259 -0
  6. ase/adapters/frameworks/langgraph.py +19 -0
  7. ase/adapters/frameworks/mcp.py +68 -0
  8. ase/adapters/frameworks/openai_agents.py +19 -0
  9. ase/adapters/frameworks/pydantic_ai.py +19 -0
  10. ase/adapters/io.py +50 -0
  11. ase/adapters/model.py +89 -0
  12. ase/adapters/protocol.py +72 -0
  13. ase/adapters/replay.py +261 -0
  14. ase/cli/__init__.py +7 -0
  15. ase/cli/_trace_outputs.py +40 -0
  16. ase/cli/adapter_cmd.py +38 -0
  17. ase/cli/certify_cmd.py +74 -0
  18. ase/cli/compare.py +145 -0
  19. ase/cli/doctor_cmd.py +45 -0
  20. ase/cli/examples_cmd.py +27 -0
  21. ase/cli/history_cmd.py +126 -0
  22. ase/cli/import_cmd.py +34 -0
  23. ase/cli/main.py +134 -0
  24. ase/cli/replay_cmd.py +48 -0
  25. ase/cli/report.py +115 -0
  26. ase/cli/spec_cmd.py +53 -0
  27. ase/cli/test_cmd.py +121 -0
  28. ase/config/env_loader.py +71 -0
  29. ase/config/loader.py +82 -0
  30. ase/config/model.py +51 -0
  31. ase/conformance/__init__.py +7 -0
  32. ase/conformance/matrix.py +111 -0
  33. ase/conformance/model.py +91 -0
  34. ase/conformance/schema.py +37 -0
  35. ase/conformance/service.py +194 -0
  36. ase/core/engine.py +348 -0
  37. ase/errors.py +59 -0
  38. ase/evaluation/__init__.py +7 -0
  39. ase/evaluation/base.py +63 -0
  40. ase/evaluation/consistency.py +79 -0
  41. ase/evaluation/correctness.py +117 -0
  42. ase/evaluation/efficiency.py +145 -0
  43. ase/evaluation/engine.py +182 -0
  44. ase/evaluation/policy.py +134 -0
  45. ase/evaluation/scoring.py +64 -0
  46. ase/evaluation/trace_summary.py +36 -0
  47. ase/examples_matrix.py +118 -0
  48. ase/reporting/__init__.py +7 -0
  49. ase/reporting/json_report.py +45 -0
  50. ase/reporting/junit.py +38 -0
  51. ase/reporting/markdown.py +32 -0
  52. ase/reporting/terminal.py +66 -0
  53. ase/scenario/__init__.py +7 -0
  54. ase/scenario/model.py +294 -0
  55. ase/scenario/parser.py +40 -0
  56. ase/storage/__init__.py +7 -0
  57. ase/storage/trace_store.py +136 -0
  58. ase/trace/__init__.py +7 -0
  59. ase/trace/builder.py +175 -0
  60. ase/trace/model.py +264 -0
  61. ase/trace/otel_export.py +75 -0
  62. ase/trace/otel_import.py +96 -0
  63. ase/trace/redaction.py +10 -0
  64. ase/trace/serializer.py +50 -0
  65. ase_python-0.1.0.dist-info/METADATA +184 -0
  66. ase_python-0.1.0.dist-info/RECORD +69 -0
  67. ase_python-0.1.0.dist-info/WHEEL +4 -0
  68. ase_python-0.1.0.dist-info/entry_points.txt +2 -0
  69. ase_python-0.1.0.dist-info/licenses/LICENSE +105 -0
ase/core/engine.py ADDED
@@ -0,0 +1,348 @@
1
+ """SimulationEngine — the main orchestration loop for a scenario run.
2
+
3
+ Ties together: environments, proxy, recorder, and evaluation.
4
+ The engine is the only component that knows how all the pieces connect.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import json
11
+ import os
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ import structlog
15
+ from ase.core.recorder import Recorder
16
+ from ase.core.resolver import Resolver
17
+ from ase.core.runtime_modes import run_direct_runtime
18
+
19
+ from ase.errors import ASEError
20
+ from ase.scenario.model import (
21
+ AgentRuntimeMode,
22
+ APISeed,
23
+ EnvironmentKind,
24
+ ScenarioConfig,
25
+ )
26
+ from ase.trace.model import Trace, TraceStatus
27
+
28
+ if TYPE_CHECKING:
29
+ pass
30
+
31
+ log = structlog.get_logger(__name__)
32
+
33
+ _POLL_INTERVAL_S = 0.1
34
+
35
+ # Hosts that must bypass the ASE proxy — LLM providers and auth endpoints.
36
+ # ASE intercepts tool calls (database, APIs, email), never model inference calls.
37
+ _PROXY_BYPASS_HOSTS = [
38
+ "api.openai.com",
39
+ "api.anthropic.com",
40
+ "generativelanguage.googleapis.com", # Gemini
41
+ "api.mistral.ai",
42
+ "api.cohere.com",
43
+ "bedrock-runtime.amazonaws.com",
44
+ "oauth2.googleapis.com",
45
+ "auth.openai.com",
46
+ ]
47
+
48
+
49
+ def _build_no_proxy(existing: str) -> str:
50
+ """Merge existing NO_PROXY with ASE's LLM bypass list, deduplicating."""
51
+ current = {h.strip() for h in existing.split(",") if h.strip()}
52
+ merged = sorted(current | set(_PROXY_BYPASS_HOSTS))
53
+ return ",".join(merged)
54
+
55
+
56
+ class RunResult:
57
+ """The outcome of a single scenario execution."""
58
+
59
+ def __init__(self, trace: Trace, environments: dict[str, Any]) -> None:
60
+ self.trace = trace
61
+ # Expose environments so evaluators can inspect post-run state
62
+ self.database: Any | None = environments.get("database")
63
+ self.api: Any | None = environments.get("api")
64
+ self.email: Any | None = environments.get("email")
65
+ self.filesystem: Any | None = environments.get("filesystem")
66
+ self.queue: Any | None = environments.get("queue")
67
+
68
+
69
+ class SimulationEngine:
70
+ """Orchestrates a full scenario run.
71
+
72
+ For each scenario:
73
+ 1. Build and set up environments from scenario config
74
+ 2. Start the HTTP proxy (if using HTTP interception)
75
+ 3. Launch the agent subprocess
76
+ 4. Wait for the agent to finish (or timeout)
77
+ 5. Tear down environments
78
+ 6. Return a RunResult containing the Trace
79
+ """
80
+
81
+ def __init__(self, proxy_host: str = "127.0.0.1", proxy_port: int = 0) -> None:
82
+ self._proxy_host = proxy_host
83
+ self._proxy_port = proxy_port # 0 = auto-allocate per run
84
+
85
+ async def run(
86
+ self, scenario: ScenarioConfig, debug: bool = False
87
+ ) -> RunResult:
88
+ """Execute a scenario and return the RunResult.
89
+
90
+ Never raises — errors are recorded in the trace with status=ERROR.
91
+ Each call creates its own proxy with a fresh port, so concurrent
92
+ calls to run() on the same engine instance are safe.
93
+ """
94
+ recorder = Recorder(
95
+ scenario_id=scenario.scenario_id,
96
+ scenario_name=scenario.name,
97
+ tags=scenario.tags,
98
+ )
99
+ self._seed_trace(recorder, scenario)
100
+ if scenario.runtime_mode != AgentRuntimeMode.PROXY:
101
+ try:
102
+ trace = await run_direct_runtime(scenario, debug=debug)
103
+ except ASEError as exc:
104
+ log.error("engine_run_failed", scenario=scenario.scenario_id, error=str(exc))
105
+ trace = recorder.finish(status=TraceStatus.ERROR, error_message=str(exc))
106
+ except Exception as exc:
107
+ log.error("engine_run_unexpected", scenario=scenario.scenario_id, error=str(exc))
108
+ trace = recorder.finish(
109
+ status=TraceStatus.ERROR,
110
+ error_message=f"unexpected error: {exc}",
111
+ )
112
+ return RunResult(trace=trace, environments={})
113
+
114
+ resolver = Resolver()
115
+ environments: dict[str, Any] = {}
116
+
117
+ # Proxy is created per-run so each concurrent scenario gets its own port
118
+ from ase.core.proxy import HTTPProxy
119
+
120
+ proxy = HTTPProxy(
121
+ resolver=resolver,
122
+ recorder=recorder,
123
+ host=self._proxy_host,
124
+ port=self._proxy_port,
125
+ )
126
+ try:
127
+ environments = await self._setup_environments(scenario, resolver)
128
+ await proxy.start()
129
+ trace = await self._execute_agent(
130
+ scenario, recorder, resolver, proxy.address, debug=debug
131
+ )
132
+ except ASEError as exc:
133
+ log.error("engine_run_failed", scenario=scenario.scenario_id, error=str(exc))
134
+ trace = recorder.finish(status=TraceStatus.ERROR, error_message=str(exc))
135
+ except Exception as exc:
136
+ log.error("engine_run_unexpected", scenario=scenario.scenario_id, error=str(exc))
137
+ trace = recorder.finish(
138
+ status=TraceStatus.ERROR,
139
+ error_message=f"unexpected error: {exc}",
140
+ )
141
+ finally:
142
+ await proxy.stop()
143
+ await self._teardown_environments(environments)
144
+
145
+ return RunResult(trace=trace, environments=environments)
146
+
147
+ # ------------------------------------------------------------------
148
+ # Internal helpers
149
+ # ------------------------------------------------------------------
150
+
151
+ def _seed_trace(self, recorder: Recorder, scenario: ScenarioConfig) -> None:
152
+ """Persist approvals and replay metadata before the agent launches."""
153
+ runtime = scenario.agent_runtime
154
+ recorder.set_runtime_provenance(
155
+ mode=scenario.runtime_mode.value,
156
+ framework=runtime.framework if runtime else None,
157
+ framework_version=runtime.version if runtime else None,
158
+ adapter_name=runtime.adapter_name if runtime else None,
159
+ event_source=runtime.event_source if runtime else None,
160
+ metadata=dict(runtime.metadata) if runtime else None,
161
+ )
162
+ for approval in scenario.fixtures.approvals:
163
+ recorder.record_approval(
164
+ approval_id=approval.approval_id,
165
+ actor=approval.actor,
166
+ granted=approval.granted,
167
+ )
168
+ recorder.set_determinism_metadata(
169
+ fixture_payload=json.loads(scenario.fixtures.model_dump_json()),
170
+ replay_key=f"{scenario.scenario_id}:spec-v{scenario.spec_version}",
171
+ )
172
+
173
+ async def _setup_environments(
174
+ self, scenario: ScenarioConfig, resolver: Resolver
175
+ ) -> dict[str, Any]:
176
+ """Instantiate, seed, and register all scenario environments."""
177
+ from ase.environments.api import APIEnvironment
178
+ from ase.environments.database import DatabaseEnvironment
179
+ from ase.environments.email import EmailEnvironment
180
+ from ase.environments.filesystem import FilesystemEnvironment
181
+ from ase.environments.queue import QueueEnvironment
182
+
183
+ from ase.trace.model import ToolCallKind
184
+
185
+ environments: dict[str, Any] = {}
186
+ env_config = scenario.environment
187
+
188
+ if env_config.kind == EnvironmentKind.REAL:
189
+ # No simulation — pass through to real backends
190
+ return environments
191
+
192
+ if env_config.database is not None:
193
+ db = DatabaseEnvironment(seed=env_config.database)
194
+ await db.setup()
195
+ resolver.register(ToolCallKind.DATABASE, db)
196
+ environments["database"] = db
197
+
198
+ api_seed = _merged_api_seed(scenario)
199
+ if api_seed.recordings:
200
+ api = APIEnvironment(seed=api_seed)
201
+ await api.setup()
202
+ resolver.register(ToolCallKind.HTTP_API, api)
203
+ environments["api"] = api
204
+
205
+ # Email is always available in simulated mode
206
+ if env_config.kind == EnvironmentKind.SIMULATED:
207
+ email = EmailEnvironment()
208
+ await email.setup()
209
+ resolver.register(ToolCallKind.EMAIL, email)
210
+ environments["email"] = email
211
+ if scenario.fixtures.filesystem:
212
+ filesystem = FilesystemEnvironment(scenario.fixtures.filesystem)
213
+ await filesystem.setup()
214
+ environments["filesystem"] = filesystem
215
+ if scenario.fixtures.queue_messages or scenario.fixtures.webhook_events:
216
+ queue = QueueEnvironment(
217
+ queue_messages=scenario.fixtures.queue_messages,
218
+ webhook_events=scenario.fixtures.webhook_events,
219
+ )
220
+ await queue.setup()
221
+ environments["queue"] = queue
222
+
223
+ return environments
224
+
225
+ async def _execute_agent(
226
+ self,
227
+ scenario: ScenarioConfig,
228
+ recorder: Recorder,
229
+ resolver: Resolver,
230
+ proxy_address: str,
231
+ debug: bool = False,
232
+ ) -> Trace:
233
+ """Launch the agent subprocess and wait for it to complete.
234
+
235
+ proxy_address: the actual address of the proxy started for this run.
236
+ debug: when True, inherit terminal stdio so agent output streams live.
237
+ """
238
+ agent_cfg = scenario.agent
239
+ proxy_env = {
240
+ **os.environ, # inherit PATH, API keys, etc. from the parent process
241
+ "HTTP_PROXY": proxy_address,
242
+ "HTTPS_PROXY": proxy_address,
243
+ "ASE_TRACE_ID": recorder.trace_id,
244
+ # LLM provider hosts must bypass the proxy — ASE intercepts tool calls,
245
+ # not the agent's model calls. Append to any existing NO_PROXY value.
246
+ "NO_PROXY": _build_no_proxy(os.environ.get("NO_PROXY", "")),
247
+ **agent_cfg.env, # scenario-level overrides win
248
+ }
249
+
250
+ log.info(
251
+ "agent_launching",
252
+ scenario=scenario.scenario_id,
253
+ command=agent_cfg.command,
254
+ )
255
+
256
+ if debug:
257
+ return await self._execute_agent_debug(scenario, agent_cfg, proxy_env, recorder)
258
+
259
+ try:
260
+ process = await asyncio.create_subprocess_exec(
261
+ *agent_cfg.command,
262
+ env=proxy_env,
263
+ stdout=asyncio.subprocess.PIPE,
264
+ stderr=asyncio.subprocess.PIPE,
265
+ )
266
+ _, stderr = await asyncio.wait_for(
267
+ process.communicate(),
268
+ timeout=agent_cfg.timeout_seconds,
269
+ )
270
+ except TimeoutError:
271
+ log.warning("agent_timeout", scenario=scenario.scenario_id)
272
+ return recorder.finish(
273
+ status=TraceStatus.ERROR,
274
+ error_message=f"agent timed out after {agent_cfg.timeout_seconds}s",
275
+ )
276
+ except Exception as exc:
277
+ return recorder.finish(
278
+ status=TraceStatus.ERROR,
279
+ error_message=f"failed to launch agent: {exc}",
280
+ )
281
+
282
+ exit_code = process.returncode or 0
283
+ log.info("agent_exited", scenario=scenario.scenario_id, exit_code=exit_code)
284
+
285
+ stderr_text = stderr.decode("utf-8", errors="replace").strip()
286
+ status = TraceStatus.PASSED if exit_code == 0 else TraceStatus.FAILED
287
+ # Always capture stderr; only treat it as error_message on failure
288
+ return recorder.finish(
289
+ status=status,
290
+ error_message=stderr_text if exit_code != 0 else None,
291
+ stderr_output=stderr_text or None,
292
+ )
293
+
294
+ async def _execute_agent_debug(
295
+ self,
296
+ scenario: ScenarioConfig,
297
+ agent_cfg: object,
298
+ proxy_env: dict[str, str],
299
+ recorder: Recorder,
300
+ ) -> Trace:
301
+ """Debug variant — inherits terminal stdio so output streams live."""
302
+ from ase.scenario.model import AgentConfig
303
+ assert isinstance(agent_cfg, AgentConfig)
304
+ try:
305
+ process = await asyncio.create_subprocess_exec(
306
+ *agent_cfg.command,
307
+ env=proxy_env,
308
+ # no PIPE — output goes directly to the terminal
309
+ )
310
+ await asyncio.wait_for(process.wait(), timeout=agent_cfg.timeout_seconds)
311
+ except TimeoutError:
312
+ log.warning("agent_timeout", scenario=scenario.scenario_id)
313
+ return recorder.finish(
314
+ status=TraceStatus.ERROR,
315
+ error_message=f"agent timed out after {agent_cfg.timeout_seconds}s",
316
+ )
317
+ except Exception as exc:
318
+ return recorder.finish(
319
+ status=TraceStatus.ERROR,
320
+ error_message=f"failed to launch agent: {exc}",
321
+ )
322
+ exit_code = process.returncode or 0
323
+ log.info("agent_exited", scenario=scenario.scenario_id, exit_code=exit_code)
324
+ status = TraceStatus.PASSED if exit_code == 0 else TraceStatus.FAILED
325
+ return recorder.finish(status=status)
326
+
327
+ @staticmethod
328
+ async def _teardown_environments(environments: dict[str, Any]) -> None:
329
+ """Tear down all environments, logging but not re-raising errors."""
330
+ from ase.environments.base import EnvironmentProvider
331
+
332
+ for name, env in environments.items():
333
+ if isinstance(env, EnvironmentProvider):
334
+ try:
335
+ await env.teardown()
336
+ except Exception as exc:
337
+ log.warning("teardown_error", environment=name, error=str(exc))
338
+
339
+
340
+ def _merged_api_seed(scenario: ScenarioConfig) -> APISeed:
341
+ """Merge API recordings declared in environment and fixtures."""
342
+ environment_recordings = list(
343
+ scenario.environment.api.recordings if scenario.environment.api else []
344
+ )
345
+ fixture_recordings = [
346
+ fixture.model_dump() for fixture in scenario.fixtures.http_recordings
347
+ ]
348
+ return APISeed(recordings=environment_recordings + fixture_recordings)
ase/errors.py ADDED
@@ -0,0 +1,59 @@
1
+ """Shared ASE exception hierarchy.
2
+
3
+ Keeping one root error type makes CLI and engine layers propagate contextual
4
+ failures consistently without depending on framework-specific exceptions.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+
10
+ class ASEError(Exception):
11
+ """Root of ASE's user-facing and internal error hierarchy."""
12
+
13
+
14
+ class CLIError(ASEError):
15
+ """Raised when a CLI workflow cannot be completed as requested."""
16
+
17
+
18
+ class ConfigError(ASEError):
19
+ """Raised when ASE configuration or environment loading fails."""
20
+
21
+
22
+ class AdapterError(ASEError):
23
+ """Raised when adapter SDKs cannot emit or persist framework events."""
24
+
25
+
26
+ class AdapterProtocolError(ASEError):
27
+ """Raised when adapter event streams are missing or malformed."""
28
+
29
+
30
+ class RuntimeModeError(ASEError):
31
+ """Raised when direct runtime execution cannot produce a valid trace."""
32
+
33
+
34
+ class TraceSerializationError(ASEError):
35
+ """Raised when native ASE traces cannot be parsed or written safely."""
36
+
37
+
38
+ class TraceError(ASEError):
39
+ """Raised when trace construction or mutation violates ASE invariants."""
40
+
41
+
42
+ class TraceSchemaMigrationError(ASEError):
43
+ """Raised when a stored trace cannot be interpreted by this schema version."""
44
+
45
+
46
+ class ConformanceError(ASEError):
47
+ """Raised when certification inputs or outputs violate ASE contracts."""
48
+
49
+
50
+ class EvaluatorNotFoundError(ASEError):
51
+ """Raised when a scenario references an unknown evaluator."""
52
+
53
+
54
+ class CacheError(ASEError):
55
+ """Raised when the content-addressed response cache cannot be maintained."""
56
+
57
+
58
+ class OTelImportError(ASEError):
59
+ """Raised when OTEL-like trace input cannot be converted into ASE format."""
@@ -0,0 +1,7 @@
1
+ """Source-backed evaluation package that composes with recovery overlays."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pkgutil import extend_path
6
+
7
+ __path__ = extend_path(__path__, __name__)
ase/evaluation/base.py ADDED
@@ -0,0 +1,63 @@
1
+ """Shared evaluator protocol and result models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from enum import StrEnum
7
+ from typing import Any
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ class Pillar(StrEnum):
13
+ """Define ASE's top-level scoring buckets."""
14
+
15
+ CORRECTNESS = "correctness"
16
+ SAFETY = "safety"
17
+ EFFICIENCY = "efficiency"
18
+ CONSISTENCY = "consistency"
19
+ CUSTOM = "custom"
20
+
21
+
22
+ class AssertionResult(BaseModel):
23
+ """Represent one evaluator outcome within a scenario run."""
24
+
25
+ evaluator: str
26
+ pillar: Pillar
27
+ passed: bool
28
+ score: float
29
+ message: str
30
+ details: dict[str, Any] = Field(default_factory=dict)
31
+
32
+
33
+ class EvaluationSummary(BaseModel):
34
+ """Summarize all evaluator outcomes for one trace."""
35
+
36
+ trace_id: str
37
+ scenario_id: str
38
+ passed: bool
39
+ ase_score: float
40
+ total: int
41
+ passed_count: int
42
+ failed_count: int
43
+ results: list[AssertionResult] = Field(default_factory=list)
44
+ pillar_scores: dict[str, float] = Field(default_factory=dict)
45
+ failing_evaluators: list[str] = Field(default_factory=list)
46
+
47
+
48
+ class Evaluator(ABC):
49
+ """Define the stable extension point for ASE assertions."""
50
+
51
+ @property
52
+ @abstractmethod
53
+ def name(self) -> str:
54
+ """Return the public evaluator name referenced by scenarios."""
55
+
56
+ @property
57
+ @abstractmethod
58
+ def pillar(self) -> Pillar:
59
+ """Return the default scoring pillar for this evaluator."""
60
+
61
+ @abstractmethod
62
+ def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
63
+ """Evaluate one trace and return a structured assertion result."""
@@ -0,0 +1,79 @@
1
+ """Consistency evaluators used for baseline-style comparisons."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from ase.evaluation.base import AssertionResult, Evaluator, Pillar
8
+
9
+
10
+ class SameToolCallsEvaluator(Evaluator):
11
+ """Compare tool-call counts against a provided baseline trace."""
12
+
13
+ @property
14
+ def name(self) -> str:
15
+ return "same_tool_calls"
16
+
17
+ @property
18
+ def pillar(self) -> Pillar:
19
+ return Pillar.CONSISTENCY
20
+
21
+ def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
22
+ baseline = context.get("baseline_trace")
23
+ if baseline is None:
24
+ return _passing(self.name, self.pillar, "no baseline provided")
25
+ current = getattr(getattr(trace, "metrics", None), "total_tool_calls", 0)
26
+ previous = getattr(getattr(baseline, "metrics", None), "total_tool_calls", 0)
27
+ passed = current == previous
28
+ return AssertionResult(
29
+ evaluator=self.name,
30
+ pillar=self.pillar,
31
+ passed=passed,
32
+ score=1.0 if passed else 0.0,
33
+ message=f"tool calls {current} vs baseline {previous}",
34
+ details={"current": current, "baseline": previous},
35
+ )
36
+
37
+
38
+ class SameMetricsEvaluator(Evaluator):
39
+ """Compare stable metrics against a provided baseline trace."""
40
+
41
+ @property
42
+ def name(self) -> str:
43
+ return "same_metrics"
44
+
45
+ @property
46
+ def pillar(self) -> Pillar:
47
+ return Pillar.CONSISTENCY
48
+
49
+ def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
50
+ baseline = context.get("baseline_trace")
51
+ if baseline is None:
52
+ return _passing(self.name, self.pillar, "no baseline provided")
53
+ current_metrics = getattr(trace, "metrics", None)
54
+ baseline_metrics = getattr(baseline, "metrics", None)
55
+ keys = ["total_tool_calls", "total_llm_calls", "total_tokens_used"]
56
+ deltas = {
57
+ key: getattr(current_metrics, key, 0) - getattr(baseline_metrics, key, 0)
58
+ for key in keys
59
+ }
60
+ passed = all(delta == 0 for delta in deltas.values())
61
+ return AssertionResult(
62
+ evaluator=self.name,
63
+ pillar=self.pillar,
64
+ passed=passed,
65
+ score=1.0 if passed else 0.0,
66
+ message="metrics match baseline" if passed else "metrics differ from baseline",
67
+ details=deltas,
68
+ )
69
+
70
+
71
+ def _passing(evaluator: str, pillar: Pillar, message: str) -> AssertionResult:
72
+ """Return a neutral passing result when no baseline context exists."""
73
+ return AssertionResult(
74
+ evaluator=evaluator,
75
+ pillar=pillar,
76
+ passed=True,
77
+ score=1.0,
78
+ message=message,
79
+ )
@@ -0,0 +1,117 @@
1
+ """Correctness evaluators for observable agent behavior.
2
+
3
+ These evaluators answer the core ASE question: did the agent take the
4
+ expected actions against its environment, regardless of the exact text it
5
+ produced for a user.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any
11
+
12
+ from ase.evaluation.base import AssertionResult, Evaluator, Pillar
13
+ from ase.trace.model import ToolCallEvent, Trace, TraceEventKind
14
+
15
+
16
+ class ToolCalledEvaluator(Evaluator):
17
+ """Verify that a scenario exercised the expected tool path."""
18
+
19
+ @property
20
+ def name(self) -> str:
21
+ return "tool_called"
22
+
23
+ @property
24
+ def pillar(self) -> Pillar:
25
+ return Pillar.CORRECTNESS
26
+
27
+ def evaluate(self, trace: Trace, params: dict[str, Any], **context: Any) -> AssertionResult:
28
+ del context
29
+ kind = str(params.get("kind", "")).strip().lower()
30
+ minimum = max(0, int(params.get("minimum", 1)))
31
+ method = _optional_upper(params.get("method"))
32
+ target_contains = _optional_lower(params.get("target_contains"))
33
+ matches = _matching_calls(trace, kind, method, target_contains)
34
+ passed = len(matches) >= minimum
35
+ label = kind or "tool"
36
+ return AssertionResult(
37
+ evaluator=self.name,
38
+ pillar=self.pillar,
39
+ passed=passed,
40
+ score=1.0 if passed else 0.0,
41
+ message=_tool_called_message(label, minimum, len(matches), passed),
42
+ details={
43
+ "kind": kind or None,
44
+ "minimum": minimum,
45
+ "actual": len(matches),
46
+ "method": method,
47
+ "target_contains": target_contains,
48
+ },
49
+ )
50
+
51
+
52
+ class APICalledEvaluator(Evaluator):
53
+ """Keep API-call assertions as a neutral alias over the shared tool model."""
54
+
55
+ @property
56
+ def name(self) -> str:
57
+ return "api_called"
58
+
59
+ @property
60
+ def pillar(self) -> Pillar:
61
+ return Pillar.CORRECTNESS
62
+
63
+ def evaluate(self, trace: Trace, params: dict[str, Any], **context: Any) -> AssertionResult:
64
+ tool_params = dict(params)
65
+ tool_params["kind"] = "http_api"
66
+ return ToolCalledEvaluator().evaluate(trace, tool_params, **context).model_copy(
67
+ update={"evaluator": self.name}
68
+ )
69
+
70
+
71
+ def _matching_calls(
72
+ trace: Trace,
73
+ kind: str,
74
+ method: str | None,
75
+ target_contains: str | None,
76
+ ) -> list[ToolCallEvent]:
77
+ """Filter tool calls using only generic trace-level properties."""
78
+ matches: list[ToolCallEvent] = []
79
+ for event in trace.events:
80
+ if event.kind != TraceEventKind.TOOL_CALL or event.tool_call is None:
81
+ continue
82
+ if kind and event.tool_call.kind.value != kind:
83
+ continue
84
+ if method and event.tool_call.method.upper() != method:
85
+ continue
86
+ if target_contains and target_contains not in event.tool_call.target.lower():
87
+ continue
88
+ matches.append(event.tool_call)
89
+ return matches
90
+
91
+
92
+ def _optional_upper(value: object) -> str | None:
93
+ """Normalize optional HTTP or tool verbs for equality checks."""
94
+ if value is None:
95
+ return None
96
+ text = str(value).strip()
97
+ return text.upper() if text else None
98
+
99
+
100
+ def _optional_lower(value: object) -> str | None:
101
+ """Normalize optional contains-style filters for case-insensitive matching."""
102
+ if value is None:
103
+ return None
104
+ text = str(value).strip()
105
+ return text.lower() if text else None
106
+
107
+
108
+ def _tool_called_message(
109
+ kind: str,
110
+ minimum: int,
111
+ actual: int,
112
+ passed: bool,
113
+ ) -> str:
114
+ """Render stable operator-facing messages for correctness assertions."""
115
+ if passed:
116
+ return f"observed {actual} '{kind}' call(s)"
117
+ return f"expected >={minimum} '{kind}' call(s), got {actual}"