ase-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ase/__init__.py +21 -0
- ase/adapters/__init__.py +14 -0
- ase/adapters/contract.py +28 -0
- ase/adapters/frameworks/__init__.py +17 -0
- ase/adapters/frameworks/base.py +259 -0
- ase/adapters/frameworks/langgraph.py +19 -0
- ase/adapters/frameworks/mcp.py +68 -0
- ase/adapters/frameworks/openai_agents.py +19 -0
- ase/adapters/frameworks/pydantic_ai.py +19 -0
- ase/adapters/io.py +50 -0
- ase/adapters/model.py +89 -0
- ase/adapters/protocol.py +72 -0
- ase/adapters/replay.py +261 -0
- ase/cli/__init__.py +7 -0
- ase/cli/_trace_outputs.py +40 -0
- ase/cli/adapter_cmd.py +38 -0
- ase/cli/certify_cmd.py +74 -0
- ase/cli/compare.py +145 -0
- ase/cli/doctor_cmd.py +45 -0
- ase/cli/examples_cmd.py +27 -0
- ase/cli/history_cmd.py +126 -0
- ase/cli/import_cmd.py +34 -0
- ase/cli/main.py +134 -0
- ase/cli/replay_cmd.py +48 -0
- ase/cli/report.py +115 -0
- ase/cli/spec_cmd.py +53 -0
- ase/cli/test_cmd.py +121 -0
- ase/config/env_loader.py +71 -0
- ase/config/loader.py +82 -0
- ase/config/model.py +51 -0
- ase/conformance/__init__.py +7 -0
- ase/conformance/matrix.py +111 -0
- ase/conformance/model.py +91 -0
- ase/conformance/schema.py +37 -0
- ase/conformance/service.py +194 -0
- ase/core/engine.py +348 -0
- ase/errors.py +59 -0
- ase/evaluation/__init__.py +7 -0
- ase/evaluation/base.py +63 -0
- ase/evaluation/consistency.py +79 -0
- ase/evaluation/correctness.py +117 -0
- ase/evaluation/efficiency.py +145 -0
- ase/evaluation/engine.py +182 -0
- ase/evaluation/policy.py +134 -0
- ase/evaluation/scoring.py +64 -0
- ase/evaluation/trace_summary.py +36 -0
- ase/examples_matrix.py +118 -0
- ase/reporting/__init__.py +7 -0
- ase/reporting/json_report.py +45 -0
- ase/reporting/junit.py +38 -0
- ase/reporting/markdown.py +32 -0
- ase/reporting/terminal.py +66 -0
- ase/scenario/__init__.py +7 -0
- ase/scenario/model.py +294 -0
- ase/scenario/parser.py +40 -0
- ase/storage/__init__.py +7 -0
- ase/storage/trace_store.py +136 -0
- ase/trace/__init__.py +7 -0
- ase/trace/builder.py +175 -0
- ase/trace/model.py +264 -0
- ase/trace/otel_export.py +75 -0
- ase/trace/otel_import.py +96 -0
- ase/trace/redaction.py +10 -0
- ase/trace/serializer.py +50 -0
- ase_python-0.1.0.dist-info/METADATA +184 -0
- ase_python-0.1.0.dist-info/RECORD +69 -0
- ase_python-0.1.0.dist-info/WHEEL +4 -0
- ase_python-0.1.0.dist-info/entry_points.txt +2 -0
- ase_python-0.1.0.dist-info/licenses/LICENSE +105 -0
ase/core/engine.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
"""SimulationEngine — the main orchestration loop for a scenario run.
|
|
2
|
+
|
|
3
|
+
Ties together: environments, proxy, recorder, and evaluation.
|
|
4
|
+
The engine is the only component that knows how all the pieces connect.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from typing import TYPE_CHECKING, Any
|
|
13
|
+
|
|
14
|
+
import structlog
|
|
15
|
+
from ase.core.recorder import Recorder
|
|
16
|
+
from ase.core.resolver import Resolver
|
|
17
|
+
from ase.core.runtime_modes import run_direct_runtime
|
|
18
|
+
|
|
19
|
+
from ase.errors import ASEError
|
|
20
|
+
from ase.scenario.model import (
|
|
21
|
+
AgentRuntimeMode,
|
|
22
|
+
APISeed,
|
|
23
|
+
EnvironmentKind,
|
|
24
|
+
ScenarioConfig,
|
|
25
|
+
)
|
|
26
|
+
from ase.trace.model import Trace, TraceStatus
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
log = structlog.get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
_POLL_INTERVAL_S = 0.1
|
|
34
|
+
|
|
35
|
+
# Hosts that must bypass the ASE proxy — LLM providers and auth endpoints.
|
|
36
|
+
# ASE intercepts tool calls (database, APIs, email), never model inference calls.
|
|
37
|
+
_PROXY_BYPASS_HOSTS = [
|
|
38
|
+
"api.openai.com",
|
|
39
|
+
"api.anthropic.com",
|
|
40
|
+
"generativelanguage.googleapis.com", # Gemini
|
|
41
|
+
"api.mistral.ai",
|
|
42
|
+
"api.cohere.com",
|
|
43
|
+
"bedrock-runtime.amazonaws.com",
|
|
44
|
+
"oauth2.googleapis.com",
|
|
45
|
+
"auth.openai.com",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _build_no_proxy(existing: str) -> str:
|
|
50
|
+
"""Merge existing NO_PROXY with ASE's LLM bypass list, deduplicating."""
|
|
51
|
+
current = {h.strip() for h in existing.split(",") if h.strip()}
|
|
52
|
+
merged = sorted(current | set(_PROXY_BYPASS_HOSTS))
|
|
53
|
+
return ",".join(merged)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class RunResult:
|
|
57
|
+
"""The outcome of a single scenario execution."""
|
|
58
|
+
|
|
59
|
+
def __init__(self, trace: Trace, environments: dict[str, Any]) -> None:
|
|
60
|
+
self.trace = trace
|
|
61
|
+
# Expose environments so evaluators can inspect post-run state
|
|
62
|
+
self.database: Any | None = environments.get("database")
|
|
63
|
+
self.api: Any | None = environments.get("api")
|
|
64
|
+
self.email: Any | None = environments.get("email")
|
|
65
|
+
self.filesystem: Any | None = environments.get("filesystem")
|
|
66
|
+
self.queue: Any | None = environments.get("queue")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class SimulationEngine:
|
|
70
|
+
"""Orchestrates a full scenario run.
|
|
71
|
+
|
|
72
|
+
For each scenario:
|
|
73
|
+
1. Build and set up environments from scenario config
|
|
74
|
+
2. Start the HTTP proxy (if using HTTP interception)
|
|
75
|
+
3. Launch the agent subprocess
|
|
76
|
+
4. Wait for the agent to finish (or timeout)
|
|
77
|
+
5. Tear down environments
|
|
78
|
+
6. Return a RunResult containing the Trace
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(self, proxy_host: str = "127.0.0.1", proxy_port: int = 0) -> None:
|
|
82
|
+
self._proxy_host = proxy_host
|
|
83
|
+
self._proxy_port = proxy_port # 0 = auto-allocate per run
|
|
84
|
+
|
|
85
|
+
async def run(
|
|
86
|
+
self, scenario: ScenarioConfig, debug: bool = False
|
|
87
|
+
) -> RunResult:
|
|
88
|
+
"""Execute a scenario and return the RunResult.
|
|
89
|
+
|
|
90
|
+
Never raises — errors are recorded in the trace with status=ERROR.
|
|
91
|
+
Each call creates its own proxy with a fresh port, so concurrent
|
|
92
|
+
calls to run() on the same engine instance are safe.
|
|
93
|
+
"""
|
|
94
|
+
recorder = Recorder(
|
|
95
|
+
scenario_id=scenario.scenario_id,
|
|
96
|
+
scenario_name=scenario.name,
|
|
97
|
+
tags=scenario.tags,
|
|
98
|
+
)
|
|
99
|
+
self._seed_trace(recorder, scenario)
|
|
100
|
+
if scenario.runtime_mode != AgentRuntimeMode.PROXY:
|
|
101
|
+
try:
|
|
102
|
+
trace = await run_direct_runtime(scenario, debug=debug)
|
|
103
|
+
except ASEError as exc:
|
|
104
|
+
log.error("engine_run_failed", scenario=scenario.scenario_id, error=str(exc))
|
|
105
|
+
trace = recorder.finish(status=TraceStatus.ERROR, error_message=str(exc))
|
|
106
|
+
except Exception as exc:
|
|
107
|
+
log.error("engine_run_unexpected", scenario=scenario.scenario_id, error=str(exc))
|
|
108
|
+
trace = recorder.finish(
|
|
109
|
+
status=TraceStatus.ERROR,
|
|
110
|
+
error_message=f"unexpected error: {exc}",
|
|
111
|
+
)
|
|
112
|
+
return RunResult(trace=trace, environments={})
|
|
113
|
+
|
|
114
|
+
resolver = Resolver()
|
|
115
|
+
environments: dict[str, Any] = {}
|
|
116
|
+
|
|
117
|
+
# Proxy is created per-run so each concurrent scenario gets its own port
|
|
118
|
+
from ase.core.proxy import HTTPProxy
|
|
119
|
+
|
|
120
|
+
proxy = HTTPProxy(
|
|
121
|
+
resolver=resolver,
|
|
122
|
+
recorder=recorder,
|
|
123
|
+
host=self._proxy_host,
|
|
124
|
+
port=self._proxy_port,
|
|
125
|
+
)
|
|
126
|
+
try:
|
|
127
|
+
environments = await self._setup_environments(scenario, resolver)
|
|
128
|
+
await proxy.start()
|
|
129
|
+
trace = await self._execute_agent(
|
|
130
|
+
scenario, recorder, resolver, proxy.address, debug=debug
|
|
131
|
+
)
|
|
132
|
+
except ASEError as exc:
|
|
133
|
+
log.error("engine_run_failed", scenario=scenario.scenario_id, error=str(exc))
|
|
134
|
+
trace = recorder.finish(status=TraceStatus.ERROR, error_message=str(exc))
|
|
135
|
+
except Exception as exc:
|
|
136
|
+
log.error("engine_run_unexpected", scenario=scenario.scenario_id, error=str(exc))
|
|
137
|
+
trace = recorder.finish(
|
|
138
|
+
status=TraceStatus.ERROR,
|
|
139
|
+
error_message=f"unexpected error: {exc}",
|
|
140
|
+
)
|
|
141
|
+
finally:
|
|
142
|
+
await proxy.stop()
|
|
143
|
+
await self._teardown_environments(environments)
|
|
144
|
+
|
|
145
|
+
return RunResult(trace=trace, environments=environments)
|
|
146
|
+
|
|
147
|
+
# ------------------------------------------------------------------
|
|
148
|
+
# Internal helpers
|
|
149
|
+
# ------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
def _seed_trace(self, recorder: Recorder, scenario: ScenarioConfig) -> None:
|
|
152
|
+
"""Persist approvals and replay metadata before the agent launches."""
|
|
153
|
+
runtime = scenario.agent_runtime
|
|
154
|
+
recorder.set_runtime_provenance(
|
|
155
|
+
mode=scenario.runtime_mode.value,
|
|
156
|
+
framework=runtime.framework if runtime else None,
|
|
157
|
+
framework_version=runtime.version if runtime else None,
|
|
158
|
+
adapter_name=runtime.adapter_name if runtime else None,
|
|
159
|
+
event_source=runtime.event_source if runtime else None,
|
|
160
|
+
metadata=dict(runtime.metadata) if runtime else None,
|
|
161
|
+
)
|
|
162
|
+
for approval in scenario.fixtures.approvals:
|
|
163
|
+
recorder.record_approval(
|
|
164
|
+
approval_id=approval.approval_id,
|
|
165
|
+
actor=approval.actor,
|
|
166
|
+
granted=approval.granted,
|
|
167
|
+
)
|
|
168
|
+
recorder.set_determinism_metadata(
|
|
169
|
+
fixture_payload=json.loads(scenario.fixtures.model_dump_json()),
|
|
170
|
+
replay_key=f"{scenario.scenario_id}:spec-v{scenario.spec_version}",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
async def _setup_environments(
|
|
174
|
+
self, scenario: ScenarioConfig, resolver: Resolver
|
|
175
|
+
) -> dict[str, Any]:
|
|
176
|
+
"""Instantiate, seed, and register all scenario environments."""
|
|
177
|
+
from ase.environments.api import APIEnvironment
|
|
178
|
+
from ase.environments.database import DatabaseEnvironment
|
|
179
|
+
from ase.environments.email import EmailEnvironment
|
|
180
|
+
from ase.environments.filesystem import FilesystemEnvironment
|
|
181
|
+
from ase.environments.queue import QueueEnvironment
|
|
182
|
+
|
|
183
|
+
from ase.trace.model import ToolCallKind
|
|
184
|
+
|
|
185
|
+
environments: dict[str, Any] = {}
|
|
186
|
+
env_config = scenario.environment
|
|
187
|
+
|
|
188
|
+
if env_config.kind == EnvironmentKind.REAL:
|
|
189
|
+
# No simulation — pass through to real backends
|
|
190
|
+
return environments
|
|
191
|
+
|
|
192
|
+
if env_config.database is not None:
|
|
193
|
+
db = DatabaseEnvironment(seed=env_config.database)
|
|
194
|
+
await db.setup()
|
|
195
|
+
resolver.register(ToolCallKind.DATABASE, db)
|
|
196
|
+
environments["database"] = db
|
|
197
|
+
|
|
198
|
+
api_seed = _merged_api_seed(scenario)
|
|
199
|
+
if api_seed.recordings:
|
|
200
|
+
api = APIEnvironment(seed=api_seed)
|
|
201
|
+
await api.setup()
|
|
202
|
+
resolver.register(ToolCallKind.HTTP_API, api)
|
|
203
|
+
environments["api"] = api
|
|
204
|
+
|
|
205
|
+
# Email is always available in simulated mode
|
|
206
|
+
if env_config.kind == EnvironmentKind.SIMULATED:
|
|
207
|
+
email = EmailEnvironment()
|
|
208
|
+
await email.setup()
|
|
209
|
+
resolver.register(ToolCallKind.EMAIL, email)
|
|
210
|
+
environments["email"] = email
|
|
211
|
+
if scenario.fixtures.filesystem:
|
|
212
|
+
filesystem = FilesystemEnvironment(scenario.fixtures.filesystem)
|
|
213
|
+
await filesystem.setup()
|
|
214
|
+
environments["filesystem"] = filesystem
|
|
215
|
+
if scenario.fixtures.queue_messages or scenario.fixtures.webhook_events:
|
|
216
|
+
queue = QueueEnvironment(
|
|
217
|
+
queue_messages=scenario.fixtures.queue_messages,
|
|
218
|
+
webhook_events=scenario.fixtures.webhook_events,
|
|
219
|
+
)
|
|
220
|
+
await queue.setup()
|
|
221
|
+
environments["queue"] = queue
|
|
222
|
+
|
|
223
|
+
return environments
|
|
224
|
+
|
|
225
|
+
async def _execute_agent(
|
|
226
|
+
self,
|
|
227
|
+
scenario: ScenarioConfig,
|
|
228
|
+
recorder: Recorder,
|
|
229
|
+
resolver: Resolver,
|
|
230
|
+
proxy_address: str,
|
|
231
|
+
debug: bool = False,
|
|
232
|
+
) -> Trace:
|
|
233
|
+
"""Launch the agent subprocess and wait for it to complete.
|
|
234
|
+
|
|
235
|
+
proxy_address: the actual address of the proxy started for this run.
|
|
236
|
+
debug: when True, inherit terminal stdio so agent output streams live.
|
|
237
|
+
"""
|
|
238
|
+
agent_cfg = scenario.agent
|
|
239
|
+
proxy_env = {
|
|
240
|
+
**os.environ, # inherit PATH, API keys, etc. from the parent process
|
|
241
|
+
"HTTP_PROXY": proxy_address,
|
|
242
|
+
"HTTPS_PROXY": proxy_address,
|
|
243
|
+
"ASE_TRACE_ID": recorder.trace_id,
|
|
244
|
+
# LLM provider hosts must bypass the proxy — ASE intercepts tool calls,
|
|
245
|
+
# not the agent's model calls. Append to any existing NO_PROXY value.
|
|
246
|
+
"NO_PROXY": _build_no_proxy(os.environ.get("NO_PROXY", "")),
|
|
247
|
+
**agent_cfg.env, # scenario-level overrides win
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
log.info(
|
|
251
|
+
"agent_launching",
|
|
252
|
+
scenario=scenario.scenario_id,
|
|
253
|
+
command=agent_cfg.command,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
if debug:
|
|
257
|
+
return await self._execute_agent_debug(scenario, agent_cfg, proxy_env, recorder)
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
process = await asyncio.create_subprocess_exec(
|
|
261
|
+
*agent_cfg.command,
|
|
262
|
+
env=proxy_env,
|
|
263
|
+
stdout=asyncio.subprocess.PIPE,
|
|
264
|
+
stderr=asyncio.subprocess.PIPE,
|
|
265
|
+
)
|
|
266
|
+
_, stderr = await asyncio.wait_for(
|
|
267
|
+
process.communicate(),
|
|
268
|
+
timeout=agent_cfg.timeout_seconds,
|
|
269
|
+
)
|
|
270
|
+
except TimeoutError:
|
|
271
|
+
log.warning("agent_timeout", scenario=scenario.scenario_id)
|
|
272
|
+
return recorder.finish(
|
|
273
|
+
status=TraceStatus.ERROR,
|
|
274
|
+
error_message=f"agent timed out after {agent_cfg.timeout_seconds}s",
|
|
275
|
+
)
|
|
276
|
+
except Exception as exc:
|
|
277
|
+
return recorder.finish(
|
|
278
|
+
status=TraceStatus.ERROR,
|
|
279
|
+
error_message=f"failed to launch agent: {exc}",
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
exit_code = process.returncode or 0
|
|
283
|
+
log.info("agent_exited", scenario=scenario.scenario_id, exit_code=exit_code)
|
|
284
|
+
|
|
285
|
+
stderr_text = stderr.decode("utf-8", errors="replace").strip()
|
|
286
|
+
status = TraceStatus.PASSED if exit_code == 0 else TraceStatus.FAILED
|
|
287
|
+
# Always capture stderr; only treat it as error_message on failure
|
|
288
|
+
return recorder.finish(
|
|
289
|
+
status=status,
|
|
290
|
+
error_message=stderr_text if exit_code != 0 else None,
|
|
291
|
+
stderr_output=stderr_text or None,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
async def _execute_agent_debug(
|
|
295
|
+
self,
|
|
296
|
+
scenario: ScenarioConfig,
|
|
297
|
+
agent_cfg: object,
|
|
298
|
+
proxy_env: dict[str, str],
|
|
299
|
+
recorder: Recorder,
|
|
300
|
+
) -> Trace:
|
|
301
|
+
"""Debug variant — inherits terminal stdio so output streams live."""
|
|
302
|
+
from ase.scenario.model import AgentConfig
|
|
303
|
+
assert isinstance(agent_cfg, AgentConfig)
|
|
304
|
+
try:
|
|
305
|
+
process = await asyncio.create_subprocess_exec(
|
|
306
|
+
*agent_cfg.command,
|
|
307
|
+
env=proxy_env,
|
|
308
|
+
# no PIPE — output goes directly to the terminal
|
|
309
|
+
)
|
|
310
|
+
await asyncio.wait_for(process.wait(), timeout=agent_cfg.timeout_seconds)
|
|
311
|
+
except TimeoutError:
|
|
312
|
+
log.warning("agent_timeout", scenario=scenario.scenario_id)
|
|
313
|
+
return recorder.finish(
|
|
314
|
+
status=TraceStatus.ERROR,
|
|
315
|
+
error_message=f"agent timed out after {agent_cfg.timeout_seconds}s",
|
|
316
|
+
)
|
|
317
|
+
except Exception as exc:
|
|
318
|
+
return recorder.finish(
|
|
319
|
+
status=TraceStatus.ERROR,
|
|
320
|
+
error_message=f"failed to launch agent: {exc}",
|
|
321
|
+
)
|
|
322
|
+
exit_code = process.returncode or 0
|
|
323
|
+
log.info("agent_exited", scenario=scenario.scenario_id, exit_code=exit_code)
|
|
324
|
+
status = TraceStatus.PASSED if exit_code == 0 else TraceStatus.FAILED
|
|
325
|
+
return recorder.finish(status=status)
|
|
326
|
+
|
|
327
|
+
@staticmethod
|
|
328
|
+
async def _teardown_environments(environments: dict[str, Any]) -> None:
|
|
329
|
+
"""Tear down all environments, logging but not re-raising errors."""
|
|
330
|
+
from ase.environments.base import EnvironmentProvider
|
|
331
|
+
|
|
332
|
+
for name, env in environments.items():
|
|
333
|
+
if isinstance(env, EnvironmentProvider):
|
|
334
|
+
try:
|
|
335
|
+
await env.teardown()
|
|
336
|
+
except Exception as exc:
|
|
337
|
+
log.warning("teardown_error", environment=name, error=str(exc))
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _merged_api_seed(scenario: ScenarioConfig) -> APISeed:
|
|
341
|
+
"""Merge API recordings declared in environment and fixtures."""
|
|
342
|
+
environment_recordings = list(
|
|
343
|
+
scenario.environment.api.recordings if scenario.environment.api else []
|
|
344
|
+
)
|
|
345
|
+
fixture_recordings = [
|
|
346
|
+
fixture.model_dump() for fixture in scenario.fixtures.http_recordings
|
|
347
|
+
]
|
|
348
|
+
return APISeed(recordings=environment_recordings + fixture_recordings)
|
ase/errors.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Shared ASE exception hierarchy.
|
|
2
|
+
|
|
3
|
+
Keeping one root error type makes CLI and engine layers propagate contextual
|
|
4
|
+
failures consistently without depending on framework-specific exceptions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ASEError(Exception):
|
|
11
|
+
"""Root of ASE's user-facing and internal error hierarchy."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CLIError(ASEError):
|
|
15
|
+
"""Raised when a CLI workflow cannot be completed as requested."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ConfigError(ASEError):
|
|
19
|
+
"""Raised when ASE configuration or environment loading fails."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AdapterError(ASEError):
|
|
23
|
+
"""Raised when adapter SDKs cannot emit or persist framework events."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AdapterProtocolError(ASEError):
|
|
27
|
+
"""Raised when adapter event streams are missing or malformed."""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RuntimeModeError(ASEError):
|
|
31
|
+
"""Raised when direct runtime execution cannot produce a valid trace."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class TraceSerializationError(ASEError):
|
|
35
|
+
"""Raised when native ASE traces cannot be parsed or written safely."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TraceError(ASEError):
|
|
39
|
+
"""Raised when trace construction or mutation violates ASE invariants."""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class TraceSchemaMigrationError(ASEError):
|
|
43
|
+
"""Raised when a stored trace cannot be interpreted by this schema version."""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ConformanceError(ASEError):
|
|
47
|
+
"""Raised when certification inputs or outputs violate ASE contracts."""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class EvaluatorNotFoundError(ASEError):
|
|
51
|
+
"""Raised when a scenario references an unknown evaluator."""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class CacheError(ASEError):
|
|
55
|
+
"""Raised when the content-addressed response cache cannot be maintained."""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class OTelImportError(ASEError):
|
|
59
|
+
"""Raised when OTEL-like trace input cannot be converted into ASE format."""
|
ase/evaluation/base.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Shared evaluator protocol and result models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from enum import StrEnum
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Pillar(StrEnum):
|
|
13
|
+
"""Define ASE's top-level scoring buckets."""
|
|
14
|
+
|
|
15
|
+
CORRECTNESS = "correctness"
|
|
16
|
+
SAFETY = "safety"
|
|
17
|
+
EFFICIENCY = "efficiency"
|
|
18
|
+
CONSISTENCY = "consistency"
|
|
19
|
+
CUSTOM = "custom"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AssertionResult(BaseModel):
|
|
23
|
+
"""Represent one evaluator outcome within a scenario run."""
|
|
24
|
+
|
|
25
|
+
evaluator: str
|
|
26
|
+
pillar: Pillar
|
|
27
|
+
passed: bool
|
|
28
|
+
score: float
|
|
29
|
+
message: str
|
|
30
|
+
details: dict[str, Any] = Field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EvaluationSummary(BaseModel):
|
|
34
|
+
"""Summarize all evaluator outcomes for one trace."""
|
|
35
|
+
|
|
36
|
+
trace_id: str
|
|
37
|
+
scenario_id: str
|
|
38
|
+
passed: bool
|
|
39
|
+
ase_score: float
|
|
40
|
+
total: int
|
|
41
|
+
passed_count: int
|
|
42
|
+
failed_count: int
|
|
43
|
+
results: list[AssertionResult] = Field(default_factory=list)
|
|
44
|
+
pillar_scores: dict[str, float] = Field(default_factory=dict)
|
|
45
|
+
failing_evaluators: list[str] = Field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Evaluator(ABC):
|
|
49
|
+
"""Define the stable extension point for ASE assertions."""
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def name(self) -> str:
|
|
54
|
+
"""Return the public evaluator name referenced by scenarios."""
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
@abstractmethod
|
|
58
|
+
def pillar(self) -> Pillar:
|
|
59
|
+
"""Return the default scoring pillar for this evaluator."""
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
63
|
+
"""Evaluate one trace and return a structured assertion result."""
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Consistency evaluators used for baseline-style comparisons."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from ase.evaluation.base import AssertionResult, Evaluator, Pillar
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SameToolCallsEvaluator(Evaluator):
|
|
11
|
+
"""Compare tool-call counts against a provided baseline trace."""
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def name(self) -> str:
|
|
15
|
+
return "same_tool_calls"
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def pillar(self) -> Pillar:
|
|
19
|
+
return Pillar.CONSISTENCY
|
|
20
|
+
|
|
21
|
+
def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
22
|
+
baseline = context.get("baseline_trace")
|
|
23
|
+
if baseline is None:
|
|
24
|
+
return _passing(self.name, self.pillar, "no baseline provided")
|
|
25
|
+
current = getattr(getattr(trace, "metrics", None), "total_tool_calls", 0)
|
|
26
|
+
previous = getattr(getattr(baseline, "metrics", None), "total_tool_calls", 0)
|
|
27
|
+
passed = current == previous
|
|
28
|
+
return AssertionResult(
|
|
29
|
+
evaluator=self.name,
|
|
30
|
+
pillar=self.pillar,
|
|
31
|
+
passed=passed,
|
|
32
|
+
score=1.0 if passed else 0.0,
|
|
33
|
+
message=f"tool calls {current} vs baseline {previous}",
|
|
34
|
+
details={"current": current, "baseline": previous},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SameMetricsEvaluator(Evaluator):
|
|
39
|
+
"""Compare stable metrics against a provided baseline trace."""
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def name(self) -> str:
|
|
43
|
+
return "same_metrics"
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def pillar(self) -> Pillar:
|
|
47
|
+
return Pillar.CONSISTENCY
|
|
48
|
+
|
|
49
|
+
def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
50
|
+
baseline = context.get("baseline_trace")
|
|
51
|
+
if baseline is None:
|
|
52
|
+
return _passing(self.name, self.pillar, "no baseline provided")
|
|
53
|
+
current_metrics = getattr(trace, "metrics", None)
|
|
54
|
+
baseline_metrics = getattr(baseline, "metrics", None)
|
|
55
|
+
keys = ["total_tool_calls", "total_llm_calls", "total_tokens_used"]
|
|
56
|
+
deltas = {
|
|
57
|
+
key: getattr(current_metrics, key, 0) - getattr(baseline_metrics, key, 0)
|
|
58
|
+
for key in keys
|
|
59
|
+
}
|
|
60
|
+
passed = all(delta == 0 for delta in deltas.values())
|
|
61
|
+
return AssertionResult(
|
|
62
|
+
evaluator=self.name,
|
|
63
|
+
pillar=self.pillar,
|
|
64
|
+
passed=passed,
|
|
65
|
+
score=1.0 if passed else 0.0,
|
|
66
|
+
message="metrics match baseline" if passed else "metrics differ from baseline",
|
|
67
|
+
details=deltas,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _passing(evaluator: str, pillar: Pillar, message: str) -> AssertionResult:
|
|
72
|
+
"""Return a neutral passing result when no baseline context exists."""
|
|
73
|
+
return AssertionResult(
|
|
74
|
+
evaluator=evaluator,
|
|
75
|
+
pillar=pillar,
|
|
76
|
+
passed=True,
|
|
77
|
+
score=1.0,
|
|
78
|
+
message=message,
|
|
79
|
+
)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Correctness evaluators for observable agent behavior.
|
|
2
|
+
|
|
3
|
+
These evaluators answer the core ASE question: did the agent take the
|
|
4
|
+
expected actions against its environment, regardless of the exact text it
|
|
5
|
+
produced for a user.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from ase.evaluation.base import AssertionResult, Evaluator, Pillar
|
|
13
|
+
from ase.trace.model import ToolCallEvent, Trace, TraceEventKind
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ToolCalledEvaluator(Evaluator):
|
|
17
|
+
"""Verify that a scenario exercised the expected tool path."""
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def name(self) -> str:
|
|
21
|
+
return "tool_called"
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def pillar(self) -> Pillar:
|
|
25
|
+
return Pillar.CORRECTNESS
|
|
26
|
+
|
|
27
|
+
def evaluate(self, trace: Trace, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
28
|
+
del context
|
|
29
|
+
kind = str(params.get("kind", "")).strip().lower()
|
|
30
|
+
minimum = max(0, int(params.get("minimum", 1)))
|
|
31
|
+
method = _optional_upper(params.get("method"))
|
|
32
|
+
target_contains = _optional_lower(params.get("target_contains"))
|
|
33
|
+
matches = _matching_calls(trace, kind, method, target_contains)
|
|
34
|
+
passed = len(matches) >= minimum
|
|
35
|
+
label = kind or "tool"
|
|
36
|
+
return AssertionResult(
|
|
37
|
+
evaluator=self.name,
|
|
38
|
+
pillar=self.pillar,
|
|
39
|
+
passed=passed,
|
|
40
|
+
score=1.0 if passed else 0.0,
|
|
41
|
+
message=_tool_called_message(label, minimum, len(matches), passed),
|
|
42
|
+
details={
|
|
43
|
+
"kind": kind or None,
|
|
44
|
+
"minimum": minimum,
|
|
45
|
+
"actual": len(matches),
|
|
46
|
+
"method": method,
|
|
47
|
+
"target_contains": target_contains,
|
|
48
|
+
},
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class APICalledEvaluator(Evaluator):
|
|
53
|
+
"""Keep API-call assertions as a neutral alias over the shared tool model."""
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def name(self) -> str:
|
|
57
|
+
return "api_called"
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def pillar(self) -> Pillar:
|
|
61
|
+
return Pillar.CORRECTNESS
|
|
62
|
+
|
|
63
|
+
def evaluate(self, trace: Trace, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
64
|
+
tool_params = dict(params)
|
|
65
|
+
tool_params["kind"] = "http_api"
|
|
66
|
+
return ToolCalledEvaluator().evaluate(trace, tool_params, **context).model_copy(
|
|
67
|
+
update={"evaluator": self.name}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _matching_calls(
|
|
72
|
+
trace: Trace,
|
|
73
|
+
kind: str,
|
|
74
|
+
method: str | None,
|
|
75
|
+
target_contains: str | None,
|
|
76
|
+
) -> list[ToolCallEvent]:
|
|
77
|
+
"""Filter tool calls using only generic trace-level properties."""
|
|
78
|
+
matches: list[ToolCallEvent] = []
|
|
79
|
+
for event in trace.events:
|
|
80
|
+
if event.kind != TraceEventKind.TOOL_CALL or event.tool_call is None:
|
|
81
|
+
continue
|
|
82
|
+
if kind and event.tool_call.kind.value != kind:
|
|
83
|
+
continue
|
|
84
|
+
if method and event.tool_call.method.upper() != method:
|
|
85
|
+
continue
|
|
86
|
+
if target_contains and target_contains not in event.tool_call.target.lower():
|
|
87
|
+
continue
|
|
88
|
+
matches.append(event.tool_call)
|
|
89
|
+
return matches
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _optional_upper(value: object) -> str | None:
|
|
93
|
+
"""Normalize optional HTTP or tool verbs for equality checks."""
|
|
94
|
+
if value is None:
|
|
95
|
+
return None
|
|
96
|
+
text = str(value).strip()
|
|
97
|
+
return text.upper() if text else None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _optional_lower(value: object) -> str | None:
|
|
101
|
+
"""Normalize optional contains-style filters for case-insensitive matching."""
|
|
102
|
+
if value is None:
|
|
103
|
+
return None
|
|
104
|
+
text = str(value).strip()
|
|
105
|
+
return text.lower() if text else None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _tool_called_message(
|
|
109
|
+
kind: str,
|
|
110
|
+
minimum: int,
|
|
111
|
+
actual: int,
|
|
112
|
+
passed: bool,
|
|
113
|
+
) -> str:
|
|
114
|
+
"""Render stable operator-facing messages for correctness assertions."""
|
|
115
|
+
if passed:
|
|
116
|
+
return f"observed {actual} '{kind}' call(s)"
|
|
117
|
+
return f"expected >={minimum} '{kind}' call(s), got {actual}"
|