ase-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ase/__init__.py +21 -0
- ase/adapters/__init__.py +14 -0
- ase/adapters/contract.py +28 -0
- ase/adapters/frameworks/__init__.py +17 -0
- ase/adapters/frameworks/base.py +259 -0
- ase/adapters/frameworks/langgraph.py +19 -0
- ase/adapters/frameworks/mcp.py +68 -0
- ase/adapters/frameworks/openai_agents.py +19 -0
- ase/adapters/frameworks/pydantic_ai.py +19 -0
- ase/adapters/io.py +50 -0
- ase/adapters/model.py +89 -0
- ase/adapters/protocol.py +72 -0
- ase/adapters/replay.py +261 -0
- ase/cli/__init__.py +7 -0
- ase/cli/_trace_outputs.py +40 -0
- ase/cli/adapter_cmd.py +38 -0
- ase/cli/certify_cmd.py +74 -0
- ase/cli/compare.py +145 -0
- ase/cli/doctor_cmd.py +45 -0
- ase/cli/examples_cmd.py +27 -0
- ase/cli/history_cmd.py +126 -0
- ase/cli/import_cmd.py +34 -0
- ase/cli/main.py +134 -0
- ase/cli/replay_cmd.py +48 -0
- ase/cli/report.py +115 -0
- ase/cli/spec_cmd.py +53 -0
- ase/cli/test_cmd.py +121 -0
- ase/config/env_loader.py +71 -0
- ase/config/loader.py +82 -0
- ase/config/model.py +51 -0
- ase/conformance/__init__.py +7 -0
- ase/conformance/matrix.py +111 -0
- ase/conformance/model.py +91 -0
- ase/conformance/schema.py +37 -0
- ase/conformance/service.py +194 -0
- ase/core/engine.py +348 -0
- ase/errors.py +59 -0
- ase/evaluation/__init__.py +7 -0
- ase/evaluation/base.py +63 -0
- ase/evaluation/consistency.py +79 -0
- ase/evaluation/correctness.py +117 -0
- ase/evaluation/efficiency.py +145 -0
- ase/evaluation/engine.py +182 -0
- ase/evaluation/policy.py +134 -0
- ase/evaluation/scoring.py +64 -0
- ase/evaluation/trace_summary.py +36 -0
- ase/examples_matrix.py +118 -0
- ase/reporting/__init__.py +7 -0
- ase/reporting/json_report.py +45 -0
- ase/reporting/junit.py +38 -0
- ase/reporting/markdown.py +32 -0
- ase/reporting/terminal.py +66 -0
- ase/scenario/__init__.py +7 -0
- ase/scenario/model.py +294 -0
- ase/scenario/parser.py +40 -0
- ase/storage/__init__.py +7 -0
- ase/storage/trace_store.py +136 -0
- ase/trace/__init__.py +7 -0
- ase/trace/builder.py +175 -0
- ase/trace/model.py +264 -0
- ase/trace/otel_export.py +75 -0
- ase/trace/otel_import.py +96 -0
- ase/trace/redaction.py +10 -0
- ase/trace/serializer.py +50 -0
- ase_python-0.1.0.dist-info/METADATA +184 -0
- ase_python-0.1.0.dist-info/RECORD +69 -0
- ase_python-0.1.0.dist-info/WHEEL +4 -0
- ase_python-0.1.0.dist-info/entry_points.txt +2 -0
- ase_python-0.1.0.dist-info/licenses/LICENSE +105 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""JSON report helpers for ASE evaluation summaries and traces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ase.errors import TraceSerializationError
|
|
10
|
+
from ase.evaluation.base import EvaluationSummary
|
|
11
|
+
from ase.trace.model import Trace
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def summary_dict(summary: EvaluationSummary) -> dict[str, Any]:
|
|
15
|
+
"""Convert one evaluation summary into a stable JSON payload."""
|
|
16
|
+
return summary.model_dump(mode="json")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def trace_dict(trace: Trace) -> dict[str, Any]:
|
|
20
|
+
"""Convert one trace into a stable JSON payload."""
|
|
21
|
+
return trace.model_dump(mode="json")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def to_string(
|
|
25
|
+
summary: EvaluationSummary | None = None,
|
|
26
|
+
trace: Trace | None = None,
|
|
27
|
+
) -> str:
|
|
28
|
+
"""Render a summary or trace as pretty JSON."""
|
|
29
|
+
if trace is not None:
|
|
30
|
+
return json.dumps(trace_dict(trace), indent=2)
|
|
31
|
+
if summary is None:
|
|
32
|
+
raise TraceSerializationError("json report requires a summary or trace")
|
|
33
|
+
return json.dumps(summary_dict(summary), indent=2)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def write_to_file(
|
|
37
|
+
path: Path,
|
|
38
|
+
summary: EvaluationSummary | None = None,
|
|
39
|
+
trace: Trace | None = None,
|
|
40
|
+
) -> None:
|
|
41
|
+
"""Write one JSON report artifact to disk."""
|
|
42
|
+
try:
|
|
43
|
+
path.write_text(to_string(summary=summary, trace=trace) + "\n", encoding="utf-8")
|
|
44
|
+
except OSError as exc:
|
|
45
|
+
raise TraceSerializationError(f"failed to write JSON report {path}: {exc}") from exc
|
ase/reporting/junit.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""JUnit XML output for ASE evaluation summaries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ase.errors import TraceSerializationError
|
|
9
|
+
from ase.evaluation.base import EvaluationSummary
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def to_string(summary: EvaluationSummary) -> str:
|
|
13
|
+
"""Render one evaluation summary as JUnit XML."""
|
|
14
|
+
suite = ET.Element(
|
|
15
|
+
"testsuite",
|
|
16
|
+
name=summary.scenario_id,
|
|
17
|
+
tests=str(summary.total),
|
|
18
|
+
failures=str(summary.failed_count),
|
|
19
|
+
)
|
|
20
|
+
for result in summary.results:
|
|
21
|
+
case = ET.SubElement(
|
|
22
|
+
suite,
|
|
23
|
+
"testcase",
|
|
24
|
+
classname=summary.scenario_id,
|
|
25
|
+
name=result.evaluator,
|
|
26
|
+
)
|
|
27
|
+
if not result.passed:
|
|
28
|
+
failure = ET.SubElement(case, "failure", message=result.message)
|
|
29
|
+
failure.text = result.message
|
|
30
|
+
return ET.tostring(suite, encoding="unicode")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def write_to_file(summary: EvaluationSummary, path: Path) -> None:
|
|
34
|
+
"""Write JUnit XML to disk."""
|
|
35
|
+
try:
|
|
36
|
+
path.write_text(to_string(summary) + "\n", encoding="utf-8")
|
|
37
|
+
except OSError as exc:
|
|
38
|
+
raise TraceSerializationError(f"failed to write JUnit report {path}: {exc}") from exc
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Markdown report helpers for ASE summaries and traces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ase.evaluation.base import EvaluationSummary
|
|
6
|
+
from ase.trace.model import Trace
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def to_string(summary: EvaluationSummary | None = None, trace: Trace | None = None) -> str:
|
|
10
|
+
"""Render a compact Markdown report."""
|
|
11
|
+
if trace is not None:
|
|
12
|
+
return "\n".join(
|
|
13
|
+
[
|
|
14
|
+
"# ASE Trace Report",
|
|
15
|
+
"",
|
|
16
|
+
f"- Trace ID: `{trace.trace_id}`",
|
|
17
|
+
f"- Scenario: `{trace.scenario_id}`",
|
|
18
|
+
f"- Status: `{trace.status.value}`",
|
|
19
|
+
f"- Tool calls: `{trace.metrics.total_tool_calls}`",
|
|
20
|
+
]
|
|
21
|
+
)
|
|
22
|
+
assert summary is not None
|
|
23
|
+
return "\n".join(
|
|
24
|
+
[
|
|
25
|
+
"# ASE Evaluation Summary",
|
|
26
|
+
"",
|
|
27
|
+
f"- Scenario: `{summary.scenario_id}`",
|
|
28
|
+
f"- Passed: `{summary.passed}`",
|
|
29
|
+
f"- ASE score: `{summary.ase_score:.2f}`",
|
|
30
|
+
f"- Assertions: `{summary.passed_count}` passed / `{summary.failed_count}` failed",
|
|
31
|
+
]
|
|
32
|
+
)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Rich terminal renderers for ASE summaries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.panel import Panel
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from ase.evaluation.base import EvaluationSummary, Pillar
|
|
10
|
+
from ase.trace.model import Trace
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def render(
|
|
14
|
+
summary: EvaluationSummary,
|
|
15
|
+
console: Console | None = None,
|
|
16
|
+
trace: Trace | None = None,
|
|
17
|
+
) -> None:
|
|
18
|
+
"""Render a full terminal report with optional trace context."""
|
|
19
|
+
target = console or Console()
|
|
20
|
+
target.print(_summary_panel(summary))
|
|
21
|
+
target.print(_results_table(summary))
|
|
22
|
+
if trace is not None and trace.stderr_output:
|
|
23
|
+
target.print(Panel(trace.stderr_output, title="Agent stderr"))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def render_compact(summary: EvaluationSummary, console: Console | None = None) -> None:
|
|
27
|
+
"""Render a compact one-panel summary for short workflows."""
|
|
28
|
+
target = console or Console()
|
|
29
|
+
target.print(_summary_panel(summary))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _summary_panel(summary: EvaluationSummary) -> Panel:
|
|
33
|
+
"""Build the high-level summary panel."""
|
|
34
|
+
status = "PASS" if summary.passed else "FAIL"
|
|
35
|
+
body = "\n".join(
|
|
36
|
+
[
|
|
37
|
+
f"Scenario: {summary.scenario_id}",
|
|
38
|
+
f"Trace ID: {summary.trace_id}",
|
|
39
|
+
f"ASE Score: {summary.ase_score:.4f}",
|
|
40
|
+
"Assertions: "
|
|
41
|
+
f"{summary.passed_count} passed / "
|
|
42
|
+
f"{summary.failed_count} failed / "
|
|
43
|
+
f"{summary.total} total",
|
|
44
|
+
]
|
|
45
|
+
)
|
|
46
|
+
return Panel(body, title=f"ASE Result — {status}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _results_table(summary: EvaluationSummary) -> Table:
|
|
50
|
+
"""Build the evaluator result table."""
|
|
51
|
+
table = Table(title="Assertions")
|
|
52
|
+
table.add_column("Evaluator")
|
|
53
|
+
table.add_column("Pillar")
|
|
54
|
+
table.add_column("Score")
|
|
55
|
+
table.add_column("Message")
|
|
56
|
+
for result in summary.results:
|
|
57
|
+
table.add_row(
|
|
58
|
+
result.evaluator,
|
|
59
|
+
result.pillar.value,
|
|
60
|
+
f"{result.score:.2f}",
|
|
61
|
+
result.message,
|
|
62
|
+
)
|
|
63
|
+
return table
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
__all__ = ["EvaluationSummary", "Pillar", "Trace", "render", "render_compact"]
|
ase/scenario/__init__.py
ADDED
ase/scenario/model.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""Scenario configuration models consumed by the ASE runtime."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from ase.conformance.model import CertificationLevel
|
|
11
|
+
|
|
12
|
+
SCENARIO_SPEC_VERSION = 1
|
|
13
|
+
DEFAULT_BASELINE_METRICS = [
|
|
14
|
+
"total_tool_calls",
|
|
15
|
+
"total_llm_calls",
|
|
16
|
+
"total_tokens_used",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EnvironmentKind(StrEnum):
|
|
21
|
+
"""Which backend mode a scenario uses."""
|
|
22
|
+
|
|
23
|
+
REAL = "real"
|
|
24
|
+
MOCK = "mock"
|
|
25
|
+
SIMULATED = "simulated"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class AgentRuntimeMode(StrEnum):
|
|
29
|
+
"""Which ASE runtime path executes the scenario."""
|
|
30
|
+
|
|
31
|
+
PROXY = "proxy"
|
|
32
|
+
INSTRUMENTED = "instrumented"
|
|
33
|
+
ADAPTER = "adapter"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class AdapterTransport(StrEnum):
|
|
37
|
+
"""How adapter events are delivered into ASE."""
|
|
38
|
+
|
|
39
|
+
JSONL_STDIO = "jsonl-stdio"
|
|
40
|
+
HTTP = "http"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class BaselineMode(StrEnum):
|
|
44
|
+
"""How baseline comparison should behave."""
|
|
45
|
+
|
|
46
|
+
TOOL_CALLS = "tool_calls"
|
|
47
|
+
METRICS = "metrics"
|
|
48
|
+
COMBINED = "combined"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class AgentConfig(BaseModel):
|
|
52
|
+
"""How ASE launches the agent subprocess."""
|
|
53
|
+
|
|
54
|
+
command: list[str] = Field(description="Shell command to run the agent")
|
|
55
|
+
env: dict[str, str] = Field(default_factory=dict)
|
|
56
|
+
timeout_seconds: int = Field(default=60, ge=1)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class AgentRuntimeConfig(BaseModel):
|
|
60
|
+
"""Describe the runtime and methodology of the agent under test."""
|
|
61
|
+
|
|
62
|
+
mode: AgentRuntimeMode | None = None
|
|
63
|
+
framework: str | None = None
|
|
64
|
+
language: str | None = None
|
|
65
|
+
version: str | None = None
|
|
66
|
+
methodology: str | None = None
|
|
67
|
+
entrypoint: str | None = None
|
|
68
|
+
adapter_name: str | None = None
|
|
69
|
+
event_source: str | None = None
|
|
70
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class AdapterConfig(BaseModel):
|
|
74
|
+
"""Declare how a non-proxy runtime integrates with ASE."""
|
|
75
|
+
|
|
76
|
+
name: str
|
|
77
|
+
transport: AdapterTransport = AdapterTransport.JSONL_STDIO
|
|
78
|
+
command: list[str] = Field(default_factory=list)
|
|
79
|
+
url: str | None = None
|
|
80
|
+
timeout_seconds: int = Field(default=60, ge=1)
|
|
81
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class ApprovalFixture(BaseModel):
|
|
85
|
+
"""Declare one approval available during the run."""
|
|
86
|
+
|
|
87
|
+
approval_id: str
|
|
88
|
+
actor: str = "system"
|
|
89
|
+
granted: bool = True
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class HTTPRecordingFixture(BaseModel):
|
|
93
|
+
"""Store one inline request/response replay fixture."""
|
|
94
|
+
|
|
95
|
+
request: dict[str, Any] = Field(default_factory=dict)
|
|
96
|
+
response: dict[str, Any] = Field(default_factory=dict)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class FilesystemEntryFixture(BaseModel):
|
|
100
|
+
"""Represent one deterministic filesystem fixture entry."""
|
|
101
|
+
|
|
102
|
+
path: str
|
|
103
|
+
content: str = ""
|
|
104
|
+
writable: bool = False
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class QueueMessageFixture(BaseModel):
|
|
108
|
+
"""Represent one deterministic queue message."""
|
|
109
|
+
|
|
110
|
+
queue: str
|
|
111
|
+
body: dict[str, Any] = Field(default_factory=dict)
|
|
112
|
+
message_id: str | None = None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class WebhookEventFixture(BaseModel):
|
|
116
|
+
"""Represent one deterministic webhook event."""
|
|
117
|
+
|
|
118
|
+
endpoint: str
|
|
119
|
+
method: str = "POST"
|
|
120
|
+
payload: dict[str, Any] = Field(default_factory=dict)
|
|
121
|
+
event_id: str | None = None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class FixturesConfig(BaseModel):
|
|
125
|
+
"""Reusable fixtures that influence determinism and policy enforcement."""
|
|
126
|
+
|
|
127
|
+
approvals: list[ApprovalFixture] = Field(default_factory=list)
|
|
128
|
+
http_recordings: list[HTTPRecordingFixture] = Field(default_factory=list)
|
|
129
|
+
filesystem: list[FilesystemEntryFixture] = Field(default_factory=list)
|
|
130
|
+
queue_messages: list[QueueMessageFixture] = Field(default_factory=list)
|
|
131
|
+
webhook_events: list[WebhookEventFixture] = Field(default_factory=list)
|
|
132
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class DatabaseSeed(BaseModel):
|
|
136
|
+
"""SQL statements to run before a scenario starts."""
|
|
137
|
+
|
|
138
|
+
statements: list[str] = Field(default_factory=list)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class APISeed(BaseModel):
|
|
142
|
+
"""HTTP request/response pairs for record-replay."""
|
|
143
|
+
|
|
144
|
+
recordings: list[dict[str, Any]] = Field(default_factory=list)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class BrowserSessionSeed(BaseModel):
|
|
148
|
+
"""Placeholder schema for future browser/session replay."""
|
|
149
|
+
|
|
150
|
+
sessions: list[dict[str, Any]] = Field(default_factory=list)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class EnvironmentConfig(BaseModel):
|
|
154
|
+
"""Environment configuration for the scenario."""
|
|
155
|
+
|
|
156
|
+
kind: EnvironmentKind = EnvironmentKind.SIMULATED
|
|
157
|
+
database: DatabaseSeed | None = None
|
|
158
|
+
api: APISeed | None = None
|
|
159
|
+
browser_session: BrowserSessionSeed | None = None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class SessionConfig(BaseModel):
|
|
163
|
+
"""Capture session expectations for stateful agents."""
|
|
164
|
+
|
|
165
|
+
enabled: bool = False
|
|
166
|
+
session_id: str | None = None
|
|
167
|
+
stateful: bool = True
|
|
168
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class HandoffConfig(BaseModel):
|
|
172
|
+
"""Declare expected multi-agent handoff behavior."""
|
|
173
|
+
|
|
174
|
+
enabled: bool = False
|
|
175
|
+
expected_edges: list[dict[str, Any]] = Field(default_factory=list)
|
|
176
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class HumanFeedbackConfig(BaseModel):
|
|
180
|
+
"""Declare expected human feedback checkpoints."""
|
|
181
|
+
|
|
182
|
+
enabled: bool = False
|
|
183
|
+
required_actors: list[str] = Field(default_factory=list)
|
|
184
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class StreamingConfig(BaseModel):
|
|
188
|
+
"""Declare streaming-output expectations."""
|
|
189
|
+
|
|
190
|
+
enabled: bool = False
|
|
191
|
+
expected_chunk_count: int | None = Field(default=None, ge=0)
|
|
192
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class RealtimeConfig(BaseModel):
|
|
196
|
+
"""Declare realtime transport expectations."""
|
|
197
|
+
|
|
198
|
+
enabled: bool = False
|
|
199
|
+
transport: str | None = None
|
|
200
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class MCPConfig(BaseModel):
|
|
204
|
+
"""Declare MCP servers and resources involved in the scenario."""
|
|
205
|
+
|
|
206
|
+
enabled: bool = False
|
|
207
|
+
servers: list[str] = Field(default_factory=list)
|
|
208
|
+
resources: list[str] = Field(default_factory=list)
|
|
209
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class InterAgentConfig(BaseModel):
|
|
213
|
+
"""Declare non-local inter-agent protocol expectations."""
|
|
214
|
+
|
|
215
|
+
enabled: bool = False
|
|
216
|
+
protocol: str | None = None
|
|
217
|
+
topology: str | None = None
|
|
218
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class AssertionConfig(BaseModel):
|
|
222
|
+
"""One evaluator invocation attached to a scenario."""
|
|
223
|
+
|
|
224
|
+
evaluator: str
|
|
225
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
|
226
|
+
pillar: str | None = None
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class PolicyConfig(BaseModel):
|
|
230
|
+
"""One policy rule compiled into an assertion at evaluation time."""
|
|
231
|
+
|
|
232
|
+
evaluator: str
|
|
233
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
|
234
|
+
policy_id: str | None = None
|
|
235
|
+
pillar: str | None = "safety"
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class BaselineConfig(BaseModel):
|
|
239
|
+
"""Baseline comparison settings for regression detection."""
|
|
240
|
+
|
|
241
|
+
mode: BaselineMode = BaselineMode.COMBINED
|
|
242
|
+
trace_file: str
|
|
243
|
+
compare_tool_calls: bool = False
|
|
244
|
+
compare_metrics: bool = False
|
|
245
|
+
metrics: list[str] = Field(default_factory=lambda: list(DEFAULT_BASELINE_METRICS))
|
|
246
|
+
metrics_tolerance: float = Field(default=0.1, ge=0)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class ScenarioConfig(BaseModel):
|
|
250
|
+
"""Complete definition of a single ASE scenario."""
|
|
251
|
+
|
|
252
|
+
spec_version: int = SCENARIO_SPEC_VERSION
|
|
253
|
+
scenario_id: str
|
|
254
|
+
name: str
|
|
255
|
+
description: str = ""
|
|
256
|
+
agent: AgentConfig
|
|
257
|
+
agent_runtime: AgentRuntimeConfig | None = None
|
|
258
|
+
adapter: AdapterConfig | None = None
|
|
259
|
+
environment: EnvironmentConfig = Field(default_factory=EnvironmentConfig)
|
|
260
|
+
fixtures: FixturesConfig = Field(default_factory=FixturesConfig)
|
|
261
|
+
session: SessionConfig | None = None
|
|
262
|
+
handoffs: HandoffConfig | None = None
|
|
263
|
+
human_feedback: HumanFeedbackConfig | None = None
|
|
264
|
+
streaming: StreamingConfig | None = None
|
|
265
|
+
realtime: RealtimeConfig | None = None
|
|
266
|
+
mcp: MCPConfig | None = None
|
|
267
|
+
inter_agent: InterAgentConfig | None = None
|
|
268
|
+
assertions: list[AssertionConfig] = Field(default_factory=list)
|
|
269
|
+
policies: list[PolicyConfig] = Field(default_factory=list)
|
|
270
|
+
baselines: BaselineConfig | None = None
|
|
271
|
+
tags: dict[str, str] = Field(default_factory=dict)
|
|
272
|
+
run_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def runtime_mode(self) -> AgentRuntimeMode:
|
|
276
|
+
"""Return the effective execution mode for the scenario."""
|
|
277
|
+
if self.agent_runtime is None or self.agent_runtime.mode is None:
|
|
278
|
+
return AgentRuntimeMode.PROXY
|
|
279
|
+
return self.agent_runtime.mode
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def certification_level(self) -> CertificationLevel | None:
|
|
283
|
+
"""Infer a coarse certification level from scenario capabilities."""
|
|
284
|
+
if self.mcp and self.mcp.enabled:
|
|
285
|
+
return CertificationLevel.MCP
|
|
286
|
+
if self.handoffs and self.handoffs.enabled:
|
|
287
|
+
return CertificationLevel.MULTI_AGENT
|
|
288
|
+
if self.session and self.session.enabled:
|
|
289
|
+
return CertificationLevel.STATEFUL
|
|
290
|
+
if self.streaming and self.streaming.enabled:
|
|
291
|
+
return CertificationLevel.REALTIME
|
|
292
|
+
if self.agent_runtime is not None:
|
|
293
|
+
return CertificationLevel.CORE
|
|
294
|
+
return None
|
ase/scenario/parser.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""YAML parser for ASE scenarios."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from ase.errors import ConfigError
|
|
10
|
+
from ase.scenario.model import ScenarioConfig
|
|
11
|
+
|
|
12
|
+
_SCHEMA_PATH = Path(__file__).resolve().parents[3] / "schemas" / "ase_scenario.schema.json"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse_file(path: Path) -> ScenarioConfig:
|
|
16
|
+
"""Load one YAML scenario file and attach its source path metadata."""
|
|
17
|
+
if not path.exists():
|
|
18
|
+
raise ConfigError(f"failed to parse scenario: file not found {path}")
|
|
19
|
+
try:
|
|
20
|
+
raw = path.read_text(encoding="utf-8")
|
|
21
|
+
except OSError as exc:
|
|
22
|
+
raise ConfigError(f"failed to parse scenario: could not read {path}: {exc}") from exc
|
|
23
|
+
try:
|
|
24
|
+
data = yaml.safe_load(raw) or {}
|
|
25
|
+
except yaml.YAMLError as exc:
|
|
26
|
+
raise ConfigError(f"failed to parse scenario: invalid YAML in {path}: {exc}") from exc
|
|
27
|
+
if not isinstance(data, dict):
|
|
28
|
+
raise ConfigError(f"failed to parse scenario: root must be a mapping in {path}")
|
|
29
|
+
run_metadata = dict(data.get("run_metadata") or {})
|
|
30
|
+
run_metadata.setdefault("source", str(path.resolve()))
|
|
31
|
+
data["run_metadata"] = run_metadata
|
|
32
|
+
try:
|
|
33
|
+
return ScenarioConfig.model_validate(data)
|
|
34
|
+
except Exception as exc:
|
|
35
|
+
raise ConfigError(f"failed to parse scenario: invalid spec in {path}: {exc}") from exc
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def schema_path() -> Path:
|
|
39
|
+
"""Return the public scenario schema location in the repo."""
|
|
40
|
+
return _SCHEMA_PATH
|
ase/storage/__init__.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Persistent local storage for ASE traces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sqlite3
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import structlog
|
|
11
|
+
|
|
12
|
+
from ase.trace.model import Trace
|
|
13
|
+
from ase.trace.serializer import deserialize, serialize
|
|
14
|
+
|
|
15
|
+
log = structlog.get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TraceStore:
|
|
19
|
+
"""Store and query traces in a local SQLite database."""
|
|
20
|
+
|
|
21
|
+
_DEFAULT_PATH = Path.home() / ".ase" / "traces.db"
|
|
22
|
+
|
|
23
|
+
def __init__(self, db_path: Path | None = None) -> None:
|
|
24
|
+
self._db_path = db_path or self._DEFAULT_PATH
|
|
25
|
+
self._conn: sqlite3.Connection | None = None
|
|
26
|
+
|
|
27
|
+
async def setup(self) -> None:
|
|
28
|
+
"""Create the database and table if they do not already exist."""
|
|
29
|
+
self._db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
self._conn = sqlite3.connect(self._db_path)
|
|
31
|
+
self._conn.row_factory = sqlite3.Row
|
|
32
|
+
self._conn.execute(
|
|
33
|
+
"""
|
|
34
|
+
CREATE TABLE IF NOT EXISTS traces (
|
|
35
|
+
trace_id TEXT PRIMARY KEY,
|
|
36
|
+
scenario_id TEXT NOT NULL,
|
|
37
|
+
scenario_name TEXT NOT NULL,
|
|
38
|
+
status TEXT NOT NULL,
|
|
39
|
+
evaluation_status TEXT,
|
|
40
|
+
ase_score REAL,
|
|
41
|
+
runtime_mode TEXT,
|
|
42
|
+
certification_level TEXT,
|
|
43
|
+
started_at_ms REAL,
|
|
44
|
+
trace_json TEXT NOT NULL
|
|
45
|
+
)
|
|
46
|
+
"""
|
|
47
|
+
)
|
|
48
|
+
self._conn.commit()
|
|
49
|
+
log.debug("trace_store_ready", path=str(self._db_path))
|
|
50
|
+
|
|
51
|
+
async def close(self) -> None:
|
|
52
|
+
"""Close the SQLite connection if it is open."""
|
|
53
|
+
if self._conn is not None:
|
|
54
|
+
self._conn.close()
|
|
55
|
+
self._conn = None
|
|
56
|
+
|
|
57
|
+
async def save_trace(self, trace: Trace, ase_score: float | None = None) -> None:
|
|
58
|
+
"""Persist one trace and its summary columns."""
|
|
59
|
+
conn = _require_conn(self._conn)
|
|
60
|
+
stored = sanitize_trace_for_storage(trace)
|
|
61
|
+
evaluation_status = None
|
|
62
|
+
if stored.evaluation is not None:
|
|
63
|
+
evaluation_status = "passed" if stored.evaluation.passed else "failed"
|
|
64
|
+
score_value = ase_score
|
|
65
|
+
if score_value is None and stored.evaluation is not None:
|
|
66
|
+
score_value = stored.evaluation.ase_score
|
|
67
|
+
conn.execute(
|
|
68
|
+
"""
|
|
69
|
+
INSERT OR REPLACE INTO traces (
|
|
70
|
+
trace_id, scenario_id, scenario_name, status, evaluation_status,
|
|
71
|
+
ase_score, runtime_mode, certification_level, started_at_ms, trace_json
|
|
72
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
73
|
+
""",
|
|
74
|
+
(
|
|
75
|
+
stored.trace_id,
|
|
76
|
+
stored.scenario_id,
|
|
77
|
+
stored.scenario_name,
|
|
78
|
+
stored.status.value,
|
|
79
|
+
evaluation_status,
|
|
80
|
+
score_value,
|
|
81
|
+
stored.runtime_provenance.mode if stored.runtime_provenance else None,
|
|
82
|
+
stored.certification_level.value if stored.certification_level else None,
|
|
83
|
+
stored.started_at_ms,
|
|
84
|
+
serialize(stored),
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
conn.commit()
|
|
88
|
+
log.debug("trace_saved", trace_id=stored.trace_id)
|
|
89
|
+
|
|
90
|
+
async def list_traces(
|
|
91
|
+
self,
|
|
92
|
+
scenario_id: str | None = None,
|
|
93
|
+
status: str | None = None,
|
|
94
|
+
limit: int = 50,
|
|
95
|
+
) -> list[dict[str, Any]]:
|
|
96
|
+
"""Return recent trace rows for history views."""
|
|
97
|
+
conn = _require_conn(self._conn)
|
|
98
|
+
query = "SELECT * FROM traces"
|
|
99
|
+
clauses: list[str] = []
|
|
100
|
+
params: list[Any] = []
|
|
101
|
+
if scenario_id:
|
|
102
|
+
clauses.append("scenario_id = ?")
|
|
103
|
+
params.append(scenario_id)
|
|
104
|
+
if status:
|
|
105
|
+
clauses.append("status = ?")
|
|
106
|
+
params.append(status)
|
|
107
|
+
if clauses:
|
|
108
|
+
query += " WHERE " + " AND ".join(clauses)
|
|
109
|
+
query += " ORDER BY started_at_ms DESC LIMIT ?"
|
|
110
|
+
params.append(limit)
|
|
111
|
+
rows = conn.execute(query, params).fetchall()
|
|
112
|
+
return [dict(row) for row in rows]
|
|
113
|
+
|
|
114
|
+
async def get_trace(self, trace_id: str) -> Trace | None:
|
|
115
|
+
"""Return one persisted trace by id."""
|
|
116
|
+
conn = _require_conn(self._conn)
|
|
117
|
+
row = conn.execute(
|
|
118
|
+
"SELECT trace_json FROM traces WHERE trace_id = ?",
|
|
119
|
+
(trace_id,),
|
|
120
|
+
).fetchone()
|
|
121
|
+
if row is None:
|
|
122
|
+
return None
|
|
123
|
+
return deserialize(row["trace_json"])
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def sanitize_trace_for_storage(trace: Trace) -> Trace:
|
|
127
|
+
"""Copy a trace into a storage-safe form without mutating the caller."""
|
|
128
|
+
payload = json.loads(trace.model_dump_json())
|
|
129
|
+
return Trace.model_validate(payload)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _require_conn(conn: sqlite3.Connection | None) -> sqlite3.Connection:
|
|
133
|
+
"""Ensure the trace store is initialized before use."""
|
|
134
|
+
if conn is None:
|
|
135
|
+
raise RuntimeError("trace store is not initialized")
|
|
136
|
+
return conn
|