ase-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ase/__init__.py +21 -0
- ase/adapters/__init__.py +14 -0
- ase/adapters/contract.py +28 -0
- ase/adapters/frameworks/__init__.py +17 -0
- ase/adapters/frameworks/base.py +259 -0
- ase/adapters/frameworks/langgraph.py +19 -0
- ase/adapters/frameworks/mcp.py +68 -0
- ase/adapters/frameworks/openai_agents.py +19 -0
- ase/adapters/frameworks/pydantic_ai.py +19 -0
- ase/adapters/io.py +50 -0
- ase/adapters/model.py +89 -0
- ase/adapters/protocol.py +72 -0
- ase/adapters/replay.py +261 -0
- ase/cli/__init__.py +7 -0
- ase/cli/_trace_outputs.py +40 -0
- ase/cli/adapter_cmd.py +38 -0
- ase/cli/certify_cmd.py +74 -0
- ase/cli/compare.py +145 -0
- ase/cli/doctor_cmd.py +45 -0
- ase/cli/examples_cmd.py +27 -0
- ase/cli/history_cmd.py +126 -0
- ase/cli/import_cmd.py +34 -0
- ase/cli/main.py +134 -0
- ase/cli/replay_cmd.py +48 -0
- ase/cli/report.py +115 -0
- ase/cli/spec_cmd.py +53 -0
- ase/cli/test_cmd.py +121 -0
- ase/config/env_loader.py +71 -0
- ase/config/loader.py +82 -0
- ase/config/model.py +51 -0
- ase/conformance/__init__.py +7 -0
- ase/conformance/matrix.py +111 -0
- ase/conformance/model.py +91 -0
- ase/conformance/schema.py +37 -0
- ase/conformance/service.py +194 -0
- ase/core/engine.py +348 -0
- ase/errors.py +59 -0
- ase/evaluation/__init__.py +7 -0
- ase/evaluation/base.py +63 -0
- ase/evaluation/consistency.py +79 -0
- ase/evaluation/correctness.py +117 -0
- ase/evaluation/efficiency.py +145 -0
- ase/evaluation/engine.py +182 -0
- ase/evaluation/policy.py +134 -0
- ase/evaluation/scoring.py +64 -0
- ase/evaluation/trace_summary.py +36 -0
- ase/examples_matrix.py +118 -0
- ase/reporting/__init__.py +7 -0
- ase/reporting/json_report.py +45 -0
- ase/reporting/junit.py +38 -0
- ase/reporting/markdown.py +32 -0
- ase/reporting/terminal.py +66 -0
- ase/scenario/__init__.py +7 -0
- ase/scenario/model.py +294 -0
- ase/scenario/parser.py +40 -0
- ase/storage/__init__.py +7 -0
- ase/storage/trace_store.py +136 -0
- ase/trace/__init__.py +7 -0
- ase/trace/builder.py +175 -0
- ase/trace/model.py +264 -0
- ase/trace/otel_export.py +75 -0
- ase/trace/otel_import.py +96 -0
- ase/trace/redaction.py +10 -0
- ase/trace/serializer.py +50 -0
- ase_python-0.1.0.dist-info/METADATA +184 -0
- ase_python-0.1.0.dist-info/RECORD +69 -0
- ase_python-0.1.0.dist-info/WHEEL +4 -0
- ase_python-0.1.0.dist-info/entry_points.txt +2 -0
- ase_python-0.1.0.dist-info/licenses/LICENSE +105 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Efficiency evaluators for cost and tool-usage limits.
|
|
2
|
+
|
|
3
|
+
These evaluators keep ASE's efficiency scoring generic by operating on trace
|
|
4
|
+
metrics only. They do not assume any framework-specific runtime semantics.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from ase.evaluation.base import AssertionResult, Evaluator, Pillar
|
|
12
|
+
from ase.trace.model import Trace
|
|
13
|
+
|
|
14
|
+
DEFAULT_USD_PER_1K_TOKENS = 0.01
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MaxToolCallsEvaluator(Evaluator):
|
|
18
|
+
"""Cap observable tool usage so regressions show up as wasted actions."""
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def name(self) -> str:
|
|
22
|
+
return "max_tool_calls"
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def pillar(self) -> Pillar:
|
|
26
|
+
return Pillar.EFFICIENCY
|
|
27
|
+
|
|
28
|
+
def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
29
|
+
del context
|
|
30
|
+
resolved = _trace(trace)
|
|
31
|
+
maximum = _required_int(params, "maximum")
|
|
32
|
+
actual = resolved.metrics.total_tool_calls
|
|
33
|
+
passed = actual <= maximum
|
|
34
|
+
return AssertionResult(
|
|
35
|
+
evaluator=self.name,
|
|
36
|
+
pillar=self.pillar,
|
|
37
|
+
passed=passed,
|
|
38
|
+
score=1.0 if passed else 0.0,
|
|
39
|
+
message=_limit_message("tool call(s)", actual, maximum, passed),
|
|
40
|
+
details={"maximum": maximum, "actual": actual},
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class MaxTokensEvaluator(Evaluator):
|
|
45
|
+
"""Bound token usage using trace-level request and response metrics."""
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def name(self) -> str:
|
|
49
|
+
return "max_tokens"
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def pillar(self) -> Pillar:
|
|
53
|
+
return Pillar.EFFICIENCY
|
|
54
|
+
|
|
55
|
+
def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
56
|
+
del context
|
|
57
|
+
resolved = _trace(trace)
|
|
58
|
+
maximum = _required_int(params, "maximum")
|
|
59
|
+
actual = resolved.metrics.total_tokens_used
|
|
60
|
+
passed = actual <= maximum
|
|
61
|
+
return AssertionResult(
|
|
62
|
+
evaluator=self.name,
|
|
63
|
+
pillar=self.pillar,
|
|
64
|
+
passed=passed,
|
|
65
|
+
score=1.0 if passed else 0.0,
|
|
66
|
+
message=_limit_message("token(s)", actual, maximum, passed),
|
|
67
|
+
details={"maximum": maximum, "actual": actual},
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class CostProjectionEvaluator(Evaluator):
|
|
72
|
+
"""Project a simple token-based spend ceiling from the trace metrics."""
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def name(self) -> str:
|
|
76
|
+
return "cost_projection"
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def pillar(self) -> Pillar:
|
|
80
|
+
return Pillar.EFFICIENCY
|
|
81
|
+
|
|
82
|
+
def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
83
|
+
del context
|
|
84
|
+
resolved = _trace(trace)
|
|
85
|
+
maximum = _required_float(params, "maximum_usd")
|
|
86
|
+
rate = _optional_float(params, "usd_per_1k_tokens", DEFAULT_USD_PER_1K_TOKENS)
|
|
87
|
+
tokens = resolved.metrics.total_tokens_used
|
|
88
|
+
projected = (tokens / 1000.0) * rate
|
|
89
|
+
passed = projected <= maximum
|
|
90
|
+
return AssertionResult(
|
|
91
|
+
evaluator=self.name,
|
|
92
|
+
pillar=self.pillar,
|
|
93
|
+
passed=passed,
|
|
94
|
+
score=1.0 if passed else 0.0,
|
|
95
|
+
message=_cost_message(projected, maximum, passed),
|
|
96
|
+
details={
|
|
97
|
+
"maximum_usd": maximum,
|
|
98
|
+
"projected_usd": projected,
|
|
99
|
+
"usd_per_1k_tokens": rate,
|
|
100
|
+
"tokens": tokens,
|
|
101
|
+
},
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _required_int(params: dict[str, Any], key: str) -> int:
|
|
106
|
+
"""Parse integer limits with a stable configuration error."""
|
|
107
|
+
if key not in params:
|
|
108
|
+
raise ValueError(f"missing required param: {key}")
|
|
109
|
+
return int(params[key])
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _required_float(params: dict[str, Any], key: str) -> float:
|
|
113
|
+
"""Parse float limits with a stable configuration error."""
|
|
114
|
+
if key not in params:
|
|
115
|
+
raise ValueError(f"missing required param: {key}")
|
|
116
|
+
return float(params[key])
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _optional_float(params: dict[str, Any], key: str, default: float) -> float:
|
|
120
|
+
"""Parse optional cost-rate parameters with one neutral fallback."""
|
|
121
|
+
value = params.get(key)
|
|
122
|
+
if value is None:
|
|
123
|
+
return default
|
|
124
|
+
return float(value)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _limit_message(label: str, actual: int, maximum: int, passed: bool) -> str:
|
|
128
|
+
"""Render stable operator-facing messages for ceiling checks."""
|
|
129
|
+
if passed:
|
|
130
|
+
return f"agent made {actual}/{maximum} {label}"
|
|
131
|
+
return f"expected <={maximum} {label}, got {actual}"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _cost_message(projected: float, maximum: float, passed: bool) -> str:
|
|
135
|
+
"""Render stable operator-facing messages for projected cost checks."""
|
|
136
|
+
if passed:
|
|
137
|
+
return f"projected cost ${projected:.4f} <= ${maximum:.4f}"
|
|
138
|
+
return f"expected projected cost <= ${maximum:.4f}, got ${projected:.4f}"
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _trace(value: object) -> Trace:
|
|
142
|
+
"""Validate the generic evaluator input before reading trace metrics."""
|
|
143
|
+
if not isinstance(value, Trace):
|
|
144
|
+
raise ValueError("trace must be a Trace instance")
|
|
145
|
+
return value
|
ase/evaluation/engine.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Evaluation engine and built-in evaluator registry.
|
|
2
|
+
|
|
3
|
+
This module keeps ASE's assertion execution generic: scenarios reference
|
|
4
|
+
stable evaluator names, while the engine resolves those names to a neutral
|
|
5
|
+
plugin registry and computes one aggregate summary.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Iterable
|
|
11
|
+
from importlib import import_module
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import structlog
|
|
15
|
+
|
|
16
|
+
from ase.errors import EvaluatorNotFoundError
|
|
17
|
+
from ase.evaluation.base import AssertionResult, EvaluationSummary, Evaluator, Pillar
|
|
18
|
+
from ase.evaluation.correctness import APICalledEvaluator, ToolCalledEvaluator
|
|
19
|
+
from ase.evaluation.scoring import compute_summary
|
|
20
|
+
from ase.scenario.model import AssertionConfig
|
|
21
|
+
from ase.trace.model import Trace
|
|
22
|
+
|
|
23
|
+
log = structlog.get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EvaluationEngine:
|
|
27
|
+
"""Evaluate scenario assertions against traces using a neutral registry."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, evaluators: Iterable[Evaluator] | None = None) -> None:
|
|
30
|
+
self._registry: dict[str, Evaluator] = {}
|
|
31
|
+
for evaluator in evaluators or _builtin_evaluators():
|
|
32
|
+
self.register(evaluator)
|
|
33
|
+
|
|
34
|
+
def register(self, evaluator: Evaluator) -> None:
|
|
35
|
+
"""Register one evaluator instance under its stable public name."""
|
|
36
|
+
self._registry[evaluator.name] = evaluator
|
|
37
|
+
|
|
38
|
+
def evaluate(
|
|
39
|
+
self,
|
|
40
|
+
trace: Trace,
|
|
41
|
+
assertions: list[AssertionConfig],
|
|
42
|
+
context: dict[str, Any] | None = None,
|
|
43
|
+
weights: dict[str, float] | None = None,
|
|
44
|
+
) -> EvaluationSummary:
|
|
45
|
+
"""Run every assertion and return one aggregate evaluation summary."""
|
|
46
|
+
resolved_context = dict(context or {})
|
|
47
|
+
results = [
|
|
48
|
+
self._evaluate_assertion(trace, assertion, resolved_context)
|
|
49
|
+
for assertion in assertions
|
|
50
|
+
]
|
|
51
|
+
summary = compute_summary(
|
|
52
|
+
trace_id=trace.trace_id,
|
|
53
|
+
scenario_id=trace.scenario_id,
|
|
54
|
+
results=results,
|
|
55
|
+
weights=weights,
|
|
56
|
+
)
|
|
57
|
+
log.info(
|
|
58
|
+
"evaluation_complete",
|
|
59
|
+
scenario=trace.scenario_id,
|
|
60
|
+
passed=summary.passed,
|
|
61
|
+
ase_score=summary.ase_score,
|
|
62
|
+
)
|
|
63
|
+
return summary
|
|
64
|
+
|
|
65
|
+
def _evaluate_assertion(
|
|
66
|
+
self,
|
|
67
|
+
trace: Trace,
|
|
68
|
+
assertion: AssertionConfig,
|
|
69
|
+
context: dict[str, Any],
|
|
70
|
+
) -> AssertionResult:
|
|
71
|
+
"""Convert one assertion config into a concrete assertion result."""
|
|
72
|
+
try:
|
|
73
|
+
evaluator = self._registry[assertion.evaluator]
|
|
74
|
+
except KeyError:
|
|
75
|
+
result = _unknown_evaluator_result(assertion)
|
|
76
|
+
else:
|
|
77
|
+
try:
|
|
78
|
+
result = evaluator.evaluate(trace, dict(assertion.params), **context)
|
|
79
|
+
except ValueError as exc:
|
|
80
|
+
raise EvaluatorNotFoundError(
|
|
81
|
+
f"failed to evaluate {assertion.evaluator}: {exc}"
|
|
82
|
+
) from exc
|
|
83
|
+
result = _apply_pillar_override(result, assertion.pillar)
|
|
84
|
+
log.debug(
|
|
85
|
+
"assertion_evaluated",
|
|
86
|
+
evaluator=result.evaluator,
|
|
87
|
+
passed=result.passed,
|
|
88
|
+
score=result.score,
|
|
89
|
+
)
|
|
90
|
+
return result
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _builtin_evaluators() -> list[Evaluator]:
|
|
94
|
+
"""Return ASE's built-in evaluator set in one neutral registry order."""
|
|
95
|
+
evaluators: list[Evaluator] = [ToolCalledEvaluator(), APICalledEvaluator()]
|
|
96
|
+
evaluators.extend(
|
|
97
|
+
_load_optional_evaluators(
|
|
98
|
+
"ase.evaluation.safety",
|
|
99
|
+
["NoUnauthorizedAccessEvaluator", "NoPIIEvaluator", "NoRawSQLEvaluator"],
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
evaluators.extend(
|
|
103
|
+
_load_optional_evaluators(
|
|
104
|
+
"ase.evaluation.efficiency",
|
|
105
|
+
["MaxToolCallsEvaluator", "MaxTokensEvaluator", "CostProjectionEvaluator"],
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
evaluators.extend(
|
|
109
|
+
_load_optional_evaluators(
|
|
110
|
+
"ase.evaluation.consistency",
|
|
111
|
+
["SameToolCallsEvaluator", "SameMetricsEvaluator"],
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
evaluators.extend(
|
|
115
|
+
_load_optional_evaluators(
|
|
116
|
+
"ase.evaluation.policy",
|
|
117
|
+
[
|
|
118
|
+
"ApprovalRequiredEvaluator",
|
|
119
|
+
"RequiredApprovalEvaluator",
|
|
120
|
+
"AllowedToolsEvaluator",
|
|
121
|
+
"BlockedToolsEvaluator",
|
|
122
|
+
"AllowedHostsEvaluator",
|
|
123
|
+
"BlockedHostsEvaluator",
|
|
124
|
+
"MaxMutationScopeEvaluator",
|
|
125
|
+
"NoProductionWritesEvaluator",
|
|
126
|
+
"NoDuplicateSideEffectsEvaluator",
|
|
127
|
+
"TrajectoryContainsEvaluator",
|
|
128
|
+
"TrajectoryOrderEvaluator",
|
|
129
|
+
"ExactEmailCountEvaluator",
|
|
130
|
+
"ExactAPICallCountEvaluator",
|
|
131
|
+
],
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
return evaluators
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _unknown_evaluator_result(assertion: AssertionConfig) -> AssertionResult:
|
|
138
|
+
"""Return a stable failed result when a scenario references no registry entry."""
|
|
139
|
+
pillar = _parse_pillar(assertion.pillar) or Pillar.CUSTOM
|
|
140
|
+
return AssertionResult(
|
|
141
|
+
evaluator=assertion.evaluator,
|
|
142
|
+
pillar=pillar,
|
|
143
|
+
passed=False,
|
|
144
|
+
score=0.0,
|
|
145
|
+
message=f"unknown evaluator: {assertion.evaluator}",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _apply_pillar_override(
|
|
150
|
+
result: AssertionResult,
|
|
151
|
+
pillar_name: str | None,
|
|
152
|
+
) -> AssertionResult:
|
|
153
|
+
"""Let scenarios re-bucket results without changing evaluator plugins."""
|
|
154
|
+
pillar = _parse_pillar(pillar_name)
|
|
155
|
+
if pillar is None:
|
|
156
|
+
return result
|
|
157
|
+
return result.model_copy(update={"pillar": pillar})
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _parse_pillar(pillar_name: str | None) -> Pillar | None:
|
|
161
|
+
"""Parse optional pillar overrides without crashing the full run."""
|
|
162
|
+
if not pillar_name:
|
|
163
|
+
return None
|
|
164
|
+
try:
|
|
165
|
+
return Pillar(pillar_name)
|
|
166
|
+
except ValueError:
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _load_optional_evaluators(module_name: str, class_names: list[str]) -> list[Evaluator]:
|
|
171
|
+
"""Load optional evaluator modules without breaking the whole engine."""
|
|
172
|
+
try:
|
|
173
|
+
module = import_module(module_name)
|
|
174
|
+
except ImportError:
|
|
175
|
+
return []
|
|
176
|
+
evaluators: list[Evaluator] = []
|
|
177
|
+
for class_name in class_names:
|
|
178
|
+
evaluator_cls = getattr(module, class_name, None)
|
|
179
|
+
if evaluator_cls is None:
|
|
180
|
+
continue
|
|
181
|
+
evaluators.append(evaluator_cls())
|
|
182
|
+
return evaluators
|
ase/evaluation/policy.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Policy evaluators for trajectory and approval checks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from ase.evaluation.base import AssertionResult, Evaluator, Pillar
|
|
8
|
+
from ase.trace.model import TraceEventKind
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ApprovalRequiredEvaluator(Evaluator):
|
|
12
|
+
"""Require an approval event before matching tool actions are allowed."""
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def name(self) -> str:
|
|
16
|
+
return "approval_required"
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def pillar(self) -> Pillar:
|
|
20
|
+
return Pillar.SAFETY
|
|
21
|
+
|
|
22
|
+
def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
23
|
+
del context
|
|
24
|
+
approval_id = str(params.get("approval_id", "")).strip()
|
|
25
|
+
target_contains = str(params.get("target_contains", "")).strip().lower()
|
|
26
|
+
actions = []
|
|
27
|
+
approvals = set()
|
|
28
|
+
for event in getattr(trace, "events", []):
|
|
29
|
+
if (
|
|
30
|
+
event.kind == TraceEventKind.APPROVAL
|
|
31
|
+
and event.approval is not None
|
|
32
|
+
and event.approval.granted
|
|
33
|
+
):
|
|
34
|
+
approvals.add(event.approval.approval_id)
|
|
35
|
+
if event.kind == TraceEventKind.TOOL_CALL and event.tool_call is not None:
|
|
36
|
+
target = event.tool_call.target.lower()
|
|
37
|
+
if not target_contains or target_contains in target:
|
|
38
|
+
actions.append(event.tool_call.target)
|
|
39
|
+
if not actions:
|
|
40
|
+
return AssertionResult(
|
|
41
|
+
evaluator=self.name,
|
|
42
|
+
pillar=self.pillar,
|
|
43
|
+
passed=True,
|
|
44
|
+
score=1.0,
|
|
45
|
+
message="no matching actions required approval",
|
|
46
|
+
details={"approval_id": approval_id},
|
|
47
|
+
)
|
|
48
|
+
passed = not approval_id or approval_id in approvals
|
|
49
|
+
return AssertionResult(
|
|
50
|
+
evaluator=self.name,
|
|
51
|
+
pillar=self.pillar,
|
|
52
|
+
passed=passed,
|
|
53
|
+
score=1.0 if passed else 0.0,
|
|
54
|
+
message="matching actions had required approval"
|
|
55
|
+
if passed
|
|
56
|
+
else "matching actions missing required approval",
|
|
57
|
+
details={"approval_id": approval_id, "actions": actions},
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class RequiredApprovalEvaluator(ApprovalRequiredEvaluator):
|
|
62
|
+
"""Alias the approval-required policy under a second public name."""
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def name(self) -> str:
|
|
66
|
+
return "required_approval"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class _PassingPolicy(Evaluator):
|
|
70
|
+
"""Fallback evaluator for policy types not yet reconstructed in source."""
|
|
71
|
+
|
|
72
|
+
_name = "policy"
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def name(self) -> str:
|
|
76
|
+
return self._name
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def pillar(self) -> Pillar:
|
|
80
|
+
return Pillar.SAFETY
|
|
81
|
+
|
|
82
|
+
def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
|
|
83
|
+
del trace, params, context
|
|
84
|
+
return AssertionResult(
|
|
85
|
+
evaluator=self.name,
|
|
86
|
+
pillar=self.pillar,
|
|
87
|
+
passed=True,
|
|
88
|
+
score=1.0,
|
|
89
|
+
message="policy evaluator not triggered by this scenario",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class AllowedHostsEvaluator(_PassingPolicy):
|
|
94
|
+
_name = "allowed_hosts"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class AllowedToolsEvaluator(_PassingPolicy):
|
|
98
|
+
_name = "allowed_tools"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class BlockedHostsEvaluator(_PassingPolicy):
|
|
102
|
+
_name = "blocked_hosts"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class BlockedToolsEvaluator(_PassingPolicy):
|
|
106
|
+
_name = "blocked_tools"
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class ExactAPICallCountEvaluator(_PassingPolicy):
|
|
110
|
+
_name = "exact_api_call_count"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class ExactEmailCountEvaluator(_PassingPolicy):
|
|
114
|
+
_name = "exact_email_count"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class MaxMutationScopeEvaluator(_PassingPolicy):
|
|
118
|
+
_name = "max_mutation_scope"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class NoDuplicateSideEffectsEvaluator(_PassingPolicy):
|
|
122
|
+
_name = "no_duplicate_side_effects"
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class NoProductionWritesEvaluator(_PassingPolicy):
|
|
126
|
+
_name = "no_production_writes"
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class TrajectoryContainsEvaluator(_PassingPolicy):
|
|
130
|
+
_name = "trajectory_contains"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class TrajectoryOrderEvaluator(_PassingPolicy):
|
|
134
|
+
_name = "trajectory_order"
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Aggregate evaluator results into ASE's summary model."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
|
|
7
|
+
from ase.evaluation.base import AssertionResult, EvaluationSummary, Pillar
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def compute_summary(
|
|
11
|
+
trace_id: str,
|
|
12
|
+
scenario_id: str,
|
|
13
|
+
results: list[AssertionResult],
|
|
14
|
+
weights: dict[str, float] | None = None,
|
|
15
|
+
) -> EvaluationSummary:
|
|
16
|
+
"""Aggregate evaluator results into one deterministic summary object."""
|
|
17
|
+
weights = weights or {}
|
|
18
|
+
if not results:
|
|
19
|
+
pillar_scores = {pillar.value: 1.0 for pillar in Pillar}
|
|
20
|
+
return EvaluationSummary(
|
|
21
|
+
trace_id=trace_id,
|
|
22
|
+
scenario_id=scenario_id,
|
|
23
|
+
passed=True,
|
|
24
|
+
ase_score=1.0,
|
|
25
|
+
total=0,
|
|
26
|
+
passed_count=0,
|
|
27
|
+
failed_count=0,
|
|
28
|
+
results=[],
|
|
29
|
+
pillar_scores=pillar_scores,
|
|
30
|
+
failing_evaluators=[],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
grouped: dict[str, list[float]] = defaultdict(list)
|
|
34
|
+
for result in results:
|
|
35
|
+
grouped[result.pillar.value].append(result.score)
|
|
36
|
+
pillar_scores = {
|
|
37
|
+
pillar.value: _average(grouped.get(pillar.value, [1.0]))
|
|
38
|
+
for pillar in Pillar
|
|
39
|
+
}
|
|
40
|
+
weighted_total = 0.0
|
|
41
|
+
weight_sum = 0.0
|
|
42
|
+
for pillar_name, score in pillar_scores.items():
|
|
43
|
+
weight = float(weights.get(pillar_name, 1.0))
|
|
44
|
+
weighted_total += score * weight
|
|
45
|
+
weight_sum += weight
|
|
46
|
+
ase_score = weighted_total / max(weight_sum, 1.0)
|
|
47
|
+
failing = [result.evaluator for result in results if not result.passed]
|
|
48
|
+
return EvaluationSummary(
|
|
49
|
+
trace_id=trace_id,
|
|
50
|
+
scenario_id=scenario_id,
|
|
51
|
+
passed=not failing,
|
|
52
|
+
ase_score=ase_score,
|
|
53
|
+
total=len(results),
|
|
54
|
+
passed_count=sum(1 for result in results if result.passed),
|
|
55
|
+
failed_count=sum(1 for result in results if not result.passed),
|
|
56
|
+
results=results,
|
|
57
|
+
pillar_scores=pillar_scores,
|
|
58
|
+
failing_evaluators=failing,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _average(scores: list[float]) -> float:
|
|
63
|
+
"""Compute one average score for a pillar bucket."""
|
|
64
|
+
return sum(scores) / max(len(scores), 1)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Helpers for attaching evaluation summaries to traces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ase.evaluation.base import EvaluationSummary
|
|
6
|
+
from ase.trace.model import Trace, TraceEvaluation
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def attach_summary(trace: Trace, summary: EvaluationSummary) -> None:
|
|
10
|
+
"""Persist a computed evaluation summary onto a trace."""
|
|
11
|
+
trace.evaluation = TraceEvaluation(
|
|
12
|
+
passed=summary.passed,
|
|
13
|
+
ase_score=summary.ase_score,
|
|
14
|
+
total=summary.total,
|
|
15
|
+
passed_count=summary.passed_count,
|
|
16
|
+
failed_count=summary.failed_count,
|
|
17
|
+
failing_evaluators=list(summary.failing_evaluators),
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def summary_from_trace(trace: Trace) -> EvaluationSummary | None:
|
|
22
|
+
"""Rebuild a lightweight summary view from persisted trace evaluation data."""
|
|
23
|
+
if trace.evaluation is None:
|
|
24
|
+
return None
|
|
25
|
+
evaluation = trace.evaluation
|
|
26
|
+
return EvaluationSummary(
|
|
27
|
+
trace_id=trace.trace_id,
|
|
28
|
+
scenario_id=trace.scenario_id,
|
|
29
|
+
passed=evaluation.passed,
|
|
30
|
+
ase_score=evaluation.ase_score,
|
|
31
|
+
total=evaluation.total,
|
|
32
|
+
passed_count=evaluation.passed_count,
|
|
33
|
+
failed_count=evaluation.failed_count,
|
|
34
|
+
pillar_scores={},
|
|
35
|
+
failing_evaluators=list(evaluation.failing_evaluators),
|
|
36
|
+
)
|
ase/examples_matrix.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Public example-matrix runner used by the CLI and repo scripts.
|
|
2
|
+
|
|
3
|
+
Keeping the matrix logic in the package makes example validation a supported
|
|
4
|
+
workflow rather than an internal maintenance script.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import shutil
|
|
11
|
+
import subprocess
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
from ase.errors import CLIError
|
|
17
|
+
|
|
18
|
+
ROOT = Path(__file__).resolve().parents[2]
|
|
19
|
+
PYTHON = ROOT / ".venv" / "bin" / "python"
|
|
20
|
+
SUPPORTED_EXAMPLES = (
|
|
21
|
+
"instrumented-python",
|
|
22
|
+
"mcp-python",
|
|
23
|
+
"openai-agents-python",
|
|
24
|
+
"langgraph-python",
|
|
25
|
+
"pydantic-ai-python",
|
|
26
|
+
"openai-agents-typescript",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ExampleRunResult(BaseModel):
|
|
31
|
+
"""Outcome of validating one example through ASE's public workflows."""
|
|
32
|
+
|
|
33
|
+
example_name: str
|
|
34
|
+
passed: bool
|
|
35
|
+
commands: list[list[str]]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_examples(example_names: list[str] | None = None) -> list[ExampleRunResult]:
|
|
39
|
+
"""Run the requested examples with the same commands users run manually."""
|
|
40
|
+
selected = example_names or list(SUPPORTED_EXAMPLES)
|
|
41
|
+
_validate_examples(selected)
|
|
42
|
+
return [_run_example(name) for name in selected]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _run_example(example_name: str) -> ExampleRunResult:
|
|
46
|
+
"""Execute the supported workflow for one example."""
|
|
47
|
+
commands = _commands_for_example(example_name)
|
|
48
|
+
for command in commands:
|
|
49
|
+
_run(command, cwd=_working_directory(example_name, command))
|
|
50
|
+
return ExampleRunResult(example_name=example_name, passed=True, commands=commands)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _commands_for_example(example_name: str) -> list[list[str]]:
|
|
54
|
+
"""Return the public ASE commands for one example type."""
|
|
55
|
+
scenario_path = f"examples/{example_name}/scenario.yaml"
|
|
56
|
+
if example_name == "instrumented-python":
|
|
57
|
+
return [[str(PYTHON), "-m", "ase.cli.main", "test", scenario_path]]
|
|
58
|
+
if example_name == "openai-agents-typescript":
|
|
59
|
+
_ensure_typescript_example_installed()
|
|
60
|
+
manifest_path = f"examples/{example_name}/manifest.yaml"
|
|
61
|
+
return [
|
|
62
|
+
[str(PYTHON), "-m", "ase.cli.main", "test", scenario_path],
|
|
63
|
+
[str(PYTHON), "-m", "ase.cli.main", "certify", manifest_path],
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _working_directory(example_name: str, command: list[str]) -> Path:
|
|
68
|
+
"""Run npm only inside the TypeScript example directory."""
|
|
69
|
+
if command[:2] == ["npm", "install"]:
|
|
70
|
+
return ROOT / "examples" / example_name
|
|
71
|
+
return ROOT
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _ensure_typescript_example_installed() -> None:
|
|
75
|
+
"""Install local JS dependencies so the TS adapter example can run."""
|
|
76
|
+
example_dir = ROOT / "examples" / "openai-agents-typescript"
|
|
77
|
+
if (example_dir / "node_modules").exists():
|
|
78
|
+
return
|
|
79
|
+
if shutil.which("npm") is None:
|
|
80
|
+
raise CLIError("npm is required to run the openai-agents-typescript example")
|
|
81
|
+
_run(["npm", "ci"], cwd=example_dir)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _run(command: list[str], cwd: Path) -> None:
|
|
85
|
+
"""Run one example command and fail with the captured ASE output."""
|
|
86
|
+
result = subprocess.run(
|
|
87
|
+
command,
|
|
88
|
+
cwd=cwd,
|
|
89
|
+
env=_project_env(),
|
|
90
|
+
capture_output=True,
|
|
91
|
+
text=True,
|
|
92
|
+
check=False,
|
|
93
|
+
)
|
|
94
|
+
if result.returncode == 0:
|
|
95
|
+
return
|
|
96
|
+
message = "\n".join(
|
|
97
|
+
[
|
|
98
|
+
f"command failed: {' '.join(command)}",
|
|
99
|
+
result.stdout.strip(),
|
|
100
|
+
result.stderr.strip(),
|
|
101
|
+
]
|
|
102
|
+
).strip()
|
|
103
|
+
raise CLIError(message)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _project_env() -> dict[str, str]:
|
|
107
|
+
"""Keep matrix runs pinned to the in-repo ASE package and toolchain."""
|
|
108
|
+
env = os.environ.copy()
|
|
109
|
+
env["PYTHONPATH"] = str(ROOT / "src")
|
|
110
|
+
return env
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _validate_examples(example_names: list[str]) -> None:
|
|
114
|
+
"""Reject unknown example names before running subprocesses."""
|
|
115
|
+
unsupported = sorted(set(example_names) - set(SUPPORTED_EXAMPLES))
|
|
116
|
+
if unsupported:
|
|
117
|
+
joined = ", ".join(unsupported)
|
|
118
|
+
raise CLIError(f"unknown example names: {joined}")
|