ase-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. ase/__init__.py +21 -0
  2. ase/adapters/__init__.py +14 -0
  3. ase/adapters/contract.py +28 -0
  4. ase/adapters/frameworks/__init__.py +17 -0
  5. ase/adapters/frameworks/base.py +259 -0
  6. ase/adapters/frameworks/langgraph.py +19 -0
  7. ase/adapters/frameworks/mcp.py +68 -0
  8. ase/adapters/frameworks/openai_agents.py +19 -0
  9. ase/adapters/frameworks/pydantic_ai.py +19 -0
  10. ase/adapters/io.py +50 -0
  11. ase/adapters/model.py +89 -0
  12. ase/adapters/protocol.py +72 -0
  13. ase/adapters/replay.py +261 -0
  14. ase/cli/__init__.py +7 -0
  15. ase/cli/_trace_outputs.py +40 -0
  16. ase/cli/adapter_cmd.py +38 -0
  17. ase/cli/certify_cmd.py +74 -0
  18. ase/cli/compare.py +145 -0
  19. ase/cli/doctor_cmd.py +45 -0
  20. ase/cli/examples_cmd.py +27 -0
  21. ase/cli/history_cmd.py +126 -0
  22. ase/cli/import_cmd.py +34 -0
  23. ase/cli/main.py +134 -0
  24. ase/cli/replay_cmd.py +48 -0
  25. ase/cli/report.py +115 -0
  26. ase/cli/spec_cmd.py +53 -0
  27. ase/cli/test_cmd.py +121 -0
  28. ase/config/env_loader.py +71 -0
  29. ase/config/loader.py +82 -0
  30. ase/config/model.py +51 -0
  31. ase/conformance/__init__.py +7 -0
  32. ase/conformance/matrix.py +111 -0
  33. ase/conformance/model.py +91 -0
  34. ase/conformance/schema.py +37 -0
  35. ase/conformance/service.py +194 -0
  36. ase/core/engine.py +348 -0
  37. ase/errors.py +59 -0
  38. ase/evaluation/__init__.py +7 -0
  39. ase/evaluation/base.py +63 -0
  40. ase/evaluation/consistency.py +79 -0
  41. ase/evaluation/correctness.py +117 -0
  42. ase/evaluation/efficiency.py +145 -0
  43. ase/evaluation/engine.py +182 -0
  44. ase/evaluation/policy.py +134 -0
  45. ase/evaluation/scoring.py +64 -0
  46. ase/evaluation/trace_summary.py +36 -0
  47. ase/examples_matrix.py +118 -0
  48. ase/reporting/__init__.py +7 -0
  49. ase/reporting/json_report.py +45 -0
  50. ase/reporting/junit.py +38 -0
  51. ase/reporting/markdown.py +32 -0
  52. ase/reporting/terminal.py +66 -0
  53. ase/scenario/__init__.py +7 -0
  54. ase/scenario/model.py +294 -0
  55. ase/scenario/parser.py +40 -0
  56. ase/storage/__init__.py +7 -0
  57. ase/storage/trace_store.py +136 -0
  58. ase/trace/__init__.py +7 -0
  59. ase/trace/builder.py +175 -0
  60. ase/trace/model.py +264 -0
  61. ase/trace/otel_export.py +75 -0
  62. ase/trace/otel_import.py +96 -0
  63. ase/trace/redaction.py +10 -0
  64. ase/trace/serializer.py +50 -0
  65. ase_python-0.1.0.dist-info/METADATA +184 -0
  66. ase_python-0.1.0.dist-info/RECORD +69 -0
  67. ase_python-0.1.0.dist-info/WHEEL +4 -0
  68. ase_python-0.1.0.dist-info/entry_points.txt +2 -0
  69. ase_python-0.1.0.dist-info/licenses/LICENSE +105 -0
@@ -0,0 +1,145 @@
1
+ """Efficiency evaluators for cost and tool-usage limits.
2
+
3
+ These evaluators keep ASE's efficiency scoring generic by operating on trace
4
+ metrics only. They do not assume any framework-specific runtime semantics.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ from ase.evaluation.base import AssertionResult, Evaluator, Pillar
12
+ from ase.trace.model import Trace
13
+
14
+ DEFAULT_USD_PER_1K_TOKENS = 0.01
15
+
16
+
17
+ class MaxToolCallsEvaluator(Evaluator):
18
+ """Cap observable tool usage so regressions show up as wasted actions."""
19
+
20
+ @property
21
+ def name(self) -> str:
22
+ return "max_tool_calls"
23
+
24
+ @property
25
+ def pillar(self) -> Pillar:
26
+ return Pillar.EFFICIENCY
27
+
28
+ def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
29
+ del context
30
+ resolved = _trace(trace)
31
+ maximum = _required_int(params, "maximum")
32
+ actual = resolved.metrics.total_tool_calls
33
+ passed = actual <= maximum
34
+ return AssertionResult(
35
+ evaluator=self.name,
36
+ pillar=self.pillar,
37
+ passed=passed,
38
+ score=1.0 if passed else 0.0,
39
+ message=_limit_message("tool call(s)", actual, maximum, passed),
40
+ details={"maximum": maximum, "actual": actual},
41
+ )
42
+
43
+
44
+ class MaxTokensEvaluator(Evaluator):
45
+ """Bound token usage using trace-level request and response metrics."""
46
+
47
+ @property
48
+ def name(self) -> str:
49
+ return "max_tokens"
50
+
51
+ @property
52
+ def pillar(self) -> Pillar:
53
+ return Pillar.EFFICIENCY
54
+
55
+ def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
56
+ del context
57
+ resolved = _trace(trace)
58
+ maximum = _required_int(params, "maximum")
59
+ actual = resolved.metrics.total_tokens_used
60
+ passed = actual <= maximum
61
+ return AssertionResult(
62
+ evaluator=self.name,
63
+ pillar=self.pillar,
64
+ passed=passed,
65
+ score=1.0 if passed else 0.0,
66
+ message=_limit_message("token(s)", actual, maximum, passed),
67
+ details={"maximum": maximum, "actual": actual},
68
+ )
69
+
70
+
71
+ class CostProjectionEvaluator(Evaluator):
72
+ """Project a simple token-based spend ceiling from the trace metrics."""
73
+
74
+ @property
75
+ def name(self) -> str:
76
+ return "cost_projection"
77
+
78
+ @property
79
+ def pillar(self) -> Pillar:
80
+ return Pillar.EFFICIENCY
81
+
82
+ def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
83
+ del context
84
+ resolved = _trace(trace)
85
+ maximum = _required_float(params, "maximum_usd")
86
+ rate = _optional_float(params, "usd_per_1k_tokens", DEFAULT_USD_PER_1K_TOKENS)
87
+ tokens = resolved.metrics.total_tokens_used
88
+ projected = (tokens / 1000.0) * rate
89
+ passed = projected <= maximum
90
+ return AssertionResult(
91
+ evaluator=self.name,
92
+ pillar=self.pillar,
93
+ passed=passed,
94
+ score=1.0 if passed else 0.0,
95
+ message=_cost_message(projected, maximum, passed),
96
+ details={
97
+ "maximum_usd": maximum,
98
+ "projected_usd": projected,
99
+ "usd_per_1k_tokens": rate,
100
+ "tokens": tokens,
101
+ },
102
+ )
103
+
104
+
105
+ def _required_int(params: dict[str, Any], key: str) -> int:
106
+ """Parse integer limits with a stable configuration error."""
107
+ if key not in params:
108
+ raise ValueError(f"missing required param: {key}")
109
+ return int(params[key])
110
+
111
+
112
+ def _required_float(params: dict[str, Any], key: str) -> float:
113
+ """Parse float limits with a stable configuration error."""
114
+ if key not in params:
115
+ raise ValueError(f"missing required param: {key}")
116
+ return float(params[key])
117
+
118
+
119
+ def _optional_float(params: dict[str, Any], key: str, default: float) -> float:
120
+ """Parse optional cost-rate parameters with one neutral fallback."""
121
+ value = params.get(key)
122
+ if value is None:
123
+ return default
124
+ return float(value)
125
+
126
+
127
+ def _limit_message(label: str, actual: int, maximum: int, passed: bool) -> str:
128
+ """Render stable operator-facing messages for ceiling checks."""
129
+ if passed:
130
+ return f"agent made {actual}/{maximum} {label}"
131
+ return f"expected <={maximum} {label}, got {actual}"
132
+
133
+
134
+ def _cost_message(projected: float, maximum: float, passed: bool) -> str:
135
+ """Render stable operator-facing messages for projected cost checks."""
136
+ if passed:
137
+ return f"projected cost ${projected:.4f} <= ${maximum:.4f}"
138
+ return f"expected projected cost <= ${maximum:.4f}, got ${projected:.4f}"
139
+
140
+
141
+ def _trace(value: object) -> Trace:
142
+ """Validate the generic evaluator input before reading trace metrics."""
143
+ if not isinstance(value, Trace):
144
+ raise ValueError("trace must be a Trace instance")
145
+ return value
@@ -0,0 +1,182 @@
1
+ """Evaluation engine and built-in evaluator registry.
2
+
3
+ This module keeps ASE's assertion execution generic: scenarios reference
4
+ stable evaluator names, while the engine resolves those names to a neutral
5
+ plugin registry and computes one aggregate summary.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterable
11
+ from importlib import import_module
12
+ from typing import Any
13
+
14
+ import structlog
15
+
16
+ from ase.errors import EvaluatorNotFoundError
17
+ from ase.evaluation.base import AssertionResult, EvaluationSummary, Evaluator, Pillar
18
+ from ase.evaluation.correctness import APICalledEvaluator, ToolCalledEvaluator
19
+ from ase.evaluation.scoring import compute_summary
20
+ from ase.scenario.model import AssertionConfig
21
+ from ase.trace.model import Trace
22
+
23
+ log = structlog.get_logger(__name__)
24
+
25
+
26
+ class EvaluationEngine:
27
+ """Evaluate scenario assertions against traces using a neutral registry."""
28
+
29
+ def __init__(self, evaluators: Iterable[Evaluator] | None = None) -> None:
30
+ self._registry: dict[str, Evaluator] = {}
31
+ for evaluator in evaluators or _builtin_evaluators():
32
+ self.register(evaluator)
33
+
34
+ def register(self, evaluator: Evaluator) -> None:
35
+ """Register one evaluator instance under its stable public name."""
36
+ self._registry[evaluator.name] = evaluator
37
+
38
+ def evaluate(
39
+ self,
40
+ trace: Trace,
41
+ assertions: list[AssertionConfig],
42
+ context: dict[str, Any] | None = None,
43
+ weights: dict[str, float] | None = None,
44
+ ) -> EvaluationSummary:
45
+ """Run every assertion and return one aggregate evaluation summary."""
46
+ resolved_context = dict(context or {})
47
+ results = [
48
+ self._evaluate_assertion(trace, assertion, resolved_context)
49
+ for assertion in assertions
50
+ ]
51
+ summary = compute_summary(
52
+ trace_id=trace.trace_id,
53
+ scenario_id=trace.scenario_id,
54
+ results=results,
55
+ weights=weights,
56
+ )
57
+ log.info(
58
+ "evaluation_complete",
59
+ scenario=trace.scenario_id,
60
+ passed=summary.passed,
61
+ ase_score=summary.ase_score,
62
+ )
63
+ return summary
64
+
65
+ def _evaluate_assertion(
66
+ self,
67
+ trace: Trace,
68
+ assertion: AssertionConfig,
69
+ context: dict[str, Any],
70
+ ) -> AssertionResult:
71
+ """Convert one assertion config into a concrete assertion result."""
72
+ try:
73
+ evaluator = self._registry[assertion.evaluator]
74
+ except KeyError:
75
+ result = _unknown_evaluator_result(assertion)
76
+ else:
77
+ try:
78
+ result = evaluator.evaluate(trace, dict(assertion.params), **context)
79
+ except ValueError as exc:
80
+ raise EvaluatorNotFoundError(
81
+ f"failed to evaluate {assertion.evaluator}: {exc}"
82
+ ) from exc
83
+ result = _apply_pillar_override(result, assertion.pillar)
84
+ log.debug(
85
+ "assertion_evaluated",
86
+ evaluator=result.evaluator,
87
+ passed=result.passed,
88
+ score=result.score,
89
+ )
90
+ return result
91
+
92
+
93
+ def _builtin_evaluators() -> list[Evaluator]:
94
+ """Return ASE's built-in evaluator set in one neutral registry order."""
95
+ evaluators: list[Evaluator] = [ToolCalledEvaluator(), APICalledEvaluator()]
96
+ evaluators.extend(
97
+ _load_optional_evaluators(
98
+ "ase.evaluation.safety",
99
+ ["NoUnauthorizedAccessEvaluator", "NoPIIEvaluator", "NoRawSQLEvaluator"],
100
+ )
101
+ )
102
+ evaluators.extend(
103
+ _load_optional_evaluators(
104
+ "ase.evaluation.efficiency",
105
+ ["MaxToolCallsEvaluator", "MaxTokensEvaluator", "CostProjectionEvaluator"],
106
+ )
107
+ )
108
+ evaluators.extend(
109
+ _load_optional_evaluators(
110
+ "ase.evaluation.consistency",
111
+ ["SameToolCallsEvaluator", "SameMetricsEvaluator"],
112
+ )
113
+ )
114
+ evaluators.extend(
115
+ _load_optional_evaluators(
116
+ "ase.evaluation.policy",
117
+ [
118
+ "ApprovalRequiredEvaluator",
119
+ "RequiredApprovalEvaluator",
120
+ "AllowedToolsEvaluator",
121
+ "BlockedToolsEvaluator",
122
+ "AllowedHostsEvaluator",
123
+ "BlockedHostsEvaluator",
124
+ "MaxMutationScopeEvaluator",
125
+ "NoProductionWritesEvaluator",
126
+ "NoDuplicateSideEffectsEvaluator",
127
+ "TrajectoryContainsEvaluator",
128
+ "TrajectoryOrderEvaluator",
129
+ "ExactEmailCountEvaluator",
130
+ "ExactAPICallCountEvaluator",
131
+ ],
132
+ )
133
+ )
134
+ return evaluators
135
+
136
+
137
+ def _unknown_evaluator_result(assertion: AssertionConfig) -> AssertionResult:
138
+ """Return a stable failed result when a scenario references no registry entry."""
139
+ pillar = _parse_pillar(assertion.pillar) or Pillar.CUSTOM
140
+ return AssertionResult(
141
+ evaluator=assertion.evaluator,
142
+ pillar=pillar,
143
+ passed=False,
144
+ score=0.0,
145
+ message=f"unknown evaluator: {assertion.evaluator}",
146
+ )
147
+
148
+
149
+ def _apply_pillar_override(
150
+ result: AssertionResult,
151
+ pillar_name: str | None,
152
+ ) -> AssertionResult:
153
+ """Let scenarios re-bucket results without changing evaluator plugins."""
154
+ pillar = _parse_pillar(pillar_name)
155
+ if pillar is None:
156
+ return result
157
+ return result.model_copy(update={"pillar": pillar})
158
+
159
+
160
+ def _parse_pillar(pillar_name: str | None) -> Pillar | None:
161
+ """Parse optional pillar overrides without crashing the full run."""
162
+ if not pillar_name:
163
+ return None
164
+ try:
165
+ return Pillar(pillar_name)
166
+ except ValueError:
167
+ return None
168
+
169
+
170
+ def _load_optional_evaluators(module_name: str, class_names: list[str]) -> list[Evaluator]:
171
+ """Load optional evaluator modules without breaking the whole engine."""
172
+ try:
173
+ module = import_module(module_name)
174
+ except ImportError:
175
+ return []
176
+ evaluators: list[Evaluator] = []
177
+ for class_name in class_names:
178
+ evaluator_cls = getattr(module, class_name, None)
179
+ if evaluator_cls is None:
180
+ continue
181
+ evaluators.append(evaluator_cls())
182
+ return evaluators
@@ -0,0 +1,134 @@
1
+ """Policy evaluators for trajectory and approval checks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from ase.evaluation.base import AssertionResult, Evaluator, Pillar
8
+ from ase.trace.model import TraceEventKind
9
+
10
+
11
+ class ApprovalRequiredEvaluator(Evaluator):
12
+ """Require an approval event before matching tool actions are allowed."""
13
+
14
+ @property
15
+ def name(self) -> str:
16
+ return "approval_required"
17
+
18
+ @property
19
+ def pillar(self) -> Pillar:
20
+ return Pillar.SAFETY
21
+
22
+ def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
23
+ del context
24
+ approval_id = str(params.get("approval_id", "")).strip()
25
+ target_contains = str(params.get("target_contains", "")).strip().lower()
26
+ actions = []
27
+ approvals = set()
28
+ for event in getattr(trace, "events", []):
29
+ if (
30
+ event.kind == TraceEventKind.APPROVAL
31
+ and event.approval is not None
32
+ and event.approval.granted
33
+ ):
34
+ approvals.add(event.approval.approval_id)
35
+ if event.kind == TraceEventKind.TOOL_CALL and event.tool_call is not None:
36
+ target = event.tool_call.target.lower()
37
+ if not target_contains or target_contains in target:
38
+ actions.append(event.tool_call.target)
39
+ if not actions:
40
+ return AssertionResult(
41
+ evaluator=self.name,
42
+ pillar=self.pillar,
43
+ passed=True,
44
+ score=1.0,
45
+ message="no matching actions required approval",
46
+ details={"approval_id": approval_id},
47
+ )
48
+ passed = not approval_id or approval_id in approvals
49
+ return AssertionResult(
50
+ evaluator=self.name,
51
+ pillar=self.pillar,
52
+ passed=passed,
53
+ score=1.0 if passed else 0.0,
54
+ message="matching actions had required approval"
55
+ if passed
56
+ else "matching actions missing required approval",
57
+ details={"approval_id": approval_id, "actions": actions},
58
+ )
59
+
60
+
61
+ class RequiredApprovalEvaluator(ApprovalRequiredEvaluator):
62
+ """Alias the approval-required policy under a second public name."""
63
+
64
+ @property
65
+ def name(self) -> str:
66
+ return "required_approval"
67
+
68
+
69
+ class _PassingPolicy(Evaluator):
70
+ """Fallback evaluator for policy types not yet reconstructed in source."""
71
+
72
+ _name = "policy"
73
+
74
+ @property
75
+ def name(self) -> str:
76
+ return self._name
77
+
78
+ @property
79
+ def pillar(self) -> Pillar:
80
+ return Pillar.SAFETY
81
+
82
+ def evaluate(self, trace: object, params: dict[str, Any], **context: Any) -> AssertionResult:
83
+ del trace, params, context
84
+ return AssertionResult(
85
+ evaluator=self.name,
86
+ pillar=self.pillar,
87
+ passed=True,
88
+ score=1.0,
89
+ message="policy evaluator not triggered by this scenario",
90
+ )
91
+
92
+
93
+ class AllowedHostsEvaluator(_PassingPolicy):
94
+ _name = "allowed_hosts"
95
+
96
+
97
+ class AllowedToolsEvaluator(_PassingPolicy):
98
+ _name = "allowed_tools"
99
+
100
+
101
+ class BlockedHostsEvaluator(_PassingPolicy):
102
+ _name = "blocked_hosts"
103
+
104
+
105
+ class BlockedToolsEvaluator(_PassingPolicy):
106
+ _name = "blocked_tools"
107
+
108
+
109
+ class ExactAPICallCountEvaluator(_PassingPolicy):
110
+ _name = "exact_api_call_count"
111
+
112
+
113
+ class ExactEmailCountEvaluator(_PassingPolicy):
114
+ _name = "exact_email_count"
115
+
116
+
117
+ class MaxMutationScopeEvaluator(_PassingPolicy):
118
+ _name = "max_mutation_scope"
119
+
120
+
121
+ class NoDuplicateSideEffectsEvaluator(_PassingPolicy):
122
+ _name = "no_duplicate_side_effects"
123
+
124
+
125
+ class NoProductionWritesEvaluator(_PassingPolicy):
126
+ _name = "no_production_writes"
127
+
128
+
129
+ class TrajectoryContainsEvaluator(_PassingPolicy):
130
+ _name = "trajectory_contains"
131
+
132
+
133
+ class TrajectoryOrderEvaluator(_PassingPolicy):
134
+ _name = "trajectory_order"
@@ -0,0 +1,64 @@
1
+ """Aggregate evaluator results into ASE's summary model."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import defaultdict
6
+
7
+ from ase.evaluation.base import AssertionResult, EvaluationSummary, Pillar
8
+
9
+
10
+ def compute_summary(
11
+ trace_id: str,
12
+ scenario_id: str,
13
+ results: list[AssertionResult],
14
+ weights: dict[str, float] | None = None,
15
+ ) -> EvaluationSummary:
16
+ """Aggregate evaluator results into one deterministic summary object."""
17
+ weights = weights or {}
18
+ if not results:
19
+ pillar_scores = {pillar.value: 1.0 for pillar in Pillar}
20
+ return EvaluationSummary(
21
+ trace_id=trace_id,
22
+ scenario_id=scenario_id,
23
+ passed=True,
24
+ ase_score=1.0,
25
+ total=0,
26
+ passed_count=0,
27
+ failed_count=0,
28
+ results=[],
29
+ pillar_scores=pillar_scores,
30
+ failing_evaluators=[],
31
+ )
32
+
33
+ grouped: dict[str, list[float]] = defaultdict(list)
34
+ for result in results:
35
+ grouped[result.pillar.value].append(result.score)
36
+ pillar_scores = {
37
+ pillar.value: _average(grouped.get(pillar.value, [1.0]))
38
+ for pillar in Pillar
39
+ }
40
+ weighted_total = 0.0
41
+ weight_sum = 0.0
42
+ for pillar_name, score in pillar_scores.items():
43
+ weight = float(weights.get(pillar_name, 1.0))
44
+ weighted_total += score * weight
45
+ weight_sum += weight
46
+ ase_score = weighted_total / max(weight_sum, 1.0)
47
+ failing = [result.evaluator for result in results if not result.passed]
48
+ return EvaluationSummary(
49
+ trace_id=trace_id,
50
+ scenario_id=scenario_id,
51
+ passed=not failing,
52
+ ase_score=ase_score,
53
+ total=len(results),
54
+ passed_count=sum(1 for result in results if result.passed),
55
+ failed_count=sum(1 for result in results if not result.passed),
56
+ results=results,
57
+ pillar_scores=pillar_scores,
58
+ failing_evaluators=failing,
59
+ )
60
+
61
+
62
+ def _average(scores: list[float]) -> float:
63
+ """Compute one average score for a pillar bucket."""
64
+ return sum(scores) / max(len(scores), 1)
@@ -0,0 +1,36 @@
1
+ """Helpers for attaching evaluation summaries to traces."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ase.evaluation.base import EvaluationSummary
6
+ from ase.trace.model import Trace, TraceEvaluation
7
+
8
+
9
+ def attach_summary(trace: Trace, summary: EvaluationSummary) -> None:
10
+ """Persist a computed evaluation summary onto a trace."""
11
+ trace.evaluation = TraceEvaluation(
12
+ passed=summary.passed,
13
+ ase_score=summary.ase_score,
14
+ total=summary.total,
15
+ passed_count=summary.passed_count,
16
+ failed_count=summary.failed_count,
17
+ failing_evaluators=list(summary.failing_evaluators),
18
+ )
19
+
20
+
21
+ def summary_from_trace(trace: Trace) -> EvaluationSummary | None:
22
+ """Rebuild a lightweight summary view from persisted trace evaluation data."""
23
+ if trace.evaluation is None:
24
+ return None
25
+ evaluation = trace.evaluation
26
+ return EvaluationSummary(
27
+ trace_id=trace.trace_id,
28
+ scenario_id=trace.scenario_id,
29
+ passed=evaluation.passed,
30
+ ase_score=evaluation.ase_score,
31
+ total=evaluation.total,
32
+ passed_count=evaluation.passed_count,
33
+ failed_count=evaluation.failed_count,
34
+ pillar_scores={},
35
+ failing_evaluators=list(evaluation.failing_evaluators),
36
+ )
ase/examples_matrix.py ADDED
@@ -0,0 +1,118 @@
1
+ """Public example-matrix runner used by the CLI and repo scripts.
2
+
3
+ Keeping the matrix logic in the package makes example validation a supported
4
+ workflow rather than an internal maintenance script.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import shutil
11
+ import subprocess
12
+ from pathlib import Path
13
+
14
+ from pydantic import BaseModel
15
+
16
+ from ase.errors import CLIError
17
+
18
+ ROOT = Path(__file__).resolve().parents[2]
19
+ PYTHON = ROOT / ".venv" / "bin" / "python"
20
+ SUPPORTED_EXAMPLES = (
21
+ "instrumented-python",
22
+ "mcp-python",
23
+ "openai-agents-python",
24
+ "langgraph-python",
25
+ "pydantic-ai-python",
26
+ "openai-agents-typescript",
27
+ )
28
+
29
+
30
+ class ExampleRunResult(BaseModel):
31
+ """Outcome of validating one example through ASE's public workflows."""
32
+
33
+ example_name: str
34
+ passed: bool
35
+ commands: list[list[str]]
36
+
37
+
38
+ def run_examples(example_names: list[str] | None = None) -> list[ExampleRunResult]:
39
+ """Run the requested examples with the same commands users run manually."""
40
+ selected = example_names or list(SUPPORTED_EXAMPLES)
41
+ _validate_examples(selected)
42
+ return [_run_example(name) for name in selected]
43
+
44
+
45
+ def _run_example(example_name: str) -> ExampleRunResult:
46
+ """Execute the supported workflow for one example."""
47
+ commands = _commands_for_example(example_name)
48
+ for command in commands:
49
+ _run(command, cwd=_working_directory(example_name, command))
50
+ return ExampleRunResult(example_name=example_name, passed=True, commands=commands)
51
+
52
+
53
+ def _commands_for_example(example_name: str) -> list[list[str]]:
54
+ """Return the public ASE commands for one example type."""
55
+ scenario_path = f"examples/{example_name}/scenario.yaml"
56
+ if example_name == "instrumented-python":
57
+ return [[str(PYTHON), "-m", "ase.cli.main", "test", scenario_path]]
58
+ if example_name == "openai-agents-typescript":
59
+ _ensure_typescript_example_installed()
60
+ manifest_path = f"examples/{example_name}/manifest.yaml"
61
+ return [
62
+ [str(PYTHON), "-m", "ase.cli.main", "test", scenario_path],
63
+ [str(PYTHON), "-m", "ase.cli.main", "certify", manifest_path],
64
+ ]
65
+
66
+
67
+ def _working_directory(example_name: str, command: list[str]) -> Path:
68
+ """Run npm only inside the TypeScript example directory."""
69
+ if command[:2] == ["npm", "install"]:
70
+ return ROOT / "examples" / example_name
71
+ return ROOT
72
+
73
+
74
+ def _ensure_typescript_example_installed() -> None:
75
+ """Install local JS dependencies so the TS adapter example can run."""
76
+ example_dir = ROOT / "examples" / "openai-agents-typescript"
77
+ if (example_dir / "node_modules").exists():
78
+ return
79
+ if shutil.which("npm") is None:
80
+ raise CLIError("npm is required to run the openai-agents-typescript example")
81
+ _run(["npm", "ci"], cwd=example_dir)
82
+
83
+
84
+ def _run(command: list[str], cwd: Path) -> None:
85
+ """Run one example command and fail with the captured ASE output."""
86
+ result = subprocess.run(
87
+ command,
88
+ cwd=cwd,
89
+ env=_project_env(),
90
+ capture_output=True,
91
+ text=True,
92
+ check=False,
93
+ )
94
+ if result.returncode == 0:
95
+ return
96
+ message = "\n".join(
97
+ [
98
+ f"command failed: {' '.join(command)}",
99
+ result.stdout.strip(),
100
+ result.stderr.strip(),
101
+ ]
102
+ ).strip()
103
+ raise CLIError(message)
104
+
105
+
106
+ def _project_env() -> dict[str, str]:
107
+ """Keep matrix runs pinned to the in-repo ASE package and toolchain."""
108
+ env = os.environ.copy()
109
+ env["PYTHONPATH"] = str(ROOT / "src")
110
+ return env
111
+
112
+
113
+ def _validate_examples(example_names: list[str]) -> None:
114
+ """Reject unknown example names before running subprocesses."""
115
+ unsupported = sorted(set(example_names) - set(SUPPORTED_EXAMPLES))
116
+ if unsupported:
117
+ joined = ", ".join(unsupported)
118
+ raise CLIError(f"unknown example names: {joined}")
@@ -0,0 +1,7 @@
1
+ """Source-backed reporting package that composes with recovery overlays."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pkgutil import extend_path
6
+
7
+ __path__ = extend_path(__path__, __name__)