aimeval 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. aimeval/__init__.py +191 -0
  2. aimeval/_beta.py +61 -0
  3. aimeval/_eval.py +316 -0
  4. aimeval/_exceptions.py +119 -0
  5. aimeval/_logging.py +139 -0
  6. aimeval/_otel.py +184 -0
  7. aimeval/_pagination.py +181 -0
  8. aimeval/_polling.py +40 -0
  9. aimeval/_progress.py +111 -0
  10. aimeval/_report.py +500 -0
  11. aimeval/_repr_html.py +147 -0
  12. aimeval/_streaming.py +519 -0
  13. aimeval/_trace.py +431 -0
  14. aimeval/_trace_io.py +190 -0
  15. aimeval/_types.py +659 -0
  16. aimeval/_webhook_verify.py +166 -0
  17. aimeval/cli/__init__.py +33 -0
  18. aimeval/cli/_config.py +113 -0
  19. aimeval/cli/_exit.py +56 -0
  20. aimeval/cli/_format.py +101 -0
  21. aimeval/cli/_output.py +378 -0
  22. aimeval/cli/_resolve.py +75 -0
  23. aimeval/cli/_session.py +90 -0
  24. aimeval/cli/app.py +116 -0
  25. aimeval/cli/commands/__init__.py +0 -0
  26. aimeval/cli/commands/annotations.py +125 -0
  27. aimeval/cli/commands/auth.py +67 -0
  28. aimeval/cli/commands/compare.py +225 -0
  29. aimeval/cli/commands/doctor.py +265 -0
  30. aimeval/cli/commands/evaluate.py +92 -0
  31. aimeval/cli/commands/gate_extra.py +95 -0
  32. aimeval/cli/commands/init.py +294 -0
  33. aimeval/cli/commands/models_extra.py +46 -0
  34. aimeval/cli/commands/resources.py +207 -0
  35. aimeval/cli/commands/run.py +546 -0
  36. aimeval/cli/commands/search.py +26 -0
  37. aimeval/client.py +888 -0
  38. aimeval/enums.py +97 -0
  39. aimeval/events.py +64 -0
  40. aimeval/metrics.py +368 -0
  41. aimeval/presentations.py +190 -0
  42. aimeval/py.typed +0 -0
  43. aimeval/pytest_plugin.py +81 -0
  44. aimeval/resources/__init__.py +41 -0
  45. aimeval/resources/analytics.py +101 -0
  46. aimeval/resources/annotations.py +275 -0
  47. aimeval/resources/collections.py +148 -0
  48. aimeval/resources/compare.py +100 -0
  49. aimeval/resources/datasets.py +481 -0
  50. aimeval/resources/evaluate.py +52 -0
  51. aimeval/resources/gates.py +219 -0
  52. aimeval/resources/metrics.py +31 -0
  53. aimeval/resources/models.py +316 -0
  54. aimeval/resources/observability.py +94 -0
  55. aimeval/resources/prompts.py +623 -0
  56. aimeval/resources/regression_sets.py +159 -0
  57. aimeval/resources/runs.py +774 -0
  58. aimeval/resources/search.py +27 -0
  59. aimeval/resources/webhooks.py +160 -0
  60. aimeval/resources/wizard.py +30 -0
  61. aimeval/types.py +378 -0
  62. aimeval-0.6.0.dist-info/METADATA +20 -0
  63. aimeval-0.6.0.dist-info/RECORD +66 -0
  64. aimeval-0.6.0.dist-info/WHEEL +5 -0
  65. aimeval-0.6.0.dist-info/entry_points.txt +5 -0
  66. aimeval-0.6.0.dist-info/top_level.txt +1 -0
aimeval/__init__.py ADDED
@@ -0,0 +1,191 @@
1
+ """
2
+ AIMEval Python SDK — programmatic access for CI/CD pipelines.
3
+
4
+ Quick start::
5
+
6
+ from aimeval import AIMEval
7
+
8
+ client = AIMEval(api_key="aime_sk_...", base_url="https://app.aimeval.com")
9
+
10
+ # End-to-end workflow with a live progress bar (TTY only):
11
+ run = client.evaluate(
12
+ name="nightly-regression",
13
+ model=MODEL_ID,
14
+ dataset=DATASET_ID,
15
+ metrics=COLLECTION_ID,
16
+ )
17
+ sys.exit(0 if run.passed else 1)
18
+
19
+ # Or step by step (industry-standard resource namespaces):
20
+ run = client.runs.create(name="...", model=..., dataset=..., metrics=...)
21
+ run = client.runs.wait(run.id, progress=True)
22
+
23
+ # Auto-pagination — iterate every page transparently:
24
+ for run in client.runs.list(status="completed"):
25
+ print(run.id, run.score)
26
+ """
27
+ from aimeval.client import AIMEval, AsyncAIMEval
28
+ from aimeval._otel import disable_otel, enable_otel, is_available as otel_is_available, otel_enabled
29
+ from aimeval import events
30
+ from aimeval._trace import Span, TraceCollector, records, span, trace
31
+ from aimeval._trace_io import (
32
+ ReplayResult,
33
+ load_trace,
34
+ load_trace_iter,
35
+ replay_trace,
36
+ save_trace,
37
+ )
38
+ from aimeval._webhook_verify import (
39
+ WebhookVerificationError,
40
+ construct_event,
41
+ verify_webhook,
42
+ )
43
+ from aimeval._eval import (
44
+ Eval,
45
+ EvalAssertionError,
46
+ EvalResult,
47
+ EvalScore,
48
+ assert_test,
49
+ eval_from_run,
50
+ )
51
+ from aimeval import metrics
52
+ from aimeval._exceptions import (
53
+ AIMEvalError,
54
+ AIMEvalSecurityWarning,
55
+ APIConnectionError,
56
+ APIServerError,
57
+ APITimeoutError,
58
+ AuthenticationError,
59
+ BadRequestError,
60
+ ConflictError,
61
+ NotFoundError,
62
+ RateLimitError,
63
+ )
64
+ from aimeval._streaming import (
65
+ AsyncRunSSEStream,
66
+ AsyncRunStream,
67
+ RunEvent,
68
+ RunSSEStream,
69
+ RunStream,
70
+ )
71
+ from aimeval._types import (
72
+ Annotation,
73
+ AnnotationsBootstrapResult,
74
+ AuditEntry,
75
+ CompareHistoryItem,
76
+ Comparison,
77
+ ConnectionTestResult,
78
+ CostEstimate,
79
+ Dataset,
80
+ GateCompatibility,
81
+ MetricCollection,
82
+ MetricDistribution,
83
+ MetricRegistry,
84
+ MetricRegistryEntry,
85
+ MetricSummary,
86
+ Model,
87
+ Prompt,
88
+ PromptDiff,
89
+ PromptTestResult,
90
+ PromptTestRun,
91
+ PromptVersion,
92
+ QualityGate,
93
+ RegressionSet,
94
+ Resource,
95
+ Run,
96
+ SearchHit,
97
+ SearchResults,
98
+ UploadStatus,
99
+ UploadTicket,
100
+ Usage,
101
+ Webhook,
102
+ WizardBootstrap,
103
+ )
104
+
105
+
106
+ __version__ = "0.6.0"
107
+
108
+ __all__ = [
109
+ # Clients
110
+ "AIMEval",
111
+ "AsyncAIMEval",
112
+ "__version__",
113
+ # Tracing (local span capture → eval datasets; optional OTel mirror)
114
+ "trace",
115
+ "span",
116
+ "TraceCollector",
117
+ "Span",
118
+ "records",
119
+ "events",
120
+ # Trace recording + replay (prod-to-eval pattern)
121
+ "save_trace",
122
+ "load_trace",
123
+ "load_trace_iter",
124
+ "replay_trace",
125
+ "ReplayResult",
126
+ "enable_otel",
127
+ "disable_otel",
128
+ "otel_enabled",
129
+ "otel_is_available",
130
+ # Eval harness + metrics (CI assertion surface)
131
+ "Eval",
132
+ "eval_from_run",
133
+ "assert_test",
134
+ "EvalResult",
135
+ "EvalScore",
136
+ "EvalAssertionError",
137
+ "metrics",
138
+ # Errors
139
+ "AIMEvalError",
140
+ "AIMEvalSecurityWarning",
141
+ "APIConnectionError",
142
+ "APIServerError",
143
+ "APITimeoutError",
144
+ "AuthenticationError",
145
+ "BadRequestError",
146
+ "ConflictError",
147
+ "NotFoundError",
148
+ "RateLimitError",
149
+ # Response types
150
+ "Annotation",
151
+ "AnnotationsBootstrapResult",
152
+ "AuditEntry",
153
+ "CompareHistoryItem",
154
+ "Comparison",
155
+ "ConnectionTestResult",
156
+ "CostEstimate",
157
+ "Dataset",
158
+ "GateCompatibility",
159
+ "MetricCollection",
160
+ "MetricDistribution",
161
+ "MetricRegistry",
162
+ "MetricRegistryEntry",
163
+ "MetricSummary",
164
+ "Model",
165
+ "Prompt",
166
+ "PromptDiff",
167
+ "PromptTestResult",
168
+ "PromptTestRun",
169
+ "PromptVersion",
170
+ "QualityGate",
171
+ "RegressionSet",
172
+ "Resource",
173
+ "Run",
174
+ "SearchHit",
175
+ "SearchResults",
176
+ "UploadStatus",
177
+ "UploadTicket",
178
+ "Usage",
179
+ "Webhook",
180
+ "WizardBootstrap",
181
+ # Webhook verifier (security helper)
182
+ "WebhookVerificationError",
183
+ "construct_event",
184
+ "verify_webhook",
185
+ # Streaming
186
+ "AsyncRunSSEStream",
187
+ "AsyncRunStream",
188
+ "RunEvent",
189
+ "RunSSEStream",
190
+ "RunStream",
191
+ ]
aimeval/_beta.py ADDED
@@ -0,0 +1,61 @@
1
+ """``client.beta.*`` — experimental surface area.
2
+
3
+ Industry pattern from OpenAI / Anthropic: features that haven't earned
4
+ SemVer protection yet live under a dedicated ``beta`` namespace. This
5
+ gives them three properties at once:
6
+
7
+ 1. **Visible**: a glance at ``client.beta.<something>`` tells the
8
+ reader "this isn't stable yet — pin your SDK and read the
9
+ CHANGELOG before bumping".
10
+ 2. **Safe to remove / rename** without bumping the major version of
11
+ the SDK. The ``beta`` package itself is the contract.
12
+ 3. **Cheap to ship**: graduation to the stable surface (e.g.
13
+ ``client.replay``) is a one-line re-export when the API settles.
14
+
15
+ Currently in beta:
16
+
17
+ - :attr:`Beta.replay` — trace recording + replay helpers
18
+ (:func:`aimeval.save_trace` / :func:`aimeval.load_trace` /
19
+ :func:`aimeval.replay_trace`). Wrapped under ``client.beta.replay``
20
+ so the helpers feel discoverable from autocomplete even when
21
+ callers don't import them directly.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ from typing import TYPE_CHECKING
26
+
27
+ from aimeval import _trace_io
28
+
29
+ if TYPE_CHECKING: # pragma: no cover
30
+ from aimeval.client import AIMEval, AsyncAIMEval
31
+
32
+
33
+ class _Replay:
34
+ """Beta wrapper around :mod:`aimeval._trace_io`.
35
+
36
+ Stays opt-in via ``client.beta.replay.save_trace(...)`` for the
37
+ same reason every premium SDK quarantines unstable surface:
38
+ discoverable via autocomplete, but the rename / removal is on
39
+ a separate timeline from the stable client.
40
+ """
41
+
42
+ def __init__(self, _client: "AIMEval | AsyncAIMEval"):
43
+ # Client kept on the instance only so a future beta feature
44
+ # that *does* hit the network has something to call. The replay
45
+ # helpers themselves are pure offline.
46
+ self._client = _client
47
+
48
+ save_trace = staticmethod(_trace_io.save_trace)
49
+ load_trace = staticmethod(_trace_io.load_trace)
50
+ load_trace_iter = staticmethod(_trace_io.load_trace_iter)
51
+ replay_trace = staticmethod(_trace_io.replay_trace)
52
+
53
+
54
+ class Beta:
55
+ """Namespace for experimental features. See module docstring."""
56
+
57
+ def __init__(self, client: "AIMEval | AsyncAIMEval"):
58
+ self.replay = _Replay(client)
59
+
60
+
61
+ __all__ = ["Beta"]
aimeval/_eval.py ADDED
@@ -0,0 +1,316 @@
1
+ """``aimeval.Eval`` + ``assert_test`` — the CI evaluation harness.
2
+
3
+ Competitors expose a one-call eval harness (Braintrust ``Eval(name,
4
+ data, task, scores)``, DeepEval ``evaluate(...)`` + ``assert_test``,
5
+ LangSmith ``client.evaluate(...)``). This module is AIMEval's version,
6
+ scoped honestly to how AIMEval actually scores.
7
+
8
+ **How AIMEval scoring works (and why this layer is honest).** The
9
+ metrics (visual faithfulness, hallucination, …) are computed
10
+ server-side — the VLM judge + computation pipeline need the image,
11
+ which the SDK doesn't have. So :func:`Eval` does NOT re-score locally.
12
+ It:
13
+
14
+ 1. launches a server-side run (``client.runs.evaluate``) — or takes an
15
+ already-finished run via :func:`eval_from_run`;
16
+ 2. reads the run's server-computed per-metric scores;
17
+ 3. interprets each score against the local :class:`~aimeval.metrics.Metric`
18
+ threshold you passed (direction-aware: lower-is-better metrics use
19
+ ``<=``);
20
+ 4. returns a structured :class:`EvalResult` with a single ``passed``
21
+ boolean for CI.
22
+
23
+ :func:`assert_test` is the pytest-friendly wrapper: it raises
24
+ ``AssertionError`` listing exactly which metrics fell below threshold —
25
+ the DeepEval ``assert_test`` ergonomics, on AIMEval's server-scored
26
+ runs.
27
+
28
+ import aimeval
29
+ from aimeval.metrics import VisualFaithfulness, Hallucination
30
+
31
+ result = aimeval.Eval(
32
+ name="ci-nightly", model=M, dataset=D, metrics=C,
33
+ scorers=[VisualFaithfulness(threshold=0.8), Hallucination(threshold=0.1)],
34
+ )
35
+ aimeval.assert_test(result) # raises if any scorer failed
36
+ """
37
+ from __future__ import annotations
38
+
39
+ import os
40
+ from typing import TYPE_CHECKING, Any, Sequence
41
+
42
+ from aimeval._types import Resource, Run
43
+ from aimeval.metrics import Metric, resolve
44
+
45
+ if TYPE_CHECKING: # pragma: no cover
46
+ from aimeval.client import AIMEval
47
+
48
+
49
+ class EvalScore(Resource):
50
+ """One metric's outcome inside an :class:`EvalResult`."""
51
+
52
+ metric: str = ""
53
+ label: str = ""
54
+ score: float | None = None
55
+ threshold: float | None = None
56
+ operator: str = ">="
57
+ passed: bool = True
58
+
59
+ def __repr__(self) -> str:
60
+ mark = "✓" if self.passed else "✗"
61
+ thr = f" {self.operator} {self.threshold}" if self.threshold is not None else ""
62
+ return f"EvalScore({mark} {self.metric}={self.score}{thr})"
63
+
64
+
65
+ class EvalResult(Resource):
66
+ """Outcome of an :func:`Eval` — server-scored run + threshold verdicts.
67
+
68
+ ``passed`` is the single CI signal: every scorer met its threshold
69
+ **and** the run's gate didn't fail.
70
+ """
71
+
72
+ name: str = ""
73
+ run_id: str = ""
74
+ gate_status: str | None = None
75
+ overall_score: float | None = None
76
+ scores: list[EvalScore] | None = None
77
+
78
+ @property
79
+ def passed(self) -> bool:
80
+ gate_ok = (self.gate_status or "").lower() != "fail"
81
+ scorers_ok = all(s.passed for s in (self.scores or []))
82
+ return gate_ok and scorers_ok
83
+
84
+ @property
85
+ def failures(self) -> list[EvalScore]:
86
+ return [s for s in (self.scores or []) if not s.passed]
87
+
88
+ def __repr__(self) -> str:
89
+ mark = "PASS" if self.passed else "FAIL"
90
+ return (
91
+ f"EvalResult({mark} name={self.name!r} run={self.run_id!r} "
92
+ f"scores={len(self.scores or [])} failures={len(self.failures)})"
93
+ )
94
+
95
+ def _repr_html_(self) -> str:
96
+ """Jupyter renders this HTML in a notebook cell."""
97
+ from aimeval._repr_html import eval_result_html
98
+ return eval_result_html(self)
99
+
100
+
101
+ # ── score extraction ────────────────────────────────────────────────────
102
+
103
+ def _all_scores(run: Run) -> dict[str, float]:
104
+ """Merge a run's judge + gt + top-level score maps into one slug→value
105
+ dict. Server returns scores split across ``judge_scores`` /
106
+ ``gt_scores``; we flatten so a metric lookup finds its value wherever
107
+ the pipeline emitted it."""
108
+ merged: dict[str, float] = {}
109
+ for source in (run.gt_scores, run.judge_scores):
110
+ if isinstance(source, dict):
111
+ for k, v in source.items():
112
+ if isinstance(v, (int, float)):
113
+ merged[str(k)] = float(v)
114
+ return merged
115
+
116
+
117
+ def _score_for(metric: Metric, scores: dict[str, float]) -> float | None:
118
+ """Find a metric's score, tolerating alias/label keys the server may
119
+ have used (e.g. ``reference_attribute_match`` for
120
+ ``reference_attr_match``)."""
121
+ if metric.slug in scores:
122
+ return scores[metric.slug]
123
+ # Try resolving each returned key back to the canonical slug.
124
+ for key, value in scores.items():
125
+ cls = resolve(key)
126
+ if cls is not None and cls.slug == metric.slug:
127
+ return value
128
+ return None
129
+
130
+
131
+ def _build_scores(
132
+ run: Run, scorers: Sequence[Metric] | None,
133
+ ) -> list[EvalScore]:
134
+ raw = _all_scores(run)
135
+ out: list[EvalScore] = []
136
+ if scorers:
137
+ for m in scorers:
138
+ value = _score_for(m, raw)
139
+ out.append(EvalScore(
140
+ metric=m.slug, label=m.label, score=value,
141
+ threshold=m.threshold, operator=m.operator,
142
+ passed=m.passed(value),
143
+ ))
144
+ else:
145
+ # No explicit scorers → report every returned metric, no threshold.
146
+ for slug, value in raw.items():
147
+ cls = resolve(slug)
148
+ out.append(EvalScore(
149
+ metric=(cls.slug if cls else slug),
150
+ label=(cls.label if cls else slug),
151
+ score=value, threshold=None,
152
+ operator=">=", passed=True,
153
+ ))
154
+ return out
155
+
156
+
157
+ def _coerce_scorers(
158
+ scorers: Sequence[Metric | str] | None,
159
+ ) -> list[Metric] | None:
160
+ if scorers is None:
161
+ return None
162
+ out: list[Metric] = []
163
+ for s in scorers:
164
+ if isinstance(s, Metric):
165
+ out.append(s)
166
+ elif isinstance(s, str):
167
+ cls = resolve(s)
168
+ if cls is None:
169
+ raise KeyError(f"unknown metric {s!r}")
170
+ out.append(cls())
171
+ else:
172
+ raise TypeError(f"scorer must be a Metric or slug str, got {type(s)}")
173
+ return out
174
+
175
+
176
+ def _resolve_client(client: "AIMEval | None") -> "AIMEval":
177
+ if client is not None:
178
+ return client
179
+ # Lazy import to avoid a hard import cycle at module load.
180
+ from aimeval.client import AIMEval
181
+ # Honest failure: if env isn't set, AIMEval() raises a clear error —
182
+ # we don't silently fabricate a client.
183
+ if not os.environ.get("AIMEVAL_API_KEY") or not os.environ.get("AIMEVAL_BASE_URL"):
184
+ raise RuntimeError(
185
+ "Eval() needs a client: pass client=AIMEval(...) or set "
186
+ "AIMEVAL_API_KEY + AIMEVAL_BASE_URL env vars.",
187
+ )
188
+ return AIMEval()
189
+
190
+
191
+ # ── public API ──────────────────────────────────────────────────────────
192
+
193
+ def Eval(
194
+ name: str,
195
+ *,
196
+ model: str,
197
+ dataset: str,
198
+ metrics: str,
199
+ scorers: Sequence[Metric | str] | None = None,
200
+ client: "AIMEval | None" = None,
201
+ timeout: int = 600,
202
+ progress: bool | str = "auto",
203
+ **run_kwargs: Any,
204
+ ) -> EvalResult:
205
+ """Run a server-side evaluation and interpret it against thresholds.
206
+
207
+ Args:
208
+ name: Run name.
209
+ model / dataset / metrics: IDs forwarded to ``runs.evaluate``
210
+ (``metrics`` is the metric-collection ID that selects which
211
+ metrics the server computes).
212
+ scorers: List of :class:`~aimeval.metrics.Metric` instances (with
213
+ thresholds) or metric slugs. Each is checked against the run's
214
+ server-computed score, direction-aware. Omit to report all
215
+ returned scores without thresholds.
216
+ client: An ``AIMEval``; defaults to one built from env vars.
217
+ timeout / progress / run_kwargs: Forwarded to ``runs.evaluate``.
218
+
219
+ Returns:
220
+ :class:`EvalResult` — ``.passed`` is the CI signal.
221
+ """
222
+ cl = _resolve_client(client)
223
+ scorer_objs = _coerce_scorers(scorers)
224
+ run = cl.runs.evaluate(
225
+ name=name, model=model, dataset=dataset, metrics=metrics,
226
+ timeout=timeout, progress=progress, **run_kwargs,
227
+ )
228
+ return eval_from_run(run, scorers=scorer_objs, name=name)
229
+
230
+
231
+ def eval_from_run(
232
+ run: Run,
233
+ *,
234
+ scorers: Sequence[Metric | str] | None = None,
235
+ name: str | None = None,
236
+ ) -> EvalResult:
237
+ """Interpret an already-finished :class:`Run` against thresholds.
238
+
239
+ Useful when you ran the evaluation elsewhere (UI / a prior CI step)
240
+ and only want the threshold verdict::
241
+
242
+ run = client.runs.retrieve(run_id)
243
+ result = aimeval.eval_from_run(run, scorers=[Hallucination(threshold=0.1)])
244
+ """
245
+ scorer_objs = _coerce_scorers(scorers)
246
+ return EvalResult(
247
+ name=name or run.name or "",
248
+ run_id=run.id,
249
+ gate_status=run.gate_status,
250
+ overall_score=run.overall_score,
251
+ scores=_build_scores(run, scorer_objs),
252
+ )
253
+
254
+
255
+ class EvalAssertionError(AssertionError):
256
+ """Raised by :func:`assert_test` when a scorer falls below threshold."""
257
+
258
+
259
+ def assert_test(
260
+ result: EvalResult | Run,
261
+ scorers: Sequence[Metric | str] | None = None,
262
+ ) -> EvalResult:
263
+ """Assert that every scorer met its threshold; raise otherwise.
264
+
265
+ pytest-friendly (DeepEval ``assert_test`` ergonomics)::
266
+
267
+ def test_nightly_quality():
268
+ result = aimeval.Eval(name="t", model=M, dataset=D, metrics=C,
269
+ scorers=[VisualFaithfulness(threshold=0.8)])
270
+ aimeval.assert_test(result)
271
+
272
+ Accepts an :class:`EvalResult` directly, or a :class:`Run` plus
273
+ ``scorers`` (interpreted on the spot). Returns the result on success
274
+ so it can be inspected.
275
+ """
276
+ if isinstance(result, Run):
277
+ result = eval_from_run(result, scorers=scorers)
278
+ elif scorers is not None and not (result.scores or []):
279
+ # An EvalResult built without scorers but assert_test was given
280
+ # some — re-interpret would need the raw run; we can only check
281
+ # what's present. Fail loudly rather than silently ignore.
282
+ raise ValueError(
283
+ "assert_test received scorers but the EvalResult has no scores; "
284
+ "pass scorers to Eval()/eval_from_run() instead.",
285
+ )
286
+
287
+ failures = result.failures
288
+ if failures:
289
+ lines = "\n".join(
290
+ f" ✗ {f.label} ({f.metric}): score={f.score} "
291
+ f"{f.operator} {f.threshold} → FAILED"
292
+ for f in failures
293
+ )
294
+ gate = (result.gate_status or "").lower()
295
+ gate_line = (
296
+ f"\n ✗ gate_status={result.gate_status}" if gate == "fail" else ""
297
+ )
298
+ raise EvalAssertionError(
299
+ f"Eval '{result.name}' (run {result.run_id}) failed "
300
+ f"{len(failures)} metric threshold(s):\n{lines}{gate_line}",
301
+ )
302
+ if (result.gate_status or "").lower() == "fail":
303
+ raise EvalAssertionError(
304
+ f"Eval '{result.name}' (run {result.run_id}) — gate_status=fail",
305
+ )
306
+ return result
307
+
308
+
309
+ __all__ = [
310
+ "Eval",
311
+ "eval_from_run",
312
+ "assert_test",
313
+ "EvalResult",
314
+ "EvalScore",
315
+ "EvalAssertionError",
316
+ ]
aimeval/_exceptions.py ADDED
@@ -0,0 +1,119 @@
1
+ """Error hierarchy.
2
+
3
+ Catch-by-status pattern (industry standard — OpenAI, Anthropic, Stripe)::
4
+
5
+ try:
6
+ client.runs.create(...)
7
+ except aimeval.RateLimitError:
8
+ time.sleep(30)
9
+ except aimeval.AuthenticationError:
10
+ sys.exit("bad key")
11
+ except aimeval.AIMEvalError:
12
+ ... # everything else
13
+ """
14
+ from __future__ import annotations
15
+
16
+
17
+ class AIMEvalError(Exception):
18
+ """Base for every error the SDK raises.
19
+
20
+ Industry-standard error context (OpenAI / Anthropic / Stripe):
21
+ every error carries the upstream ``X-Request-Id`` so users can copy
22
+ it into a support ticket. Backend already attaches one via
23
+ ``config.api.request_id`` middleware.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ message: str,
29
+ *,
30
+ status_code: int = 0,
31
+ details: dict | None = None,
32
+ request_id: str | None = None,
33
+ response: object | None = None,
34
+ ):
35
+ super().__init__(message)
36
+ self.status_code = status_code
37
+ self.details = details or {}
38
+ self.request_id = request_id
39
+ # Raw httpx.Response for power users that need headers / body.
40
+ # Kept as ``object`` to avoid leaking httpx into the public type.
41
+ self.response = response
42
+
43
+ def __str__(self) -> str:
44
+ base = super().__str__()
45
+ if self.request_id:
46
+ return f"{base} (request_id={self.request_id})"
47
+ return base
48
+
49
+
50
+ class APIConnectionError(AIMEvalError):
51
+ """Network / DNS / TLS failure — no HTTP response was received."""
52
+
53
+
54
+ class APITimeoutError(APIConnectionError):
55
+ """Request didn't finish within the configured timeout."""
56
+
57
+
58
+ class AuthenticationError(AIMEvalError):
59
+ """401 — API key missing, invalid, or revoked."""
60
+
61
+
62
+ class PermissionError_(AIMEvalError): # avoid shadowing builtin
63
+ """403 — key valid but not allowed to do this."""
64
+
65
+
66
+ class NotFoundError(AIMEvalError):
67
+ """404 — resource doesn't exist (or isn't visible to this key)."""
68
+
69
+
70
+ class ConflictError(AIMEvalError):
71
+ """409 — concurrent edit or duplicate idempotency key."""
72
+
73
+
74
+ class BadRequestError(AIMEvalError):
75
+ """400 / 422 — request shape is wrong; ``details`` has field-level info."""
76
+
77
+
78
+ class RateLimitError(AIMEvalError):
79
+ """429 — too many requests; retry after ``Retry-After`` seconds."""
80
+
81
+
82
+ class APIServerError(AIMEvalError):
83
+ """5xx — server-side failure."""
84
+
85
+
86
+ class AIMEvalSecurityWarning(UserWarning):
87
+ """Warned when SDK usage risks leaking credentials.
88
+
89
+ Filtered with ``warnings.simplefilter('ignore', AIMEvalSecurityWarning)``.
90
+ """
91
+
92
+
93
+ def from_status(
94
+ status_code: int,
95
+ message: str,
96
+ details: dict | None = None,
97
+ *,
98
+ request_id: str | None = None,
99
+ response: object | None = None,
100
+ ) -> AIMEvalError:
101
+ """Map an HTTP status code to the most specific subclass."""
102
+ cls = _STATUS_MAP.get(status_code)
103
+ if cls is None:
104
+ cls = APIServerError if status_code >= 500 else AIMEvalError
105
+ return cls(
106
+ message, status_code=status_code, details=details,
107
+ request_id=request_id, response=response,
108
+ )
109
+
110
+
111
+ _STATUS_MAP: dict[int, type[AIMEvalError]] = {
112
+ 400: BadRequestError,
113
+ 401: AuthenticationError,
114
+ 403: PermissionError_,
115
+ 404: NotFoundError,
116
+ 409: ConflictError,
117
+ 422: BadRequestError,
118
+ 429: RateLimitError,
119
+ }