aimeval 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aimeval/__init__.py +191 -0
- aimeval/_beta.py +61 -0
- aimeval/_eval.py +316 -0
- aimeval/_exceptions.py +119 -0
- aimeval/_logging.py +139 -0
- aimeval/_otel.py +184 -0
- aimeval/_pagination.py +181 -0
- aimeval/_polling.py +40 -0
- aimeval/_progress.py +111 -0
- aimeval/_report.py +500 -0
- aimeval/_repr_html.py +147 -0
- aimeval/_streaming.py +519 -0
- aimeval/_trace.py +431 -0
- aimeval/_trace_io.py +190 -0
- aimeval/_types.py +659 -0
- aimeval/_webhook_verify.py +166 -0
- aimeval/cli/__init__.py +33 -0
- aimeval/cli/_config.py +113 -0
- aimeval/cli/_exit.py +56 -0
- aimeval/cli/_format.py +101 -0
- aimeval/cli/_output.py +378 -0
- aimeval/cli/_resolve.py +75 -0
- aimeval/cli/_session.py +90 -0
- aimeval/cli/app.py +116 -0
- aimeval/cli/commands/__init__.py +0 -0
- aimeval/cli/commands/annotations.py +125 -0
- aimeval/cli/commands/auth.py +67 -0
- aimeval/cli/commands/compare.py +225 -0
- aimeval/cli/commands/doctor.py +265 -0
- aimeval/cli/commands/evaluate.py +92 -0
- aimeval/cli/commands/gate_extra.py +95 -0
- aimeval/cli/commands/init.py +294 -0
- aimeval/cli/commands/models_extra.py +46 -0
- aimeval/cli/commands/resources.py +207 -0
- aimeval/cli/commands/run.py +546 -0
- aimeval/cli/commands/search.py +26 -0
- aimeval/client.py +888 -0
- aimeval/enums.py +97 -0
- aimeval/events.py +64 -0
- aimeval/metrics.py +368 -0
- aimeval/presentations.py +190 -0
- aimeval/py.typed +0 -0
- aimeval/pytest_plugin.py +81 -0
- aimeval/resources/__init__.py +41 -0
- aimeval/resources/analytics.py +101 -0
- aimeval/resources/annotations.py +275 -0
- aimeval/resources/collections.py +148 -0
- aimeval/resources/compare.py +100 -0
- aimeval/resources/datasets.py +481 -0
- aimeval/resources/evaluate.py +52 -0
- aimeval/resources/gates.py +219 -0
- aimeval/resources/metrics.py +31 -0
- aimeval/resources/models.py +316 -0
- aimeval/resources/observability.py +94 -0
- aimeval/resources/prompts.py +623 -0
- aimeval/resources/regression_sets.py +159 -0
- aimeval/resources/runs.py +774 -0
- aimeval/resources/search.py +27 -0
- aimeval/resources/webhooks.py +160 -0
- aimeval/resources/wizard.py +30 -0
- aimeval/types.py +378 -0
- aimeval-0.6.0.dist-info/METADATA +20 -0
- aimeval-0.6.0.dist-info/RECORD +66 -0
- aimeval-0.6.0.dist-info/WHEEL +5 -0
- aimeval-0.6.0.dist-info/entry_points.txt +5 -0
- aimeval-0.6.0.dist-info/top_level.txt +1 -0
aimeval/__init__.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AIMEval Python SDK — programmatic access for CI/CD pipelines.
|
|
3
|
+
|
|
4
|
+
Quick start::
|
|
5
|
+
|
|
6
|
+
from aimeval import AIMEval
|
|
7
|
+
|
|
8
|
+
client = AIMEval(api_key="aime_sk_...", base_url="https://app.aimeval.com")
|
|
9
|
+
|
|
10
|
+
# End-to-end workflow with a live progress bar (TTY only):
|
|
11
|
+
run = client.evaluate(
|
|
12
|
+
name="nightly-regression",
|
|
13
|
+
model=MODEL_ID,
|
|
14
|
+
dataset=DATASET_ID,
|
|
15
|
+
metrics=COLLECTION_ID,
|
|
16
|
+
)
|
|
17
|
+
sys.exit(0 if run.passed else 1)
|
|
18
|
+
|
|
19
|
+
# Or step by step (industry-standard resource namespaces):
|
|
20
|
+
run = client.runs.create(name="...", model=..., dataset=..., metrics=...)
|
|
21
|
+
run = client.runs.wait(run.id, progress=True)
|
|
22
|
+
|
|
23
|
+
# Auto-pagination — iterate every page transparently:
|
|
24
|
+
for run in client.runs.list(status="completed"):
|
|
25
|
+
print(run.id, run.score)
|
|
26
|
+
"""
|
|
27
|
+
from aimeval.client import AIMEval, AsyncAIMEval
|
|
28
|
+
from aimeval._otel import disable_otel, enable_otel, is_available as otel_is_available, otel_enabled
|
|
29
|
+
from aimeval import events
|
|
30
|
+
from aimeval._trace import Span, TraceCollector, records, span, trace
|
|
31
|
+
from aimeval._trace_io import (
|
|
32
|
+
ReplayResult,
|
|
33
|
+
load_trace,
|
|
34
|
+
load_trace_iter,
|
|
35
|
+
replay_trace,
|
|
36
|
+
save_trace,
|
|
37
|
+
)
|
|
38
|
+
from aimeval._webhook_verify import (
|
|
39
|
+
WebhookVerificationError,
|
|
40
|
+
construct_event,
|
|
41
|
+
verify_webhook,
|
|
42
|
+
)
|
|
43
|
+
from aimeval._eval import (
|
|
44
|
+
Eval,
|
|
45
|
+
EvalAssertionError,
|
|
46
|
+
EvalResult,
|
|
47
|
+
EvalScore,
|
|
48
|
+
assert_test,
|
|
49
|
+
eval_from_run,
|
|
50
|
+
)
|
|
51
|
+
from aimeval import metrics
|
|
52
|
+
from aimeval._exceptions import (
|
|
53
|
+
AIMEvalError,
|
|
54
|
+
AIMEvalSecurityWarning,
|
|
55
|
+
APIConnectionError,
|
|
56
|
+
APIServerError,
|
|
57
|
+
APITimeoutError,
|
|
58
|
+
AuthenticationError,
|
|
59
|
+
BadRequestError,
|
|
60
|
+
ConflictError,
|
|
61
|
+
NotFoundError,
|
|
62
|
+
RateLimitError,
|
|
63
|
+
)
|
|
64
|
+
from aimeval._streaming import (
|
|
65
|
+
AsyncRunSSEStream,
|
|
66
|
+
AsyncRunStream,
|
|
67
|
+
RunEvent,
|
|
68
|
+
RunSSEStream,
|
|
69
|
+
RunStream,
|
|
70
|
+
)
|
|
71
|
+
from aimeval._types import (
|
|
72
|
+
Annotation,
|
|
73
|
+
AnnotationsBootstrapResult,
|
|
74
|
+
AuditEntry,
|
|
75
|
+
CompareHistoryItem,
|
|
76
|
+
Comparison,
|
|
77
|
+
ConnectionTestResult,
|
|
78
|
+
CostEstimate,
|
|
79
|
+
Dataset,
|
|
80
|
+
GateCompatibility,
|
|
81
|
+
MetricCollection,
|
|
82
|
+
MetricDistribution,
|
|
83
|
+
MetricRegistry,
|
|
84
|
+
MetricRegistryEntry,
|
|
85
|
+
MetricSummary,
|
|
86
|
+
Model,
|
|
87
|
+
Prompt,
|
|
88
|
+
PromptDiff,
|
|
89
|
+
PromptTestResult,
|
|
90
|
+
PromptTestRun,
|
|
91
|
+
PromptVersion,
|
|
92
|
+
QualityGate,
|
|
93
|
+
RegressionSet,
|
|
94
|
+
Resource,
|
|
95
|
+
Run,
|
|
96
|
+
SearchHit,
|
|
97
|
+
SearchResults,
|
|
98
|
+
UploadStatus,
|
|
99
|
+
UploadTicket,
|
|
100
|
+
Usage,
|
|
101
|
+
Webhook,
|
|
102
|
+
WizardBootstrap,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
__version__ = "0.6.0"
|
|
107
|
+
|
|
108
|
+
__all__ = [
|
|
109
|
+
# Clients
|
|
110
|
+
"AIMEval",
|
|
111
|
+
"AsyncAIMEval",
|
|
112
|
+
"__version__",
|
|
113
|
+
# Tracing (local span capture → eval datasets; optional OTel mirror)
|
|
114
|
+
"trace",
|
|
115
|
+
"span",
|
|
116
|
+
"TraceCollector",
|
|
117
|
+
"Span",
|
|
118
|
+
"records",
|
|
119
|
+
"events",
|
|
120
|
+
# Trace recording + replay (prod-to-eval pattern)
|
|
121
|
+
"save_trace",
|
|
122
|
+
"load_trace",
|
|
123
|
+
"load_trace_iter",
|
|
124
|
+
"replay_trace",
|
|
125
|
+
"ReplayResult",
|
|
126
|
+
"enable_otel",
|
|
127
|
+
"disable_otel",
|
|
128
|
+
"otel_enabled",
|
|
129
|
+
"otel_is_available",
|
|
130
|
+
# Eval harness + metrics (CI assertion surface)
|
|
131
|
+
"Eval",
|
|
132
|
+
"eval_from_run",
|
|
133
|
+
"assert_test",
|
|
134
|
+
"EvalResult",
|
|
135
|
+
"EvalScore",
|
|
136
|
+
"EvalAssertionError",
|
|
137
|
+
"metrics",
|
|
138
|
+
# Errors
|
|
139
|
+
"AIMEvalError",
|
|
140
|
+
"AIMEvalSecurityWarning",
|
|
141
|
+
"APIConnectionError",
|
|
142
|
+
"APIServerError",
|
|
143
|
+
"APITimeoutError",
|
|
144
|
+
"AuthenticationError",
|
|
145
|
+
"BadRequestError",
|
|
146
|
+
"ConflictError",
|
|
147
|
+
"NotFoundError",
|
|
148
|
+
"RateLimitError",
|
|
149
|
+
# Response types
|
|
150
|
+
"Annotation",
|
|
151
|
+
"AnnotationsBootstrapResult",
|
|
152
|
+
"AuditEntry",
|
|
153
|
+
"CompareHistoryItem",
|
|
154
|
+
"Comparison",
|
|
155
|
+
"ConnectionTestResult",
|
|
156
|
+
"CostEstimate",
|
|
157
|
+
"Dataset",
|
|
158
|
+
"GateCompatibility",
|
|
159
|
+
"MetricCollection",
|
|
160
|
+
"MetricDistribution",
|
|
161
|
+
"MetricRegistry",
|
|
162
|
+
"MetricRegistryEntry",
|
|
163
|
+
"MetricSummary",
|
|
164
|
+
"Model",
|
|
165
|
+
"Prompt",
|
|
166
|
+
"PromptDiff",
|
|
167
|
+
"PromptTestResult",
|
|
168
|
+
"PromptTestRun",
|
|
169
|
+
"PromptVersion",
|
|
170
|
+
"QualityGate",
|
|
171
|
+
"RegressionSet",
|
|
172
|
+
"Resource",
|
|
173
|
+
"Run",
|
|
174
|
+
"SearchHit",
|
|
175
|
+
"SearchResults",
|
|
176
|
+
"UploadStatus",
|
|
177
|
+
"UploadTicket",
|
|
178
|
+
"Usage",
|
|
179
|
+
"Webhook",
|
|
180
|
+
"WizardBootstrap",
|
|
181
|
+
# Webhook verifier (security helper)
|
|
182
|
+
"WebhookVerificationError",
|
|
183
|
+
"construct_event",
|
|
184
|
+
"verify_webhook",
|
|
185
|
+
# Streaming
|
|
186
|
+
"AsyncRunSSEStream",
|
|
187
|
+
"AsyncRunStream",
|
|
188
|
+
"RunEvent",
|
|
189
|
+
"RunSSEStream",
|
|
190
|
+
"RunStream",
|
|
191
|
+
]
|
aimeval/_beta.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""``client.beta.*`` — experimental surface area.
|
|
2
|
+
|
|
3
|
+
Industry pattern from OpenAI / Anthropic: features that haven't earned
|
|
4
|
+
SemVer protection yet live under a dedicated ``beta`` namespace. This
|
|
5
|
+
gives them three properties at once:
|
|
6
|
+
|
|
7
|
+
1. **Visible**: a glance at ``client.beta.<something>`` tells the
|
|
8
|
+
reader "this isn't stable yet — pin your SDK and read the
|
|
9
|
+
CHANGELOG before bumping".
|
|
10
|
+
2. **Safe to remove / rename** without bumping the major version of
|
|
11
|
+
the SDK. The ``beta`` package itself is the contract.
|
|
12
|
+
3. **Cheap to ship**: graduation to the stable surface (e.g.
|
|
13
|
+
``client.replay``) is a one-line re-export when the API settles.
|
|
14
|
+
|
|
15
|
+
Currently in beta:
|
|
16
|
+
|
|
17
|
+
- :attr:`Beta.replay` — trace recording + replay helpers
|
|
18
|
+
(:func:`aimeval.save_trace` / :func:`aimeval.load_trace` /
|
|
19
|
+
:func:`aimeval.replay_trace`). Wrapped under ``client.beta.replay``
|
|
20
|
+
so the helpers feel discoverable from autocomplete even when
|
|
21
|
+
callers don't import them directly.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from typing import TYPE_CHECKING
|
|
26
|
+
|
|
27
|
+
from aimeval import _trace_io
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
30
|
+
from aimeval.client import AIMEval, AsyncAIMEval
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class _Replay:
|
|
34
|
+
"""Beta wrapper around :mod:`aimeval._trace_io`.
|
|
35
|
+
|
|
36
|
+
Stays opt-in via ``client.beta.replay.save_trace(...)`` for the
|
|
37
|
+
same reason every premium SDK quarantines unstable surface:
|
|
38
|
+
discoverable via autocomplete, but the rename / removal is on
|
|
39
|
+
a separate timeline from the stable client.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, _client: "AIMEval | AsyncAIMEval"):
|
|
43
|
+
# Client kept on the instance only so a future beta feature
|
|
44
|
+
# that *does* hit the network has something to call. The replay
|
|
45
|
+
# helpers themselves are pure offline.
|
|
46
|
+
self._client = _client
|
|
47
|
+
|
|
48
|
+
save_trace = staticmethod(_trace_io.save_trace)
|
|
49
|
+
load_trace = staticmethod(_trace_io.load_trace)
|
|
50
|
+
load_trace_iter = staticmethod(_trace_io.load_trace_iter)
|
|
51
|
+
replay_trace = staticmethod(_trace_io.replay_trace)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Beta:
|
|
55
|
+
"""Namespace for experimental features. See module docstring."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, client: "AIMEval | AsyncAIMEval"):
|
|
58
|
+
self.replay = _Replay(client)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
__all__ = ["Beta"]
|
aimeval/_eval.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"""``aimeval.Eval`` + ``assert_test`` — the CI evaluation harness.
|
|
2
|
+
|
|
3
|
+
Competitors expose a one-call eval harness (Braintrust ``Eval(name,
|
|
4
|
+
data, task, scores)``, DeepEval ``evaluate(...)`` + ``assert_test``,
|
|
5
|
+
LangSmith ``client.evaluate(...)``). This module is AIMEval's version,
|
|
6
|
+
scoped honestly to how AIMEval actually scores.
|
|
7
|
+
|
|
8
|
+
**How AIMEval scoring works (and why this layer is honest).** The
|
|
9
|
+
metrics (visual faithfulness, hallucination, …) are computed
|
|
10
|
+
server-side — the VLM judge + computation pipeline need the image,
|
|
11
|
+
which the SDK doesn't have. So :func:`Eval` does NOT re-score locally.
|
|
12
|
+
It:
|
|
13
|
+
|
|
14
|
+
1. launches a server-side run (``client.runs.evaluate``) — or takes an
|
|
15
|
+
already-finished run via :func:`eval_from_run`;
|
|
16
|
+
2. reads the run's server-computed per-metric scores;
|
|
17
|
+
3. interprets each score against the local :class:`~aimeval.metrics.Metric`
|
|
18
|
+
threshold you passed (direction-aware: lower-is-better metrics use
|
|
19
|
+
``<=``);
|
|
20
|
+
4. returns a structured :class:`EvalResult` with a single ``passed``
|
|
21
|
+
boolean for CI.
|
|
22
|
+
|
|
23
|
+
:func:`assert_test` is the pytest-friendly wrapper: it raises
|
|
24
|
+
``AssertionError`` listing exactly which metrics fell below threshold —
|
|
25
|
+
the DeepEval ``assert_test`` ergonomics, on AIMEval's server-scored
|
|
26
|
+
runs.
|
|
27
|
+
|
|
28
|
+
import aimeval
|
|
29
|
+
from aimeval.metrics import VisualFaithfulness, Hallucination
|
|
30
|
+
|
|
31
|
+
result = aimeval.Eval(
|
|
32
|
+
name="ci-nightly", model=M, dataset=D, metrics=C,
|
|
33
|
+
scorers=[VisualFaithfulness(threshold=0.8), Hallucination(threshold=0.1)],
|
|
34
|
+
)
|
|
35
|
+
aimeval.assert_test(result) # raises if any scorer failed
|
|
36
|
+
"""
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
import os
|
|
40
|
+
from typing import TYPE_CHECKING, Any, Sequence
|
|
41
|
+
|
|
42
|
+
from aimeval._types import Resource, Run
|
|
43
|
+
from aimeval.metrics import Metric, resolve
|
|
44
|
+
|
|
45
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
46
|
+
from aimeval.client import AIMEval
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class EvalScore(Resource):
|
|
50
|
+
"""One metric's outcome inside an :class:`EvalResult`."""
|
|
51
|
+
|
|
52
|
+
metric: str = ""
|
|
53
|
+
label: str = ""
|
|
54
|
+
score: float | None = None
|
|
55
|
+
threshold: float | None = None
|
|
56
|
+
operator: str = ">="
|
|
57
|
+
passed: bool = True
|
|
58
|
+
|
|
59
|
+
def __repr__(self) -> str:
|
|
60
|
+
mark = "✓" if self.passed else "✗"
|
|
61
|
+
thr = f" {self.operator} {self.threshold}" if self.threshold is not None else ""
|
|
62
|
+
return f"EvalScore({mark} {self.metric}={self.score}{thr})"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class EvalResult(Resource):
|
|
66
|
+
"""Outcome of an :func:`Eval` — server-scored run + threshold verdicts.
|
|
67
|
+
|
|
68
|
+
``passed`` is the single CI signal: every scorer met its threshold
|
|
69
|
+
**and** the run's gate didn't fail.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
name: str = ""
|
|
73
|
+
run_id: str = ""
|
|
74
|
+
gate_status: str | None = None
|
|
75
|
+
overall_score: float | None = None
|
|
76
|
+
scores: list[EvalScore] | None = None
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def passed(self) -> bool:
|
|
80
|
+
gate_ok = (self.gate_status or "").lower() != "fail"
|
|
81
|
+
scorers_ok = all(s.passed for s in (self.scores or []))
|
|
82
|
+
return gate_ok and scorers_ok
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def failures(self) -> list[EvalScore]:
|
|
86
|
+
return [s for s in (self.scores or []) if not s.passed]
|
|
87
|
+
|
|
88
|
+
def __repr__(self) -> str:
|
|
89
|
+
mark = "PASS" if self.passed else "FAIL"
|
|
90
|
+
return (
|
|
91
|
+
f"EvalResult({mark} name={self.name!r} run={self.run_id!r} "
|
|
92
|
+
f"scores={len(self.scores or [])} failures={len(self.failures)})"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def _repr_html_(self) -> str:
|
|
96
|
+
"""Jupyter renders this HTML in a notebook cell."""
|
|
97
|
+
from aimeval._repr_html import eval_result_html
|
|
98
|
+
return eval_result_html(self)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ── score extraction ────────────────────────────────────────────────────
|
|
102
|
+
|
|
103
|
+
def _all_scores(run: Run) -> dict[str, float]:
|
|
104
|
+
"""Merge a run's judge + gt + top-level score maps into one slug→value
|
|
105
|
+
dict. Server returns scores split across ``judge_scores`` /
|
|
106
|
+
``gt_scores``; we flatten so a metric lookup finds its value wherever
|
|
107
|
+
the pipeline emitted it."""
|
|
108
|
+
merged: dict[str, float] = {}
|
|
109
|
+
for source in (run.gt_scores, run.judge_scores):
|
|
110
|
+
if isinstance(source, dict):
|
|
111
|
+
for k, v in source.items():
|
|
112
|
+
if isinstance(v, (int, float)):
|
|
113
|
+
merged[str(k)] = float(v)
|
|
114
|
+
return merged
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _score_for(metric: Metric, scores: dict[str, float]) -> float | None:
|
|
118
|
+
"""Find a metric's score, tolerating alias/label keys the server may
|
|
119
|
+
have used (e.g. ``reference_attribute_match`` for
|
|
120
|
+
``reference_attr_match``)."""
|
|
121
|
+
if metric.slug in scores:
|
|
122
|
+
return scores[metric.slug]
|
|
123
|
+
# Try resolving each returned key back to the canonical slug.
|
|
124
|
+
for key, value in scores.items():
|
|
125
|
+
cls = resolve(key)
|
|
126
|
+
if cls is not None and cls.slug == metric.slug:
|
|
127
|
+
return value
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _build_scores(
|
|
132
|
+
run: Run, scorers: Sequence[Metric] | None,
|
|
133
|
+
) -> list[EvalScore]:
|
|
134
|
+
raw = _all_scores(run)
|
|
135
|
+
out: list[EvalScore] = []
|
|
136
|
+
if scorers:
|
|
137
|
+
for m in scorers:
|
|
138
|
+
value = _score_for(m, raw)
|
|
139
|
+
out.append(EvalScore(
|
|
140
|
+
metric=m.slug, label=m.label, score=value,
|
|
141
|
+
threshold=m.threshold, operator=m.operator,
|
|
142
|
+
passed=m.passed(value),
|
|
143
|
+
))
|
|
144
|
+
else:
|
|
145
|
+
# No explicit scorers → report every returned metric, no threshold.
|
|
146
|
+
for slug, value in raw.items():
|
|
147
|
+
cls = resolve(slug)
|
|
148
|
+
out.append(EvalScore(
|
|
149
|
+
metric=(cls.slug if cls else slug),
|
|
150
|
+
label=(cls.label if cls else slug),
|
|
151
|
+
score=value, threshold=None,
|
|
152
|
+
operator=">=", passed=True,
|
|
153
|
+
))
|
|
154
|
+
return out
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _coerce_scorers(
|
|
158
|
+
scorers: Sequence[Metric | str] | None,
|
|
159
|
+
) -> list[Metric] | None:
|
|
160
|
+
if scorers is None:
|
|
161
|
+
return None
|
|
162
|
+
out: list[Metric] = []
|
|
163
|
+
for s in scorers:
|
|
164
|
+
if isinstance(s, Metric):
|
|
165
|
+
out.append(s)
|
|
166
|
+
elif isinstance(s, str):
|
|
167
|
+
cls = resolve(s)
|
|
168
|
+
if cls is None:
|
|
169
|
+
raise KeyError(f"unknown metric {s!r}")
|
|
170
|
+
out.append(cls())
|
|
171
|
+
else:
|
|
172
|
+
raise TypeError(f"scorer must be a Metric or slug str, got {type(s)}")
|
|
173
|
+
return out
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _resolve_client(client: "AIMEval | None") -> "AIMEval":
|
|
177
|
+
if client is not None:
|
|
178
|
+
return client
|
|
179
|
+
# Lazy import to avoid a hard import cycle at module load.
|
|
180
|
+
from aimeval.client import AIMEval
|
|
181
|
+
# Honest failure: if env isn't set, AIMEval() raises a clear error —
|
|
182
|
+
# we don't silently fabricate a client.
|
|
183
|
+
if not os.environ.get("AIMEVAL_API_KEY") or not os.environ.get("AIMEVAL_BASE_URL"):
|
|
184
|
+
raise RuntimeError(
|
|
185
|
+
"Eval() needs a client: pass client=AIMEval(...) or set "
|
|
186
|
+
"AIMEVAL_API_KEY + AIMEVAL_BASE_URL env vars.",
|
|
187
|
+
)
|
|
188
|
+
return AIMEval()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ── public API ──────────────────────────────────────────────────────────
|
|
192
|
+
|
|
193
|
+
def Eval(
|
|
194
|
+
name: str,
|
|
195
|
+
*,
|
|
196
|
+
model: str,
|
|
197
|
+
dataset: str,
|
|
198
|
+
metrics: str,
|
|
199
|
+
scorers: Sequence[Metric | str] | None = None,
|
|
200
|
+
client: "AIMEval | None" = None,
|
|
201
|
+
timeout: int = 600,
|
|
202
|
+
progress: bool | str = "auto",
|
|
203
|
+
**run_kwargs: Any,
|
|
204
|
+
) -> EvalResult:
|
|
205
|
+
"""Run a server-side evaluation and interpret it against thresholds.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
name: Run name.
|
|
209
|
+
model / dataset / metrics: IDs forwarded to ``runs.evaluate``
|
|
210
|
+
(``metrics`` is the metric-collection ID that selects which
|
|
211
|
+
metrics the server computes).
|
|
212
|
+
scorers: List of :class:`~aimeval.metrics.Metric` instances (with
|
|
213
|
+
thresholds) or metric slugs. Each is checked against the run's
|
|
214
|
+
server-computed score, direction-aware. Omit to report all
|
|
215
|
+
returned scores without thresholds.
|
|
216
|
+
client: An ``AIMEval``; defaults to one built from env vars.
|
|
217
|
+
timeout / progress / run_kwargs: Forwarded to ``runs.evaluate``.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
:class:`EvalResult` — ``.passed`` is the CI signal.
|
|
221
|
+
"""
|
|
222
|
+
cl = _resolve_client(client)
|
|
223
|
+
scorer_objs = _coerce_scorers(scorers)
|
|
224
|
+
run = cl.runs.evaluate(
|
|
225
|
+
name=name, model=model, dataset=dataset, metrics=metrics,
|
|
226
|
+
timeout=timeout, progress=progress, **run_kwargs,
|
|
227
|
+
)
|
|
228
|
+
return eval_from_run(run, scorers=scorer_objs, name=name)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def eval_from_run(
|
|
232
|
+
run: Run,
|
|
233
|
+
*,
|
|
234
|
+
scorers: Sequence[Metric | str] | None = None,
|
|
235
|
+
name: str | None = None,
|
|
236
|
+
) -> EvalResult:
|
|
237
|
+
"""Interpret an already-finished :class:`Run` against thresholds.
|
|
238
|
+
|
|
239
|
+
Useful when you ran the evaluation elsewhere (UI / a prior CI step)
|
|
240
|
+
and only want the threshold verdict::
|
|
241
|
+
|
|
242
|
+
run = client.runs.retrieve(run_id)
|
|
243
|
+
result = aimeval.eval_from_run(run, scorers=[Hallucination(threshold=0.1)])
|
|
244
|
+
"""
|
|
245
|
+
scorer_objs = _coerce_scorers(scorers)
|
|
246
|
+
return EvalResult(
|
|
247
|
+
name=name or run.name or "",
|
|
248
|
+
run_id=run.id,
|
|
249
|
+
gate_status=run.gate_status,
|
|
250
|
+
overall_score=run.overall_score,
|
|
251
|
+
scores=_build_scores(run, scorer_objs),
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class EvalAssertionError(AssertionError):
|
|
256
|
+
"""Raised by :func:`assert_test` when a scorer falls below threshold."""
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def assert_test(
|
|
260
|
+
result: EvalResult | Run,
|
|
261
|
+
scorers: Sequence[Metric | str] | None = None,
|
|
262
|
+
) -> EvalResult:
|
|
263
|
+
"""Assert that every scorer met its threshold; raise otherwise.
|
|
264
|
+
|
|
265
|
+
pytest-friendly (DeepEval ``assert_test`` ergonomics)::
|
|
266
|
+
|
|
267
|
+
def test_nightly_quality():
|
|
268
|
+
result = aimeval.Eval(name="t", model=M, dataset=D, metrics=C,
|
|
269
|
+
scorers=[VisualFaithfulness(threshold=0.8)])
|
|
270
|
+
aimeval.assert_test(result)
|
|
271
|
+
|
|
272
|
+
Accepts an :class:`EvalResult` directly, or a :class:`Run` plus
|
|
273
|
+
``scorers`` (interpreted on the spot). Returns the result on success
|
|
274
|
+
so it can be inspected.
|
|
275
|
+
"""
|
|
276
|
+
if isinstance(result, Run):
|
|
277
|
+
result = eval_from_run(result, scorers=scorers)
|
|
278
|
+
elif scorers is not None and not (result.scores or []):
|
|
279
|
+
# An EvalResult built without scorers but assert_test was given
|
|
280
|
+
# some — re-interpret would need the raw run; we can only check
|
|
281
|
+
# what's present. Fail loudly rather than silently ignore.
|
|
282
|
+
raise ValueError(
|
|
283
|
+
"assert_test received scorers but the EvalResult has no scores; "
|
|
284
|
+
"pass scorers to Eval()/eval_from_run() instead.",
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
failures = result.failures
|
|
288
|
+
if failures:
|
|
289
|
+
lines = "\n".join(
|
|
290
|
+
f" ✗ {f.label} ({f.metric}): score={f.score} "
|
|
291
|
+
f"{f.operator} {f.threshold} → FAILED"
|
|
292
|
+
for f in failures
|
|
293
|
+
)
|
|
294
|
+
gate = (result.gate_status or "").lower()
|
|
295
|
+
gate_line = (
|
|
296
|
+
f"\n ✗ gate_status={result.gate_status}" if gate == "fail" else ""
|
|
297
|
+
)
|
|
298
|
+
raise EvalAssertionError(
|
|
299
|
+
f"Eval '{result.name}' (run {result.run_id}) failed "
|
|
300
|
+
f"{len(failures)} metric threshold(s):\n{lines}{gate_line}",
|
|
301
|
+
)
|
|
302
|
+
if (result.gate_status or "").lower() == "fail":
|
|
303
|
+
raise EvalAssertionError(
|
|
304
|
+
f"Eval '{result.name}' (run {result.run_id}) — gate_status=fail",
|
|
305
|
+
)
|
|
306
|
+
return result
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
__all__ = [
|
|
310
|
+
"Eval",
|
|
311
|
+
"eval_from_run",
|
|
312
|
+
"assert_test",
|
|
313
|
+
"EvalResult",
|
|
314
|
+
"EvalScore",
|
|
315
|
+
"EvalAssertionError",
|
|
316
|
+
]
|
aimeval/_exceptions.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Error hierarchy.
|
|
2
|
+
|
|
3
|
+
Catch-by-status pattern (industry standard — OpenAI, Anthropic, Stripe)::
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
client.runs.create(...)
|
|
7
|
+
except aimeval.RateLimitError:
|
|
8
|
+
time.sleep(30)
|
|
9
|
+
except aimeval.AuthenticationError:
|
|
10
|
+
sys.exit("bad key")
|
|
11
|
+
except aimeval.AIMEvalError:
|
|
12
|
+
... # everything else
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AIMEvalError(Exception):
|
|
18
|
+
"""Base for every error the SDK raises.
|
|
19
|
+
|
|
20
|
+
Industry-standard error context (OpenAI / Anthropic / Stripe):
|
|
21
|
+
every error carries the upstream ``X-Request-Id`` so users can copy
|
|
22
|
+
it into a support ticket. Backend already attaches one via
|
|
23
|
+
``config.api.request_id`` middleware.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
message: str,
|
|
29
|
+
*,
|
|
30
|
+
status_code: int = 0,
|
|
31
|
+
details: dict | None = None,
|
|
32
|
+
request_id: str | None = None,
|
|
33
|
+
response: object | None = None,
|
|
34
|
+
):
|
|
35
|
+
super().__init__(message)
|
|
36
|
+
self.status_code = status_code
|
|
37
|
+
self.details = details or {}
|
|
38
|
+
self.request_id = request_id
|
|
39
|
+
# Raw httpx.Response for power users that need headers / body.
|
|
40
|
+
# Kept as ``object`` to avoid leaking httpx into the public type.
|
|
41
|
+
self.response = response
|
|
42
|
+
|
|
43
|
+
def __str__(self) -> str:
|
|
44
|
+
base = super().__str__()
|
|
45
|
+
if self.request_id:
|
|
46
|
+
return f"{base} (request_id={self.request_id})"
|
|
47
|
+
return base
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class APIConnectionError(AIMEvalError):
|
|
51
|
+
"""Network / DNS / TLS failure — no HTTP response was received."""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class APITimeoutError(APIConnectionError):
|
|
55
|
+
"""Request didn't finish within the configured timeout."""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class AuthenticationError(AIMEvalError):
|
|
59
|
+
"""401 — API key missing, invalid, or revoked."""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class PermissionError_(AIMEvalError): # avoid shadowing builtin
|
|
63
|
+
"""403 — key valid but not allowed to do this."""
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class NotFoundError(AIMEvalError):
|
|
67
|
+
"""404 — resource doesn't exist (or isn't visible to this key)."""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ConflictError(AIMEvalError):
|
|
71
|
+
"""409 — concurrent edit or duplicate idempotency key."""
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class BadRequestError(AIMEvalError):
|
|
75
|
+
"""400 / 422 — request shape is wrong; ``details`` has field-level info."""
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class RateLimitError(AIMEvalError):
|
|
79
|
+
"""429 — too many requests; retry after ``Retry-After`` seconds."""
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class APIServerError(AIMEvalError):
|
|
83
|
+
"""5xx — server-side failure."""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class AIMEvalSecurityWarning(UserWarning):
|
|
87
|
+
"""Warned when SDK usage risks leaking credentials.
|
|
88
|
+
|
|
89
|
+
Filtered with ``warnings.simplefilter('ignore', AIMEvalSecurityWarning)``.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def from_status(
|
|
94
|
+
status_code: int,
|
|
95
|
+
message: str,
|
|
96
|
+
details: dict | None = None,
|
|
97
|
+
*,
|
|
98
|
+
request_id: str | None = None,
|
|
99
|
+
response: object | None = None,
|
|
100
|
+
) -> AIMEvalError:
|
|
101
|
+
"""Map an HTTP status code to the most specific subclass."""
|
|
102
|
+
cls = _STATUS_MAP.get(status_code)
|
|
103
|
+
if cls is None:
|
|
104
|
+
cls = APIServerError if status_code >= 500 else AIMEvalError
|
|
105
|
+
return cls(
|
|
106
|
+
message, status_code=status_code, details=details,
|
|
107
|
+
request_id=request_id, response=response,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
_STATUS_MAP: dict[int, type[AIMEvalError]] = {
|
|
112
|
+
400: BadRequestError,
|
|
113
|
+
401: AuthenticationError,
|
|
114
|
+
403: PermissionError_,
|
|
115
|
+
404: NotFoundError,
|
|
116
|
+
409: ConflictError,
|
|
117
|
+
422: BadRequestError,
|
|
118
|
+
429: RateLimitError,
|
|
119
|
+
}
|