aimeval 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aimeval-0.6.0/PKG-INFO +20 -0
- aimeval-0.6.0/README.md +330 -0
- aimeval-0.6.0/aimeval/__init__.py +191 -0
- aimeval-0.6.0/aimeval/_beta.py +61 -0
- aimeval-0.6.0/aimeval/_eval.py +316 -0
- aimeval-0.6.0/aimeval/_exceptions.py +119 -0
- aimeval-0.6.0/aimeval/_logging.py +139 -0
- aimeval-0.6.0/aimeval/_otel.py +184 -0
- aimeval-0.6.0/aimeval/_pagination.py +181 -0
- aimeval-0.6.0/aimeval/_polling.py +40 -0
- aimeval-0.6.0/aimeval/_progress.py +111 -0
- aimeval-0.6.0/aimeval/_report.py +500 -0
- aimeval-0.6.0/aimeval/_repr_html.py +147 -0
- aimeval-0.6.0/aimeval/_streaming.py +519 -0
- aimeval-0.6.0/aimeval/_trace.py +431 -0
- aimeval-0.6.0/aimeval/_trace_io.py +190 -0
- aimeval-0.6.0/aimeval/_types.py +659 -0
- aimeval-0.6.0/aimeval/_webhook_verify.py +166 -0
- aimeval-0.6.0/aimeval/cli/__init__.py +33 -0
- aimeval-0.6.0/aimeval/cli/_config.py +113 -0
- aimeval-0.6.0/aimeval/cli/_exit.py +56 -0
- aimeval-0.6.0/aimeval/cli/_format.py +101 -0
- aimeval-0.6.0/aimeval/cli/_output.py +378 -0
- aimeval-0.6.0/aimeval/cli/_resolve.py +75 -0
- aimeval-0.6.0/aimeval/cli/_session.py +90 -0
- aimeval-0.6.0/aimeval/cli/app.py +116 -0
- aimeval-0.6.0/aimeval/cli/commands/__init__.py +0 -0
- aimeval-0.6.0/aimeval/cli/commands/annotations.py +125 -0
- aimeval-0.6.0/aimeval/cli/commands/auth.py +67 -0
- aimeval-0.6.0/aimeval/cli/commands/compare.py +225 -0
- aimeval-0.6.0/aimeval/cli/commands/doctor.py +265 -0
- aimeval-0.6.0/aimeval/cli/commands/evaluate.py +92 -0
- aimeval-0.6.0/aimeval/cli/commands/gate_extra.py +95 -0
- aimeval-0.6.0/aimeval/cli/commands/init.py +294 -0
- aimeval-0.6.0/aimeval/cli/commands/models_extra.py +46 -0
- aimeval-0.6.0/aimeval/cli/commands/resources.py +207 -0
- aimeval-0.6.0/aimeval/cli/commands/run.py +546 -0
- aimeval-0.6.0/aimeval/cli/commands/search.py +26 -0
- aimeval-0.6.0/aimeval/client.py +888 -0
- aimeval-0.6.0/aimeval/enums.py +97 -0
- aimeval-0.6.0/aimeval/events.py +64 -0
- aimeval-0.6.0/aimeval/metrics.py +368 -0
- aimeval-0.6.0/aimeval/presentations.py +190 -0
- aimeval-0.6.0/aimeval/py.typed +0 -0
- aimeval-0.6.0/aimeval/pytest_plugin.py +81 -0
- aimeval-0.6.0/aimeval/resources/__init__.py +41 -0
- aimeval-0.6.0/aimeval/resources/analytics.py +101 -0
- aimeval-0.6.0/aimeval/resources/annotations.py +275 -0
- aimeval-0.6.0/aimeval/resources/collections.py +148 -0
- aimeval-0.6.0/aimeval/resources/compare.py +100 -0
- aimeval-0.6.0/aimeval/resources/datasets.py +481 -0
- aimeval-0.6.0/aimeval/resources/evaluate.py +52 -0
- aimeval-0.6.0/aimeval/resources/gates.py +219 -0
- aimeval-0.6.0/aimeval/resources/metrics.py +31 -0
- aimeval-0.6.0/aimeval/resources/models.py +316 -0
- aimeval-0.6.0/aimeval/resources/observability.py +94 -0
- aimeval-0.6.0/aimeval/resources/prompts.py +623 -0
- aimeval-0.6.0/aimeval/resources/regression_sets.py +159 -0
- aimeval-0.6.0/aimeval/resources/runs.py +774 -0
- aimeval-0.6.0/aimeval/resources/search.py +27 -0
- aimeval-0.6.0/aimeval/resources/webhooks.py +160 -0
- aimeval-0.6.0/aimeval/resources/wizard.py +30 -0
- aimeval-0.6.0/aimeval/types.py +378 -0
- aimeval-0.6.0/aimeval.egg-info/PKG-INFO +20 -0
- aimeval-0.6.0/aimeval.egg-info/SOURCES.txt +101 -0
- aimeval-0.6.0/aimeval.egg-info/dependency_links.txt +1 -0
- aimeval-0.6.0/aimeval.egg-info/entry_points.txt +5 -0
- aimeval-0.6.0/aimeval.egg-info/requires.txt +20 -0
- aimeval-0.6.0/aimeval.egg-info/top_level.txt +1 -0
- aimeval-0.6.0/pyproject.toml +56 -0
- aimeval-0.6.0/setup.cfg +4 -0
- aimeval-0.6.0/tests/test_catchup.py +486 -0
- aimeval-0.6.0/tests/test_catchup_full.py +683 -0
- aimeval-0.6.0/tests/test_cli_s1_contract.py +116 -0
- aimeval-0.6.0/tests/test_client.py +255 -0
- aimeval-0.6.0/tests/test_compare_render.py +72 -0
- aimeval-0.6.0/tests/test_cost_estimate.py +211 -0
- aimeval-0.6.0/tests/test_doctor.py +69 -0
- aimeval-0.6.0/tests/test_dx_polish.py +134 -0
- aimeval-0.6.0/tests/test_endpoint_gaps.py +229 -0
- aimeval-0.6.0/tests/test_eval.py +180 -0
- aimeval-0.6.0/tests/test_exceptions.py +50 -0
- aimeval-0.6.0/tests/test_gate_render.py +40 -0
- aimeval-0.6.0/tests/test_init_cli.py +105 -0
- aimeval-0.6.0/tests/test_logging_and_transport.py +224 -0
- aimeval-0.6.0/tests/test_metrics.py +155 -0
- aimeval-0.6.0/tests/test_observability.py +161 -0
- aimeval-0.6.0/tests/test_otel.py +143 -0
- aimeval-0.6.0/tests/test_pagination.py +127 -0
- aimeval-0.6.0/tests/test_polling.py +35 -0
- aimeval-0.6.0/tests/test_pretty_print.py +108 -0
- aimeval-0.6.0/tests/test_pytest_plugin.py +76 -0
- aimeval-0.6.0/tests/test_regression_sets_from_run.py +163 -0
- aimeval-0.6.0/tests/test_resolve_run.py +99 -0
- aimeval-0.6.0/tests/test_result_stream.py +109 -0
- aimeval-0.6.0/tests/test_run_results_render.py +236 -0
- aimeval-0.6.0/tests/test_span_and_events.py +133 -0
- aimeval-0.6.0/tests/test_streaming.py +73 -0
- aimeval-0.6.0/tests/test_trace.py +289 -0
- aimeval-0.6.0/tests/test_trace_io.py +250 -0
- aimeval-0.6.0/tests/test_types.py +62 -0
- aimeval-0.6.0/tests/test_webhooks.py +250 -0
- aimeval-0.6.0/tests/test_with_options.py +136 -0
aimeval-0.6.0/PKG-INFO
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aimeval
|
|
3
|
+
Version: 0.6.0
|
|
4
|
+
Summary: AIMEval Python SDK — Vision-Text Evaluation Platform
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: httpx>=0.25
|
|
7
|
+
Requires-Dist: pydantic>=2.0
|
|
8
|
+
Requires-Dist: tqdm>=4.65
|
|
9
|
+
Provides-Extra: cli
|
|
10
|
+
Requires-Dist: typer>=0.12; extra == "cli"
|
|
11
|
+
Requires-Dist: rich>=13.7; extra == "cli"
|
|
12
|
+
Requires-Dist: tomli>=2.0; python_version < "3.11" and extra == "cli"
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
15
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
16
|
+
Requires-Dist: pytest-httpx>=0.30; extra == "dev"
|
|
17
|
+
Requires-Dist: pyyaml>=6; extra == "dev"
|
|
18
|
+
Provides-Extra: otel
|
|
19
|
+
Requires-Dist: opentelemetry-api>=1.20; extra == "otel"
|
|
20
|
+
Requires-Dist: opentelemetry-sdk>=1.20; extra == "otel"
|
aimeval-0.6.0/README.md
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
# aimeval — AIMEval Python SDK
|
|
2
|
+
|
|
3
|
+
The Python SDK + CLI for [AIMEval](https://aimeval.com) — the
|
|
4
|
+
vision-language evaluation platform for e-commerce product listings.
|
|
5
|
+
|
|
6
|
+
```python
|
|
7
|
+
import aimeval
|
|
8
|
+
from aimeval.metrics import VisualFaithfulness, Hallucination
|
|
9
|
+
|
|
10
|
+
# One-call CI gate. Returns when the run finishes; assert_test raises
|
|
11
|
+
# (with the per-metric breakdown) if any scorer fell below threshold.
|
|
12
|
+
result = aimeval.Eval(
|
|
13
|
+
name="nightly",
|
|
14
|
+
model="m_42", dataset="d_77", metrics="c_5", gate="g_strict",
|
|
15
|
+
scorers=[
|
|
16
|
+
VisualFaithfulness(threshold=0.80),
|
|
17
|
+
Hallucination(threshold=0.10), # ← lower is better, SDK knows
|
|
18
|
+
],
|
|
19
|
+
)
|
|
20
|
+
aimeval.assert_test(result)
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
> **The IDs above are placeholders.** `m_42` / `d_77` / `c_5` / `g_strict`
|
|
24
|
+
> stand in for real IDs — on AIMEval every object ID is a UUID
|
|
25
|
+
> (`a1b2c3d4-…`). Get yours from the dashboard, the CLI
|
|
26
|
+
> (`aimeval model list` · `dataset list` · `collection list` · `gate list`),
|
|
27
|
+
> or in code:
|
|
28
|
+
>
|
|
29
|
+
> ```python
|
|
30
|
+
> from aimeval import AIMEval
|
|
31
|
+
> c = AIMEval()
|
|
32
|
+
> model_id = c.models.list().data[0].id # '9133edaa-f0d4-46c1-…'
|
|
33
|
+
> dataset_id = c.datasets.list().data[0].id
|
|
34
|
+
> coll_id = c.collections.list().data[0].id
|
|
35
|
+
> ```
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install aimeval # base: typed responses + async + retry
|
|
41
|
+
pip install 'aimeval[cli]' # adds the `aimeval` CLI + Rich panels
|
|
42
|
+
pip install 'aimeval[otel]' # OpenTelemetry export for @aimeval.trace
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Python 3.10+.
|
|
46
|
+
|
|
47
|
+
## 60-second quickstart
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install 'aimeval[cli]'
|
|
51
|
+
|
|
52
|
+
aimeval init my-project --github-action
|
|
53
|
+
cd my-project
|
|
54
|
+
cp .env.example .env && $EDITOR .env # AIMEVAL_API_KEY=…
|
|
55
|
+
aimeval doctor # verify env + auth + resources
|
|
56
|
+
$EDITOR evals/ci.py # plug in your real IDs
|
|
57
|
+
python evals/ci.py
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Done. The scaffolded `ci.py` exits 0 on pass, 2 on review, 1 on fail — the
|
|
61
|
+
GitHub Action wires those into PR checks automatically.
|
|
62
|
+
|
|
63
|
+
## What's in the box
|
|
64
|
+
|
|
65
|
+
| Surface | What you get |
|
|
66
|
+
|---|---|
|
|
67
|
+
| **`client.runs / models / datasets / gates / collections / prompts`** | Full CRUD + lifecycle for every product object |
|
|
68
|
+
| **`client.runs.create(…, gate=, prompt=, baseline_run=)`** + **`apply_gate`** / **`promote`** / **`unpromote`** / **`export_results`** | Wires the v7 evaluation flow end-to-end |
|
|
69
|
+
| **`aimeval.Eval(name, model, dataset, metrics, scorers=[...])`** + **`assert_test`** | One-call CI harness with direction-aware threshold checking |
|
|
70
|
+
| **`aimeval.metrics`** (15 typed metric handles) | `VisualFaithfulness`, `Hallucination`, `BrandSupport`, `PIIExposure`, … — each encodes its direction so `passed(score)` can't be inverted |
|
|
71
|
+
| **`@aimeval.trace`** + **`aimeval.span(…)`** + **`TraceCollector`** | Local span capture, parent/child nesting, sync/async/gen; no fake trace endpoint |
|
|
72
|
+
| **`aimeval.save_trace`** / **`load_trace`** / **`replay_trace`** | JSONL persistence + replay through a candidate model (prod-to-eval) |
|
|
73
|
+
| **`aimeval.enable_otel()`** | Mirror spans into Datadog / Honeycomb / Phoenix / Langfuse via OpenInference convention |
|
|
74
|
+
| **`client.webhooks.*`** + **`aimeval.verify_webhook(...)`** | CRUD + HMAC-SHA256 verifier (Stripe-style: constant-time + 5-min replay window) |
|
|
75
|
+
| **`client.runs.estimate_cost(model=, samples=)`** | Honest cost preview from your project's own run history — no stale price tables |
|
|
76
|
+
| **`client.regression_sets.from_run(run_id, n=20)`** | Pin the worst-scoring samples for re-test on every commit |
|
|
77
|
+
| **`client.audit_log()`** / **`client.usage()`** | Settings observability (compliance + quota dashboards) |
|
|
78
|
+
| **`client.beta.replay.*`** | Experimental surface (OpenAI/Anthropic pattern; pin SDK + read CHANGELOG before bumping) |
|
|
79
|
+
| **`client.compare`** / **`compare_history`** / **`search`** / **`metrics_catalog`** / **`wizard_bootstrap`** | Top-level convenience methods |
|
|
80
|
+
| **CLI** — `aimeval init / doctor / evaluate / run / annotations / gate / model / search / compare` | Every workflow scriptable from the terminal |
|
|
81
|
+
|
|
82
|
+
Every async resource has a sync twin on `AIMEval`, and vice versa on `AsyncAIMEval`.
|
|
83
|
+
|
|
84
|
+
## Configure
|
|
85
|
+
|
|
86
|
+
The SDK reads from the environment by default:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
export AIMEVAL_API_KEY="aime_sk_..."
|
|
90
|
+
export AIMEVAL_BASE_URL="https://app.aimeval.com"
|
|
91
|
+
|
|
92
|
+
# Optional:
|
|
93
|
+
export AIMEVAL_LOG=info # logfmt request logs on stderr
|
|
94
|
+
export AIMEVAL_OTEL=1 # mirror @trace spans to OTel
|
|
95
|
+
export AIMEVAL_WEBHOOK_SECRET=... # for verify_webhook
|
|
96
|
+
export AIMEVAL_MODEL_AUTH=... # Custom Endpoint test_connection
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Production-shaped API keys (`aime_sk_live_*`) passed inline (instead of via
|
|
100
|
+
env var) raise `AIMEvalSecurityWarning` so a leaked key surfaces before it
|
|
101
|
+
lands in shell history or a committed notebook. Test/sandbox keys are
|
|
102
|
+
silent.
|
|
103
|
+
|
|
104
|
+
`api_key` is **masked** in `__repr__`, in error messages, in CLI output,
|
|
105
|
+
and in the dev logger handler.
|
|
106
|
+
|
|
107
|
+
## Selected recipes
|
|
108
|
+
|
|
109
|
+
### CI gate that blocks merges on regression
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
# evals/ci.py — scaffolded by `aimeval init`
|
|
113
|
+
import aimeval
|
|
114
|
+
from aimeval.metrics import VisualFaithfulness, Hallucination, PIIExposure
|
|
115
|
+
|
|
116
|
+
result = aimeval.Eval(
|
|
117
|
+
name=f"ci-{os.environ['AIMEVAL_CI_COMMIT'][:7]}",
|
|
118
|
+
model="m_gpt4o", dataset="d_amazon", metrics="c_standard",
|
|
119
|
+
gate="g_strict",
|
|
120
|
+
idempotency_key=os.environ["AIMEVAL_CI_COMMIT"], # auto-generated if omitted
|
|
121
|
+
scorers=[
|
|
122
|
+
VisualFaithfulness(threshold=0.80),
|
|
123
|
+
Hallucination(threshold=0.10),
|
|
124
|
+
PIIExposure(threshold=0.05),
|
|
125
|
+
],
|
|
126
|
+
)
|
|
127
|
+
aimeval.assert_test(result)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Pair with the bundled GitHub Action:
|
|
131
|
+
|
|
132
|
+
```yaml
|
|
133
|
+
# .github/workflows/aimeval-ci.yml
|
|
134
|
+
- uses: rybalena/aimeval-backend/.github/actions/aimeval-eval@main
|
|
135
|
+
with:
|
|
136
|
+
script: ./evals/ci.py
|
|
137
|
+
api-key: ${{ secrets.AIMEVAL_API_KEY }}
|
|
138
|
+
aimeval-version: "0.6.0"
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### pytest plugin (auto-loaded via the `pytest11` entry point)
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
@pytest.mark.aimeval_eval
|
|
145
|
+
def test_amazon_nightly():
|
|
146
|
+
result = aimeval.Eval(name="t", model="m", dataset="d", metrics="c",
|
|
147
|
+
scorers=[VisualFaithfulness(threshold=0.8)])
|
|
148
|
+
aimeval.assert_test(result)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Failures render a dedicated "AIMEval eval failure" section with the
|
|
152
|
+
per-metric breakdown — no traceback noise.
|
|
153
|
+
|
|
154
|
+
### Instrument production, replay as an eval
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
@aimeval.trace
|
|
158
|
+
def describe(image_url: str) -> str:
|
|
159
|
+
return my_vlm(image_url)
|
|
160
|
+
|
|
161
|
+
# In prod: capture every call
|
|
162
|
+
with aimeval.TraceCollector() as spans:
|
|
163
|
+
for url in queue:
|
|
164
|
+
describe(url)
|
|
165
|
+
aimeval.save_trace(spans, "captures/2026-06-07.jsonl")
|
|
166
|
+
|
|
167
|
+
# Later: replay through a candidate model
|
|
168
|
+
results = aimeval.replay_trace(
|
|
169
|
+
"captures/2026-06-07.jsonl",
|
|
170
|
+
task=lambda inp: candidate_vlm(inp["image_url"]),
|
|
171
|
+
)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### OpenTelemetry mirror — Datadog / Honeycomb / Phoenix / Langfuse
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
aimeval.enable_otel() # or AIMEVAL_OTEL=1
|
|
178
|
+
# Spans now also go to whichever OTLP collector OTEL_EXPORTER_OTLP_* points at.
|
|
179
|
+
# Attributes follow the OpenInference semantic convention so Phoenix /
|
|
180
|
+
# Langfuse / Patronus / LangSmith / Opik / Weave all consume them natively.
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Webhook signature verification
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from aimeval import verify_webhook, WebhookVerificationError
|
|
187
|
+
|
|
188
|
+
# In your FastAPI / Flask handler:
|
|
189
|
+
try:
|
|
190
|
+
verify_webhook(
|
|
191
|
+
secret=os.environ["AIMEVAL_WEBHOOK_SECRET"],
|
|
192
|
+
body=request.body, # raw bytes!
|
|
193
|
+
signature=request.headers["X-AIMEval-Signature"],
|
|
194
|
+
timestamp=request.headers["X-AIMEval-Timestamp"],
|
|
195
|
+
)
|
|
196
|
+
except WebhookVerificationError as exc:
|
|
197
|
+
return Response(str(exc), status=400)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Constant-time HMAC + 5-minute default tolerance window. Same protections
|
|
201
|
+
Stripe / GitHub ship.
|
|
202
|
+
|
|
203
|
+
### Honest cost preview (no stale vendor price tables)
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
est = client.runs.estimate_cost(model="m_gpt4o", samples=5000)
|
|
207
|
+
print(est)
|
|
208
|
+
# CostEstimate(model='m_gpt4o', samples=5000, mean=$90.00,
|
|
209
|
+
# range=$75.00–$110.00, based_on_runs=8)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
Averages `cost_usd / samples_processed` across the project's most recent
|
|
213
|
+
completed runs of the given model. With no history yet: returns
|
|
214
|
+
`known=False` plus a note ("run sample_limit=5 first") instead of
|
|
215
|
+
fabricating a number.
|
|
216
|
+
|
|
217
|
+
### Enterprise: corporate proxy / mTLS
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
import httpx
|
|
221
|
+
proxied = httpx.Client(
|
|
222
|
+
proxies="http://corp-proxy:8080",
|
|
223
|
+
verify="/etc/corp/ca.pem",
|
|
224
|
+
)
|
|
225
|
+
client = AIMEval(api_key="...", base_url="...", http_client=proxied)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
The SDK installs auth + UA + `base_url` **on top** of your client — no
|
|
229
|
+
second client is constructed behind the scenes.
|
|
230
|
+
|
|
231
|
+
### Per-call overrides + raw response
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
# OpenAI/Anthropic with_options pattern — immutable copy, never mutates self.
|
|
235
|
+
slow_run = client.with_options(timeout=300, max_retries=10).runs.create(...)
|
|
236
|
+
|
|
237
|
+
# Read request_id + headers without losing the typed surface:
|
|
238
|
+
resp = client.with_raw_response.runs.retrieve("run_abc")
|
|
239
|
+
print(resp.headers["x-request-id"])
|
|
240
|
+
run = resp.parse() # same dict a normal call would return
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
Every error also carries `exc.request_id` + `exc.response` so support
|
|
244
|
+
tickets are diagnosable.
|
|
245
|
+
|
|
246
|
+
## CLI
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
aimeval --install-completion bash # shell tab-completion
|
|
250
|
+
|
|
251
|
+
aimeval init my-project [--github-action] # scaffold evals/ + .env.example
|
|
252
|
+
aimeval doctor # env + DNS + auth + resources
|
|
253
|
+
aimeval evaluate --name x --model m_… --dataset d_… --collection c_…
|
|
254
|
+
|
|
255
|
+
aimeval run apply-gate / promote / unpromote / report / export-results
|
|
256
|
+
aimeval run estimate-cost --model M --samples 1000
|
|
257
|
+
|
|
258
|
+
aimeval annotations bootstrap / list / label / flag
|
|
259
|
+
aimeval gate duplicate / check-compatibility / metric-registry
|
|
260
|
+
aimeval model test-connection --model M # AIMEVAL_MODEL_AUTH from env
|
|
261
|
+
|
|
262
|
+
aimeval search "amazon-nightly"
|
|
263
|
+
aimeval compare run_baseline run_candidate
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
Global flags: `-o json|table|markdown`, `-q` (quiet), `-p PROFILE`,
|
|
267
|
+
`--api-key`, `--base-url`.
|
|
268
|
+
|
|
269
|
+
## Async
|
|
270
|
+
|
|
271
|
+
Every method has an `async` twin:
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
import asyncio
|
|
275
|
+
from aimeval import AsyncAIMEval
|
|
276
|
+
|
|
277
|
+
async def main():
|
|
278
|
+
async with AsyncAIMEval() as client:
|
|
279
|
+
results = await asyncio.gather(*[
|
|
280
|
+
client.runs.estimate_cost(model="m1", samples=n) for n in (100, 1000, 10000)
|
|
281
|
+
])
|
|
282
|
+
for r in results:
|
|
283
|
+
print(r)
|
|
284
|
+
|
|
285
|
+
asyncio.run(main())
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Debugging — `AIMEVAL_LOG`
|
|
289
|
+
|
|
290
|
+
```bash
|
|
291
|
+
$ AIMEVAL_LOG=info python evals/ci.py
|
|
292
|
+
07:31:39.575 aimeval INFO op=http method=GET path=/runs/page \
|
|
293
|
+
status=200 request_id=req_abc duration_ms=42
|
|
294
|
+
07:31:40.211 aimeval WARNING op=http method=POST path=/runs \
|
|
295
|
+
status=429 request_id=req_def attempt=1
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
logfmt — `grep request_id=req_…` works straight from `cat`/`less` without
|
|
299
|
+
JSON parsing. Promotes to WARNING on 4xx/5xx and on every retry attempt.
|
|
300
|
+
Zero hot-path cost when the env var is unset.
|
|
301
|
+
|
|
302
|
+
## Error taxonomy
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
from aimeval import (
|
|
306
|
+
AIMEvalError, APIConnectionError, APITimeoutError,
|
|
307
|
+
AuthenticationError, PermissionError_, NotFoundError, ConflictError,
|
|
308
|
+
BadRequestError, RateLimitError, APIServerError,
|
|
309
|
+
WebhookVerificationError, EvalAssertionError,
|
|
310
|
+
)
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
Each maps to an HTTP status range. Retries (408 / 429 / 502 / 503 / 504)
|
|
314
|
+
are automatic with exponential backoff + `Retry-After` honoured. Every
|
|
315
|
+
error carries `exc.request_id` + `exc.response`.
|
|
316
|
+
|
|
317
|
+
## Versioning
|
|
318
|
+
|
|
319
|
+
This SDK follows the backend's API contract. Minor versions
|
|
320
|
+
(`0.5.x → 0.6.0`) introduce new endpoint wrappers and ergonomic surface;
|
|
321
|
+
breaking changes only happen at `0.x → 1.0`. `client.beta.*` is exempt —
|
|
322
|
+
that's the unstable surface by contract.
|
|
323
|
+
|
|
324
|
+
Current: **v0.6.0** — see [CHANGELOG.md](CHANGELOG.md).
|
|
325
|
+
|
|
326
|
+
## More
|
|
327
|
+
|
|
328
|
+
- Recipes: [EXAMPLES.md](EXAMPLES.md)
|
|
329
|
+
- Backend API reference: https://app.aimeval.com/api/docs
|
|
330
|
+
- Source: https://github.com/rybalena/aimeval-backend
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AIMEval Python SDK — programmatic access for CI/CD pipelines.
|
|
3
|
+
|
|
4
|
+
Quick start::
|
|
5
|
+
|
|
6
|
+
from aimeval import AIMEval
|
|
7
|
+
|
|
8
|
+
client = AIMEval(api_key="aime_sk_...", base_url="https://app.aimeval.com")
|
|
9
|
+
|
|
10
|
+
# End-to-end workflow with a live progress bar (TTY only):
|
|
11
|
+
run = client.evaluate(
|
|
12
|
+
name="nightly-regression",
|
|
13
|
+
model=MODEL_ID,
|
|
14
|
+
dataset=DATASET_ID,
|
|
15
|
+
metrics=COLLECTION_ID,
|
|
16
|
+
)
|
|
17
|
+
sys.exit(0 if run.passed else 1)
|
|
18
|
+
|
|
19
|
+
# Or step by step (industry-standard resource namespaces):
|
|
20
|
+
run = client.runs.create(name="...", model=..., dataset=..., metrics=...)
|
|
21
|
+
run = client.runs.wait(run.id, progress=True)
|
|
22
|
+
|
|
23
|
+
# Auto-pagination — iterate every page transparently:
|
|
24
|
+
for run in client.runs.list(status="completed"):
|
|
25
|
+
print(run.id, run.score)
|
|
26
|
+
"""
|
|
27
|
+
from aimeval.client import AIMEval, AsyncAIMEval
|
|
28
|
+
from aimeval._otel import disable_otel, enable_otel, is_available as otel_is_available, otel_enabled
|
|
29
|
+
from aimeval import events
|
|
30
|
+
from aimeval._trace import Span, TraceCollector, records, span, trace
|
|
31
|
+
from aimeval._trace_io import (
|
|
32
|
+
ReplayResult,
|
|
33
|
+
load_trace,
|
|
34
|
+
load_trace_iter,
|
|
35
|
+
replay_trace,
|
|
36
|
+
save_trace,
|
|
37
|
+
)
|
|
38
|
+
from aimeval._webhook_verify import (
|
|
39
|
+
WebhookVerificationError,
|
|
40
|
+
construct_event,
|
|
41
|
+
verify_webhook,
|
|
42
|
+
)
|
|
43
|
+
from aimeval._eval import (
|
|
44
|
+
Eval,
|
|
45
|
+
EvalAssertionError,
|
|
46
|
+
EvalResult,
|
|
47
|
+
EvalScore,
|
|
48
|
+
assert_test,
|
|
49
|
+
eval_from_run,
|
|
50
|
+
)
|
|
51
|
+
from aimeval import metrics
|
|
52
|
+
from aimeval._exceptions import (
|
|
53
|
+
AIMEvalError,
|
|
54
|
+
AIMEvalSecurityWarning,
|
|
55
|
+
APIConnectionError,
|
|
56
|
+
APIServerError,
|
|
57
|
+
APITimeoutError,
|
|
58
|
+
AuthenticationError,
|
|
59
|
+
BadRequestError,
|
|
60
|
+
ConflictError,
|
|
61
|
+
NotFoundError,
|
|
62
|
+
RateLimitError,
|
|
63
|
+
)
|
|
64
|
+
from aimeval._streaming import (
|
|
65
|
+
AsyncRunSSEStream,
|
|
66
|
+
AsyncRunStream,
|
|
67
|
+
RunEvent,
|
|
68
|
+
RunSSEStream,
|
|
69
|
+
RunStream,
|
|
70
|
+
)
|
|
71
|
+
from aimeval._types import (
|
|
72
|
+
Annotation,
|
|
73
|
+
AnnotationsBootstrapResult,
|
|
74
|
+
AuditEntry,
|
|
75
|
+
CompareHistoryItem,
|
|
76
|
+
Comparison,
|
|
77
|
+
ConnectionTestResult,
|
|
78
|
+
CostEstimate,
|
|
79
|
+
Dataset,
|
|
80
|
+
GateCompatibility,
|
|
81
|
+
MetricCollection,
|
|
82
|
+
MetricDistribution,
|
|
83
|
+
MetricRegistry,
|
|
84
|
+
MetricRegistryEntry,
|
|
85
|
+
MetricSummary,
|
|
86
|
+
Model,
|
|
87
|
+
Prompt,
|
|
88
|
+
PromptDiff,
|
|
89
|
+
PromptTestResult,
|
|
90
|
+
PromptTestRun,
|
|
91
|
+
PromptVersion,
|
|
92
|
+
QualityGate,
|
|
93
|
+
RegressionSet,
|
|
94
|
+
Resource,
|
|
95
|
+
Run,
|
|
96
|
+
SearchHit,
|
|
97
|
+
SearchResults,
|
|
98
|
+
UploadStatus,
|
|
99
|
+
UploadTicket,
|
|
100
|
+
Usage,
|
|
101
|
+
Webhook,
|
|
102
|
+
WizardBootstrap,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
__version__ = "0.6.0"
|
|
107
|
+
|
|
108
|
+
__all__ = [
|
|
109
|
+
# Clients
|
|
110
|
+
"AIMEval",
|
|
111
|
+
"AsyncAIMEval",
|
|
112
|
+
"__version__",
|
|
113
|
+
# Tracing (local span capture → eval datasets; optional OTel mirror)
|
|
114
|
+
"trace",
|
|
115
|
+
"span",
|
|
116
|
+
"TraceCollector",
|
|
117
|
+
"Span",
|
|
118
|
+
"records",
|
|
119
|
+
"events",
|
|
120
|
+
# Trace recording + replay (prod-to-eval pattern)
|
|
121
|
+
"save_trace",
|
|
122
|
+
"load_trace",
|
|
123
|
+
"load_trace_iter",
|
|
124
|
+
"replay_trace",
|
|
125
|
+
"ReplayResult",
|
|
126
|
+
"enable_otel",
|
|
127
|
+
"disable_otel",
|
|
128
|
+
"otel_enabled",
|
|
129
|
+
"otel_is_available",
|
|
130
|
+
# Eval harness + metrics (CI assertion surface)
|
|
131
|
+
"Eval",
|
|
132
|
+
"eval_from_run",
|
|
133
|
+
"assert_test",
|
|
134
|
+
"EvalResult",
|
|
135
|
+
"EvalScore",
|
|
136
|
+
"EvalAssertionError",
|
|
137
|
+
"metrics",
|
|
138
|
+
# Errors
|
|
139
|
+
"AIMEvalError",
|
|
140
|
+
"AIMEvalSecurityWarning",
|
|
141
|
+
"APIConnectionError",
|
|
142
|
+
"APIServerError",
|
|
143
|
+
"APITimeoutError",
|
|
144
|
+
"AuthenticationError",
|
|
145
|
+
"BadRequestError",
|
|
146
|
+
"ConflictError",
|
|
147
|
+
"NotFoundError",
|
|
148
|
+
"RateLimitError",
|
|
149
|
+
# Response types
|
|
150
|
+
"Annotation",
|
|
151
|
+
"AnnotationsBootstrapResult",
|
|
152
|
+
"AuditEntry",
|
|
153
|
+
"CompareHistoryItem",
|
|
154
|
+
"Comparison",
|
|
155
|
+
"ConnectionTestResult",
|
|
156
|
+
"CostEstimate",
|
|
157
|
+
"Dataset",
|
|
158
|
+
"GateCompatibility",
|
|
159
|
+
"MetricCollection",
|
|
160
|
+
"MetricDistribution",
|
|
161
|
+
"MetricRegistry",
|
|
162
|
+
"MetricRegistryEntry",
|
|
163
|
+
"MetricSummary",
|
|
164
|
+
"Model",
|
|
165
|
+
"Prompt",
|
|
166
|
+
"PromptDiff",
|
|
167
|
+
"PromptTestResult",
|
|
168
|
+
"PromptTestRun",
|
|
169
|
+
"PromptVersion",
|
|
170
|
+
"QualityGate",
|
|
171
|
+
"RegressionSet",
|
|
172
|
+
"Resource",
|
|
173
|
+
"Run",
|
|
174
|
+
"SearchHit",
|
|
175
|
+
"SearchResults",
|
|
176
|
+
"UploadStatus",
|
|
177
|
+
"UploadTicket",
|
|
178
|
+
"Usage",
|
|
179
|
+
"Webhook",
|
|
180
|
+
"WizardBootstrap",
|
|
181
|
+
# Webhook verifier (security helper)
|
|
182
|
+
"WebhookVerificationError",
|
|
183
|
+
"construct_event",
|
|
184
|
+
"verify_webhook",
|
|
185
|
+
# Streaming
|
|
186
|
+
"AsyncRunSSEStream",
|
|
187
|
+
"AsyncRunStream",
|
|
188
|
+
"RunEvent",
|
|
189
|
+
"RunSSEStream",
|
|
190
|
+
"RunStream",
|
|
191
|
+
]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""``client.beta.*`` — experimental surface area.
|
|
2
|
+
|
|
3
|
+
Industry pattern from OpenAI / Anthropic: features that haven't earned
|
|
4
|
+
SemVer protection yet live under a dedicated ``beta`` namespace. This
|
|
5
|
+
gives them three properties at once:
|
|
6
|
+
|
|
7
|
+
1. **Visible**: a glance at ``client.beta.<something>`` tells the
|
|
8
|
+
reader "this isn't stable yet — pin your SDK and read the
|
|
9
|
+
CHANGELOG before bumping".
|
|
10
|
+
2. **Safe to remove / rename** without bumping the major version of
|
|
11
|
+
the SDK. The ``beta`` package itself is the contract.
|
|
12
|
+
3. **Cheap to ship**: graduation to the stable surface (e.g.
|
|
13
|
+
``client.replay``) is a one-line re-export when the API settles.
|
|
14
|
+
|
|
15
|
+
Currently in beta:
|
|
16
|
+
|
|
17
|
+
- :attr:`Beta.replay` — trace recording + replay helpers
|
|
18
|
+
(:func:`aimeval.save_trace` / :func:`aimeval.load_trace` /
|
|
19
|
+
:func:`aimeval.replay_trace`). Wrapped under ``client.beta.replay``
|
|
20
|
+
so the helpers feel discoverable from autocomplete even when
|
|
21
|
+
callers don't import them directly.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from typing import TYPE_CHECKING
|
|
26
|
+
|
|
27
|
+
from aimeval import _trace_io
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING: # pragma: no cover
|
|
30
|
+
from aimeval.client import AIMEval, AsyncAIMEval
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class _Replay:
|
|
34
|
+
"""Beta wrapper around :mod:`aimeval._trace_io`.
|
|
35
|
+
|
|
36
|
+
Stays opt-in via ``client.beta.replay.save_trace(...)`` for the
|
|
37
|
+
same reason every premium SDK quarantines unstable surface:
|
|
38
|
+
discoverable via autocomplete, but the rename / removal is on
|
|
39
|
+
a separate timeline from the stable client.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, _client: "AIMEval | AsyncAIMEval"):
|
|
43
|
+
# Client kept on the instance only so a future beta feature
|
|
44
|
+
# that *does* hit the network has something to call. The replay
|
|
45
|
+
# helpers themselves are pure offline.
|
|
46
|
+
self._client = _client
|
|
47
|
+
|
|
48
|
+
save_trace = staticmethod(_trace_io.save_trace)
|
|
49
|
+
load_trace = staticmethod(_trace_io.load_trace)
|
|
50
|
+
load_trace_iter = staticmethod(_trace_io.load_trace_iter)
|
|
51
|
+
replay_trace = staticmethod(_trace_io.replay_trace)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Beta:
|
|
55
|
+
"""Namespace for experimental features. See module docstring."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, client: "AIMEval | AsyncAIMEval"):
|
|
58
|
+
self.replay = _Replay(client)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
__all__ = ["Beta"]
|