aimeval 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. aimeval-0.6.0/PKG-INFO +20 -0
  2. aimeval-0.6.0/README.md +330 -0
  3. aimeval-0.6.0/aimeval/__init__.py +191 -0
  4. aimeval-0.6.0/aimeval/_beta.py +61 -0
  5. aimeval-0.6.0/aimeval/_eval.py +316 -0
  6. aimeval-0.6.0/aimeval/_exceptions.py +119 -0
  7. aimeval-0.6.0/aimeval/_logging.py +139 -0
  8. aimeval-0.6.0/aimeval/_otel.py +184 -0
  9. aimeval-0.6.0/aimeval/_pagination.py +181 -0
  10. aimeval-0.6.0/aimeval/_polling.py +40 -0
  11. aimeval-0.6.0/aimeval/_progress.py +111 -0
  12. aimeval-0.6.0/aimeval/_report.py +500 -0
  13. aimeval-0.6.0/aimeval/_repr_html.py +147 -0
  14. aimeval-0.6.0/aimeval/_streaming.py +519 -0
  15. aimeval-0.6.0/aimeval/_trace.py +431 -0
  16. aimeval-0.6.0/aimeval/_trace_io.py +190 -0
  17. aimeval-0.6.0/aimeval/_types.py +659 -0
  18. aimeval-0.6.0/aimeval/_webhook_verify.py +166 -0
  19. aimeval-0.6.0/aimeval/cli/__init__.py +33 -0
  20. aimeval-0.6.0/aimeval/cli/_config.py +113 -0
  21. aimeval-0.6.0/aimeval/cli/_exit.py +56 -0
  22. aimeval-0.6.0/aimeval/cli/_format.py +101 -0
  23. aimeval-0.6.0/aimeval/cli/_output.py +378 -0
  24. aimeval-0.6.0/aimeval/cli/_resolve.py +75 -0
  25. aimeval-0.6.0/aimeval/cli/_session.py +90 -0
  26. aimeval-0.6.0/aimeval/cli/app.py +116 -0
  27. aimeval-0.6.0/aimeval/cli/commands/__init__.py +0 -0
  28. aimeval-0.6.0/aimeval/cli/commands/annotations.py +125 -0
  29. aimeval-0.6.0/aimeval/cli/commands/auth.py +67 -0
  30. aimeval-0.6.0/aimeval/cli/commands/compare.py +225 -0
  31. aimeval-0.6.0/aimeval/cli/commands/doctor.py +265 -0
  32. aimeval-0.6.0/aimeval/cli/commands/evaluate.py +92 -0
  33. aimeval-0.6.0/aimeval/cli/commands/gate_extra.py +95 -0
  34. aimeval-0.6.0/aimeval/cli/commands/init.py +294 -0
  35. aimeval-0.6.0/aimeval/cli/commands/models_extra.py +46 -0
  36. aimeval-0.6.0/aimeval/cli/commands/resources.py +207 -0
  37. aimeval-0.6.0/aimeval/cli/commands/run.py +546 -0
  38. aimeval-0.6.0/aimeval/cli/commands/search.py +26 -0
  39. aimeval-0.6.0/aimeval/client.py +888 -0
  40. aimeval-0.6.0/aimeval/enums.py +97 -0
  41. aimeval-0.6.0/aimeval/events.py +64 -0
  42. aimeval-0.6.0/aimeval/metrics.py +368 -0
  43. aimeval-0.6.0/aimeval/presentations.py +190 -0
  44. aimeval-0.6.0/aimeval/py.typed +0 -0
  45. aimeval-0.6.0/aimeval/pytest_plugin.py +81 -0
  46. aimeval-0.6.0/aimeval/resources/__init__.py +41 -0
  47. aimeval-0.6.0/aimeval/resources/analytics.py +101 -0
  48. aimeval-0.6.0/aimeval/resources/annotations.py +275 -0
  49. aimeval-0.6.0/aimeval/resources/collections.py +148 -0
  50. aimeval-0.6.0/aimeval/resources/compare.py +100 -0
  51. aimeval-0.6.0/aimeval/resources/datasets.py +481 -0
  52. aimeval-0.6.0/aimeval/resources/evaluate.py +52 -0
  53. aimeval-0.6.0/aimeval/resources/gates.py +219 -0
  54. aimeval-0.6.0/aimeval/resources/metrics.py +31 -0
  55. aimeval-0.6.0/aimeval/resources/models.py +316 -0
  56. aimeval-0.6.0/aimeval/resources/observability.py +94 -0
  57. aimeval-0.6.0/aimeval/resources/prompts.py +623 -0
  58. aimeval-0.6.0/aimeval/resources/regression_sets.py +159 -0
  59. aimeval-0.6.0/aimeval/resources/runs.py +774 -0
  60. aimeval-0.6.0/aimeval/resources/search.py +27 -0
  61. aimeval-0.6.0/aimeval/resources/webhooks.py +160 -0
  62. aimeval-0.6.0/aimeval/resources/wizard.py +30 -0
  63. aimeval-0.6.0/aimeval/types.py +378 -0
  64. aimeval-0.6.0/aimeval.egg-info/PKG-INFO +20 -0
  65. aimeval-0.6.0/aimeval.egg-info/SOURCES.txt +101 -0
  66. aimeval-0.6.0/aimeval.egg-info/dependency_links.txt +1 -0
  67. aimeval-0.6.0/aimeval.egg-info/entry_points.txt +5 -0
  68. aimeval-0.6.0/aimeval.egg-info/requires.txt +20 -0
  69. aimeval-0.6.0/aimeval.egg-info/top_level.txt +1 -0
  70. aimeval-0.6.0/pyproject.toml +56 -0
  71. aimeval-0.6.0/setup.cfg +4 -0
  72. aimeval-0.6.0/tests/test_catchup.py +486 -0
  73. aimeval-0.6.0/tests/test_catchup_full.py +683 -0
  74. aimeval-0.6.0/tests/test_cli_s1_contract.py +116 -0
  75. aimeval-0.6.0/tests/test_client.py +255 -0
  76. aimeval-0.6.0/tests/test_compare_render.py +72 -0
  77. aimeval-0.6.0/tests/test_cost_estimate.py +211 -0
  78. aimeval-0.6.0/tests/test_doctor.py +69 -0
  79. aimeval-0.6.0/tests/test_dx_polish.py +134 -0
  80. aimeval-0.6.0/tests/test_endpoint_gaps.py +229 -0
  81. aimeval-0.6.0/tests/test_eval.py +180 -0
  82. aimeval-0.6.0/tests/test_exceptions.py +50 -0
  83. aimeval-0.6.0/tests/test_gate_render.py +40 -0
  84. aimeval-0.6.0/tests/test_init_cli.py +105 -0
  85. aimeval-0.6.0/tests/test_logging_and_transport.py +224 -0
  86. aimeval-0.6.0/tests/test_metrics.py +155 -0
  87. aimeval-0.6.0/tests/test_observability.py +161 -0
  88. aimeval-0.6.0/tests/test_otel.py +143 -0
  89. aimeval-0.6.0/tests/test_pagination.py +127 -0
  90. aimeval-0.6.0/tests/test_polling.py +35 -0
  91. aimeval-0.6.0/tests/test_pretty_print.py +108 -0
  92. aimeval-0.6.0/tests/test_pytest_plugin.py +76 -0
  93. aimeval-0.6.0/tests/test_regression_sets_from_run.py +163 -0
  94. aimeval-0.6.0/tests/test_resolve_run.py +99 -0
  95. aimeval-0.6.0/tests/test_result_stream.py +109 -0
  96. aimeval-0.6.0/tests/test_run_results_render.py +236 -0
  97. aimeval-0.6.0/tests/test_span_and_events.py +133 -0
  98. aimeval-0.6.0/tests/test_streaming.py +73 -0
  99. aimeval-0.6.0/tests/test_trace.py +289 -0
  100. aimeval-0.6.0/tests/test_trace_io.py +250 -0
  101. aimeval-0.6.0/tests/test_types.py +62 -0
  102. aimeval-0.6.0/tests/test_webhooks.py +250 -0
  103. aimeval-0.6.0/tests/test_with_options.py +136 -0
aimeval-0.6.0/PKG-INFO ADDED
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: aimeval
3
+ Version: 0.6.0
4
+ Summary: AIMEval Python SDK — Vision-Text Evaluation Platform
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: httpx>=0.25
7
+ Requires-Dist: pydantic>=2.0
8
+ Requires-Dist: tqdm>=4.65
9
+ Provides-Extra: cli
10
+ Requires-Dist: typer>=0.12; extra == "cli"
11
+ Requires-Dist: rich>=13.7; extra == "cli"
12
+ Requires-Dist: tomli>=2.0; python_version < "3.11" and extra == "cli"
13
+ Provides-Extra: dev
14
+ Requires-Dist: pytest>=8; extra == "dev"
15
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
16
+ Requires-Dist: pytest-httpx>=0.30; extra == "dev"
17
+ Requires-Dist: pyyaml>=6; extra == "dev"
18
+ Provides-Extra: otel
19
+ Requires-Dist: opentelemetry-api>=1.20; extra == "otel"
20
+ Requires-Dist: opentelemetry-sdk>=1.20; extra == "otel"
@@ -0,0 +1,330 @@
1
+ # aimeval — AIMEval Python SDK
2
+
3
+ The Python SDK + CLI for [AIMEval](https://aimeval.com) — the
4
+ vision-language evaluation platform for e-commerce product listings.
5
+
6
+ ```python
7
+ import aimeval
8
+ from aimeval.metrics import VisualFaithfulness, Hallucination
9
+
10
+ # One-call CI gate. Returns when the run finishes; assert_test raises
11
+ # (with the per-metric breakdown) if any scorer fell below threshold.
12
+ result = aimeval.Eval(
13
+ name="nightly",
14
+ model="m_42", dataset="d_77", metrics="c_5", gate="g_strict",
15
+ scorers=[
16
+ VisualFaithfulness(threshold=0.80),
17
+ Hallucination(threshold=0.10), # ← lower is better, SDK knows
18
+ ],
19
+ )
20
+ aimeval.assert_test(result)
21
+ ```
22
+
23
+ > **The IDs above are placeholders.** `m_42` / `d_77` / `c_5` / `g_strict`
24
+ > stand in for real IDs — on AIMEval every object ID is a UUID
25
+ > (`a1b2c3d4-…`). Get yours from the dashboard, the CLI
26
+ > (`aimeval model list` · `dataset list` · `collection list` · `gate list`),
27
+ > or in code:
28
+ >
29
+ > ```python
30
+ > from aimeval import AIMEval
31
+ > c = AIMEval()
32
+ > model_id = c.models.list().data[0].id # '9133edaa-f0d4-46c1-…'
33
+ > dataset_id = c.datasets.list().data[0].id
34
+ > coll_id = c.collections.list().data[0].id
35
+ > ```
36
+
37
+ ## Install
38
+
39
+ ```bash
40
+ pip install aimeval # base: typed responses + async + retry
41
+ pip install 'aimeval[cli]' # adds the `aimeval` CLI + Rich panels
42
+ pip install 'aimeval[otel]' # OpenTelemetry export for @aimeval.trace
43
+ ```
44
+
45
+ Python 3.10+.
46
+
47
+ ## 60-second quickstart
48
+
49
+ ```bash
50
+ pip install 'aimeval[cli]'
51
+
52
+ aimeval init my-project --github-action
53
+ cd my-project
54
+ cp .env.example .env && $EDITOR .env # AIMEVAL_API_KEY=…
55
+ aimeval doctor # verify env + auth + resources
56
+ $EDITOR evals/ci.py # plug in your real IDs
57
+ python evals/ci.py
58
+ ```
59
+
60
+ Done. The scaffolded `ci.py` exits 0 on pass, 2 on review, 1 on fail — the
61
+ GitHub Action wires those into PR checks automatically.
62
+
63
+ ## What's in the box
64
+
65
+ | Surface | What you get |
66
+ |---|---|
67
+ | **`client.runs / models / datasets / gates / collections / prompts`** | Full CRUD + lifecycle for every product object |
68
+ | **`client.runs.create(…, gate=, prompt=, baseline_run=)`** + **`apply_gate`** / **`promote`** / **`unpromote`** / **`export_results`** | Wires the v7 evaluation flow end-to-end |
69
+ | **`aimeval.Eval(name, model, dataset, metrics, scorers=[...])`** + **`assert_test`** | One-call CI harness with direction-aware threshold checking |
70
+ | **`aimeval.metrics`** (15 typed metric handles) | `VisualFaithfulness`, `Hallucination`, `BrandSupport`, `PIIExposure`, … — each encodes its direction so `passed(score)` can't be inverted |
71
+ | **`@aimeval.trace`** + **`aimeval.span(…)`** + **`TraceCollector`** | Local span capture, parent/child nesting, sync/async/gen; no fake trace endpoint |
72
+ | **`aimeval.save_trace`** / **`load_trace`** / **`replay_trace`** | JSONL persistence + replay through a candidate model (prod-to-eval) |
73
+ | **`aimeval.enable_otel()`** | Mirror spans into Datadog / Honeycomb / Phoenix / Langfuse via OpenInference convention |
74
+ | **`client.webhooks.*`** + **`aimeval.verify_webhook(...)`** | CRUD + HMAC-SHA256 verifier (Stripe-style: constant-time + 5-min replay window) |
75
+ | **`client.runs.estimate_cost(model=, samples=)`** | Honest cost preview from your project's own run history — no stale price tables |
76
+ | **`client.regression_sets.from_run(run_id, n=20)`** | Pin the worst-scoring samples for re-test on every commit |
77
+ | **`client.audit_log()`** / **`client.usage()`** | Settings observability (compliance + quota dashboards) |
78
+ | **`client.beta.replay.*`** | Experimental surface (OpenAI/Anthropic pattern; pin SDK + read CHANGELOG before bumping) |
79
+ | **`client.compare`** / **`compare_history`** / **`search`** / **`metrics_catalog`** / **`wizard_bootstrap`** | Top-level convenience methods |
80
+ | **CLI** — `aimeval init / doctor / evaluate / run / annotations / gate / model / search / compare` | Every workflow scriptable from the terminal |
81
+
82
+ Every async resource has a sync twin on `AIMEval`, and vice versa on `AsyncAIMEval`.
83
+
84
+ ## Configure
85
+
86
+ The SDK reads from the environment by default:
87
+
88
+ ```bash
89
+ export AIMEVAL_API_KEY="aime_sk_..."
90
+ export AIMEVAL_BASE_URL="https://app.aimeval.com"
91
+
92
+ # Optional:
93
+ export AIMEVAL_LOG=info # logfmt request logs on stderr
94
+ export AIMEVAL_OTEL=1 # mirror @trace spans to OTel
95
+ export AIMEVAL_WEBHOOK_SECRET=... # for verify_webhook
96
+ export AIMEVAL_MODEL_AUTH=... # Custom Endpoint test_connection
97
+ ```
98
+
99
+ Production-shaped API keys (`aime_sk_live_*`) passed inline (instead of via
100
+ env var) raise `AIMEvalSecurityWarning` so a leaked key surfaces before it
101
+ lands in shell history or a committed notebook. Test/sandbox keys are
102
+ silent.
103
+
104
+ `api_key` is **masked** in `__repr__`, in error messages, in CLI output,
105
+ and in the dev logger handler.
106
+
107
+ ## Selected recipes
108
+
109
+ ### CI gate that blocks merges on regression
110
+
111
+ ```python
112
+ # evals/ci.py — scaffolded by `aimeval init`
113
+ import aimeval
114
+ from aimeval.metrics import VisualFaithfulness, Hallucination, PIIExposure
115
+
116
+ result = aimeval.Eval(
117
+ name=f"ci-{os.environ['AIMEVAL_CI_COMMIT'][:7]}",
118
+ model="m_gpt4o", dataset="d_amazon", metrics="c_standard",
119
+ gate="g_strict",
120
+ idempotency_key=os.environ["AIMEVAL_CI_COMMIT"], # auto-generated if omitted
121
+ scorers=[
122
+ VisualFaithfulness(threshold=0.80),
123
+ Hallucination(threshold=0.10),
124
+ PIIExposure(threshold=0.05),
125
+ ],
126
+ )
127
+ aimeval.assert_test(result)
128
+ ```
129
+
130
+ Pair with the bundled GitHub Action:
131
+
132
+ ```yaml
133
+ # .github/workflows/aimeval-ci.yml
134
+ - uses: rybalena/aimeval-backend/.github/actions/aimeval-eval@main
135
+ with:
136
+ script: ./evals/ci.py
137
+ api-key: ${{ secrets.AIMEVAL_API_KEY }}
138
+ aimeval-version: "0.6.0"
139
+ ```
140
+
141
+ ### pytest plugin (auto-loaded via the `pytest11` entry point)
142
+
143
+ ```python
144
+ @pytest.mark.aimeval_eval
145
+ def test_amazon_nightly():
146
+ result = aimeval.Eval(name="t", model="m", dataset="d", metrics="c",
147
+ scorers=[VisualFaithfulness(threshold=0.8)])
148
+ aimeval.assert_test(result)
149
+ ```
150
+
151
+ Failures render a dedicated "AIMEval eval failure" section with the
152
+ per-metric breakdown — no traceback noise.
153
+
154
+ ### Instrument production, replay as an eval
155
+
156
+ ```python
157
+ @aimeval.trace
158
+ def describe(image_url: str) -> str:
159
+ return my_vlm(image_url)
160
+
161
+ # In prod: capture every call
162
+ with aimeval.TraceCollector() as spans:
163
+ for url in queue:
164
+ describe(url)
165
+ aimeval.save_trace(spans, "captures/2026-06-07.jsonl")
166
+
167
+ # Later: replay through a candidate model
168
+ results = aimeval.replay_trace(
169
+ "captures/2026-06-07.jsonl",
170
+ task=lambda inp: candidate_vlm(inp["image_url"]),
171
+ )
172
+ ```
173
+
174
+ ### OpenTelemetry mirror — Datadog / Honeycomb / Phoenix / Langfuse
175
+
176
+ ```python
177
+ aimeval.enable_otel() # or AIMEVAL_OTEL=1
178
+ # Spans now also go to whichever OTLP collector OTEL_EXPORTER_OTLP_* points at.
179
+ # Attributes follow the OpenInference semantic convention so Phoenix /
180
+ # Langfuse / Patronus / LangSmith / Opik / Weave all consume them natively.
181
+ ```
182
+
183
+ ### Webhook signature verification
184
+
185
+ ```python
186
+ from aimeval import verify_webhook, WebhookVerificationError
187
+
188
+ # In your FastAPI / Flask handler:
189
+ try:
190
+ verify_webhook(
191
+ secret=os.environ["AIMEVAL_WEBHOOK_SECRET"],
192
+ body=request.body, # raw bytes!
193
+ signature=request.headers["X-AIMEval-Signature"],
194
+ timestamp=request.headers["X-AIMEval-Timestamp"],
195
+ )
196
+ except WebhookVerificationError as exc:
197
+ return Response(str(exc), status=400)
198
+ ```
199
+
200
+ Constant-time HMAC + 5-minute default tolerance window. Same protections
201
+ Stripe / GitHub ship.
202
+
203
+ ### Honest cost preview (no stale vendor price tables)
204
+
205
+ ```python
206
+ est = client.runs.estimate_cost(model="m_gpt4o", samples=5000)
207
+ print(est)
208
+ # CostEstimate(model='m_gpt4o', samples=5000, mean=$90.00,
209
+ # range=$75.00–$110.00, based_on_runs=8)
210
+ ```
211
+
212
+ Averages `cost_usd / samples_processed` across the project's most recent
213
+ completed runs of the given model. With no history yet: returns
214
+ `known=False` plus a note ("run sample_limit=5 first") instead of
215
+ fabricating a number.
216
+
217
+ ### Enterprise: corporate proxy / mTLS
218
+
219
+ ```python
220
+ import httpx
221
+ proxied = httpx.Client(
222
+ proxies="http://corp-proxy:8080",
223
+ verify="/etc/corp/ca.pem",
224
+ )
225
+ client = AIMEval(api_key="...", base_url="...", http_client=proxied)
226
+ ```
227
+
228
+ The SDK installs auth + UA + `base_url` **on top** of your client — no
229
+ second client is constructed behind the scenes.
230
+
231
+ ### Per-call overrides + raw response
232
+
233
+ ```python
234
+ # OpenAI/Anthropic with_options pattern — immutable copy, never mutates self.
235
+ slow_run = client.with_options(timeout=300, max_retries=10).runs.create(...)
236
+
237
+ # Read request_id + headers without losing the typed surface:
238
+ resp = client.with_raw_response.runs.retrieve("run_abc")
239
+ print(resp.headers["x-request-id"])
240
+ run = resp.parse() # same dict a normal call would return
241
+ ```
242
+
243
+ Every error also carries `exc.request_id` + `exc.response` so support
244
+ tickets are diagnosable.
245
+
246
+ ## CLI
247
+
248
+ ```bash
249
+ aimeval --install-completion bash # shell tab-completion
250
+
251
+ aimeval init my-project [--github-action] # scaffold evals/ + .env.example
252
+ aimeval doctor # env + DNS + auth + resources
253
+ aimeval evaluate --name x --model m_… --dataset d_… --collection c_…
254
+
255
+ aimeval run apply-gate / promote / unpromote / report / export-results
256
+ aimeval run estimate-cost --model M --samples 1000
257
+
258
+ aimeval annotations bootstrap / list / label / flag
259
+ aimeval gate duplicate / check-compatibility / metric-registry
260
+ aimeval model test-connection --model M # AIMEVAL_MODEL_AUTH from env
261
+
262
+ aimeval search "amazon-nightly"
263
+ aimeval compare run_baseline run_candidate
264
+ ```
265
+
266
+ Global flags: `-o json|table|markdown`, `-q` (quiet), `-p PROFILE`,
267
+ `--api-key`, `--base-url`.
268
+
269
+ ## Async
270
+
271
+ Every method has an `async` twin:
272
+
273
+ ```python
274
+ import asyncio
275
+ from aimeval import AsyncAIMEval
276
+
277
+ async def main():
278
+ async with AsyncAIMEval() as client:
279
+ results = await asyncio.gather(*[
280
+ client.runs.estimate_cost(model="m1", samples=n) for n in (100, 1000, 10000)
281
+ ])
282
+ for r in results:
283
+ print(r)
284
+
285
+ asyncio.run(main())
286
+ ```
287
+
288
+ ## Debugging — `AIMEVAL_LOG`
289
+
290
+ ```bash
291
+ $ AIMEVAL_LOG=info python evals/ci.py
292
+ 07:31:39.575 aimeval INFO op=http method=GET path=/runs/page \
293
+ status=200 request_id=req_abc duration_ms=42
294
+ 07:31:40.211 aimeval WARNING op=http method=POST path=/runs \
295
+ status=429 request_id=req_def attempt=1
296
+ ```
297
+
298
+ logfmt — `grep request_id=req_…` works straight from `cat`/`less` without
299
+ JSON parsing. Promotes to WARNING on 4xx/5xx and on every retry attempt.
300
+ Zero hot-path cost when the env var is unset.
301
+
302
+ ## Error taxonomy
303
+
304
+ ```python
305
+ from aimeval import (
306
+ AIMEvalError, APIConnectionError, APITimeoutError,
307
+ AuthenticationError, PermissionError_, NotFoundError, ConflictError,
308
+ BadRequestError, RateLimitError, APIServerError,
309
+ WebhookVerificationError, EvalAssertionError,
310
+ )
311
+ ```
312
+
313
+ Each maps to an HTTP status range. Retries (408 / 429 / 502 / 503 / 504)
314
+ are automatic with exponential backoff + `Retry-After` honoured. Every
315
+ error carries `exc.request_id` + `exc.response`.
316
+
317
+ ## Versioning
318
+
319
+ This SDK follows the backend's API contract. Minor versions
320
+ (`0.5.x → 0.6.0`) introduce new endpoint wrappers and ergonomic surface;
321
+ breaking changes only happen at `0.x → 1.0`. `client.beta.*` is exempt —
322
+ that's the unstable surface by contract.
323
+
324
+ Current: **v0.6.0** — see [CHANGELOG.md](CHANGELOG.md).
325
+
326
+ ## More
327
+
328
+ - Recipes: [EXAMPLES.md](EXAMPLES.md)
329
+ - Backend API reference: https://app.aimeval.com/api/docs
330
+ - Source: https://github.com/rybalena/aimeval-backend
@@ -0,0 +1,191 @@
1
+ """
2
+ AIMEval Python SDK — programmatic access for CI/CD pipelines.
3
+
4
+ Quick start::
5
+
6
+ from aimeval import AIMEval
7
+
8
+ client = AIMEval(api_key="aime_sk_...", base_url="https://app.aimeval.com")
9
+
10
+ # End-to-end workflow with a live progress bar (TTY only):
11
+ run = client.evaluate(
12
+ name="nightly-regression",
13
+ model=MODEL_ID,
14
+ dataset=DATASET_ID,
15
+ metrics=COLLECTION_ID,
16
+ )
17
+ sys.exit(0 if run.passed else 1)
18
+
19
+ # Or step by step (industry-standard resource namespaces):
20
+ run = client.runs.create(name="...", model=..., dataset=..., metrics=...)
21
+ run = client.runs.wait(run.id, progress=True)
22
+
23
+ # Auto-pagination — iterate every page transparently:
24
+ for run in client.runs.list(status="completed"):
25
+ print(run.id, run.score)
26
+ """
27
+ from aimeval.client import AIMEval, AsyncAIMEval
28
+ from aimeval._otel import disable_otel, enable_otel, is_available as otel_is_available, otel_enabled
29
+ from aimeval import events
30
+ from aimeval._trace import Span, TraceCollector, records, span, trace
31
+ from aimeval._trace_io import (
32
+ ReplayResult,
33
+ load_trace,
34
+ load_trace_iter,
35
+ replay_trace,
36
+ save_trace,
37
+ )
38
+ from aimeval._webhook_verify import (
39
+ WebhookVerificationError,
40
+ construct_event,
41
+ verify_webhook,
42
+ )
43
+ from aimeval._eval import (
44
+ Eval,
45
+ EvalAssertionError,
46
+ EvalResult,
47
+ EvalScore,
48
+ assert_test,
49
+ eval_from_run,
50
+ )
51
+ from aimeval import metrics
52
+ from aimeval._exceptions import (
53
+ AIMEvalError,
54
+ AIMEvalSecurityWarning,
55
+ APIConnectionError,
56
+ APIServerError,
57
+ APITimeoutError,
58
+ AuthenticationError,
59
+ BadRequestError,
60
+ ConflictError,
61
+ NotFoundError,
62
+ RateLimitError,
63
+ )
64
+ from aimeval._streaming import (
65
+ AsyncRunSSEStream,
66
+ AsyncRunStream,
67
+ RunEvent,
68
+ RunSSEStream,
69
+ RunStream,
70
+ )
71
+ from aimeval._types import (
72
+ Annotation,
73
+ AnnotationsBootstrapResult,
74
+ AuditEntry,
75
+ CompareHistoryItem,
76
+ Comparison,
77
+ ConnectionTestResult,
78
+ CostEstimate,
79
+ Dataset,
80
+ GateCompatibility,
81
+ MetricCollection,
82
+ MetricDistribution,
83
+ MetricRegistry,
84
+ MetricRegistryEntry,
85
+ MetricSummary,
86
+ Model,
87
+ Prompt,
88
+ PromptDiff,
89
+ PromptTestResult,
90
+ PromptTestRun,
91
+ PromptVersion,
92
+ QualityGate,
93
+ RegressionSet,
94
+ Resource,
95
+ Run,
96
+ SearchHit,
97
+ SearchResults,
98
+ UploadStatus,
99
+ UploadTicket,
100
+ Usage,
101
+ Webhook,
102
+ WizardBootstrap,
103
+ )
104
+
105
+
106
+ __version__ = "0.6.0"
107
+
108
+ __all__ = [
109
+ # Clients
110
+ "AIMEval",
111
+ "AsyncAIMEval",
112
+ "__version__",
113
+ # Tracing (local span capture → eval datasets; optional OTel mirror)
114
+ "trace",
115
+ "span",
116
+ "TraceCollector",
117
+ "Span",
118
+ "records",
119
+ "events",
120
+ # Trace recording + replay (prod-to-eval pattern)
121
+ "save_trace",
122
+ "load_trace",
123
+ "load_trace_iter",
124
+ "replay_trace",
125
+ "ReplayResult",
126
+ "enable_otel",
127
+ "disable_otel",
128
+ "otel_enabled",
129
+ "otel_is_available",
130
+ # Eval harness + metrics (CI assertion surface)
131
+ "Eval",
132
+ "eval_from_run",
133
+ "assert_test",
134
+ "EvalResult",
135
+ "EvalScore",
136
+ "EvalAssertionError",
137
+ "metrics",
138
+ # Errors
139
+ "AIMEvalError",
140
+ "AIMEvalSecurityWarning",
141
+ "APIConnectionError",
142
+ "APIServerError",
143
+ "APITimeoutError",
144
+ "AuthenticationError",
145
+ "BadRequestError",
146
+ "ConflictError",
147
+ "NotFoundError",
148
+ "RateLimitError",
149
+ # Response types
150
+ "Annotation",
151
+ "AnnotationsBootstrapResult",
152
+ "AuditEntry",
153
+ "CompareHistoryItem",
154
+ "Comparison",
155
+ "ConnectionTestResult",
156
+ "CostEstimate",
157
+ "Dataset",
158
+ "GateCompatibility",
159
+ "MetricCollection",
160
+ "MetricDistribution",
161
+ "MetricRegistry",
162
+ "MetricRegistryEntry",
163
+ "MetricSummary",
164
+ "Model",
165
+ "Prompt",
166
+ "PromptDiff",
167
+ "PromptTestResult",
168
+ "PromptTestRun",
169
+ "PromptVersion",
170
+ "QualityGate",
171
+ "RegressionSet",
172
+ "Resource",
173
+ "Run",
174
+ "SearchHit",
175
+ "SearchResults",
176
+ "UploadStatus",
177
+ "UploadTicket",
178
+ "Usage",
179
+ "Webhook",
180
+ "WizardBootstrap",
181
+ # Webhook verifier (security helper)
182
+ "WebhookVerificationError",
183
+ "construct_event",
184
+ "verify_webhook",
185
+ # Streaming
186
+ "AsyncRunSSEStream",
187
+ "AsyncRunStream",
188
+ "RunEvent",
189
+ "RunSSEStream",
190
+ "RunStream",
191
+ ]
@@ -0,0 +1,61 @@
1
+ """``client.beta.*`` — experimental surface area.
2
+
3
+ Industry pattern from OpenAI / Anthropic: features that haven't earned
4
+ SemVer protection yet live under a dedicated ``beta`` namespace. This
5
+ gives them three properties at once:
6
+
7
+ 1. **Visible**: a glance at ``client.beta.<something>`` tells the
8
+ reader "this isn't stable yet — pin your SDK and read the
9
+ CHANGELOG before bumping".
10
+ 2. **Safe to remove / rename** without bumping the major version of
11
+ the SDK. The ``beta`` package itself is the contract.
12
+ 3. **Cheap to ship**: graduation to the stable surface (e.g.
13
+ ``client.replay``) is a one-line re-export when the API settles.
14
+
15
+ Currently in beta:
16
+
17
+ - :attr:`Beta.replay` — trace recording + replay helpers
18
+ (:func:`aimeval.save_trace` / :func:`aimeval.load_trace` /
19
+ :func:`aimeval.replay_trace`). Wrapped under ``client.beta.replay``
20
+ so the helpers feel discoverable from autocomplete even when
21
+ callers don't import them directly.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ from typing import TYPE_CHECKING
26
+
27
+ from aimeval import _trace_io
28
+
29
+ if TYPE_CHECKING: # pragma: no cover
30
+ from aimeval.client import AIMEval, AsyncAIMEval
31
+
32
+
33
+ class _Replay:
34
+ """Beta wrapper around :mod:`aimeval._trace_io`.
35
+
36
+ Stays opt-in via ``client.beta.replay.save_trace(...)`` for the
37
+ same reason every premium SDK quarantines unstable surface:
38
+ discoverable via autocomplete, but the rename / removal is on
39
+ a separate timeline from the stable client.
40
+ """
41
+
42
+ def __init__(self, _client: "AIMEval | AsyncAIMEval"):
43
+ # Client kept on the instance only so a future beta feature
44
+ # that *does* hit the network has something to call. The replay
45
+ # helpers themselves are pure offline.
46
+ self._client = _client
47
+
48
+ save_trace = staticmethod(_trace_io.save_trace)
49
+ load_trace = staticmethod(_trace_io.load_trace)
50
+ load_trace_iter = staticmethod(_trace_io.load_trace_iter)
51
+ replay_trace = staticmethod(_trace_io.replay_trace)
52
+
53
+
54
+ class Beta:
55
+ """Namespace for experimental features. See module docstring."""
56
+
57
+ def __init__(self, client: "AIMEval | AsyncAIMEval"):
58
+ self.replay = _Replay(client)
59
+
60
+
61
+ __all__ = ["Beta"]