arize-phoenix 4.4.4rc5__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (42) hide show
  1. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +11 -5
  2. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +39 -36
  3. phoenix/config.py +21 -0
  4. phoenix/datetime_utils.py +4 -0
  5. phoenix/db/insertion/evaluation.py +4 -4
  6. phoenix/db/insertion/helpers.py +4 -12
  7. phoenix/db/insertion/span.py +3 -3
  8. phoenix/db/models.py +1 -1
  9. phoenix/experiments/__init__.py +6 -0
  10. phoenix/experiments/evaluators/__init__.py +29 -0
  11. phoenix/experiments/evaluators/base.py +153 -0
  12. phoenix/{datasets → experiments}/evaluators/code_evaluators.py +7 -7
  13. phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +9 -9
  14. phoenix/{datasets → experiments}/evaluators/utils.py +38 -141
  15. phoenix/{datasets/experiments.py → experiments/functions.py} +248 -182
  16. phoenix/experiments/types.py +722 -0
  17. phoenix/experiments/utils.py +9 -0
  18. phoenix/server/api/context.py +2 -0
  19. phoenix/server/api/dataloaders/__init__.py +2 -0
  20. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  21. phoenix/server/api/routers/v1/__init__.py +1 -1
  22. phoenix/server/api/routers/v1/dataset_examples.py +10 -10
  23. phoenix/server/api/routers/v1/datasets.py +6 -6
  24. phoenix/server/api/routers/v1/evaluations.py +4 -11
  25. phoenix/server/api/routers/v1/experiment_evaluations.py +22 -23
  26. phoenix/server/api/routers/v1/experiment_runs.py +4 -16
  27. phoenix/server/api/routers/v1/experiments.py +5 -5
  28. phoenix/server/api/routers/v1/spans.py +6 -4
  29. phoenix/server/api/types/Experiment.py +7 -0
  30. phoenix/server/app.py +2 -0
  31. phoenix/server/static/index.js +648 -570
  32. phoenix/session/client.py +256 -85
  33. phoenix/trace/fixtures.py +6 -6
  34. phoenix/utilities/json.py +8 -8
  35. phoenix/version.py +1 -1
  36. phoenix/datasets/__init__.py +0 -0
  37. phoenix/datasets/evaluators/__init__.py +0 -18
  38. phoenix/datasets/types.py +0 -178
  39. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
  40. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
  41. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
  42. /phoenix/{datasets → experiments}/tracing.py +0 -0
@@ -3,13 +3,13 @@ import json
3
3
  from binascii import hexlify
4
4
  from contextlib import ExitStack
5
5
  from copy import deepcopy
6
+ from dataclasses import replace
6
7
  from datetime import datetime, timezone
7
8
  from itertools import product
8
9
  from typing import (
9
10
  Any,
10
11
  Awaitable,
11
12
  Dict,
12
- Iterable,
13
13
  Mapping,
14
14
  Optional,
15
15
  Sequence,
@@ -22,6 +22,7 @@ from urllib.parse import urljoin
22
22
 
23
23
  import httpx
24
24
  import opentelemetry.sdk.trace as trace_sdk
25
+ import pandas as pd
25
26
  from openinference.semconv.resource import ResourceAttributes
26
27
  from openinference.semconv.trace import (
27
28
  OpenInferenceMimeTypeValues,
@@ -33,76 +34,51 @@ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExport
33
34
  from opentelemetry.sdk.resources import Resource
34
35
  from opentelemetry.sdk.trace import Span
35
36
  from opentelemetry.sdk.trace.export import SimpleSpanProcessor
36
- from opentelemetry.trace import Status, StatusCode
37
+ from opentelemetry.trace import Status, StatusCode, Tracer
37
38
  from typing_extensions import TypeAlias
38
39
 
39
- from phoenix.config import (
40
- get_env_client_headers,
41
- get_env_collector_endpoint,
42
- get_env_host,
43
- get_env_port,
44
- )
45
- from phoenix.datasets.evaluators.utils import (
40
+ from phoenix.config import get_base_url, get_env_client_headers
41
+ from phoenix.evals.executors import get_executor_on_sync_context
42
+ from phoenix.evals.models.rate_limiters import RateLimiter
43
+ from phoenix.evals.utils import get_tqdm_progress_bar_formatter
44
+ from phoenix.experiments.evaluators import create_evaluator
45
+ from phoenix.experiments.evaluators.base import (
46
46
  Evaluator,
47
- EvaluatorName,
48
47
  ExperimentEvaluator,
49
- create_evaluator,
50
48
  )
51
- from phoenix.datasets.tracing import capture_spans
52
- from phoenix.datasets.types import (
49
+ from phoenix.experiments.tracing import capture_spans
50
+ from phoenix.experiments.types import (
51
+ DRY_RUN,
53
52
  Dataset,
53
+ EvaluationParameters,
54
54
  EvaluationResult,
55
+ EvaluationSummary,
56
+ EvaluatorName,
55
57
  Example,
56
58
  Experiment,
57
59
  ExperimentEvaluationRun,
60
+ ExperimentParameters,
58
61
  ExperimentResult,
59
62
  ExperimentRun,
60
- ExperimentRunId,
61
63
  ExperimentTask,
64
+ RanExperiment,
65
+ TaskSummary,
62
66
  TestCase,
67
+ _asdict,
68
+ _replace,
63
69
  )
64
- from phoenix.evals.executors import get_executor_on_sync_context
65
- from phoenix.evals.models.rate_limiters import RateLimiter
66
- from phoenix.evals.utils import get_tqdm_progress_bar_formatter
67
- from phoenix.session.session import active_session
70
+ from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url
68
71
  from phoenix.trace.attributes import flatten
69
72
  from phoenix.utilities.json import jsonify
70
73
 
71
74
 
72
- def _get_base_url() -> str:
73
- host = get_env_host()
74
- if host == "0.0.0.0":
75
- host = "127.0.0.1"
76
- base_url = get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
77
- return base_url if base_url.endswith("/") else base_url + "/"
78
-
79
-
80
- def _get_web_base_url() -> str:
81
- """Return the web UI base URL.
82
-
83
- Returns:
84
- str: the web UI base URL
85
- """
86
- if session := active_session():
87
- return session.url
88
- return _get_base_url()
89
-
90
-
91
- def _get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
92
- return f"{_get_web_base_url()}datasets/{dataset_id}/compare?experimentId={experiment_id}"
93
-
94
-
95
- def _get_dataset_experiments_url(*, dataset_id: str) -> str:
96
- return f"{_get_web_base_url()}datasets/{dataset_id}/experiments"
97
-
98
-
99
75
  def _phoenix_clients() -> Tuple[httpx.Client, httpx.AsyncClient]:
100
76
  headers = get_env_client_headers()
101
77
  return httpx.Client(
102
- base_url=_get_base_url(),
78
+ base_url=get_base_url(),
103
79
  headers=headers,
104
80
  ), httpx.AsyncClient(
105
- base_url=_get_base_url(),
81
+ base_url=get_base_url(),
106
82
  headers=headers,
107
83
  )
108
84
 
@@ -114,16 +90,23 @@ Evaluators: TypeAlias = Union[
114
90
  ]
115
91
 
116
92
 
93
+ RateLimitErrors: TypeAlias = Union[Type[BaseException], Sequence[Type[BaseException]]]
94
+
95
+
117
96
  def run_experiment(
118
97
  dataset: Dataset,
119
98
  task: ExperimentTask,
99
+ evaluators: Optional[Evaluators] = None,
120
100
  *,
121
101
  experiment_name: Optional[str] = None,
122
102
  experiment_description: Optional[str] = None,
123
103
  experiment_metadata: Optional[Mapping[str, Any]] = None,
124
- evaluators: Optional[Evaluators] = None,
125
- rate_limit_errors: Optional[Union[Type[BaseException], Tuple[Type[BaseException], ...]]] = None,
126
- ) -> Experiment:
104
+ rate_limit_errors: Optional[RateLimitErrors] = None,
105
+ dry_run: Union[bool, int] = False,
106
+ print_summary: bool = True,
107
+ ) -> RanExperiment:
108
+ if not dataset.examples:
109
+ raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
127
110
  # Add this to the params once supported in the UI
128
111
  repetitions = 1
129
112
  assert repetitions > 0, "Must run the experiment at least once."
@@ -131,44 +114,60 @@ def run_experiment(
131
114
 
132
115
  sync_client, async_client = _phoenix_clients()
133
116
 
134
- experiment_response = sync_client.post(
135
- f"/v1/datasets/{dataset.id}/experiments",
136
- json={
137
- "version-id": dataset.version_id,
138
- "name": experiment_name,
139
- "description": experiment_description,
140
- "metadata": experiment_metadata,
141
- "repetitions": repetitions,
142
- },
143
- )
144
- experiment_response.raise_for_status()
145
- exp_json = experiment_response.json()
146
- experiment_id = exp_json["id"]
147
- project_name = exp_json["project_name"]
117
+ payload = {
118
+ "version_id": dataset.version_id,
119
+ "name": experiment_name,
120
+ "description": experiment_description,
121
+ "metadata": experiment_metadata,
122
+ "repetitions": repetitions,
123
+ }
124
+ if not dry_run:
125
+ experiment_response = sync_client.post(
126
+ f"/v1/datasets/{dataset.id}/experiments",
127
+ json=payload,
128
+ )
129
+ experiment_response.raise_for_status()
130
+ exp_json = experiment_response.json()["data"]
131
+ project_name = exp_json["project_name"]
132
+ experiment = Experiment(
133
+ dataset_id=dataset.id,
134
+ dataset_version_id=dataset.version_id,
135
+ repetitions=repetitions,
136
+ id=exp_json["id"],
137
+ project_name=project_name,
138
+ )
139
+ else:
140
+ experiment = Experiment(
141
+ dataset_id=dataset.id,
142
+ dataset_version_id=dataset.version_id,
143
+ repetitions=repetitions,
144
+ id=DRY_RUN,
145
+ project_name="",
146
+ )
148
147
 
149
- resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
150
- tracer_provider = trace_sdk.TracerProvider(resource=resource)
151
- tracer_provider.add_span_processor(
152
- SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
153
- )
154
- tracer = tracer_provider.get_tracer(__name__)
148
+ tracer, resource = _get_tracer(experiment.project_name)
155
149
  root_span_name = f"Task: {_get_task_name(task)}"
156
150
  root_span_kind = CHAIN
157
151
 
158
- dataset_experiments_url = _get_dataset_experiments_url(dataset_id=dataset.id)
159
- experiment_compare_url = _get_experiment_url(dataset_id=dataset.id, experiment_id=experiment_id)
160
152
  print("🧪 Experiment started.")
161
- print(f"📺 View dataset experiments: {dataset_experiments_url}")
162
- print(f"🔗 View this experiment: {experiment_compare_url}")
163
-
164
- errors: Tuple[Optional[Type[BaseException]], ...]
165
- if not hasattr(rate_limit_errors, "__iter__"):
166
- errors = (rate_limit_errors,)
153
+ if dry_run:
154
+ examples = {
155
+ (ex := dataset[i]).id: ex
156
+ for i in pd.Series(range(len(dataset)))
157
+ .sample(min(len(dataset), int(dry_run)), random_state=42)
158
+ .sort_values()
159
+ }
160
+ id_selection = "\n".join(examples)
161
+ print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
162
+ dataset = replace(dataset, examples=examples)
167
163
  else:
168
- rate_limit_errors = cast(Tuple[Type[BaseException], ...], rate_limit_errors)
169
- errors = rate_limit_errors
170
-
171
- rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in errors]
164
+ dataset_experiments_url = get_dataset_experiments_url(dataset_id=dataset.id)
165
+ experiment_compare_url = get_experiment_url(
166
+ dataset_id=dataset.id,
167
+ experiment_id=experiment.id,
168
+ )
169
+ print(f"📺 View dataset experiments: {dataset_experiments_url}")
170
+ print(f"🔗 View this experiment: {experiment_compare_url}")
172
171
 
173
172
  def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
174
173
  example, repetition_number = test_case.example, test_case.repetition_number
@@ -193,6 +192,7 @@ def run_experiment(
193
192
  span.record_exception(exc)
194
193
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
195
194
  error = exc
195
+ output = jsonify(output)
196
196
  span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
197
197
  span.set_attribute(INPUT_MIME_TYPE, JSON.value)
198
198
  if result := ExperimentResult(result=output) if output is not None else None:
@@ -208,21 +208,21 @@ def run_experiment(
208
208
  assert isinstance(
209
209
  output, (dict, list, str, int, float, bool, type(None))
210
210
  ), "Output must be JSON serializable"
211
- experiment_run = ExperimentRun(
211
+ exp_run = ExperimentRun(
212
212
  start_time=_decode_unix_nano(cast(int, span.start_time)),
213
213
  end_time=_decode_unix_nano(cast(int, span.end_time)),
214
- experiment_id=experiment_id,
214
+ experiment_id=experiment.id,
215
215
  dataset_example_id=example.id,
216
216
  repetition_number=repetition_number,
217
217
  output=result,
218
218
  error=repr(error) if error else None,
219
219
  trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
220
220
  )
221
- resp = sync_client.post(
222
- f"/v1/experiments/{experiment_id}/runs", json=jsonify(experiment_run)
223
- )
224
- resp.raise_for_status()
225
- return experiment_run
221
+ if not dry_run:
222
+ resp = sync_client.post(f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run))
223
+ resp.raise_for_status()
224
+ exp_run = replace(exp_run, id=resp.json()["data"]["id"])
225
+ return exp_run
226
226
 
227
227
  async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
228
228
  example, repetition_number = test_case.example, test_case.repetition_number
@@ -247,6 +247,7 @@ def run_experiment(
247
247
  span.record_exception(exc)
248
248
  status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
249
249
  error = exc
250
+ output = jsonify(output)
250
251
  span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
251
252
  span.set_attribute(INPUT_MIME_TYPE, JSON.value)
252
253
  if result := ExperimentResult(result=output) if output is not None else None:
@@ -262,21 +263,31 @@ def run_experiment(
262
263
  assert isinstance(
263
264
  output, (dict, list, str, int, float, bool, type(None))
264
265
  ), "Output must be JSON serializable"
265
- experiment_run = ExperimentRun(
266
+ exp_run = ExperimentRun(
266
267
  start_time=_decode_unix_nano(cast(int, span.start_time)),
267
268
  end_time=_decode_unix_nano(cast(int, span.end_time)),
268
- experiment_id=experiment_id,
269
+ experiment_id=experiment.id,
269
270
  dataset_example_id=example.id,
270
271
  repetition_number=repetition_number,
271
272
  output=result,
272
273
  error=repr(error) if error else None,
273
274
  trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
274
275
  )
275
- resp = await async_client.post(
276
- f"/v1/experiments/{experiment_id}/runs", json=jsonify(experiment_run)
277
- )
278
- resp.raise_for_status()
279
- return experiment_run
276
+ if not dry_run:
277
+ resp = await async_client.post(
278
+ f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run)
279
+ )
280
+ resp.raise_for_status()
281
+ exp_run = replace(exp_run, id=resp.json()["data"]["id"])
282
+ return exp_run
283
+
284
+ _errors: Tuple[Type[BaseException], ...]
285
+ if not hasattr(rate_limit_errors, "__iter__"):
286
+ _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
287
+ else:
288
+ rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
289
+ _errors = tuple(filter(None, rate_limit_errors))
290
+ rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
280
291
 
281
292
  rate_limited_sync_run_experiment = functools.reduce(
282
293
  lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
@@ -295,84 +306,97 @@ def run_experiment(
295
306
  )
296
307
 
297
308
  test_cases = [
298
- TestCase(example=ex, repetition_number=rep)
299
- for ex, rep in product(dataset.examples, range(1, repetitions + 1))
309
+ TestCase(example=deepcopy(ex), repetition_number=rep)
310
+ for ex, rep in product(dataset.examples.values(), range(1, repetitions + 1))
300
311
  ]
301
- _, _execution_details = executor.run(test_cases)
302
- experiment = Experiment(
303
- id=experiment_id,
304
- dataset_id=dataset.id,
305
- dataset_version_id=dataset.version_id,
306
- project_name=project_name,
307
- )
308
-
312
+ task_runs, _execution_details = executor.run(test_cases)
309
313
  print("✅ Task runs completed.")
310
-
314
+ params = ExperimentParameters(n_examples=len(dataset.examples), n_repetitions=repetitions)
315
+ task_summary = TaskSummary.from_task_runs(params, task_runs)
316
+ ran_experiment: RanExperiment = object.__new__(RanExperiment)
317
+ ran_experiment.__init__( # type: ignore[misc]
318
+ params=params,
319
+ dataset=dataset,
320
+ runs={r.id: r for r in task_runs},
321
+ task_summary=task_summary,
322
+ **_asdict(experiment),
323
+ )
311
324
  if evaluators_by_name:
312
- _evaluate_experiment(
313
- experiment,
325
+ return evaluate_experiment(
326
+ ran_experiment,
314
327
  evaluators=evaluators_by_name,
315
- dataset_examples=dataset.examples,
316
- clients=(sync_client, async_client),
328
+ dry_run=dry_run,
329
+ print_summary=print_summary,
330
+ rate_limit_errors=rate_limit_errors,
317
331
  )
318
-
319
- return experiment
332
+ if print_summary:
333
+ print(ran_experiment)
334
+ return ran_experiment
320
335
 
321
336
 
322
337
  def evaluate_experiment(
323
338
  experiment: Experiment,
324
- evaluators: Union[
325
- ExperimentEvaluator,
326
- Sequence[ExperimentEvaluator],
327
- Mapping[EvaluatorName, ExperimentEvaluator],
328
- ],
329
- ) -> None:
339
+ evaluators: Evaluators,
340
+ *,
341
+ dry_run: Union[bool, int] = False,
342
+ print_summary: bool = True,
343
+ rate_limit_errors: Optional[RateLimitErrors] = None,
344
+ ) -> RanExperiment:
345
+ if not dry_run and _is_dry_run(experiment):
346
+ dry_run = True
347
+ evaluators_by_name = _evaluators_by_name(evaluators)
348
+ if not evaluators_by_name:
349
+ raise ValueError("Must specify at least one Evaluator")
330
350
  sync_client, async_client = _phoenix_clients()
331
351
  dataset_id = experiment.dataset_id
332
352
  dataset_version_id = experiment.dataset_version_id
333
-
334
- dataset_examples = [
335
- Example.from_dict(ex)
336
- for ex in (
353
+ if isinstance(experiment, RanExperiment):
354
+ ran_experiment: RanExperiment = experiment
355
+ else:
356
+ dataset = Dataset.from_dict(
337
357
  sync_client.get(
338
358
  f"/v1/datasets/{dataset_id}/examples",
339
- params={"version-id": str(dataset_version_id)},
340
- )
341
- .json()
342
- .get("data", {})
343
- .get("examples", [])
359
+ params={"version_id": str(dataset_version_id)},
360
+ ).json()["data"]
344
361
  )
345
- ]
346
- _evaluate_experiment(
347
- experiment,
348
- evaluators=evaluators,
349
- dataset_examples=dataset_examples,
350
- clients=(sync_client, async_client),
351
- )
352
-
353
-
354
- def _evaluate_experiment(
355
- experiment: Experiment,
356
- *,
357
- evaluators: Evaluators,
358
- dataset_examples: Iterable[Example],
359
- clients: Tuple[httpx.Client, httpx.AsyncClient],
360
- ) -> None:
361
- evaluators_by_name = _evaluators_by_name(evaluators)
362
- if not evaluators_by_name:
363
- raise ValueError("Must specify at least one Evaluator")
364
- experiment_id = experiment.id
365
- sync_client, async_client = clients
366
- experiment_runs = [
367
- ExperimentRun.from_dict(exp_run)
368
- for exp_run in sync_client.get(f"/v1/experiments/{experiment_id}/runs").json()
369
- ]
370
-
362
+ if not dataset.examples:
363
+ raise ValueError(f"Dataset has no examples: {dataset_id=}, {dataset_version_id=}")
364
+ experiment_runs = tuple(
365
+ ExperimentRun.from_dict(exp_run)
366
+ for exp_run in sync_client.get(f"/v1/experiments/{experiment.id}/runs").json()["data"]
367
+ )
368
+ if not experiment_runs:
369
+ raise ValueError("Experiment has not been run")
370
+ params = ExperimentParameters(n_examples=len(dataset.examples))
371
+ task_summary = TaskSummary.from_task_runs(params, experiment_runs)
372
+ ran_experiment = object.__new__(RanExperiment)
373
+ ran_experiment.__init__( # type: ignore[misc]
374
+ dataset=dataset,
375
+ params=params,
376
+ runs=experiment_runs,
377
+ task_summary=task_summary,
378
+ **_asdict(experiment),
379
+ )
380
+ print("🧠 Evaluation started.")
381
+ examples = ran_experiment.dataset.examples
382
+ if dry_run:
383
+ if not _is_dry_run(ran_experiment):
384
+ dataset = ran_experiment.dataset
385
+ examples = {
386
+ (ex := dataset[i]).id: ex
387
+ for i in pd.Series(range(len(dataset)))
388
+ .sample(min(len(dataset), int(dry_run)), random_state=42)
389
+ .sort_values()
390
+ }
391
+ dataset = replace(ran_experiment.dataset, examples=examples)
392
+ ran_experiment = _replace(ran_experiment, id=DRY_RUN, dataset=dataset)
393
+ id_selection = "\n".join(examples)
394
+ print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
371
395
  # not all dataset examples have associated experiment runs, so we need to pair them up
372
396
  example_run_pairs = []
373
- examples_by_id = {example.id: example for example in dataset_examples}
374
- for exp_run in experiment_runs:
375
- example = examples_by_id.get(exp_run.dataset_example_id)
397
+ examples = ran_experiment.dataset.examples
398
+ for exp_run in ran_experiment.runs.values():
399
+ example = examples.get(exp_run.dataset_example_id)
376
400
  if example:
377
401
  example_run_pairs.append((deepcopy(example), exp_run))
378
402
  evaluation_input = [
@@ -380,13 +404,7 @@ def _evaluate_experiment(
380
404
  for (example, run), evaluator in product(example_run_pairs, evaluators_by_name.values())
381
405
  ]
382
406
 
383
- project_name = "evaluators"
384
- resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
385
- tracer_provider = trace_sdk.TracerProvider(resource=resource)
386
- tracer_provider.add_span_processor(
387
- SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
388
- )
389
- tracer = tracer_provider.get_tracer(__name__)
407
+ tracer, resource = _get_tracer(None if dry_run else "evaluators")
390
408
  root_span_kind = EVALUATOR
391
409
 
392
410
  def sync_evaluate_run(
@@ -404,7 +422,7 @@ def _evaluate_experiment(
404
422
  stack.enter_context(capture_spans(resource))
405
423
  try:
406
424
  result = evaluator.evaluate(
407
- output=None if experiment_run.output is None else experiment_run.output.result,
425
+ output=experiment_run.task_output,
408
426
  expected=example.output,
409
427
  input=example.input,
410
428
  metadata=example.metadata,
@@ -418,8 +436,8 @@ def _evaluate_experiment(
418
436
  span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
419
437
  span.set_status(status)
420
438
 
421
- evaluator_payload = ExperimentEvaluationRun(
422
- experiment_run_id=cast(ExperimentRunId, experiment_run.id),
439
+ eval_run = ExperimentEvaluationRun(
440
+ experiment_run_id=experiment_run.id,
423
441
  start_time=_decode_unix_nano(cast(int, span.start_time)),
424
442
  end_time=_decode_unix_nano(cast(int, span.end_time)),
425
443
  name=evaluator.name,
@@ -428,9 +446,11 @@ def _evaluate_experiment(
428
446
  result=result,
429
447
  trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
430
448
  )
431
- resp = sync_client.post("/v1/experiment_evaluations", json=jsonify(evaluator_payload))
432
- resp.raise_for_status()
433
- return evaluator_payload
449
+ if not dry_run:
450
+ resp = sync_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
451
+ resp.raise_for_status()
452
+ eval_run = replace(eval_run, id=resp.json()["data"]["id"])
453
+ return eval_run
434
454
 
435
455
  async def async_evaluate_run(
436
456
  obj: Tuple[Example, ExperimentRun, Evaluator],
@@ -447,7 +467,7 @@ def _evaluate_experiment(
447
467
  stack.enter_context(capture_spans(resource))
448
468
  try:
449
469
  result = await evaluator.async_evaluate(
450
- output=None if experiment_run.output is None else experiment_run.output.result,
470
+ output=experiment_run.task_output,
451
471
  expected=example.output,
452
472
  input=example.input,
453
473
  metadata=example.metadata,
@@ -461,8 +481,8 @@ def _evaluate_experiment(
461
481
  span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
462
482
  span.set_status(status)
463
483
 
464
- evaluator_payload = ExperimentEvaluationRun(
465
- experiment_run_id=cast(ExperimentRunId, experiment_run.id),
484
+ eval_run = ExperimentEvaluationRun(
485
+ experiment_run_id=experiment_run.id,
466
486
  start_time=_decode_unix_nano(cast(int, span.start_time)),
467
487
  end_time=_decode_unix_nano(cast(int, span.end_time)),
468
488
  name=evaluator.name,
@@ -471,22 +491,47 @@ def _evaluate_experiment(
471
491
  result=result,
472
492
  trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
473
493
  )
474
- resp = await async_client.post(
475
- "/v1/experiment_evaluations", json=jsonify(evaluator_payload)
476
- )
477
- resp.raise_for_status()
478
- return evaluator_payload
494
+ if not dry_run:
495
+ resp = await async_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
496
+ resp.raise_for_status()
497
+ eval_run = replace(eval_run, id=resp.json()["data"]["id"])
498
+ return eval_run
499
+
500
+ _errors: Tuple[Type[BaseException], ...]
501
+ if not hasattr(rate_limit_errors, "__iter__"):
502
+ _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
503
+ else:
504
+ rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
505
+ _errors = tuple(filter(None, rate_limit_errors))
506
+ rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
507
+
508
+ rate_limited_sync_evaluate_run = functools.reduce(
509
+ lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_evaluate_run
510
+ )
511
+ rate_limited_async_evaluate_run = functools.reduce(
512
+ lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_evaluate_run
513
+ )
479
514
 
480
515
  executor = get_executor_on_sync_context(
481
- sync_evaluate_run,
482
- async_evaluate_run,
516
+ rate_limited_sync_evaluate_run,
517
+ rate_limited_async_evaluate_run,
483
518
  max_retries=0,
484
519
  exit_on_error=False,
485
520
  fallback_return_value=None,
486
521
  tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
487
522
  )
488
- print("🧠 Evaluation started.")
489
- _, _execution_details = executor.run(evaluation_input)
523
+ eval_runs, _execution_details = executor.run(evaluation_input)
524
+ eval_summary = EvaluationSummary.from_eval_runs(
525
+ EvaluationParameters(
526
+ eval_names=frozenset(evaluators_by_name),
527
+ exp_params=ran_experiment.params,
528
+ ),
529
+ *eval_runs,
530
+ )
531
+ ran_experiment = ran_experiment.add(eval_summary, *eval_runs)
532
+ if print_summary:
533
+ print(ran_experiment)
534
+ return ran_experiment
490
535
 
491
536
 
492
537
  def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Evaluator]:
@@ -519,6 +564,18 @@ def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Eva
519
564
  return evaluators_by_name
520
565
 
521
566
 
567
+ def _get_tracer(project_name: Optional[str] = None) -> Tuple[Tracer, Resource]:
568
+ resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
569
+ tracer_provider = trace_sdk.TracerProvider(resource=resource)
570
+ span_processor = (
571
+ SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{get_base_url()}", "v1/traces")))
572
+ if project_name
573
+ else _NoOpProcessor()
574
+ )
575
+ tracer_provider.add_span_processor(span_processor)
576
+ return tracer_provider.get_tracer(__name__), resource
577
+
578
+
522
579
  def _str_trace_id(id_: int) -> str:
523
580
  return hexlify(id_.to_bytes(16, "big")).decode()
524
581
 
@@ -539,6 +596,15 @@ def _get_task_name(task: ExperimentTask) -> str:
539
596
  return str(task)
540
597
 
541
598
 
599
+ def _is_dry_run(obj: Any) -> bool:
600
+ return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
601
+
602
+
603
+ class _NoOpProcessor(trace_sdk.SpanProcessor):
604
+ def force_flush(self, *_: Any) -> bool:
605
+ return True
606
+
607
+
542
608
  INPUT_VALUE = SpanAttributes.INPUT_VALUE
543
609
  OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
544
610
  INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE