arize-phoenix 4.5.0__py3-none-any.whl → 4.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (123) hide show
  1. {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.2.dist-info}/METADATA +16 -8
  2. {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.2.dist-info}/RECORD +122 -58
  3. {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.2.dist-info}/WHEEL +1 -1
  4. phoenix/__init__.py +0 -27
  5. phoenix/config.py +42 -7
  6. phoenix/core/model.py +25 -25
  7. phoenix/core/model_schema.py +64 -62
  8. phoenix/core/model_schema_adapter.py +27 -25
  9. phoenix/datetime_utils.py +4 -0
  10. phoenix/db/bulk_inserter.py +54 -14
  11. phoenix/db/insertion/dataset.py +237 -0
  12. phoenix/db/insertion/evaluation.py +10 -10
  13. phoenix/db/insertion/helpers.py +17 -14
  14. phoenix/db/insertion/span.py +3 -3
  15. phoenix/db/migrations/types.py +29 -0
  16. phoenix/db/migrations/versions/10460e46d750_datasets.py +291 -0
  17. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +2 -28
  18. phoenix/db/models.py +236 -4
  19. phoenix/experiments/__init__.py +6 -0
  20. phoenix/experiments/evaluators/__init__.py +29 -0
  21. phoenix/experiments/evaluators/base.py +153 -0
  22. phoenix/experiments/evaluators/code_evaluators.py +99 -0
  23. phoenix/experiments/evaluators/llm_evaluators.py +244 -0
  24. phoenix/experiments/evaluators/utils.py +186 -0
  25. phoenix/experiments/functions.py +757 -0
  26. phoenix/experiments/tracing.py +85 -0
  27. phoenix/experiments/types.py +753 -0
  28. phoenix/experiments/utils.py +24 -0
  29. phoenix/inferences/fixtures.py +23 -23
  30. phoenix/inferences/inferences.py +7 -7
  31. phoenix/inferences/validation.py +1 -1
  32. phoenix/server/api/context.py +20 -0
  33. phoenix/server/api/dataloaders/__init__.py +20 -0
  34. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  35. phoenix/server/api/dataloaders/dataset_example_revisions.py +100 -0
  36. phoenix/server/api/dataloaders/dataset_example_spans.py +43 -0
  37. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +85 -0
  38. phoenix/server/api/dataloaders/experiment_error_rates.py +43 -0
  39. phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
  40. phoenix/server/api/dataloaders/experiment_sequence_number.py +49 -0
  41. phoenix/server/api/dataloaders/project_by_name.py +31 -0
  42. phoenix/server/api/dataloaders/span_descendants.py +2 -3
  43. phoenix/server/api/dataloaders/span_projects.py +33 -0
  44. phoenix/server/api/dataloaders/trace_row_ids.py +39 -0
  45. phoenix/server/api/helpers/dataset_helpers.py +179 -0
  46. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
  47. phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
  48. phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  49. phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
  50. phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
  51. phoenix/server/api/input_types/DatasetSort.py +17 -0
  52. phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
  53. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
  54. phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
  55. phoenix/server/api/input_types/DeleteExperimentsInput.py +9 -0
  56. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
  57. phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
  58. phoenix/server/api/mutations/__init__.py +13 -0
  59. phoenix/server/api/mutations/auth.py +11 -0
  60. phoenix/server/api/mutations/dataset_mutations.py +520 -0
  61. phoenix/server/api/mutations/experiment_mutations.py +65 -0
  62. phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +17 -14
  63. phoenix/server/api/mutations/project_mutations.py +47 -0
  64. phoenix/server/api/openapi/__init__.py +0 -0
  65. phoenix/server/api/openapi/main.py +6 -0
  66. phoenix/server/api/openapi/schema.py +16 -0
  67. phoenix/server/api/queries.py +503 -0
  68. phoenix/server/api/routers/v1/__init__.py +77 -2
  69. phoenix/server/api/routers/v1/dataset_examples.py +178 -0
  70. phoenix/server/api/routers/v1/datasets.py +965 -0
  71. phoenix/server/api/routers/v1/evaluations.py +8 -13
  72. phoenix/server/api/routers/v1/experiment_evaluations.py +143 -0
  73. phoenix/server/api/routers/v1/experiment_runs.py +220 -0
  74. phoenix/server/api/routers/v1/experiments.py +302 -0
  75. phoenix/server/api/routers/v1/spans.py +9 -5
  76. phoenix/server/api/routers/v1/traces.py +1 -4
  77. phoenix/server/api/schema.py +2 -303
  78. phoenix/server/api/types/AnnotatorKind.py +10 -0
  79. phoenix/server/api/types/Cluster.py +19 -19
  80. phoenix/server/api/types/CreateDatasetPayload.py +8 -0
  81. phoenix/server/api/types/Dataset.py +282 -63
  82. phoenix/server/api/types/DatasetExample.py +85 -0
  83. phoenix/server/api/types/DatasetExampleRevision.py +34 -0
  84. phoenix/server/api/types/DatasetVersion.py +14 -0
  85. phoenix/server/api/types/Dimension.py +30 -29
  86. phoenix/server/api/types/EmbeddingDimension.py +40 -34
  87. phoenix/server/api/types/Event.py +16 -16
  88. phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
  89. phoenix/server/api/types/Experiment.py +147 -0
  90. phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
  91. phoenix/server/api/types/ExperimentComparison.py +19 -0
  92. phoenix/server/api/types/ExperimentRun.py +91 -0
  93. phoenix/server/api/types/ExperimentRunAnnotation.py +57 -0
  94. phoenix/server/api/types/Inferences.py +80 -0
  95. phoenix/server/api/types/InferencesRole.py +23 -0
  96. phoenix/server/api/types/Model.py +43 -42
  97. phoenix/server/api/types/Project.py +26 -12
  98. phoenix/server/api/types/Span.py +79 -2
  99. phoenix/server/api/types/TimeSeries.py +6 -6
  100. phoenix/server/api/types/Trace.py +15 -4
  101. phoenix/server/api/types/UMAPPoints.py +1 -1
  102. phoenix/server/api/types/node.py +5 -111
  103. phoenix/server/api/types/pagination.py +10 -52
  104. phoenix/server/app.py +103 -49
  105. phoenix/server/main.py +49 -27
  106. phoenix/server/openapi/docs.py +3 -0
  107. phoenix/server/static/index.js +2300 -1294
  108. phoenix/server/templates/index.html +1 -0
  109. phoenix/services.py +15 -15
  110. phoenix/session/client.py +581 -22
  111. phoenix/session/session.py +47 -37
  112. phoenix/trace/exporter.py +14 -9
  113. phoenix/trace/fixtures.py +133 -7
  114. phoenix/trace/schemas.py +1 -2
  115. phoenix/trace/span_evaluations.py +3 -3
  116. phoenix/trace/trace_dataset.py +6 -6
  117. phoenix/utilities/json.py +61 -0
  118. phoenix/utilities/re.py +50 -0
  119. phoenix/version.py +1 -1
  120. phoenix/server/api/types/DatasetRole.py +0 -23
  121. {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.2.dist-info}/licenses/IP_NOTICE +0 -0
  122. {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.2.dist-info}/licenses/LICENSE +0 -0
  123. /phoenix/server/api/{helpers.py → helpers/__init__.py} +0 -0
@@ -0,0 +1,757 @@
1
+ import functools
2
+ import inspect
3
+ import json
4
+ import traceback
5
+ from binascii import hexlify
6
+ from contextlib import ExitStack
7
+ from copy import deepcopy
8
+ from dataclasses import replace
9
+ from datetime import datetime, timezone
10
+ from itertools import product
11
+ from typing import (
12
+ Any,
13
+ Awaitable,
14
+ Dict,
15
+ Literal,
16
+ Mapping,
17
+ Optional,
18
+ Sequence,
19
+ Tuple,
20
+ Type,
21
+ Union,
22
+ cast,
23
+ )
24
+ from urllib.parse import urljoin
25
+
26
+ import httpx
27
+ import opentelemetry.sdk.trace as trace_sdk
28
+ import pandas as pd
29
+ from openinference.semconv.resource import ResourceAttributes
30
+ from openinference.semconv.trace import (
31
+ OpenInferenceMimeTypeValues,
32
+ OpenInferenceSpanKindValues,
33
+ SpanAttributes,
34
+ )
35
+ from opentelemetry.context import Context
36
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
37
+ from opentelemetry.sdk.resources import Resource
38
+ from opentelemetry.sdk.trace import Span
39
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor
40
+ from opentelemetry.trace import Status, StatusCode, Tracer
41
+ from typing_extensions import TypeAlias
42
+
43
+ from phoenix.config import get_base_url, get_env_client_headers
44
+ from phoenix.evals.executors import get_executor_on_sync_context
45
+ from phoenix.evals.models.rate_limiters import RateLimiter
46
+ from phoenix.evals.utils import get_tqdm_progress_bar_formatter
47
+ from phoenix.experiments.evaluators import create_evaluator
48
+ from phoenix.experiments.evaluators.base import (
49
+ Evaluator,
50
+ ExperimentEvaluator,
51
+ )
52
+ from phoenix.experiments.tracing import capture_spans
53
+ from phoenix.experiments.types import (
54
+ DRY_RUN,
55
+ Dataset,
56
+ EvaluationParameters,
57
+ EvaluationResult,
58
+ EvaluationSummary,
59
+ EvaluatorName,
60
+ Example,
61
+ Experiment,
62
+ ExperimentEvaluationRun,
63
+ ExperimentParameters,
64
+ ExperimentRun,
65
+ ExperimentRunOutput,
66
+ ExperimentTask,
67
+ RanExperiment,
68
+ TaskSummary,
69
+ TestCase,
70
+ _asdict,
71
+ _replace,
72
+ )
73
+ from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url, get_func_name
74
+ from phoenix.trace.attributes import flatten
75
+ from phoenix.utilities.json import jsonify
76
+
77
+
78
+ def _phoenix_clients() -> Tuple[httpx.Client, httpx.AsyncClient]:
79
+ headers = get_env_client_headers()
80
+ return httpx.Client(
81
+ base_url=get_base_url(),
82
+ headers=headers,
83
+ ), httpx.AsyncClient(
84
+ base_url=get_base_url(),
85
+ headers=headers,
86
+ )
87
+
88
+
89
+ Evaluators: TypeAlias = Union[
90
+ ExperimentEvaluator,
91
+ Sequence[ExperimentEvaluator],
92
+ Mapping[EvaluatorName, ExperimentEvaluator],
93
+ ]
94
+
95
+
96
+ RateLimitErrors: TypeAlias = Union[Type[BaseException], Sequence[Type[BaseException]]]
97
+
98
+
99
+ def run_experiment(
100
+ dataset: Dataset,
101
+ task: ExperimentTask,
102
+ evaluators: Optional[Evaluators] = None,
103
+ *,
104
+ experiment_name: Optional[str] = None,
105
+ experiment_description: Optional[str] = None,
106
+ experiment_metadata: Optional[Mapping[str, Any]] = None,
107
+ rate_limit_errors: Optional[RateLimitErrors] = None,
108
+ dry_run: Union[bool, int] = False,
109
+ print_summary: bool = True,
110
+ ) -> RanExperiment:
111
+ """
112
+ Runs an experiment using a given set of dataset of examples.
113
+
114
+ An experiment is a user-defined task that runs on each example in a dataset. The results from
115
+ each experiment can be evaluated using any number of evaluators to measure the behavior of the
116
+ task. The experiment and evaluation results are stored in the Phoenix database for comparison
117
+ and analysis.
118
+
119
+ A `task` is either a synchronous or asynchronous function that returns a JSON serializable
120
+ output. If the `task` is a function of one argument then that argument will be bound to the
121
+ `input` field of the dataset example. Alternatively, the `task` can be a function of any
122
+ combination of specific argument names that will be bound to special values:
123
+ `input`: The input field of the dataset example
124
+ `expected`: The expected or reference output of the dataset example
125
+ `reference`: An alias for `expected`
126
+ `metadata`: Metadata associated with the dataset example
127
+ `example`: The dataset `Example` object with all associated fields
128
+
129
+ An `evaluator` is either a synchronous or asynchronous function that returns either a boolean
130
+ or numeric "score". If the `evaluator` is a function of one argument then that argument will be
131
+ bound to the `output` of the task. Alternatively, the `evaluator` can be a function of any
132
+ combination of specific argument names that will be bound to special values:
133
+ `input`: The input field of the dataset example
134
+ `output`: The output of the task
135
+ `expected`: The expected or reference output of the dataset example
136
+ `reference`: An alias for `expected`
137
+ `metadata`: Metadata associated with the dataset example
138
+
139
+ Phoenix also provides pre-built evaluators in the `phoenix.experiments.evaluators` module.
140
+
141
+ Args:
142
+ dataset (Dataset): The dataset on which to run the experiment.
143
+ task (ExperimentTask): The task to run on each example in the dataset.
144
+ evaluators (Optional[Evaluators]): A single evaluator or sequence of evaluators used to
145
+ evaluate the results of the experiment. Defaults to None.
146
+ experiment_name (Optional[str]): The name of the experiment. Defaults to None.
147
+ experiment_description (Optional[str]): A description of the experiment. Defaults to None.
148
+ experiment_metadata (Optional[Mapping[str, Any]]): Metadata to associate with the
149
+ experiment. Defaults to None.
150
+ rate_limit_errors (Optional[BaseException | Sequence[BaseException]]): An exception or
151
+ sequence of exceptions to adaptively throttle on. Defaults to None.
152
+ dry_run (bool | int): R the experiment in dry-run mode. When set, experiment results will
153
+ not be recorded in Phoenix. If True, the experiment will run on a random dataset
154
+ example. If an integer, the experiment will run on a random sample of the dataset
155
+ examples of the given size. Defaults to False.
156
+ print_summary (bool): Whether to print a summary of the experiment and evaluation results.
157
+ Defaults to True.
158
+
159
+ Returns:
160
+ RanExperiment: The results of the experiment and evaluation. Additional evaluations can be
161
+ added to the experiment using the `evaluate_experiment` function.
162
+ """
163
+ task_signature = inspect.signature(task)
164
+ _validate_task_signature(task_signature)
165
+
166
+ if not dataset.examples:
167
+ raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
168
+ # Add this to the params once supported in the UI
169
+ repetitions = 1
170
+ assert repetitions > 0, "Must run the experiment at least once."
171
+ evaluators_by_name = _evaluators_by_name(evaluators)
172
+
173
+ sync_client, async_client = _phoenix_clients()
174
+
175
+ payload = {
176
+ "version_id": dataset.version_id,
177
+ "name": experiment_name,
178
+ "description": experiment_description,
179
+ "metadata": experiment_metadata,
180
+ "repetitions": repetitions,
181
+ }
182
+ if not dry_run:
183
+ experiment_response = sync_client.post(
184
+ f"/v1/datasets/{dataset.id}/experiments",
185
+ json=payload,
186
+ )
187
+ experiment_response.raise_for_status()
188
+ exp_json = experiment_response.json()["data"]
189
+ project_name = exp_json["project_name"]
190
+ experiment = Experiment(
191
+ dataset_id=dataset.id,
192
+ dataset_version_id=dataset.version_id,
193
+ repetitions=repetitions,
194
+ id=exp_json["id"],
195
+ project_name=project_name,
196
+ )
197
+ else:
198
+ experiment = Experiment(
199
+ dataset_id=dataset.id,
200
+ dataset_version_id=dataset.version_id,
201
+ repetitions=repetitions,
202
+ id=DRY_RUN,
203
+ project_name="",
204
+ )
205
+
206
+ tracer, resource = _get_tracer(experiment.project_name)
207
+ root_span_name = f"Task: {get_func_name(task)}"
208
+ root_span_kind = CHAIN
209
+
210
+ print("🧪 Experiment started.")
211
+ if dry_run:
212
+ examples = {
213
+ (ex := dataset[i]).id: ex
214
+ for i in pd.Series(range(len(dataset)))
215
+ .sample(min(len(dataset), int(dry_run)), random_state=42)
216
+ .sort_values()
217
+ }
218
+ id_selection = "\n".join(examples)
219
+ print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
220
+ dataset = replace(dataset, examples=examples)
221
+ else:
222
+ dataset_experiments_url = get_dataset_experiments_url(dataset_id=dataset.id)
223
+ experiment_compare_url = get_experiment_url(
224
+ dataset_id=dataset.id,
225
+ experiment_id=experiment.id,
226
+ )
227
+ print(f"📺 View dataset experiments: {dataset_experiments_url}")
228
+ print(f"🔗 View this experiment: {experiment_compare_url}")
229
+
230
+ def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
231
+ example, repetition_number = test_case.example, test_case.repetition_number
232
+ output = None
233
+ error: Optional[BaseException] = None
234
+ status = Status(StatusCode.OK)
235
+ with ExitStack() as stack:
236
+ span: Span = stack.enter_context(
237
+ tracer.start_as_current_span(root_span_name, context=Context())
238
+ )
239
+ stack.enter_context(capture_spans(resource))
240
+ try:
241
+ # Do not use keyword arguments, which can fail at runtime
242
+ # even when function obeys protocol, because keyword arguments
243
+ # are implementation details.
244
+ bound_task_args = _bind_task_signature(task_signature, example)
245
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
246
+ if isinstance(_output, Awaitable):
247
+ sync_error_message = (
248
+ "Task is async and cannot be run within an existing event loop. "
249
+ "Consider the following options:\n\n"
250
+ "1. Pass in a synchronous task callable.\n"
251
+ "2. Use `nest_asyncio.apply()` to allow nesting event loops."
252
+ )
253
+ raise RuntimeError(sync_error_message)
254
+ else:
255
+ output = _output
256
+ except BaseException as exc:
257
+ span.record_exception(exc)
258
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
259
+ error = exc
260
+ _print_experiment_error(
261
+ exc,
262
+ example_id=example.id,
263
+ repetition_number=repetition_number,
264
+ kind="task",
265
+ )
266
+ output = jsonify(output)
267
+ span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
268
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
269
+ if output is not None:
270
+ if isinstance(output, str):
271
+ span.set_attribute(OUTPUT_VALUE, output)
272
+ else:
273
+ span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
274
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
275
+ span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
276
+ span.set_status(status)
277
+
278
+ assert isinstance(
279
+ output, (dict, list, str, int, float, bool, type(None))
280
+ ), "Output must be JSON serializable"
281
+ exp_run = ExperimentRun(
282
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
283
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
284
+ experiment_id=experiment.id,
285
+ dataset_example_id=example.id,
286
+ repetition_number=repetition_number,
287
+ experiment_run_output=ExperimentRunOutput(task_output=output),
288
+ error=repr(error) if error else None,
289
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
290
+ )
291
+ if not dry_run:
292
+ resp = sync_client.post(f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run))
293
+ resp.raise_for_status()
294
+ exp_run = replace(exp_run, id=resp.json()["data"]["id"])
295
+ return exp_run
296
+
297
+ async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
298
+ example, repetition_number = test_case.example, test_case.repetition_number
299
+ output = None
300
+ error: Optional[BaseException] = None
301
+ status = Status(StatusCode.OK)
302
+ with ExitStack() as stack:
303
+ span: Span = stack.enter_context(
304
+ tracer.start_as_current_span(root_span_name, context=Context())
305
+ )
306
+ stack.enter_context(capture_spans(resource))
307
+ try:
308
+ # Do not use keyword arguments, which can fail at runtime
309
+ # even when function obeys protocol, because keyword arguments
310
+ # are implementation details.
311
+ bound_task_args = _bind_task_signature(task_signature, example)
312
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
313
+ if isinstance(_output, Awaitable):
314
+ output = await _output
315
+ else:
316
+ output = _output
317
+ except BaseException as exc:
318
+ span.record_exception(exc)
319
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
320
+ error = exc
321
+ _print_experiment_error(
322
+ exc,
323
+ example_id=example.id,
324
+ repetition_number=repetition_number,
325
+ kind="task",
326
+ )
327
+ output = jsonify(output)
328
+ span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
329
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
330
+ if output is not None:
331
+ if isinstance(output, str):
332
+ span.set_attribute(OUTPUT_VALUE, output)
333
+ else:
334
+ span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
335
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
336
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
337
+ span.set_status(status)
338
+
339
+ assert isinstance(
340
+ output, (dict, list, str, int, float, bool, type(None))
341
+ ), "Output must be JSON serializable"
342
+ exp_run = ExperimentRun(
343
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
344
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
345
+ experiment_id=experiment.id,
346
+ dataset_example_id=example.id,
347
+ repetition_number=repetition_number,
348
+ experiment_run_output=ExperimentRunOutput(task_output=output),
349
+ error=repr(error) if error else None,
350
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
351
+ )
352
+ if not dry_run:
353
+ resp = await async_client.post(
354
+ f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run)
355
+ )
356
+ resp.raise_for_status()
357
+ exp_run = replace(exp_run, id=resp.json()["data"]["id"])
358
+ return exp_run
359
+
360
+ _errors: Tuple[Type[BaseException], ...]
361
+ if not hasattr(rate_limit_errors, "__iter__"):
362
+ _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
363
+ else:
364
+ rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
365
+ _errors = tuple(filter(None, rate_limit_errors))
366
+ rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
367
+
368
+ rate_limited_sync_run_experiment = functools.reduce(
369
+ lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
370
+ )
371
+ rate_limited_async_run_experiment = functools.reduce(
372
+ lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
373
+ )
374
+
375
+ executor = get_executor_on_sync_context(
376
+ rate_limited_sync_run_experiment,
377
+ rate_limited_async_run_experiment,
378
+ max_retries=0,
379
+ exit_on_error=False,
380
+ fallback_return_value=None,
381
+ tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
382
+ )
383
+
384
+ test_cases = [
385
+ TestCase(example=deepcopy(ex), repetition_number=rep)
386
+ for ex, rep in product(dataset.examples.values(), range(1, repetitions + 1))
387
+ ]
388
+ task_runs, _execution_details = executor.run(test_cases)
389
+ print("✅ Task runs completed.")
390
+ params = ExperimentParameters(n_examples=len(dataset.examples), n_repetitions=repetitions)
391
+ task_summary = TaskSummary.from_task_runs(params, task_runs)
392
+ ran_experiment: RanExperiment = object.__new__(RanExperiment)
393
+ ran_experiment.__init__( # type: ignore[misc]
394
+ params=params,
395
+ dataset=dataset,
396
+ runs={r.id: r for r in task_runs},
397
+ task_summary=task_summary,
398
+ **_asdict(experiment),
399
+ )
400
+ if evaluators_by_name:
401
+ return evaluate_experiment(
402
+ ran_experiment,
403
+ evaluators=evaluators_by_name,
404
+ dry_run=dry_run,
405
+ print_summary=print_summary,
406
+ rate_limit_errors=rate_limit_errors,
407
+ )
408
+ if print_summary:
409
+ print(ran_experiment)
410
+ return ran_experiment
411
+
412
+
413
+ def evaluate_experiment(
414
+ experiment: Experiment,
415
+ evaluators: Evaluators,
416
+ *,
417
+ dry_run: Union[bool, int] = False,
418
+ print_summary: bool = True,
419
+ rate_limit_errors: Optional[RateLimitErrors] = None,
420
+ ) -> RanExperiment:
421
+ if not dry_run and _is_dry_run(experiment):
422
+ dry_run = True
423
+ evaluators_by_name = _evaluators_by_name(evaluators)
424
+ if not evaluators_by_name:
425
+ raise ValueError("Must specify at least one Evaluator")
426
+ sync_client, async_client = _phoenix_clients()
427
+ dataset_id = experiment.dataset_id
428
+ dataset_version_id = experiment.dataset_version_id
429
+ if isinstance(experiment, RanExperiment):
430
+ ran_experiment: RanExperiment = experiment
431
+ else:
432
+ dataset = Dataset.from_dict(
433
+ sync_client.get(
434
+ f"/v1/datasets/{dataset_id}/examples",
435
+ params={"version_id": str(dataset_version_id)},
436
+ ).json()["data"]
437
+ )
438
+ if not dataset.examples:
439
+ raise ValueError(f"Dataset has no examples: {dataset_id=}, {dataset_version_id=}")
440
+ experiment_runs = tuple(
441
+ ExperimentRun.from_dict(exp_run)
442
+ for exp_run in sync_client.get(f"/v1/experiments/{experiment.id}/runs").json()["data"]
443
+ )
444
+ if not experiment_runs:
445
+ raise ValueError("Experiment has not been run")
446
+ params = ExperimentParameters(n_examples=len(dataset.examples))
447
+ task_summary = TaskSummary.from_task_runs(params, experiment_runs)
448
+ ran_experiment = object.__new__(RanExperiment)
449
+ ran_experiment.__init__( # type: ignore[misc]
450
+ dataset=dataset,
451
+ params=params,
452
+ runs=experiment_runs,
453
+ task_summary=task_summary,
454
+ **_asdict(experiment),
455
+ )
456
+ print("🧠 Evaluation started.")
457
+ examples = ran_experiment.dataset.examples
458
+ if dry_run:
459
+ if not _is_dry_run(ran_experiment):
460
+ dataset = ran_experiment.dataset
461
+ examples = {
462
+ (ex := dataset[i]).id: ex
463
+ for i in pd.Series(range(len(dataset)))
464
+ .sample(min(len(dataset), int(dry_run)), random_state=42)
465
+ .sort_values()
466
+ }
467
+ dataset = replace(ran_experiment.dataset, examples=examples)
468
+ ran_experiment = _replace(ran_experiment, id=DRY_RUN, dataset=dataset)
469
+ id_selection = "\n".join(examples)
470
+ print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
471
+ # not all dataset examples have associated experiment runs, so we need to pair them up
472
+ example_run_pairs = []
473
+ examples = ran_experiment.dataset.examples
474
+ for exp_run in ran_experiment.runs.values():
475
+ example = examples.get(exp_run.dataset_example_id)
476
+ if example:
477
+ example_run_pairs.append((deepcopy(example), exp_run))
478
+ evaluation_input = [
479
+ (example, run, evaluator)
480
+ for (example, run), evaluator in product(example_run_pairs, evaluators_by_name.values())
481
+ ]
482
+
483
+ tracer, resource = _get_tracer(None if dry_run else "evaluators")
484
+ root_span_kind = EVALUATOR
485
+
486
+ def sync_evaluate_run(
487
+ obj: Tuple[Example, ExperimentRun, Evaluator],
488
+ ) -> ExperimentEvaluationRun:
489
+ example, experiment_run, evaluator = obj
490
+ result: Optional[EvaluationResult] = None
491
+ error: Optional[BaseException] = None
492
+ status = Status(StatusCode.OK)
493
+ root_span_name = f"Evaluation: {evaluator.name}"
494
+ with ExitStack() as stack:
495
+ span: Span = stack.enter_context(
496
+ tracer.start_as_current_span(root_span_name, context=Context())
497
+ )
498
+ stack.enter_context(capture_spans(resource))
499
+ try:
500
+ result = evaluator.evaluate(
501
+ output=experiment_run.output,
502
+ expected=example.output,
503
+ reference=example.output,
504
+ input=example.input,
505
+ metadata=example.metadata,
506
+ )
507
+ except BaseException as exc:
508
+ span.record_exception(exc)
509
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
510
+ error = exc
511
+ _print_experiment_error(
512
+ exc,
513
+ example_id=example.id,
514
+ repetition_number=experiment_run.repetition_number,
515
+ kind="evaluator",
516
+ )
517
+ if result:
518
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
519
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
520
+ span.set_status(status)
521
+
522
+ eval_run = ExperimentEvaluationRun(
523
+ experiment_run_id=experiment_run.id,
524
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
525
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
526
+ name=evaluator.name,
527
+ annotator_kind=evaluator.kind,
528
+ error=repr(error) if error else None,
529
+ result=result,
530
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
531
+ )
532
+ if not dry_run:
533
+ resp = sync_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
534
+ resp.raise_for_status()
535
+ eval_run = replace(eval_run, id=resp.json()["data"]["id"])
536
+ return eval_run
537
+
538
+ async def async_evaluate_run(
539
+ obj: Tuple[Example, ExperimentRun, Evaluator],
540
+ ) -> ExperimentEvaluationRun:
541
+ example, experiment_run, evaluator = obj
542
+ result: Optional[EvaluationResult] = None
543
+ error: Optional[BaseException] = None
544
+ status = Status(StatusCode.OK)
545
+ root_span_name = f"Evaluation: {evaluator.name}"
546
+ with ExitStack() as stack:
547
+ span: Span = stack.enter_context(
548
+ tracer.start_as_current_span(root_span_name, context=Context())
549
+ )
550
+ stack.enter_context(capture_spans(resource))
551
+ try:
552
+ result = await evaluator.async_evaluate(
553
+ output=experiment_run.output,
554
+ expected=example.output,
555
+ reference=example.output,
556
+ input=example.input,
557
+ metadata=example.metadata,
558
+ )
559
+ except BaseException as exc:
560
+ span.record_exception(exc)
561
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
562
+ error = exc
563
+ _print_experiment_error(
564
+ exc,
565
+ example_id=example.id,
566
+ repetition_number=experiment_run.repetition_number,
567
+ kind="evaluator",
568
+ )
569
+ if result:
570
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
571
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
572
+ span.set_status(status)
573
+
574
+ eval_run = ExperimentEvaluationRun(
575
+ experiment_run_id=experiment_run.id,
576
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
577
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
578
+ name=evaluator.name,
579
+ annotator_kind=evaluator.kind,
580
+ error=repr(error) if error else None,
581
+ result=result,
582
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
583
+ )
584
+ if not dry_run:
585
+ resp = await async_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
586
+ resp.raise_for_status()
587
+ eval_run = replace(eval_run, id=resp.json()["data"]["id"])
588
+ return eval_run
589
+
590
+ _errors: Tuple[Type[BaseException], ...]
591
+ if not hasattr(rate_limit_errors, "__iter__"):
592
+ _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
593
+ else:
594
+ rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
595
+ _errors = tuple(filter(None, rate_limit_errors))
596
+ rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
597
+
598
+ rate_limited_sync_evaluate_run = functools.reduce(
599
+ lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_evaluate_run
600
+ )
601
+ rate_limited_async_evaluate_run = functools.reduce(
602
+ lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_evaluate_run
603
+ )
604
+
605
+ executor = get_executor_on_sync_context(
606
+ rate_limited_sync_evaluate_run,
607
+ rate_limited_async_evaluate_run,
608
+ max_retries=0,
609
+ exit_on_error=False,
610
+ fallback_return_value=None,
611
+ tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
612
+ )
613
+ eval_runs, _execution_details = executor.run(evaluation_input)
614
+ eval_summary = EvaluationSummary.from_eval_runs(
615
+ EvaluationParameters(
616
+ eval_names=frozenset(evaluators_by_name),
617
+ exp_params=ran_experiment.params,
618
+ ),
619
+ *eval_runs,
620
+ )
621
+ ran_experiment = ran_experiment.add(eval_summary, *eval_runs)
622
+ if print_summary:
623
+ print(ran_experiment)
624
+ return ran_experiment
625
+
626
+
627
+ def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Evaluator]:
628
+ evaluators_by_name: Dict[EvaluatorName, Evaluator] = {}
629
+ if obj is None:
630
+ return evaluators_by_name
631
+ if isinstance(mapping := obj, Mapping):
632
+ for name, value in mapping.items():
633
+ evaluator = (
634
+ create_evaluator(name=name)(value) if not isinstance(value, Evaluator) else value
635
+ )
636
+ name = evaluator.name
637
+ if name in evaluators_by_name:
638
+ raise ValueError(f"Two evaluators have the same name: {name}")
639
+ evaluators_by_name[name] = evaluator
640
+ elif isinstance(seq := obj, Sequence):
641
+ for value in seq:
642
+ evaluator = create_evaluator()(value) if not isinstance(value, Evaluator) else value
643
+ name = evaluator.name
644
+ if name in evaluators_by_name:
645
+ raise ValueError(f"Two evaluators have the same name: {name}")
646
+ evaluators_by_name[name] = evaluator
647
+ else:
648
+ assert not isinstance(obj, Mapping) and not isinstance(obj, Sequence)
649
+ evaluator = create_evaluator()(obj) if not isinstance(obj, Evaluator) else obj
650
+ name = evaluator.name
651
+ if name in evaluators_by_name:
652
+ raise ValueError(f"Two evaluators have the same name: {name}")
653
+ evaluators_by_name[name] = evaluator
654
+ return evaluators_by_name
655
+
656
+
657
+ def _get_tracer(project_name: Optional[str] = None) -> Tuple[Tracer, Resource]:
658
+ resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
659
+ tracer_provider = trace_sdk.TracerProvider(resource=resource)
660
+ span_processor = (
661
+ SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{get_base_url()}", "v1/traces")))
662
+ if project_name
663
+ else _NoOpProcessor()
664
+ )
665
+ tracer_provider.add_span_processor(span_processor)
666
+ return tracer_provider.get_tracer(__name__), resource
667
+
668
+
669
+ def _str_trace_id(id_: int) -> str:
670
+ return hexlify(id_.to_bytes(16, "big")).decode()
671
+
672
+
673
+ def _decode_unix_nano(time_unix_nano: int) -> datetime:
674
+ return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
675
+
676
+
677
+ def _is_dry_run(obj: Any) -> bool:
678
+ return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
679
+
680
+
681
+ def _validate_task_signature(sig: inspect.Signature) -> None:
682
+ # Check that the function signature has a valid signature for use as a task
683
+ # If it does not, raise an error to exit early before running an experiment
684
+ params = sig.parameters
685
+ valid_named_params = {"input", "expected", "reference", "metadata", "example"}
686
+ if len(params) == 0:
687
+ raise ValueError("Task function must have at least one parameter.")
688
+ if len(params) > 1:
689
+ for not_found in set(params) - valid_named_params:
690
+ param = params[not_found]
691
+ if (
692
+ param.kind is inspect.Parameter.VAR_KEYWORD
693
+ or param.default is not inspect.Parameter.empty
694
+ ):
695
+ continue
696
+ raise ValueError(
697
+ (
698
+ f"Invalid parameter names in task function: {', '.join(not_found)}. "
699
+ "Parameters names for multi-argument functions must be "
700
+ f"any of: {', '.join(valid_named_params)}."
701
+ )
702
+ )
703
+
704
+
705
+ def _bind_task_signature(sig: inspect.Signature, example: Example) -> inspect.BoundArguments:
706
+ parameter_mapping = {
707
+ "input": example.input,
708
+ "expected": example.output,
709
+ "reference": example.output, # Alias for "expected"
710
+ "metadata": example.metadata,
711
+ "example": example,
712
+ }
713
+ params = sig.parameters
714
+ if len(params) == 1:
715
+ parameter_name = next(iter(params))
716
+ if parameter_name in parameter_mapping:
717
+ return sig.bind(parameter_mapping[parameter_name])
718
+ else:
719
+ return sig.bind(parameter_mapping["input"])
720
+ return sig.bind_partial(
721
+ **{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
722
+ )
723
+
724
+
725
+ def _print_experiment_error(
726
+ error: BaseException,
727
+ /,
728
+ *,
729
+ example_id: str,
730
+ repetition_number: int,
731
+ kind: Literal["evaluator", "task"],
732
+ ) -> None:
733
+ """
734
+ Prints an experiment error.
735
+ """
736
+ display_error = RuntimeError(
737
+ f"{kind} failed for example id {repr(example_id)}, " f"repetition {repr(repetition_number)}"
738
+ )
739
+ display_error.__cause__ = error
740
+ formatted_exception = "".join(traceback.format_exception(display_error)) # type: ignore[arg-type, call-arg, unused-ignore]
741
+ print("\033[91m" + formatted_exception + "\033[0m") # prints in red
742
+
743
+
744
+ class _NoOpProcessor(trace_sdk.SpanProcessor):
745
+ def force_flush(self, *_: Any) -> bool:
746
+ return True
747
+
748
+
749
+ INPUT_VALUE = SpanAttributes.INPUT_VALUE
750
+ OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
751
+ INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
752
+ OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
753
+ OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
754
+
755
+ CHAIN = OpenInferenceSpanKindValues.CHAIN.value
756
+ EVALUATOR = OpenInferenceSpanKindValues.EVALUATOR.value
757
+ JSON = OpenInferenceMimeTypeValues.JSON