arize-phoenix 4.4.4rc4__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (52) hide show
  1. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +12 -6
  2. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +47 -42
  3. phoenix/config.py +21 -0
  4. phoenix/datetime_utils.py +4 -0
  5. phoenix/db/insertion/dataset.py +19 -16
  6. phoenix/db/insertion/evaluation.py +4 -4
  7. phoenix/db/insertion/helpers.py +4 -12
  8. phoenix/db/insertion/span.py +3 -3
  9. phoenix/db/migrations/versions/10460e46d750_datasets.py +2 -2
  10. phoenix/db/models.py +8 -3
  11. phoenix/experiments/__init__.py +6 -0
  12. phoenix/experiments/evaluators/__init__.py +29 -0
  13. phoenix/experiments/evaluators/base.py +153 -0
  14. phoenix/{datasets → experiments}/evaluators/code_evaluators.py +25 -53
  15. phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +62 -31
  16. phoenix/experiments/evaluators/utils.py +189 -0
  17. phoenix/experiments/functions.py +616 -0
  18. phoenix/{datasets → experiments}/tracing.py +19 -0
  19. phoenix/experiments/types.py +722 -0
  20. phoenix/experiments/utils.py +9 -0
  21. phoenix/server/api/context.py +4 -0
  22. phoenix/server/api/dataloaders/__init__.py +4 -0
  23. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  24. phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
  25. phoenix/server/api/helpers/dataset_helpers.py +8 -7
  26. phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  27. phoenix/server/api/mutations/project_mutations.py +9 -4
  28. phoenix/server/api/routers/v1/__init__.py +1 -1
  29. phoenix/server/api/routers/v1/dataset_examples.py +10 -10
  30. phoenix/server/api/routers/v1/datasets.py +152 -48
  31. phoenix/server/api/routers/v1/evaluations.py +4 -11
  32. phoenix/server/api/routers/v1/experiment_evaluations.py +23 -23
  33. phoenix/server/api/routers/v1/experiment_runs.py +5 -17
  34. phoenix/server/api/routers/v1/experiments.py +5 -5
  35. phoenix/server/api/routers/v1/spans.py +6 -4
  36. phoenix/server/api/types/Experiment.py +12 -0
  37. phoenix/server/api/types/ExperimentRun.py +1 -1
  38. phoenix/server/api/types/ExperimentRunAnnotation.py +1 -1
  39. phoenix/server/app.py +4 -0
  40. phoenix/server/static/index.js +712 -588
  41. phoenix/session/client.py +321 -28
  42. phoenix/trace/fixtures.py +6 -6
  43. phoenix/utilities/json.py +8 -8
  44. phoenix/version.py +1 -1
  45. phoenix/datasets/__init__.py +0 -0
  46. phoenix/datasets/evaluators/__init__.py +0 -18
  47. phoenix/datasets/evaluators/_utils.py +0 -13
  48. phoenix/datasets/experiments.py +0 -485
  49. phoenix/datasets/types.py +0 -212
  50. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
  51. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
  52. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,616 @@
1
+ import functools
2
+ import json
3
+ from binascii import hexlify
4
+ from contextlib import ExitStack
5
+ from copy import deepcopy
6
+ from dataclasses import replace
7
+ from datetime import datetime, timezone
8
+ from itertools import product
9
+ from typing import (
10
+ Any,
11
+ Awaitable,
12
+ Dict,
13
+ Mapping,
14
+ Optional,
15
+ Sequence,
16
+ Tuple,
17
+ Type,
18
+ Union,
19
+ cast,
20
+ )
21
+ from urllib.parse import urljoin
22
+
23
+ import httpx
24
+ import opentelemetry.sdk.trace as trace_sdk
25
+ import pandas as pd
26
+ from openinference.semconv.resource import ResourceAttributes
27
+ from openinference.semconv.trace import (
28
+ OpenInferenceMimeTypeValues,
29
+ OpenInferenceSpanKindValues,
30
+ SpanAttributes,
31
+ )
32
+ from opentelemetry.context import Context
33
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
34
+ from opentelemetry.sdk.resources import Resource
35
+ from opentelemetry.sdk.trace import Span
36
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor
37
+ from opentelemetry.trace import Status, StatusCode, Tracer
38
+ from typing_extensions import TypeAlias
39
+
40
+ from phoenix.config import get_base_url, get_env_client_headers
41
+ from phoenix.evals.executors import get_executor_on_sync_context
42
+ from phoenix.evals.models.rate_limiters import RateLimiter
43
+ from phoenix.evals.utils import get_tqdm_progress_bar_formatter
44
+ from phoenix.experiments.evaluators import create_evaluator
45
+ from phoenix.experiments.evaluators.base import (
46
+ Evaluator,
47
+ ExperimentEvaluator,
48
+ )
49
+ from phoenix.experiments.tracing import capture_spans
50
+ from phoenix.experiments.types import (
51
+ DRY_RUN,
52
+ Dataset,
53
+ EvaluationParameters,
54
+ EvaluationResult,
55
+ EvaluationSummary,
56
+ EvaluatorName,
57
+ Example,
58
+ Experiment,
59
+ ExperimentEvaluationRun,
60
+ ExperimentParameters,
61
+ ExperimentResult,
62
+ ExperimentRun,
63
+ ExperimentTask,
64
+ RanExperiment,
65
+ TaskSummary,
66
+ TestCase,
67
+ _asdict,
68
+ _replace,
69
+ )
70
+ from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url
71
+ from phoenix.trace.attributes import flatten
72
+ from phoenix.utilities.json import jsonify
73
+
74
+
75
+ def _phoenix_clients() -> Tuple[httpx.Client, httpx.AsyncClient]:
76
+ headers = get_env_client_headers()
77
+ return httpx.Client(
78
+ base_url=get_base_url(),
79
+ headers=headers,
80
+ ), httpx.AsyncClient(
81
+ base_url=get_base_url(),
82
+ headers=headers,
83
+ )
84
+
85
+
86
+ Evaluators: TypeAlias = Union[
87
+ ExperimentEvaluator,
88
+ Sequence[ExperimentEvaluator],
89
+ Mapping[EvaluatorName, ExperimentEvaluator],
90
+ ]
91
+
92
+
93
+ RateLimitErrors: TypeAlias = Union[Type[BaseException], Sequence[Type[BaseException]]]
94
+
95
+
96
+ def run_experiment(
97
+ dataset: Dataset,
98
+ task: ExperimentTask,
99
+ evaluators: Optional[Evaluators] = None,
100
+ *,
101
+ experiment_name: Optional[str] = None,
102
+ experiment_description: Optional[str] = None,
103
+ experiment_metadata: Optional[Mapping[str, Any]] = None,
104
+ rate_limit_errors: Optional[RateLimitErrors] = None,
105
+ dry_run: Union[bool, int] = False,
106
+ print_summary: bool = True,
107
+ ) -> RanExperiment:
108
+ if not dataset.examples:
109
+ raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
110
+ # Add this to the params once supported in the UI
111
+ repetitions = 1
112
+ assert repetitions > 0, "Must run the experiment at least once."
113
+ evaluators_by_name = _evaluators_by_name(evaluators)
114
+
115
+ sync_client, async_client = _phoenix_clients()
116
+
117
+ payload = {
118
+ "version_id": dataset.version_id,
119
+ "name": experiment_name,
120
+ "description": experiment_description,
121
+ "metadata": experiment_metadata,
122
+ "repetitions": repetitions,
123
+ }
124
+ if not dry_run:
125
+ experiment_response = sync_client.post(
126
+ f"/v1/datasets/{dataset.id}/experiments",
127
+ json=payload,
128
+ )
129
+ experiment_response.raise_for_status()
130
+ exp_json = experiment_response.json()["data"]
131
+ project_name = exp_json["project_name"]
132
+ experiment = Experiment(
133
+ dataset_id=dataset.id,
134
+ dataset_version_id=dataset.version_id,
135
+ repetitions=repetitions,
136
+ id=exp_json["id"],
137
+ project_name=project_name,
138
+ )
139
+ else:
140
+ experiment = Experiment(
141
+ dataset_id=dataset.id,
142
+ dataset_version_id=dataset.version_id,
143
+ repetitions=repetitions,
144
+ id=DRY_RUN,
145
+ project_name="",
146
+ )
147
+
148
+ tracer, resource = _get_tracer(experiment.project_name)
149
+ root_span_name = f"Task: {_get_task_name(task)}"
150
+ root_span_kind = CHAIN
151
+
152
+ print("🧪 Experiment started.")
153
+ if dry_run:
154
+ examples = {
155
+ (ex := dataset[i]).id: ex
156
+ for i in pd.Series(range(len(dataset)))
157
+ .sample(min(len(dataset), int(dry_run)), random_state=42)
158
+ .sort_values()
159
+ }
160
+ id_selection = "\n".join(examples)
161
+ print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
162
+ dataset = replace(dataset, examples=examples)
163
+ else:
164
+ dataset_experiments_url = get_dataset_experiments_url(dataset_id=dataset.id)
165
+ experiment_compare_url = get_experiment_url(
166
+ dataset_id=dataset.id,
167
+ experiment_id=experiment.id,
168
+ )
169
+ print(f"📺 View dataset experiments: {dataset_experiments_url}")
170
+ print(f"🔗 View this experiment: {experiment_compare_url}")
171
+
172
+ def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
173
+ example, repetition_number = test_case.example, test_case.repetition_number
174
+ output = None
175
+ error: Optional[BaseException] = None
176
+ status = Status(StatusCode.OK)
177
+ with ExitStack() as stack:
178
+ span: Span = stack.enter_context(
179
+ tracer.start_as_current_span(root_span_name, context=Context())
180
+ )
181
+ stack.enter_context(capture_spans(resource))
182
+ try:
183
+ # Do not use keyword arguments, which can fail at runtime
184
+ # even when function obeys protocol, because keyword arguments
185
+ # are implementation details.
186
+ _output = task(example)
187
+ if isinstance(_output, Awaitable):
188
+ raise RuntimeError("Task is async but running in sync context")
189
+ else:
190
+ output = _output
191
+ except BaseException as exc:
192
+ span.record_exception(exc)
193
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
194
+ error = exc
195
+ output = jsonify(output)
196
+ span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
197
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
198
+ if result := ExperimentResult(result=output) if output is not None else None:
199
+ if isinstance(output, str):
200
+ span.set_attribute(OUTPUT_VALUE, output)
201
+ else:
202
+ span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
203
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
204
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
205
+ span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
206
+ span.set_status(status)
207
+
208
+ assert isinstance(
209
+ output, (dict, list, str, int, float, bool, type(None))
210
+ ), "Output must be JSON serializable"
211
+ exp_run = ExperimentRun(
212
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
213
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
214
+ experiment_id=experiment.id,
215
+ dataset_example_id=example.id,
216
+ repetition_number=repetition_number,
217
+ output=result,
218
+ error=repr(error) if error else None,
219
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
220
+ )
221
+ if not dry_run:
222
+ resp = sync_client.post(f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run))
223
+ resp.raise_for_status()
224
+ exp_run = replace(exp_run, id=resp.json()["data"]["id"])
225
+ return exp_run
226
+
227
+ async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
228
+ example, repetition_number = test_case.example, test_case.repetition_number
229
+ output = None
230
+ error: Optional[BaseException] = None
231
+ status = Status(StatusCode.OK)
232
+ with ExitStack() as stack:
233
+ span: Span = stack.enter_context(
234
+ tracer.start_as_current_span(root_span_name, context=Context())
235
+ )
236
+ stack.enter_context(capture_spans(resource))
237
+ try:
238
+ # Do not use keyword arguments, which can fail at runtime
239
+ # even when function obeys protocol, because keyword arguments
240
+ # are implementation details.
241
+ _output = task(example)
242
+ if isinstance(_output, Awaitable):
243
+ output = await _output
244
+ else:
245
+ output = _output
246
+ except BaseException as exc:
247
+ span.record_exception(exc)
248
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
249
+ error = exc
250
+ output = jsonify(output)
251
+ span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
252
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
253
+ if result := ExperimentResult(result=output) if output is not None else None:
254
+ if isinstance(output, str):
255
+ span.set_attribute(OUTPUT_VALUE, output)
256
+ else:
257
+ span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
258
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
259
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
260
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
261
+ span.set_status(status)
262
+
263
+ assert isinstance(
264
+ output, (dict, list, str, int, float, bool, type(None))
265
+ ), "Output must be JSON serializable"
266
+ exp_run = ExperimentRun(
267
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
268
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
269
+ experiment_id=experiment.id,
270
+ dataset_example_id=example.id,
271
+ repetition_number=repetition_number,
272
+ output=result,
273
+ error=repr(error) if error else None,
274
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
275
+ )
276
+ if not dry_run:
277
+ resp = await async_client.post(
278
+ f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run)
279
+ )
280
+ resp.raise_for_status()
281
+ exp_run = replace(exp_run, id=resp.json()["data"]["id"])
282
+ return exp_run
283
+
284
+ _errors: Tuple[Type[BaseException], ...]
285
+ if not hasattr(rate_limit_errors, "__iter__"):
286
+ _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
287
+ else:
288
+ rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
289
+ _errors = tuple(filter(None, rate_limit_errors))
290
+ rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
291
+
292
+ rate_limited_sync_run_experiment = functools.reduce(
293
+ lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
294
+ )
295
+ rate_limited_async_run_experiment = functools.reduce(
296
+ lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
297
+ )
298
+
299
+ executor = get_executor_on_sync_context(
300
+ rate_limited_sync_run_experiment,
301
+ rate_limited_async_run_experiment,
302
+ max_retries=0,
303
+ exit_on_error=False,
304
+ fallback_return_value=None,
305
+ tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
306
+ )
307
+
308
+ test_cases = [
309
+ TestCase(example=deepcopy(ex), repetition_number=rep)
310
+ for ex, rep in product(dataset.examples.values(), range(1, repetitions + 1))
311
+ ]
312
+ task_runs, _execution_details = executor.run(test_cases)
313
+ print("✅ Task runs completed.")
314
+ params = ExperimentParameters(n_examples=len(dataset.examples), n_repetitions=repetitions)
315
+ task_summary = TaskSummary.from_task_runs(params, task_runs)
316
+ ran_experiment: RanExperiment = object.__new__(RanExperiment)
317
+ ran_experiment.__init__( # type: ignore[misc]
318
+ params=params,
319
+ dataset=dataset,
320
+ runs={r.id: r for r in task_runs},
321
+ task_summary=task_summary,
322
+ **_asdict(experiment),
323
+ )
324
+ if evaluators_by_name:
325
+ return evaluate_experiment(
326
+ ran_experiment,
327
+ evaluators=evaluators_by_name,
328
+ dry_run=dry_run,
329
+ print_summary=print_summary,
330
+ rate_limit_errors=rate_limit_errors,
331
+ )
332
+ if print_summary:
333
+ print(ran_experiment)
334
+ return ran_experiment
335
+
336
+
337
+ def evaluate_experiment(
338
+ experiment: Experiment,
339
+ evaluators: Evaluators,
340
+ *,
341
+ dry_run: Union[bool, int] = False,
342
+ print_summary: bool = True,
343
+ rate_limit_errors: Optional[RateLimitErrors] = None,
344
+ ) -> RanExperiment:
345
+ if not dry_run and _is_dry_run(experiment):
346
+ dry_run = True
347
+ evaluators_by_name = _evaluators_by_name(evaluators)
348
+ if not evaluators_by_name:
349
+ raise ValueError("Must specify at least one Evaluator")
350
+ sync_client, async_client = _phoenix_clients()
351
+ dataset_id = experiment.dataset_id
352
+ dataset_version_id = experiment.dataset_version_id
353
+ if isinstance(experiment, RanExperiment):
354
+ ran_experiment: RanExperiment = experiment
355
+ else:
356
+ dataset = Dataset.from_dict(
357
+ sync_client.get(
358
+ f"/v1/datasets/{dataset_id}/examples",
359
+ params={"version_id": str(dataset_version_id)},
360
+ ).json()["data"]
361
+ )
362
+ if not dataset.examples:
363
+ raise ValueError(f"Dataset has no examples: {dataset_id=}, {dataset_version_id=}")
364
+ experiment_runs = tuple(
365
+ ExperimentRun.from_dict(exp_run)
366
+ for exp_run in sync_client.get(f"/v1/experiments/{experiment.id}/runs").json()["data"]
367
+ )
368
+ if not experiment_runs:
369
+ raise ValueError("Experiment has not been run")
370
+ params = ExperimentParameters(n_examples=len(dataset.examples))
371
+ task_summary = TaskSummary.from_task_runs(params, experiment_runs)
372
+ ran_experiment = object.__new__(RanExperiment)
373
+ ran_experiment.__init__( # type: ignore[misc]
374
+ dataset=dataset,
375
+ params=params,
376
+ runs=experiment_runs,
377
+ task_summary=task_summary,
378
+ **_asdict(experiment),
379
+ )
380
+ print("🧠 Evaluation started.")
381
+ examples = ran_experiment.dataset.examples
382
+ if dry_run:
383
+ if not _is_dry_run(ran_experiment):
384
+ dataset = ran_experiment.dataset
385
+ examples = {
386
+ (ex := dataset[i]).id: ex
387
+ for i in pd.Series(range(len(dataset)))
388
+ .sample(min(len(dataset), int(dry_run)), random_state=42)
389
+ .sort_values()
390
+ }
391
+ dataset = replace(ran_experiment.dataset, examples=examples)
392
+ ran_experiment = _replace(ran_experiment, id=DRY_RUN, dataset=dataset)
393
+ id_selection = "\n".join(examples)
394
+ print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
395
+ # not all dataset examples have associated experiment runs, so we need to pair them up
396
+ example_run_pairs = []
397
+ examples = ran_experiment.dataset.examples
398
+ for exp_run in ran_experiment.runs.values():
399
+ example = examples.get(exp_run.dataset_example_id)
400
+ if example:
401
+ example_run_pairs.append((deepcopy(example), exp_run))
402
+ evaluation_input = [
403
+ (example, run, evaluator)
404
+ for (example, run), evaluator in product(example_run_pairs, evaluators_by_name.values())
405
+ ]
406
+
407
+ tracer, resource = _get_tracer(None if dry_run else "evaluators")
408
+ root_span_kind = EVALUATOR
409
+
410
+ def sync_evaluate_run(
411
+ obj: Tuple[Example, ExperimentRun, Evaluator],
412
+ ) -> ExperimentEvaluationRun:
413
+ example, experiment_run, evaluator = obj
414
+ result: Optional[EvaluationResult] = None
415
+ error: Optional[BaseException] = None
416
+ status = Status(StatusCode.OK)
417
+ root_span_name = f"Evaluation: {evaluator.name}"
418
+ with ExitStack() as stack:
419
+ span: Span = stack.enter_context(
420
+ tracer.start_as_current_span(root_span_name, context=Context())
421
+ )
422
+ stack.enter_context(capture_spans(resource))
423
+ try:
424
+ result = evaluator.evaluate(
425
+ output=experiment_run.task_output,
426
+ expected=example.output,
427
+ input=example.input,
428
+ metadata=example.metadata,
429
+ )
430
+ except BaseException as exc:
431
+ span.record_exception(exc)
432
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
433
+ error = exc
434
+ if result:
435
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
436
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
437
+ span.set_status(status)
438
+
439
+ eval_run = ExperimentEvaluationRun(
440
+ experiment_run_id=experiment_run.id,
441
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
442
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
443
+ name=evaluator.name,
444
+ annotator_kind=evaluator.kind,
445
+ error=repr(error) if error else None,
446
+ result=result,
447
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
448
+ )
449
+ if not dry_run:
450
+ resp = sync_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
451
+ resp.raise_for_status()
452
+ eval_run = replace(eval_run, id=resp.json()["data"]["id"])
453
+ return eval_run
454
+
455
+ async def async_evaluate_run(
456
+ obj: Tuple[Example, ExperimentRun, Evaluator],
457
+ ) -> ExperimentEvaluationRun:
458
+ example, experiment_run, evaluator = obj
459
+ result: Optional[EvaluationResult] = None
460
+ error: Optional[BaseException] = None
461
+ status = Status(StatusCode.OK)
462
+ root_span_name = f"Evaluation: {evaluator.name}"
463
+ with ExitStack() as stack:
464
+ span: Span = stack.enter_context(
465
+ tracer.start_as_current_span(root_span_name, context=Context())
466
+ )
467
+ stack.enter_context(capture_spans(resource))
468
+ try:
469
+ result = await evaluator.async_evaluate(
470
+ output=experiment_run.task_output,
471
+ expected=example.output,
472
+ input=example.input,
473
+ metadata=example.metadata,
474
+ )
475
+ except BaseException as exc:
476
+ span.record_exception(exc)
477
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
478
+ error = exc
479
+ if result:
480
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
481
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
482
+ span.set_status(status)
483
+
484
+ eval_run = ExperimentEvaluationRun(
485
+ experiment_run_id=experiment_run.id,
486
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
487
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
488
+ name=evaluator.name,
489
+ annotator_kind=evaluator.kind,
490
+ error=repr(error) if error else None,
491
+ result=result,
492
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
493
+ )
494
+ if not dry_run:
495
+ resp = await async_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
496
+ resp.raise_for_status()
497
+ eval_run = replace(eval_run, id=resp.json()["data"]["id"])
498
+ return eval_run
499
+
500
+ _errors: Tuple[Type[BaseException], ...]
501
+ if not hasattr(rate_limit_errors, "__iter__"):
502
+ _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
503
+ else:
504
+ rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
505
+ _errors = tuple(filter(None, rate_limit_errors))
506
+ rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
507
+
508
+ rate_limited_sync_evaluate_run = functools.reduce(
509
+ lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_evaluate_run
510
+ )
511
+ rate_limited_async_evaluate_run = functools.reduce(
512
+ lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_evaluate_run
513
+ )
514
+
515
+ executor = get_executor_on_sync_context(
516
+ rate_limited_sync_evaluate_run,
517
+ rate_limited_async_evaluate_run,
518
+ max_retries=0,
519
+ exit_on_error=False,
520
+ fallback_return_value=None,
521
+ tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
522
+ )
523
+ eval_runs, _execution_details = executor.run(evaluation_input)
524
+ eval_summary = EvaluationSummary.from_eval_runs(
525
+ EvaluationParameters(
526
+ eval_names=frozenset(evaluators_by_name),
527
+ exp_params=ran_experiment.params,
528
+ ),
529
+ *eval_runs,
530
+ )
531
+ ran_experiment = ran_experiment.add(eval_summary, *eval_runs)
532
+ if print_summary:
533
+ print(ran_experiment)
534
+ return ran_experiment
535
+
536
+
537
+ def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Evaluator]:
538
+ evaluators_by_name: Dict[EvaluatorName, Evaluator] = {}
539
+ if obj is None:
540
+ return evaluators_by_name
541
+ if isinstance(mapping := obj, Mapping):
542
+ for name, value in mapping.items():
543
+ evaluator = (
544
+ create_evaluator(name=name)(value) if not isinstance(value, Evaluator) else value
545
+ )
546
+ name = evaluator.name
547
+ if name in evaluators_by_name:
548
+ raise ValueError(f"Two evaluators have the same name: {name}")
549
+ evaluators_by_name[name] = evaluator
550
+ elif isinstance(seq := obj, Sequence):
551
+ for value in seq:
552
+ evaluator = create_evaluator()(value) if not isinstance(value, Evaluator) else value
553
+ name = evaluator.name
554
+ if name in evaluators_by_name:
555
+ raise ValueError(f"Two evaluators have the same name: {name}")
556
+ evaluators_by_name[name] = evaluator
557
+ else:
558
+ assert not isinstance(obj, Mapping) and not isinstance(obj, Sequence)
559
+ evaluator = create_evaluator()(obj) if not isinstance(obj, Evaluator) else obj
560
+ name = evaluator.name
561
+ if name in evaluators_by_name:
562
+ raise ValueError(f"Two evaluators have the same name: {name}")
563
+ evaluators_by_name[name] = evaluator
564
+ return evaluators_by_name
565
+
566
+
567
+ def _get_tracer(project_name: Optional[str] = None) -> Tuple[Tracer, Resource]:
568
+ resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
569
+ tracer_provider = trace_sdk.TracerProvider(resource=resource)
570
+ span_processor = (
571
+ SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{get_base_url()}", "v1/traces")))
572
+ if project_name
573
+ else _NoOpProcessor()
574
+ )
575
+ tracer_provider.add_span_processor(span_processor)
576
+ return tracer_provider.get_tracer(__name__), resource
577
+
578
+
579
+ def _str_trace_id(id_: int) -> str:
580
+ return hexlify(id_.to_bytes(16, "big")).decode()
581
+
582
+
583
+ def _decode_unix_nano(time_unix_nano: int) -> datetime:
584
+ return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
585
+
586
+
587
+ def _get_task_name(task: ExperimentTask) -> str:
588
+ """
589
+ Makes a best-effort attempt to get the name of the task.
590
+ """
591
+
592
+ if isinstance(task, functools.partial):
593
+ return task.func.__qualname__
594
+ if hasattr(task, "__qualname__"):
595
+ return task.__qualname__
596
+ return str(task)
597
+
598
+
599
+ def _is_dry_run(obj: Any) -> bool:
600
+ return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
601
+
602
+
603
+ class _NoOpProcessor(trace_sdk.SpanProcessor):
604
+ def force_flush(self, *_: Any) -> bool:
605
+ return True
606
+
607
+
608
+ INPUT_VALUE = SpanAttributes.INPUT_VALUE
609
+ OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
610
+ INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
611
+ OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
612
+ OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
613
+
614
+ CHAIN = OpenInferenceSpanKindValues.CHAIN.value
615
+ EVALUATOR = OpenInferenceSpanKindValues.EVALUATOR.value
616
+ JSON = OpenInferenceMimeTypeValues.JSON
@@ -12,12 +12,22 @@ from wrapt import apply_patch, resolve_path, wrap_function_wrapper
12
12
 
13
13
 
14
14
  class SpanModifier:
15
+ """
16
+ A class that modifies spans with the specified resource attributes.
17
+ """
18
+
15
19
  __slots__ = ("_resource",)
16
20
 
17
21
  def __init__(self, resource: Resource) -> None:
18
22
  self._resource = resource
19
23
 
20
24
  def modify_resource(self, span: ReadableSpan) -> None:
25
+ """
26
+ Takes a span and merges in the resource attributes specified in the constructor.
27
+
28
+ Args:
29
+ span: ReadableSpan: the span to modify
30
+ """
21
31
  if (ctx := span._context) is None or ctx.span_id == INVALID_TRACE_ID:
22
32
  return
23
33
  span._resource = span._resource.merge(self._resource)
@@ -59,6 +69,15 @@ def _monkey_patch_span_init() -> Iterator[None]:
59
69
 
60
70
  @contextmanager
61
71
  def capture_spans(resource: Resource) -> Iterator[SpanModifier]:
72
+ """
73
+ A context manager that captures spans and modifies them with the specified resources.
74
+
75
+ Args:
76
+ resource: Resource: The resource to merge into the spans created within the context.
77
+
78
+ Returns:
79
+ modifier: Iterator[SpanModifier]: The span modifier that is active within the context.
80
+ """
62
81
  modifier = SpanModifier(resource)
63
82
  with _monkey_patch_span_init():
64
83
  token = _ACTIVE_MODIFIER.set(modifier)