arize 8.0.0a14__py3-none-any.whl → 8.0.0a16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. arize/__init__.py +70 -1
  2. arize/_flight/client.py +163 -43
  3. arize/_flight/types.py +1 -0
  4. arize/_generated/api_client/__init__.py +5 -1
  5. arize/_generated/api_client/api/datasets_api.py +6 -6
  6. arize/_generated/api_client/api/experiments_api.py +924 -61
  7. arize/_generated/api_client/api_client.py +1 -1
  8. arize/_generated/api_client/configuration.py +1 -1
  9. arize/_generated/api_client/exceptions.py +1 -1
  10. arize/_generated/api_client/models/__init__.py +3 -1
  11. arize/_generated/api_client/models/dataset.py +2 -2
  12. arize/_generated/api_client/models/dataset_version.py +1 -1
  13. arize/_generated/api_client/models/datasets_create_request.py +3 -3
  14. arize/_generated/api_client/models/datasets_list200_response.py +1 -1
  15. arize/_generated/api_client/models/datasets_list_examples200_response.py +1 -1
  16. arize/_generated/api_client/models/error.py +1 -1
  17. arize/_generated/api_client/models/experiment.py +6 -6
  18. arize/_generated/api_client/models/experiments_create_request.py +98 -0
  19. arize/_generated/api_client/models/experiments_list200_response.py +1 -1
  20. arize/_generated/api_client/models/experiments_runs_list200_response.py +92 -0
  21. arize/_generated/api_client/rest.py +1 -1
  22. arize/_generated/api_client/test/test_dataset.py +2 -1
  23. arize/_generated/api_client/test/test_dataset_version.py +1 -1
  24. arize/_generated/api_client/test/test_datasets_api.py +1 -1
  25. arize/_generated/api_client/test/test_datasets_create_request.py +2 -1
  26. arize/_generated/api_client/test/test_datasets_list200_response.py +1 -1
  27. arize/_generated/api_client/test/test_datasets_list_examples200_response.py +1 -1
  28. arize/_generated/api_client/test/test_error.py +1 -1
  29. arize/_generated/api_client/test/test_experiment.py +6 -1
  30. arize/_generated/api_client/test/test_experiments_api.py +23 -2
  31. arize/_generated/api_client/test/test_experiments_create_request.py +61 -0
  32. arize/_generated/api_client/test/test_experiments_list200_response.py +1 -1
  33. arize/_generated/api_client/test/test_experiments_runs_list200_response.py +56 -0
  34. arize/_generated/api_client_README.md +13 -8
  35. arize/client.py +19 -2
  36. arize/config.py +50 -3
  37. arize/constants/config.py +8 -2
  38. arize/constants/openinference.py +14 -0
  39. arize/constants/pyarrow.py +1 -0
  40. arize/datasets/__init__.py +0 -70
  41. arize/datasets/client.py +106 -19
  42. arize/datasets/errors.py +61 -0
  43. arize/datasets/validation.py +46 -0
  44. arize/experiments/client.py +455 -0
  45. arize/experiments/evaluators/__init__.py +0 -0
  46. arize/experiments/evaluators/base.py +255 -0
  47. arize/experiments/evaluators/exceptions.py +10 -0
  48. arize/experiments/evaluators/executors.py +502 -0
  49. arize/experiments/evaluators/rate_limiters.py +277 -0
  50. arize/experiments/evaluators/types.py +122 -0
  51. arize/experiments/evaluators/utils.py +198 -0
  52. arize/experiments/functions.py +920 -0
  53. arize/experiments/tracing.py +276 -0
  54. arize/experiments/types.py +394 -0
  55. arize/models/client.py +4 -1
  56. arize/spans/client.py +16 -20
  57. arize/utils/arrow.py +4 -3
  58. arize/utils/openinference_conversion.py +56 -0
  59. arize/utils/proto.py +13 -0
  60. arize/utils/size.py +22 -0
  61. arize/version.py +1 -1
  62. {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/METADATA +3 -1
  63. {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/RECORD +65 -44
  64. {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/WHEEL +0 -0
  65. {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,920 @@
1
+ import dataclasses
2
+ import functools
3
+ import inspect
4
+ import json
5
+ import logging
6
+ import traceback
7
+ from binascii import hexlify
8
+ from contextlib import ExitStack
9
+ from copy import deepcopy
10
+ from datetime import date, datetime, time, timedelta, timezone
11
+ from enum import Enum
12
+ from itertools import product
13
+ from pathlib import Path
14
+ from typing import (
15
+ Any,
16
+ Awaitable,
17
+ Callable,
18
+ Dict,
19
+ List,
20
+ Literal,
21
+ Mapping,
22
+ Sequence,
23
+ Tuple,
24
+ Type,
25
+ Union,
26
+ cast,
27
+ get_args,
28
+ get_origin,
29
+ )
30
+
31
+ import numpy as np
32
+ import pandas as pd
33
+ from openinference.semconv.trace import (
34
+ OpenInferenceMimeTypeValues,
35
+ OpenInferenceSpanKindValues,
36
+ SpanAttributes,
37
+ )
38
+ from opentelemetry.context import Context
39
+ from opentelemetry.sdk.resources import Resource
40
+ from opentelemetry.sdk.trace import Span
41
+ from opentelemetry.trace import Status, StatusCode, Tracer
42
+ from typing_extensions import TypeAlias
43
+
44
+ from arize.experiments.evaluators.base import Evaluator, Evaluators
45
+ from arize.experiments.evaluators.executors import (
46
+ get_executor_on_sync_context,
47
+ )
48
+ from arize.experiments.evaluators.rate_limiters import RateLimiter
49
+ from arize.experiments.evaluators.types import (
50
+ EvaluationResult,
51
+ EvaluationResultFieldNames,
52
+ EvaluatorName,
53
+ )
54
+ from arize.experiments.evaluators.utils import create_evaluator
55
+ from arize.experiments.tracing import capture_spans, flatten
56
+ from arize.experiments.types import (
57
+ Example,
58
+ ExperimentEvaluationRun,
59
+ ExperimentRun,
60
+ ExperimentTask,
61
+ ExperimentTaskResultFieldNames,
62
+ _TaskSummary,
63
+ )
64
+
65
+ RateLimitErrors: TypeAlias = Union[
66
+ Type[BaseException], Sequence[Type[BaseException]]
67
+ ]
68
+
69
+ logger = logging.getLogger(__name__)
70
+
71
+
72
+ def run_experiment(
73
+ experiment_name: str,
74
+ experiment_id: str,
75
+ dataset: pd.DataFrame,
76
+ task: ExperimentTask,
77
+ tracer: Tracer,
78
+ resource: Resource,
79
+ rate_limit_errors: RateLimitErrors | None = None,
80
+ evaluators: Evaluators | None = None,
81
+ concurrency: int = 3,
82
+ exit_on_error: bool = False,
83
+ ) -> pd.DataFrame:
84
+ """
85
+ Run an experiment on a dataset.
86
+ Args:
87
+ experiment_name (str): The name for the experiment.
88
+ experiment_id (str): The ID for the experiment.
89
+ dataset (pd.DataFrame): The dataset to run the experiment on.
90
+ task (ExperimentTask): The task to be executed on the dataset.
91
+ tracer (Tracer): Tracer for tracing the experiment.
92
+ resource (Resource): The resource for tracing the experiment.
93
+ rate_limit_errors (Optional[RateLimitErrors]): Optional rate limit errors.
94
+ evaluators (Optional[Evaluators]): Optional evaluators to assess the task.
95
+ concurrency (int): The number of concurrent tasks to run. Default is 3.
96
+ exit_on_error (bool): Whether to exit on error. Default is False.
97
+ Returns:
98
+ pd.DataFrame: The results of the experiment.
99
+ """
100
+ task_signature = inspect.signature(task)
101
+ _validate_task_signature(task_signature)
102
+
103
+ examples = _dataframe_to_examples(dataset)
104
+ if not examples:
105
+ raise ValueError("No examples found in the dataset.")
106
+
107
+ evaluators_by_name = _evaluators_by_name(evaluators)
108
+ root_span_name = f"Task: {get_func_name(task)}"
109
+ root_span_kind = CHAIN
110
+
111
+ logger.info("🧪 Experiment started.")
112
+
113
+ md = {"experiment_id": experiment_id}
114
+
115
+ def sync_run_experiment(example: Example) -> ExperimentRun:
116
+ output = None
117
+ error: BaseException | None = None
118
+ status = Status(StatusCode.OK)
119
+ with ExitStack() as stack:
120
+ span: Span = stack.enter_context(
121
+ cm=tracer.start_as_current_span(
122
+ name=root_span_name, context=Context()
123
+ )
124
+ ) # type:ignore
125
+ stack.enter_context(capture_spans(resource))
126
+ span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
127
+ try:
128
+ bound_task_args = _bind_task_signature(task_signature, example)
129
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
130
+ if isinstance(_output, Awaitable):
131
+ sync_error_message = (
132
+ "Task is async and cannot be run within an existing event loop. "
133
+ "Consider the following options:\n\n"
134
+ "1. Pass in a synchronous task callable.\n"
135
+ "2. Use `nest_asyncio.apply()` to allow nesting event loops."
136
+ )
137
+ raise RuntimeError(sync_error_message)
138
+ else:
139
+ output = _output
140
+ except BaseException as exc:
141
+ if exit_on_error:
142
+ raise exc
143
+ span.record_exception(exc)
144
+ status = Status(
145
+ StatusCode.ERROR, f"{type(exc).__name__}: {exc}"
146
+ )
147
+ error = exc
148
+ _print_experiment_error(exc, example_id=example.id, kind="task")
149
+
150
+ output = jsonify(output)
151
+ if example.input:
152
+ span.set_attribute(INPUT_VALUE, example.input) # type: ignore
153
+ else:
154
+ span.set_attribute(
155
+ INPUT_VALUE,
156
+ json.dumps(
157
+ obj=jsonify(example.dataset_row), ensure_ascii=False
158
+ ),
159
+ )
160
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
161
+ if output is not None:
162
+ if isinstance(output, str):
163
+ span.set_attribute(OUTPUT_VALUE, output)
164
+ else:
165
+ span.set_attribute(
166
+ OUTPUT_VALUE, json.dumps(output, ensure_ascii=False)
167
+ )
168
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
169
+ span.set_attribute(
170
+ SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind
171
+ )
172
+ span.set_status(status)
173
+
174
+ assert isinstance(
175
+ output, (dict, list, str, int, float, bool, type(None))
176
+ ), "Output must be JSON serializable"
177
+
178
+ exp_run = ExperimentRun(
179
+ experiment_id=experiment_name,
180
+ repetition_number=1,
181
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
182
+ end_time=(
183
+ _decode_unix_nano(cast(int, span.end_time))
184
+ if span.end_time
185
+ else datetime.now()
186
+ ),
187
+ dataset_example_id=example.id,
188
+ output=output, # type:ignore
189
+ error=repr(error) if error else None,
190
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore
191
+ )
192
+ return exp_run
193
+
194
+ async def async_run_experiment(example: Example) -> ExperimentRun:
195
+ output = None
196
+ error: BaseException | None = None
197
+ status = Status(StatusCode.OK)
198
+ with ExitStack() as stack:
199
+ span: Span = stack.enter_context(
200
+ cm=tracer.start_as_current_span(
201
+ name=root_span_name, context=Context()
202
+ )
203
+ ) # type:ignore
204
+ stack.enter_context(capture_spans(resource))
205
+ span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
206
+ try:
207
+ bound_task_args = _bind_task_signature(task_signature, example)
208
+ _output = task(*bound_task_args.args, **bound_task_args.kwargs)
209
+ if isinstance(_output, Awaitable):
210
+ output = await _output
211
+ else:
212
+ output = _output
213
+ except BaseException as exc:
214
+ if exit_on_error:
215
+ raise exc
216
+ span.record_exception(exc)
217
+ status = Status(
218
+ StatusCode.ERROR, f"{type(exc).__name__}: {exc}"
219
+ )
220
+ error = exc
221
+ _print_experiment_error(exc, example_id=example.id, kind="task")
222
+ output = jsonify(output)
223
+ if example.input:
224
+ span.set_attribute(INPUT_VALUE, example.input) # type: ignore
225
+ else:
226
+ span.set_attribute(
227
+ INPUT_VALUE,
228
+ json.dumps(
229
+ obj=jsonify(example.dataset_row), ensure_ascii=False
230
+ ),
231
+ )
232
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
233
+ if output is not None:
234
+ if isinstance(output, str):
235
+ span.set_attribute(OUTPUT_VALUE, output)
236
+ else:
237
+ span.set_attribute(
238
+ OUTPUT_VALUE, json.dumps(output, ensure_ascii=False)
239
+ )
240
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
241
+ span.set_attribute(
242
+ SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind
243
+ )
244
+ span.set_status(status)
245
+
246
+ assert isinstance(
247
+ output, (dict, list, str, int, float, bool, type(None))
248
+ ), "Output must be JSON serializable"
249
+
250
+ exp_run = ExperimentRun(
251
+ experiment_id=experiment_name,
252
+ repetition_number=1,
253
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
254
+ end_time=(
255
+ _decode_unix_nano(cast(int, span.end_time))
256
+ if span.end_time
257
+ else datetime.now()
258
+ ),
259
+ dataset_example_id=example.id,
260
+ output=output, # type: ignore
261
+ error=repr(error) if error else None,
262
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore
263
+ )
264
+ return exp_run
265
+
266
+ _errors: Tuple[Type[BaseException], ...]
267
+ if not isinstance(rate_limit_errors, Sequence):
268
+ _errors = (rate_limit_errors,) # type: ignore
269
+ else:
270
+ _errors = tuple(filter(None, rate_limit_errors))
271
+ rate_limiters = [RateLimiter(rate_limit_error=rle) for rle in _errors]
272
+ rate_limited_sync_run_experiment = functools.reduce(
273
+ lambda fn, limiter: limiter.limit(fn),
274
+ rate_limiters,
275
+ sync_run_experiment,
276
+ )
277
+ rate_limited_async_run_experiment = functools.reduce(
278
+ lambda fn, limiter: limiter.alimit(fn),
279
+ rate_limiters,
280
+ async_run_experiment,
281
+ )
282
+
283
+ executor = get_executor_on_sync_context(
284
+ sync_fn=rate_limited_sync_run_experiment,
285
+ async_fn=rate_limited_async_run_experiment,
286
+ max_retries=0,
287
+ exit_on_error=exit_on_error,
288
+ fallback_return_value=None,
289
+ tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
290
+ concurrency=concurrency,
291
+ )
292
+
293
+ runs, _ = executor.run(examples)
294
+ task_summary = _TaskSummary.from_task_runs(len(dataset), runs)
295
+
296
+ if exit_on_error and (None in runs):
297
+ # When exit_on_error is True, the result of a failed task execution is None
298
+ # If any task execution failed, raise an error to exit early
299
+ raise RuntimeError("An error occurred during execution of tasks.")
300
+
301
+ out_df = pd.DataFrame()
302
+ out_df["id"] = [run.id for run in runs]
303
+ out_df["example_id"] = [run.dataset_example_id for run in runs]
304
+ out_df["result"] = [run.output for run in runs]
305
+ out_df["result.trace.id"] = [run.trace_id for run in runs]
306
+ out_df["result.trace.timestamp"] = [
307
+ int(run.start_time.timestamp() * 1e3) for run in runs
308
+ ]
309
+ out_df.set_index("id", inplace=True, drop=False)
310
+ logger.info(f"✅ Task runs completed.\n{task_summary}")
311
+
312
+ if evaluators_by_name:
313
+ eval_results = evaluate_experiment(
314
+ experiment_name=experiment_name,
315
+ examples=examples,
316
+ experiment_results=runs,
317
+ evaluators=evaluators,
318
+ rate_limit_errors=rate_limit_errors,
319
+ concurrency=concurrency,
320
+ tracer=tracer,
321
+ resource=resource,
322
+ exit_on_error=exit_on_error,
323
+ )
324
+
325
+ if exit_on_error and (None in eval_results):
326
+ raise RuntimeError(
327
+ "An error occurred during execution of evaluators."
328
+ )
329
+
330
+ # group evaluation results by name
331
+ eval_results_by_name = {}
332
+ for r in eval_results:
333
+ if r is None:
334
+ continue
335
+ if r.name not in eval_results_by_name:
336
+ eval_results_by_name[r.name] = []
337
+ eval_results_by_name[r.name].append(r)
338
+
339
+ for eval_name, eval_res in eval_results_by_name.items():
340
+ eval_data = {
341
+ "score": lambda x: get_result_attr(x, "score", None),
342
+ "label": lambda x: get_result_attr(x, "label", None),
343
+ "explanation": lambda x: get_result_attr(
344
+ x, "explanation", None
345
+ ),
346
+ "trace.id": lambda x: x.trace_id,
347
+ "trace.timestamp": lambda x: int(
348
+ x.start_time.timestamp() * 1e3
349
+ ),
350
+ }
351
+
352
+ for attr, getter in eval_data.items():
353
+ out_df[f"eval.{eval_name}.{attr}"] = out_df.index.map(
354
+ {r.experiment_run_id: getter(r) for r in eval_res}
355
+ )
356
+ out_df = _add_metadata_to_output_df(out_df, eval_res, eval_name)
357
+ logger.info("✅ All evaluators completed.")
358
+ out_df.reset_index(drop=True, inplace=True)
359
+ return out_df
360
+
361
+
362
+ def evaluate_experiment(
363
+ experiment_name: str,
364
+ examples: Sequence[Example],
365
+ experiment_results: Sequence[ExperimentRun],
366
+ *,
367
+ evaluators: Evaluators | None = None,
368
+ rate_limit_errors: RateLimitErrors | None = None,
369
+ concurrency: int = 3,
370
+ tracer: Tracer | None = None,
371
+ resource: Resource | None = None,
372
+ exit_on_error: bool = False,
373
+ ):
374
+ """
375
+ Evaluate the results of an experiment using the provided evaluators.
376
+ Args:
377
+ experiment_name (str): The name of the experiment.
378
+ examples (Sequence[Example]): The examples to evaluate.
379
+ experiment_results (Sequence[ExperimentRun]): The results of the experiment.
380
+ evaluators (Evaluators): The evaluators to use for assessment.
381
+ rate_limit_errors (Optional[RateLimitErrors]): Optional rate limit errors.
382
+ concurrency (int): The number of concurrent tasks to run. Default is 3.
383
+ tracer (Optional[Tracer]): Optional tracer for tracing the evaluation.
384
+ resource (Optional[Resource]): Optional resource for the evaluation.
385
+ exit_on_error (bool): Whether to exit on error. Default is False.
386
+ Returns:
387
+ List[ExperimentEvaluationRun]: The evaluation results.
388
+ """
389
+ evaluators_by_name = _evaluators_by_name(evaluators)
390
+ if not evaluators_by_name:
391
+ raise ValueError("Must specify at least one Evaluator")
392
+ experiment_result_dict = {
393
+ run.dataset_example_id: run for run in experiment_results
394
+ }
395
+ paired_list = [
396
+ (example, experiment_result_dict[example.id])
397
+ for example in examples
398
+ if example.id in experiment_result_dict
399
+ ]
400
+
401
+ evaluation_input = [
402
+ (example, run, evaluator)
403
+ for (example, run), evaluator in product(
404
+ paired_list, evaluators_by_name.values()
405
+ )
406
+ ]
407
+
408
+ root_span_kind = EVALUATOR
409
+ md = {"experiment_name": experiment_name}
410
+
411
+ def sync_eval_run(
412
+ obj: Tuple[Example, ExperimentRun, Evaluator],
413
+ ) -> ExperimentEvaluationRun:
414
+ example, experiment_run, evaluator = obj
415
+ result: EvaluationResult | None = None
416
+ error: BaseException | None = None
417
+ status = Status(StatusCode.OK)
418
+ root_span_name = f"Evaluation: {evaluator.name}"
419
+ with ExitStack() as stack:
420
+ span: Span = stack.enter_context(
421
+ tracer.start_as_current_span( # type:ignore
422
+ name=root_span_name, context=Context()
423
+ )
424
+ )
425
+ stack.enter_context(capture_spans(resource)) # type:ignore
426
+ span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
427
+ try:
428
+ result = evaluator.evaluate(
429
+ dataset_row=example.dataset_row,
430
+ input=example.input,
431
+ output=deepcopy(experiment_run.output),
432
+ experiment_output=deepcopy(experiment_run.output),
433
+ dataset_output=example.output,
434
+ metadata=example.metadata,
435
+ )
436
+ except BaseException as exc:
437
+ if exit_on_error:
438
+ raise exc
439
+ span.record_exception(exc)
440
+ status = Status(
441
+ StatusCode.ERROR, f"{type(exc).__name__}: {exc}"
442
+ )
443
+ error = exc
444
+ _print_experiment_error(
445
+ exc,
446
+ example_id=example.id,
447
+ kind="evaluator",
448
+ )
449
+ if result:
450
+ span.set_attributes(
451
+ dict(flatten(jsonify(result), recurse_on_sequence=True))
452
+ )
453
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
454
+ span.set_status(status)
455
+
456
+ eval_run = ExperimentEvaluationRun(
457
+ experiment_run_id=experiment_run.id,
458
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
459
+ end_time=(
460
+ _decode_unix_nano(cast(int, span.end_time))
461
+ if span.end_time
462
+ else datetime.now()
463
+ ),
464
+ name=evaluator.name,
465
+ annotator_kind=evaluator.kind,
466
+ error=repr(error) if error else None,
467
+ result=result,
468
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type:ignore
469
+ )
470
+ return eval_run
471
+
472
+ async def async_eval_run(
473
+ obj: Tuple[Example, ExperimentRun, Evaluator],
474
+ ) -> ExperimentEvaluationRun:
475
+ example, experiment_run, evaluator = obj
476
+ result: EvaluationResult | None = None
477
+ error: BaseException | None = None
478
+ status = Status(StatusCode.OK)
479
+ root_span_name = f"Evaluation: {evaluator.name}"
480
+ with ExitStack() as stack:
481
+ span: Span = stack.enter_context(
482
+ tracer.start_as_current_span( # type:ignore
483
+ name=root_span_name, context=Context()
484
+ )
485
+ )
486
+ stack.enter_context(capture_spans(resource)) # type:ignore
487
+ span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
488
+ try:
489
+ result = await evaluator.async_evaluate(
490
+ dataset_row=example.dataset_row,
491
+ input=example.input,
492
+ output=deepcopy(experiment_run.output),
493
+ experiment_output=deepcopy(experiment_run.output),
494
+ dataset_output=example.output,
495
+ metadata=example.metadata,
496
+ )
497
+ except BaseException as exc:
498
+ if exit_on_error:
499
+ raise exc
500
+ span.record_exception(exc)
501
+ status = Status(
502
+ StatusCode.ERROR, f"{type(exc).__name__}: {exc}"
503
+ )
504
+ error = exc
505
+ _print_experiment_error(
506
+ exc,
507
+ example_id=example.id,
508
+ kind="evaluator",
509
+ )
510
+ if result:
511
+ span.set_attributes(
512
+ dict(flatten(jsonify(result), recurse_on_sequence=True))
513
+ )
514
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
515
+ span.set_status(status)
516
+ eval_run = ExperimentEvaluationRun(
517
+ experiment_run_id=experiment_run.id,
518
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
519
+ end_time=(
520
+ _decode_unix_nano(cast(int, span.end_time))
521
+ if span.end_time
522
+ else datetime.now()
523
+ ),
524
+ name=evaluator.name,
525
+ annotator_kind=evaluator.kind,
526
+ error=repr(error) if error else None,
527
+ result=result,
528
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type:ignore
529
+ )
530
+ return eval_run
531
+
532
+ _errors: Tuple[Type[BaseException], ...]
533
+ if not isinstance(rate_limit_errors, Sequence):
534
+ _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
535
+ else:
536
+ _errors = tuple(filter(None, rate_limit_errors))
537
+ rate_limiters = [
538
+ RateLimiter(rate_limit_error=rate_limit_error)
539
+ for rate_limit_error in _errors
540
+ ]
541
+
542
+ rate_limited_sync_evaluate_run = functools.reduce(
543
+ lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_eval_run
544
+ )
545
+ rate_limited_async_evaluate_run = functools.reduce(
546
+ lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_eval_run
547
+ )
548
+
549
+ executor = get_executor_on_sync_context(
550
+ rate_limited_sync_evaluate_run,
551
+ rate_limited_async_evaluate_run,
552
+ max_retries=0,
553
+ exit_on_error=exit_on_error,
554
+ fallback_return_value=None,
555
+ tqdm_bar_format=get_tqdm_progress_bar_formatter(
556
+ "running experiment evaluations"
557
+ ),
558
+ concurrency=concurrency,
559
+ )
560
+ eval_runs, _ = executor.run(evaluation_input)
561
+ return eval_runs
562
+
563
+
564
+ def _add_metadata_to_output_df(
565
+ output_df: pd.DataFrame,
566
+ eval_runs: List[ExperimentEvaluationRun],
567
+ evaluator_name: str,
568
+ ):
569
+ for eval_run in eval_runs:
570
+ if eval_run.result is None:
571
+ continue
572
+ metadata = eval_run.result.metadata
573
+ for key, value in metadata.items():
574
+ column_name = f"eval.{evaluator_name}.metadata.{key}"
575
+ if column_name not in output_df.columns:
576
+ output_df[column_name] = None
577
+ # If the value is not a primitive type, try to convert it to a string
578
+ if value is not None and not isinstance(
579
+ value, (int, float, str, bool)
580
+ ):
581
+ try:
582
+ value = str(value)
583
+ except Exception as e:
584
+ raise ValueError(
585
+ f"Metadata value for key '{key}' in evaluator '{evaluator_name}' is not a primitive"
586
+ "type and cannot be converted to a string."
587
+ ) from e
588
+ output_df.loc[eval_run.experiment_run_id, column_name] = value
589
+ return output_df
590
+
591
+
592
+ def _dataframe_to_examples(dataset: pd.DataFrame) -> List[Example]:
593
+ for column in dataset.columns:
594
+ if pd.api.types.is_datetime64_any_dtype(dataset[column]):
595
+ dataset[column] = dataset[column].astype(str)
596
+ examples = []
597
+
598
+ for _, row in dataset.iterrows():
599
+ example = Example(dataset_row=row.to_dict())
600
+ examples.append(example)
601
+ return examples
602
+
603
+
604
+ def _validate_task_signature(sig: inspect.Signature) -> None:
605
+ # Check that the function signature has a valid signature for use as a task
606
+ # If it does not, raise an error to exit early before running an experiment
607
+ params = sig.parameters
608
+ valid_named_params = {"input", "output", "metadata", "dataset_row"}
609
+ if len(params) == 0:
610
+ raise ValueError("Task function must have at least one parameter.")
611
+ if len(params) > 1:
612
+ for not_found in set(params) - valid_named_params:
613
+ param = params[not_found]
614
+ if (
615
+ param.kind is inspect.Parameter.VAR_KEYWORD
616
+ or param.default is not inspect.Parameter.empty
617
+ ):
618
+ continue
619
+ raise ValueError(
620
+ f"Invalid parameter names in task function: {', '.join(not_found)}. "
621
+ "Parameters names for multi-argument functions must be "
622
+ f"any of: {', '.join(valid_named_params)}."
623
+ )
624
+
625
+
626
+ def _bind_task_signature(
627
+ sig: inspect.Signature, example: Example
628
+ ) -> inspect.BoundArguments:
629
+ parameter_mapping = {
630
+ "input": example.input,
631
+ "output": example.output,
632
+ "metadata": example.metadata,
633
+ "dataset_row": example.dataset_row,
634
+ }
635
+ params = sig.parameters
636
+ if len(params) == 1:
637
+ parameter_name = next(iter(params))
638
+ if parameter_name in parameter_mapping:
639
+ return sig.bind(parameter_mapping[parameter_name])
640
+ else:
641
+ return sig.bind(parameter_mapping["dataset_row"])
642
+ return sig.bind_partial(
643
+ **{
644
+ name: parameter_mapping[name]
645
+ for name in set(parameter_mapping).intersection(params)
646
+ }
647
+ )
648
+
649
+
650
+ def _evaluators_by_name(
651
+ obj: Evaluators | None,
652
+ ) -> Mapping[EvaluatorName, Evaluator]:
653
+ evaluators_by_name: Dict[EvaluatorName, Evaluator] = {}
654
+ if obj is None:
655
+ return evaluators_by_name
656
+ if isinstance(obj, Mapping):
657
+ for name, value in obj.items():
658
+ evaluator = (
659
+ create_evaluator(name=name)(value)
660
+ if not isinstance(value, Evaluator)
661
+ else value
662
+ )
663
+ name = evaluator.name
664
+ if name in evaluators_by_name:
665
+ raise ValueError(f"Two evaluators have the same name: {name}")
666
+ evaluators_by_name[name] = evaluator
667
+ elif isinstance(obj, Sequence):
668
+ for value in obj:
669
+ evaluator = (
670
+ create_evaluator()(value)
671
+ if not isinstance(value, Evaluator)
672
+ else value
673
+ )
674
+ name = evaluator.name
675
+ if name in evaluators_by_name:
676
+ raise ValueError(f"Two evaluators have the same name: {name}")
677
+ evaluators_by_name[name] = evaluator
678
+ else:
679
+ assert not isinstance(obj, Mapping) and not isinstance(obj, Sequence)
680
+ evaluator = (
681
+ create_evaluator()(obj) if not isinstance(obj, Evaluator) else obj
682
+ )
683
+ name = evaluator.name
684
+ if name in evaluators_by_name:
685
+ raise ValueError(f"Two evaluators have the same name: {name}")
686
+ evaluators_by_name[name] = evaluator
687
+ return evaluators_by_name
688
+
689
+
690
+ def get_func_name(fn: Callable[..., Any]) -> str:
691
+ """
692
+ Makes a best-effort attempt to get the name of the function.
693
+ """
694
+ if isinstance(fn, functools.partial):
695
+ return fn.func.__qualname__
696
+ if hasattr(fn, "__qualname__") and not fn.__qualname__.endswith("<lambda>"):
697
+ return fn.__qualname__.split(".<locals>.")[-1]
698
+ return str(fn)
699
+
700
+
701
+ def _print_experiment_error(
702
+ error: BaseException,
703
+ /,
704
+ *,
705
+ example_id: str,
706
+ kind: Literal["evaluator", "task"],
707
+ ) -> None:
708
+ """
709
+ Prints an experiment error.
710
+ """
711
+ display_error = RuntimeError(
712
+ f"{kind} failed for example id {repr(example_id)}"
713
+ )
714
+ display_error.__cause__ = error
715
+ formatted_exception = "".join(
716
+ traceback.format_exception(
717
+ type(display_error), display_error, display_error.__traceback__
718
+ )
719
+ )
720
+ print("\033[91m" + formatted_exception + "\033[0m") # prints in red
721
+
722
+
723
+ def _decode_unix_nano(time_unix_nano: int) -> datetime:
724
+ return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
725
+
726
+
727
+ def _str_trace_id(id_: int) -> str:
728
+ return hexlify(id_.to_bytes(16, "big")).decode()
729
+
730
+
731
+ def get_tqdm_progress_bar_formatter(title: str) -> str:
732
+ """
733
+ Returns a progress bar formatter for use with tqdm.
734
+
735
+ Args:
736
+ title (str): The title of the progress bar, displayed as a prefix.
737
+
738
+ Returns:
739
+ str: A formatter to be passed to the bar_format argument of tqdm.
740
+
741
+ """
742
+ return (
743
+ title + " |{bar}| {n_fmt}/{total_fmt} ({percentage:3.1f}%) "
744
+ "| ⏳ {elapsed}<{remaining} | {rate_fmt}{postfix}"
745
+ )
746
+
747
+
748
+ INPUT_VALUE = SpanAttributes.INPUT_VALUE
749
+ OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
750
+ INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
751
+ OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
752
+ OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
753
+ METADATA = SpanAttributes.METADATA
754
+
755
+ CHAIN = OpenInferenceSpanKindValues.CHAIN.value
756
+ EVALUATOR = OpenInferenceSpanKindValues.EVALUATOR.value
757
+ JSON = OpenInferenceMimeTypeValues.JSON
758
+
759
+
760
+ def get_result_attr(r, attr, default=None):
761
+ return getattr(r.result, attr, default) if r.result else default
762
+
763
+
764
+ def transform_to_experiment_format(
765
+ experiment_runs: List[Dict[str, Any]] | pd.DataFrame,
766
+ task_fields: ExperimentTaskResultFieldNames,
767
+ evaluator_fields: Dict[str, EvaluationResultFieldNames] | None = None,
768
+ ) -> pd.DataFrame:
769
+ """
770
+ Transform a DataFrame to match the format returned by run_experiment().
771
+
772
+ Args:
773
+ df: Input DataFrame containing experiment results
774
+ task_columns: Column mapping for task results
775
+ evaluator_columns: Dictionary mapping evaluator names (str)
776
+ to their column mappings (EvaluationResultColumnNames)
777
+
778
+ Returns:
779
+ DataFrame in the format matching run_experiment() output
780
+ """
781
+ data = (
782
+ experiment_runs
783
+ if isinstance(experiment_runs, pd.DataFrame)
784
+ else pd.DataFrame(experiment_runs)
785
+ )
786
+ # Validate required columns
787
+ required_cols = {task_fields.example_id, task_fields.result}
788
+ missing_cols = required_cols - set(data.columns)
789
+ if missing_cols:
790
+ raise ValueError(f"Missing required columns: {missing_cols}")
791
+
792
+ # Initialize output DataFrame with required columns
793
+ out_df = data.copy()
794
+ out_df["id"] = range(len(data)) # Generate sequential IDs
795
+ out_df["example_id"] = data[task_fields.example_id]
796
+ if task_fields.example_id != "example_id":
797
+ out_df.drop(task_fields.example_id, axis=1, inplace=True)
798
+ out_df["result"] = data[task_fields.result].apply(
799
+ lambda x: json.dumps(x) if isinstance(x, dict) else x
800
+ )
801
+ if task_fields.result != "result":
802
+ out_df.drop(task_fields.result, axis=1, inplace=True)
803
+
804
+ # Process evaluator results
805
+ if evaluator_fields:
806
+ for evaluator_name, column_names in evaluator_fields.items():
807
+ _add_evaluator_columns(data, out_df, evaluator_name, column_names)
808
+
809
+ # Set index but keep id column
810
+ out_df.set_index("id", inplace=True, drop=False)
811
+ out_df.reset_index(drop=True, inplace=True)
812
+ return out_df
813
+
814
+
815
+ def _add_evaluator_columns(
816
+ input_df: pd.DataFrame,
817
+ output_df: pd.DataFrame,
818
+ evaluator_name: str,
819
+ column_names: EvaluationResultFieldNames,
820
+ ) -> None:
821
+ """Helper function to add evaluator columns to output DataFrame"""
822
+ # Add score if specified
823
+ if column_names.score and column_names.score in input_df.columns:
824
+ output_df[f"eval.{evaluator_name}.score"] = input_df[column_names.score]
825
+ output_df.drop(column_names.score, axis=1, inplace=True)
826
+
827
+ # Add label if specified
828
+ if column_names.label and column_names.label in input_df.columns:
829
+ output_df[f"eval.{evaluator_name}.label"] = input_df[column_names.label]
830
+ output_df.drop(column_names.label, axis=1, inplace=True)
831
+
832
+ # Add explanation if specified
833
+ if (
834
+ column_names.explanation
835
+ and column_names.explanation in input_df.columns
836
+ ):
837
+ output_df[f"eval.{evaluator_name}.explanation"] = input_df[
838
+ column_names.explanation
839
+ ]
840
+ output_df.drop(column_names.explanation, axis=1, inplace=True)
841
+
842
+ # Add metadata columns if specified
843
+ if column_names.metadata:
844
+ for metadata_key, column_name in column_names.metadata.items():
845
+ # If column_name not specified, use metadata_key as the column name
846
+ md_col_name = column_name if column_name else metadata_key
847
+
848
+ if md_col_name not in input_df.columns:
849
+ raise ValueError(
850
+ f"metadata column {md_col_name} not found in input DataFrame columns: "
851
+ f"{input_df.columns}"
852
+ )
853
+
854
+ output_col = f"eval.{evaluator_name}.metadata.{metadata_key}"
855
+ output_df.drop(md_col_name, axis=1, inplace=True)
856
+
857
+ output_vals = input_df[md_col_name].apply(
858
+ lambda x: str(x)
859
+ if x is not None and not isinstance(x, (int, float, str, bool))
860
+ else x
861
+ )
862
+ output_df[output_col] = output_vals
863
+
864
+
865
+ def jsonify(obj: Any) -> Any:
866
+ """
867
+ Coerce object to be json serializable.
868
+ """
869
+ if isinstance(obj, Enum):
870
+ return jsonify(obj.value)
871
+ if isinstance(obj, (str, int, float, bool)) or obj is None:
872
+ return obj
873
+ if isinstance(obj, (list, set, frozenset, Sequence)):
874
+ return [jsonify(v) for v in obj]
875
+ if isinstance(obj, (dict, Mapping)):
876
+ return {jsonify(k): jsonify(v) for k, v in obj.items()}
877
+ if dataclasses.is_dataclass(obj):
878
+ result = {}
879
+ for field in dataclasses.fields(obj):
880
+ k = field.name
881
+ v = getattr(obj, k)
882
+ if not (
883
+ v is None
884
+ and get_origin(field) is Union
885
+ and type(None) in get_args(field)
886
+ ):
887
+ result[k] = jsonify(v)
888
+ return result
889
+ if isinstance(obj, (date, datetime, time)):
890
+ return obj.isoformat()
891
+ if isinstance(obj, timedelta):
892
+ return obj.total_seconds()
893
+ if isinstance(obj, Path):
894
+ return str(obj)
895
+ if isinstance(obj, BaseException):
896
+ return str(obj)
897
+ if isinstance(obj, np.ndarray):
898
+ return [jsonify(v) for v in obj]
899
+ if hasattr(obj, "__float__"):
900
+ return float(obj)
901
+ if hasattr(obj, "model_dump") and callable(obj.model_dump):
902
+ # pydantic v2
903
+ try:
904
+ d = obj
905
+ assert isinstance(d, dict)
906
+ except BaseException:
907
+ pass
908
+ else:
909
+ return jsonify(d)
910
+ if hasattr(obj, "dict") and callable(obj.dict):
911
+ # pydantic v1
912
+ try:
913
+ d = obj.dict()
914
+ assert isinstance(d, dict)
915
+ except BaseException:
916
+ pass
917
+ else:
918
+ return jsonify(d)
919
+ cls = obj.__class__
920
+ return f"<{cls.__module__}.{cls.__name__} object>"