arize 8.0.0a14__py3-none-any.whl → 8.0.0a16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +70 -1
- arize/_flight/client.py +163 -43
- arize/_flight/types.py +1 -0
- arize/_generated/api_client/__init__.py +5 -1
- arize/_generated/api_client/api/datasets_api.py +6 -6
- arize/_generated/api_client/api/experiments_api.py +924 -61
- arize/_generated/api_client/api_client.py +1 -1
- arize/_generated/api_client/configuration.py +1 -1
- arize/_generated/api_client/exceptions.py +1 -1
- arize/_generated/api_client/models/__init__.py +3 -1
- arize/_generated/api_client/models/dataset.py +2 -2
- arize/_generated/api_client/models/dataset_version.py +1 -1
- arize/_generated/api_client/models/datasets_create_request.py +3 -3
- arize/_generated/api_client/models/datasets_list200_response.py +1 -1
- arize/_generated/api_client/models/datasets_list_examples200_response.py +1 -1
- arize/_generated/api_client/models/error.py +1 -1
- arize/_generated/api_client/models/experiment.py +6 -6
- arize/_generated/api_client/models/experiments_create_request.py +98 -0
- arize/_generated/api_client/models/experiments_list200_response.py +1 -1
- arize/_generated/api_client/models/experiments_runs_list200_response.py +92 -0
- arize/_generated/api_client/rest.py +1 -1
- arize/_generated/api_client/test/test_dataset.py +2 -1
- arize/_generated/api_client/test/test_dataset_version.py +1 -1
- arize/_generated/api_client/test/test_datasets_api.py +1 -1
- arize/_generated/api_client/test/test_datasets_create_request.py +2 -1
- arize/_generated/api_client/test/test_datasets_list200_response.py +1 -1
- arize/_generated/api_client/test/test_datasets_list_examples200_response.py +1 -1
- arize/_generated/api_client/test/test_error.py +1 -1
- arize/_generated/api_client/test/test_experiment.py +6 -1
- arize/_generated/api_client/test/test_experiments_api.py +23 -2
- arize/_generated/api_client/test/test_experiments_create_request.py +61 -0
- arize/_generated/api_client/test/test_experiments_list200_response.py +1 -1
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +56 -0
- arize/_generated/api_client_README.md +13 -8
- arize/client.py +19 -2
- arize/config.py +50 -3
- arize/constants/config.py +8 -2
- arize/constants/openinference.py +14 -0
- arize/constants/pyarrow.py +1 -0
- arize/datasets/__init__.py +0 -70
- arize/datasets/client.py +106 -19
- arize/datasets/errors.py +61 -0
- arize/datasets/validation.py +46 -0
- arize/experiments/client.py +455 -0
- arize/experiments/evaluators/__init__.py +0 -0
- arize/experiments/evaluators/base.py +255 -0
- arize/experiments/evaluators/exceptions.py +10 -0
- arize/experiments/evaluators/executors.py +502 -0
- arize/experiments/evaluators/rate_limiters.py +277 -0
- arize/experiments/evaluators/types.py +122 -0
- arize/experiments/evaluators/utils.py +198 -0
- arize/experiments/functions.py +920 -0
- arize/experiments/tracing.py +276 -0
- arize/experiments/types.py +394 -0
- arize/models/client.py +4 -1
- arize/spans/client.py +16 -20
- arize/utils/arrow.py +4 -3
- arize/utils/openinference_conversion.py +56 -0
- arize/utils/proto.py +13 -0
- arize/utils/size.py +22 -0
- arize/version.py +1 -1
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/METADATA +3 -1
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/RECORD +65 -44
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/WHEEL +0 -0
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,920 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import functools
|
|
3
|
+
import inspect
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import traceback
|
|
7
|
+
from binascii import hexlify
|
|
8
|
+
from contextlib import ExitStack
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from datetime import date, datetime, time, timedelta, timezone
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from itertools import product
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import (
|
|
15
|
+
Any,
|
|
16
|
+
Awaitable,
|
|
17
|
+
Callable,
|
|
18
|
+
Dict,
|
|
19
|
+
List,
|
|
20
|
+
Literal,
|
|
21
|
+
Mapping,
|
|
22
|
+
Sequence,
|
|
23
|
+
Tuple,
|
|
24
|
+
Type,
|
|
25
|
+
Union,
|
|
26
|
+
cast,
|
|
27
|
+
get_args,
|
|
28
|
+
get_origin,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
from openinference.semconv.trace import (
|
|
34
|
+
OpenInferenceMimeTypeValues,
|
|
35
|
+
OpenInferenceSpanKindValues,
|
|
36
|
+
SpanAttributes,
|
|
37
|
+
)
|
|
38
|
+
from opentelemetry.context import Context
|
|
39
|
+
from opentelemetry.sdk.resources import Resource
|
|
40
|
+
from opentelemetry.sdk.trace import Span
|
|
41
|
+
from opentelemetry.trace import Status, StatusCode, Tracer
|
|
42
|
+
from typing_extensions import TypeAlias
|
|
43
|
+
|
|
44
|
+
from arize.experiments.evaluators.base import Evaluator, Evaluators
|
|
45
|
+
from arize.experiments.evaluators.executors import (
|
|
46
|
+
get_executor_on_sync_context,
|
|
47
|
+
)
|
|
48
|
+
from arize.experiments.evaluators.rate_limiters import RateLimiter
|
|
49
|
+
from arize.experiments.evaluators.types import (
|
|
50
|
+
EvaluationResult,
|
|
51
|
+
EvaluationResultFieldNames,
|
|
52
|
+
EvaluatorName,
|
|
53
|
+
)
|
|
54
|
+
from arize.experiments.evaluators.utils import create_evaluator
|
|
55
|
+
from arize.experiments.tracing import capture_spans, flatten
|
|
56
|
+
from arize.experiments.types import (
|
|
57
|
+
Example,
|
|
58
|
+
ExperimentEvaluationRun,
|
|
59
|
+
ExperimentRun,
|
|
60
|
+
ExperimentTask,
|
|
61
|
+
ExperimentTaskResultFieldNames,
|
|
62
|
+
_TaskSummary,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
RateLimitErrors: TypeAlias = Union[
|
|
66
|
+
Type[BaseException], Sequence[Type[BaseException]]
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
logger = logging.getLogger(__name__)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def run_experiment(
|
|
73
|
+
experiment_name: str,
|
|
74
|
+
experiment_id: str,
|
|
75
|
+
dataset: pd.DataFrame,
|
|
76
|
+
task: ExperimentTask,
|
|
77
|
+
tracer: Tracer,
|
|
78
|
+
resource: Resource,
|
|
79
|
+
rate_limit_errors: RateLimitErrors | None = None,
|
|
80
|
+
evaluators: Evaluators | None = None,
|
|
81
|
+
concurrency: int = 3,
|
|
82
|
+
exit_on_error: bool = False,
|
|
83
|
+
) -> pd.DataFrame:
|
|
84
|
+
"""
|
|
85
|
+
Run an experiment on a dataset.
|
|
86
|
+
Args:
|
|
87
|
+
experiment_name (str): The name for the experiment.
|
|
88
|
+
experiment_id (str): The ID for the experiment.
|
|
89
|
+
dataset (pd.DataFrame): The dataset to run the experiment on.
|
|
90
|
+
task (ExperimentTask): The task to be executed on the dataset.
|
|
91
|
+
tracer (Tracer): Tracer for tracing the experiment.
|
|
92
|
+
resource (Resource): The resource for tracing the experiment.
|
|
93
|
+
rate_limit_errors (Optional[RateLimitErrors]): Optional rate limit errors.
|
|
94
|
+
evaluators (Optional[Evaluators]): Optional evaluators to assess the task.
|
|
95
|
+
concurrency (int): The number of concurrent tasks to run. Default is 3.
|
|
96
|
+
exit_on_error (bool): Whether to exit on error. Default is False.
|
|
97
|
+
Returns:
|
|
98
|
+
pd.DataFrame: The results of the experiment.
|
|
99
|
+
"""
|
|
100
|
+
task_signature = inspect.signature(task)
|
|
101
|
+
_validate_task_signature(task_signature)
|
|
102
|
+
|
|
103
|
+
examples = _dataframe_to_examples(dataset)
|
|
104
|
+
if not examples:
|
|
105
|
+
raise ValueError("No examples found in the dataset.")
|
|
106
|
+
|
|
107
|
+
evaluators_by_name = _evaluators_by_name(evaluators)
|
|
108
|
+
root_span_name = f"Task: {get_func_name(task)}"
|
|
109
|
+
root_span_kind = CHAIN
|
|
110
|
+
|
|
111
|
+
logger.info("🧪 Experiment started.")
|
|
112
|
+
|
|
113
|
+
md = {"experiment_id": experiment_id}
|
|
114
|
+
|
|
115
|
+
def sync_run_experiment(example: Example) -> ExperimentRun:
|
|
116
|
+
output = None
|
|
117
|
+
error: BaseException | None = None
|
|
118
|
+
status = Status(StatusCode.OK)
|
|
119
|
+
with ExitStack() as stack:
|
|
120
|
+
span: Span = stack.enter_context(
|
|
121
|
+
cm=tracer.start_as_current_span(
|
|
122
|
+
name=root_span_name, context=Context()
|
|
123
|
+
)
|
|
124
|
+
) # type:ignore
|
|
125
|
+
stack.enter_context(capture_spans(resource))
|
|
126
|
+
span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
|
|
127
|
+
try:
|
|
128
|
+
bound_task_args = _bind_task_signature(task_signature, example)
|
|
129
|
+
_output = task(*bound_task_args.args, **bound_task_args.kwargs)
|
|
130
|
+
if isinstance(_output, Awaitable):
|
|
131
|
+
sync_error_message = (
|
|
132
|
+
"Task is async and cannot be run within an existing event loop. "
|
|
133
|
+
"Consider the following options:\n\n"
|
|
134
|
+
"1. Pass in a synchronous task callable.\n"
|
|
135
|
+
"2. Use `nest_asyncio.apply()` to allow nesting event loops."
|
|
136
|
+
)
|
|
137
|
+
raise RuntimeError(sync_error_message)
|
|
138
|
+
else:
|
|
139
|
+
output = _output
|
|
140
|
+
except BaseException as exc:
|
|
141
|
+
if exit_on_error:
|
|
142
|
+
raise exc
|
|
143
|
+
span.record_exception(exc)
|
|
144
|
+
status = Status(
|
|
145
|
+
StatusCode.ERROR, f"{type(exc).__name__}: {exc}"
|
|
146
|
+
)
|
|
147
|
+
error = exc
|
|
148
|
+
_print_experiment_error(exc, example_id=example.id, kind="task")
|
|
149
|
+
|
|
150
|
+
output = jsonify(output)
|
|
151
|
+
if example.input:
|
|
152
|
+
span.set_attribute(INPUT_VALUE, example.input) # type: ignore
|
|
153
|
+
else:
|
|
154
|
+
span.set_attribute(
|
|
155
|
+
INPUT_VALUE,
|
|
156
|
+
json.dumps(
|
|
157
|
+
obj=jsonify(example.dataset_row), ensure_ascii=False
|
|
158
|
+
),
|
|
159
|
+
)
|
|
160
|
+
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
161
|
+
if output is not None:
|
|
162
|
+
if isinstance(output, str):
|
|
163
|
+
span.set_attribute(OUTPUT_VALUE, output)
|
|
164
|
+
else:
|
|
165
|
+
span.set_attribute(
|
|
166
|
+
OUTPUT_VALUE, json.dumps(output, ensure_ascii=False)
|
|
167
|
+
)
|
|
168
|
+
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
169
|
+
span.set_attribute(
|
|
170
|
+
SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind
|
|
171
|
+
)
|
|
172
|
+
span.set_status(status)
|
|
173
|
+
|
|
174
|
+
assert isinstance(
|
|
175
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
176
|
+
), "Output must be JSON serializable"
|
|
177
|
+
|
|
178
|
+
exp_run = ExperimentRun(
|
|
179
|
+
experiment_id=experiment_name,
|
|
180
|
+
repetition_number=1,
|
|
181
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
182
|
+
end_time=(
|
|
183
|
+
_decode_unix_nano(cast(int, span.end_time))
|
|
184
|
+
if span.end_time
|
|
185
|
+
else datetime.now()
|
|
186
|
+
),
|
|
187
|
+
dataset_example_id=example.id,
|
|
188
|
+
output=output, # type:ignore
|
|
189
|
+
error=repr(error) if error else None,
|
|
190
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore
|
|
191
|
+
)
|
|
192
|
+
return exp_run
|
|
193
|
+
|
|
194
|
+
async def async_run_experiment(example: Example) -> ExperimentRun:
|
|
195
|
+
output = None
|
|
196
|
+
error: BaseException | None = None
|
|
197
|
+
status = Status(StatusCode.OK)
|
|
198
|
+
with ExitStack() as stack:
|
|
199
|
+
span: Span = stack.enter_context(
|
|
200
|
+
cm=tracer.start_as_current_span(
|
|
201
|
+
name=root_span_name, context=Context()
|
|
202
|
+
)
|
|
203
|
+
) # type:ignore
|
|
204
|
+
stack.enter_context(capture_spans(resource))
|
|
205
|
+
span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
|
|
206
|
+
try:
|
|
207
|
+
bound_task_args = _bind_task_signature(task_signature, example)
|
|
208
|
+
_output = task(*bound_task_args.args, **bound_task_args.kwargs)
|
|
209
|
+
if isinstance(_output, Awaitable):
|
|
210
|
+
output = await _output
|
|
211
|
+
else:
|
|
212
|
+
output = _output
|
|
213
|
+
except BaseException as exc:
|
|
214
|
+
if exit_on_error:
|
|
215
|
+
raise exc
|
|
216
|
+
span.record_exception(exc)
|
|
217
|
+
status = Status(
|
|
218
|
+
StatusCode.ERROR, f"{type(exc).__name__}: {exc}"
|
|
219
|
+
)
|
|
220
|
+
error = exc
|
|
221
|
+
_print_experiment_error(exc, example_id=example.id, kind="task")
|
|
222
|
+
output = jsonify(output)
|
|
223
|
+
if example.input:
|
|
224
|
+
span.set_attribute(INPUT_VALUE, example.input) # type: ignore
|
|
225
|
+
else:
|
|
226
|
+
span.set_attribute(
|
|
227
|
+
INPUT_VALUE,
|
|
228
|
+
json.dumps(
|
|
229
|
+
obj=jsonify(example.dataset_row), ensure_ascii=False
|
|
230
|
+
),
|
|
231
|
+
)
|
|
232
|
+
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
233
|
+
if output is not None:
|
|
234
|
+
if isinstance(output, str):
|
|
235
|
+
span.set_attribute(OUTPUT_VALUE, output)
|
|
236
|
+
else:
|
|
237
|
+
span.set_attribute(
|
|
238
|
+
OUTPUT_VALUE, json.dumps(output, ensure_ascii=False)
|
|
239
|
+
)
|
|
240
|
+
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
241
|
+
span.set_attribute(
|
|
242
|
+
SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind
|
|
243
|
+
)
|
|
244
|
+
span.set_status(status)
|
|
245
|
+
|
|
246
|
+
assert isinstance(
|
|
247
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
248
|
+
), "Output must be JSON serializable"
|
|
249
|
+
|
|
250
|
+
exp_run = ExperimentRun(
|
|
251
|
+
experiment_id=experiment_name,
|
|
252
|
+
repetition_number=1,
|
|
253
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
254
|
+
end_time=(
|
|
255
|
+
_decode_unix_nano(cast(int, span.end_time))
|
|
256
|
+
if span.end_time
|
|
257
|
+
else datetime.now()
|
|
258
|
+
),
|
|
259
|
+
dataset_example_id=example.id,
|
|
260
|
+
output=output, # type: ignore
|
|
261
|
+
error=repr(error) if error else None,
|
|
262
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore
|
|
263
|
+
)
|
|
264
|
+
return exp_run
|
|
265
|
+
|
|
266
|
+
_errors: Tuple[Type[BaseException], ...]
|
|
267
|
+
if not isinstance(rate_limit_errors, Sequence):
|
|
268
|
+
_errors = (rate_limit_errors,) # type: ignore
|
|
269
|
+
else:
|
|
270
|
+
_errors = tuple(filter(None, rate_limit_errors))
|
|
271
|
+
rate_limiters = [RateLimiter(rate_limit_error=rle) for rle in _errors]
|
|
272
|
+
rate_limited_sync_run_experiment = functools.reduce(
|
|
273
|
+
lambda fn, limiter: limiter.limit(fn),
|
|
274
|
+
rate_limiters,
|
|
275
|
+
sync_run_experiment,
|
|
276
|
+
)
|
|
277
|
+
rate_limited_async_run_experiment = functools.reduce(
|
|
278
|
+
lambda fn, limiter: limiter.alimit(fn),
|
|
279
|
+
rate_limiters,
|
|
280
|
+
async_run_experiment,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
executor = get_executor_on_sync_context(
|
|
284
|
+
sync_fn=rate_limited_sync_run_experiment,
|
|
285
|
+
async_fn=rate_limited_async_run_experiment,
|
|
286
|
+
max_retries=0,
|
|
287
|
+
exit_on_error=exit_on_error,
|
|
288
|
+
fallback_return_value=None,
|
|
289
|
+
tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
|
|
290
|
+
concurrency=concurrency,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
runs, _ = executor.run(examples)
|
|
294
|
+
task_summary = _TaskSummary.from_task_runs(len(dataset), runs)
|
|
295
|
+
|
|
296
|
+
if exit_on_error and (None in runs):
|
|
297
|
+
# When exit_on_error is True, the result of a failed task execution is None
|
|
298
|
+
# If any task execution failed, raise an error to exit early
|
|
299
|
+
raise RuntimeError("An error occurred during execution of tasks.")
|
|
300
|
+
|
|
301
|
+
out_df = pd.DataFrame()
|
|
302
|
+
out_df["id"] = [run.id for run in runs]
|
|
303
|
+
out_df["example_id"] = [run.dataset_example_id for run in runs]
|
|
304
|
+
out_df["result"] = [run.output for run in runs]
|
|
305
|
+
out_df["result.trace.id"] = [run.trace_id for run in runs]
|
|
306
|
+
out_df["result.trace.timestamp"] = [
|
|
307
|
+
int(run.start_time.timestamp() * 1e3) for run in runs
|
|
308
|
+
]
|
|
309
|
+
out_df.set_index("id", inplace=True, drop=False)
|
|
310
|
+
logger.info(f"✅ Task runs completed.\n{task_summary}")
|
|
311
|
+
|
|
312
|
+
if evaluators_by_name:
|
|
313
|
+
eval_results = evaluate_experiment(
|
|
314
|
+
experiment_name=experiment_name,
|
|
315
|
+
examples=examples,
|
|
316
|
+
experiment_results=runs,
|
|
317
|
+
evaluators=evaluators,
|
|
318
|
+
rate_limit_errors=rate_limit_errors,
|
|
319
|
+
concurrency=concurrency,
|
|
320
|
+
tracer=tracer,
|
|
321
|
+
resource=resource,
|
|
322
|
+
exit_on_error=exit_on_error,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if exit_on_error and (None in eval_results):
|
|
326
|
+
raise RuntimeError(
|
|
327
|
+
"An error occurred during execution of evaluators."
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# group evaluation results by name
|
|
331
|
+
eval_results_by_name = {}
|
|
332
|
+
for r in eval_results:
|
|
333
|
+
if r is None:
|
|
334
|
+
continue
|
|
335
|
+
if r.name not in eval_results_by_name:
|
|
336
|
+
eval_results_by_name[r.name] = []
|
|
337
|
+
eval_results_by_name[r.name].append(r)
|
|
338
|
+
|
|
339
|
+
for eval_name, eval_res in eval_results_by_name.items():
|
|
340
|
+
eval_data = {
|
|
341
|
+
"score": lambda x: get_result_attr(x, "score", None),
|
|
342
|
+
"label": lambda x: get_result_attr(x, "label", None),
|
|
343
|
+
"explanation": lambda x: get_result_attr(
|
|
344
|
+
x, "explanation", None
|
|
345
|
+
),
|
|
346
|
+
"trace.id": lambda x: x.trace_id,
|
|
347
|
+
"trace.timestamp": lambda x: int(
|
|
348
|
+
x.start_time.timestamp() * 1e3
|
|
349
|
+
),
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
for attr, getter in eval_data.items():
|
|
353
|
+
out_df[f"eval.{eval_name}.{attr}"] = out_df.index.map(
|
|
354
|
+
{r.experiment_run_id: getter(r) for r in eval_res}
|
|
355
|
+
)
|
|
356
|
+
out_df = _add_metadata_to_output_df(out_df, eval_res, eval_name)
|
|
357
|
+
logger.info("✅ All evaluators completed.")
|
|
358
|
+
out_df.reset_index(drop=True, inplace=True)
|
|
359
|
+
return out_df
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def evaluate_experiment(
|
|
363
|
+
experiment_name: str,
|
|
364
|
+
examples: Sequence[Example],
|
|
365
|
+
experiment_results: Sequence[ExperimentRun],
|
|
366
|
+
*,
|
|
367
|
+
evaluators: Evaluators | None = None,
|
|
368
|
+
rate_limit_errors: RateLimitErrors | None = None,
|
|
369
|
+
concurrency: int = 3,
|
|
370
|
+
tracer: Tracer | None = None,
|
|
371
|
+
resource: Resource | None = None,
|
|
372
|
+
exit_on_error: bool = False,
|
|
373
|
+
):
|
|
374
|
+
"""
|
|
375
|
+
Evaluate the results of an experiment using the provided evaluators.
|
|
376
|
+
Args:
|
|
377
|
+
experiment_name (str): The name of the experiment.
|
|
378
|
+
examples (Sequence[Example]): The examples to evaluate.
|
|
379
|
+
experiment_results (Sequence[ExperimentRun]): The results of the experiment.
|
|
380
|
+
evaluators (Evaluators): The evaluators to use for assessment.
|
|
381
|
+
rate_limit_errors (Optional[RateLimitErrors]): Optional rate limit errors.
|
|
382
|
+
concurrency (int): The number of concurrent tasks to run. Default is 3.
|
|
383
|
+
tracer (Optional[Tracer]): Optional tracer for tracing the evaluation.
|
|
384
|
+
resource (Optional[Resource]): Optional resource for the evaluation.
|
|
385
|
+
exit_on_error (bool): Whether to exit on error. Default is False.
|
|
386
|
+
Returns:
|
|
387
|
+
List[ExperimentEvaluationRun]: The evaluation results.
|
|
388
|
+
"""
|
|
389
|
+
evaluators_by_name = _evaluators_by_name(evaluators)
|
|
390
|
+
if not evaluators_by_name:
|
|
391
|
+
raise ValueError("Must specify at least one Evaluator")
|
|
392
|
+
experiment_result_dict = {
|
|
393
|
+
run.dataset_example_id: run for run in experiment_results
|
|
394
|
+
}
|
|
395
|
+
paired_list = [
|
|
396
|
+
(example, experiment_result_dict[example.id])
|
|
397
|
+
for example in examples
|
|
398
|
+
if example.id in experiment_result_dict
|
|
399
|
+
]
|
|
400
|
+
|
|
401
|
+
evaluation_input = [
|
|
402
|
+
(example, run, evaluator)
|
|
403
|
+
for (example, run), evaluator in product(
|
|
404
|
+
paired_list, evaluators_by_name.values()
|
|
405
|
+
)
|
|
406
|
+
]
|
|
407
|
+
|
|
408
|
+
root_span_kind = EVALUATOR
|
|
409
|
+
md = {"experiment_name": experiment_name}
|
|
410
|
+
|
|
411
|
+
def sync_eval_run(
|
|
412
|
+
obj: Tuple[Example, ExperimentRun, Evaluator],
|
|
413
|
+
) -> ExperimentEvaluationRun:
|
|
414
|
+
example, experiment_run, evaluator = obj
|
|
415
|
+
result: EvaluationResult | None = None
|
|
416
|
+
error: BaseException | None = None
|
|
417
|
+
status = Status(StatusCode.OK)
|
|
418
|
+
root_span_name = f"Evaluation: {evaluator.name}"
|
|
419
|
+
with ExitStack() as stack:
|
|
420
|
+
span: Span = stack.enter_context(
|
|
421
|
+
tracer.start_as_current_span( # type:ignore
|
|
422
|
+
name=root_span_name, context=Context()
|
|
423
|
+
)
|
|
424
|
+
)
|
|
425
|
+
stack.enter_context(capture_spans(resource)) # type:ignore
|
|
426
|
+
span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
|
|
427
|
+
try:
|
|
428
|
+
result = evaluator.evaluate(
|
|
429
|
+
dataset_row=example.dataset_row,
|
|
430
|
+
input=example.input,
|
|
431
|
+
output=deepcopy(experiment_run.output),
|
|
432
|
+
experiment_output=deepcopy(experiment_run.output),
|
|
433
|
+
dataset_output=example.output,
|
|
434
|
+
metadata=example.metadata,
|
|
435
|
+
)
|
|
436
|
+
except BaseException as exc:
|
|
437
|
+
if exit_on_error:
|
|
438
|
+
raise exc
|
|
439
|
+
span.record_exception(exc)
|
|
440
|
+
status = Status(
|
|
441
|
+
StatusCode.ERROR, f"{type(exc).__name__}: {exc}"
|
|
442
|
+
)
|
|
443
|
+
error = exc
|
|
444
|
+
_print_experiment_error(
|
|
445
|
+
exc,
|
|
446
|
+
example_id=example.id,
|
|
447
|
+
kind="evaluator",
|
|
448
|
+
)
|
|
449
|
+
if result:
|
|
450
|
+
span.set_attributes(
|
|
451
|
+
dict(flatten(jsonify(result), recurse_on_sequence=True))
|
|
452
|
+
)
|
|
453
|
+
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
454
|
+
span.set_status(status)
|
|
455
|
+
|
|
456
|
+
eval_run = ExperimentEvaluationRun(
|
|
457
|
+
experiment_run_id=experiment_run.id,
|
|
458
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
459
|
+
end_time=(
|
|
460
|
+
_decode_unix_nano(cast(int, span.end_time))
|
|
461
|
+
if span.end_time
|
|
462
|
+
else datetime.now()
|
|
463
|
+
),
|
|
464
|
+
name=evaluator.name,
|
|
465
|
+
annotator_kind=evaluator.kind,
|
|
466
|
+
error=repr(error) if error else None,
|
|
467
|
+
result=result,
|
|
468
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type:ignore
|
|
469
|
+
)
|
|
470
|
+
return eval_run
|
|
471
|
+
|
|
472
|
+
async def async_eval_run(
|
|
473
|
+
obj: Tuple[Example, ExperimentRun, Evaluator],
|
|
474
|
+
) -> ExperimentEvaluationRun:
|
|
475
|
+
example, experiment_run, evaluator = obj
|
|
476
|
+
result: EvaluationResult | None = None
|
|
477
|
+
error: BaseException | None = None
|
|
478
|
+
status = Status(StatusCode.OK)
|
|
479
|
+
root_span_name = f"Evaluation: {evaluator.name}"
|
|
480
|
+
with ExitStack() as stack:
|
|
481
|
+
span: Span = stack.enter_context(
|
|
482
|
+
tracer.start_as_current_span( # type:ignore
|
|
483
|
+
name=root_span_name, context=Context()
|
|
484
|
+
)
|
|
485
|
+
)
|
|
486
|
+
stack.enter_context(capture_spans(resource)) # type:ignore
|
|
487
|
+
span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
|
|
488
|
+
try:
|
|
489
|
+
result = await evaluator.async_evaluate(
|
|
490
|
+
dataset_row=example.dataset_row,
|
|
491
|
+
input=example.input,
|
|
492
|
+
output=deepcopy(experiment_run.output),
|
|
493
|
+
experiment_output=deepcopy(experiment_run.output),
|
|
494
|
+
dataset_output=example.output,
|
|
495
|
+
metadata=example.metadata,
|
|
496
|
+
)
|
|
497
|
+
except BaseException as exc:
|
|
498
|
+
if exit_on_error:
|
|
499
|
+
raise exc
|
|
500
|
+
span.record_exception(exc)
|
|
501
|
+
status = Status(
|
|
502
|
+
StatusCode.ERROR, f"{type(exc).__name__}: {exc}"
|
|
503
|
+
)
|
|
504
|
+
error = exc
|
|
505
|
+
_print_experiment_error(
|
|
506
|
+
exc,
|
|
507
|
+
example_id=example.id,
|
|
508
|
+
kind="evaluator",
|
|
509
|
+
)
|
|
510
|
+
if result:
|
|
511
|
+
span.set_attributes(
|
|
512
|
+
dict(flatten(jsonify(result), recurse_on_sequence=True))
|
|
513
|
+
)
|
|
514
|
+
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
515
|
+
span.set_status(status)
|
|
516
|
+
eval_run = ExperimentEvaluationRun(
|
|
517
|
+
experiment_run_id=experiment_run.id,
|
|
518
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
519
|
+
end_time=(
|
|
520
|
+
_decode_unix_nano(cast(int, span.end_time))
|
|
521
|
+
if span.end_time
|
|
522
|
+
else datetime.now()
|
|
523
|
+
),
|
|
524
|
+
name=evaluator.name,
|
|
525
|
+
annotator_kind=evaluator.kind,
|
|
526
|
+
error=repr(error) if error else None,
|
|
527
|
+
result=result,
|
|
528
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type:ignore
|
|
529
|
+
)
|
|
530
|
+
return eval_run
|
|
531
|
+
|
|
532
|
+
_errors: Tuple[Type[BaseException], ...]
|
|
533
|
+
if not isinstance(rate_limit_errors, Sequence):
|
|
534
|
+
_errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
|
|
535
|
+
else:
|
|
536
|
+
_errors = tuple(filter(None, rate_limit_errors))
|
|
537
|
+
rate_limiters = [
|
|
538
|
+
RateLimiter(rate_limit_error=rate_limit_error)
|
|
539
|
+
for rate_limit_error in _errors
|
|
540
|
+
]
|
|
541
|
+
|
|
542
|
+
rate_limited_sync_evaluate_run = functools.reduce(
|
|
543
|
+
lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_eval_run
|
|
544
|
+
)
|
|
545
|
+
rate_limited_async_evaluate_run = functools.reduce(
|
|
546
|
+
lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_eval_run
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
executor = get_executor_on_sync_context(
|
|
550
|
+
rate_limited_sync_evaluate_run,
|
|
551
|
+
rate_limited_async_evaluate_run,
|
|
552
|
+
max_retries=0,
|
|
553
|
+
exit_on_error=exit_on_error,
|
|
554
|
+
fallback_return_value=None,
|
|
555
|
+
tqdm_bar_format=get_tqdm_progress_bar_formatter(
|
|
556
|
+
"running experiment evaluations"
|
|
557
|
+
),
|
|
558
|
+
concurrency=concurrency,
|
|
559
|
+
)
|
|
560
|
+
eval_runs, _ = executor.run(evaluation_input)
|
|
561
|
+
return eval_runs
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def _add_metadata_to_output_df(
|
|
565
|
+
output_df: pd.DataFrame,
|
|
566
|
+
eval_runs: List[ExperimentEvaluationRun],
|
|
567
|
+
evaluator_name: str,
|
|
568
|
+
):
|
|
569
|
+
for eval_run in eval_runs:
|
|
570
|
+
if eval_run.result is None:
|
|
571
|
+
continue
|
|
572
|
+
metadata = eval_run.result.metadata
|
|
573
|
+
for key, value in metadata.items():
|
|
574
|
+
column_name = f"eval.{evaluator_name}.metadata.{key}"
|
|
575
|
+
if column_name not in output_df.columns:
|
|
576
|
+
output_df[column_name] = None
|
|
577
|
+
# If the value is not a primitive type, try to convert it to a string
|
|
578
|
+
if value is not None and not isinstance(
|
|
579
|
+
value, (int, float, str, bool)
|
|
580
|
+
):
|
|
581
|
+
try:
|
|
582
|
+
value = str(value)
|
|
583
|
+
except Exception as e:
|
|
584
|
+
raise ValueError(
|
|
585
|
+
f"Metadata value for key '{key}' in evaluator '{evaluator_name}' is not a primitive"
|
|
586
|
+
"type and cannot be converted to a string."
|
|
587
|
+
) from e
|
|
588
|
+
output_df.loc[eval_run.experiment_run_id, column_name] = value
|
|
589
|
+
return output_df
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def _dataframe_to_examples(dataset: pd.DataFrame) -> List[Example]:
|
|
593
|
+
for column in dataset.columns:
|
|
594
|
+
if pd.api.types.is_datetime64_any_dtype(dataset[column]):
|
|
595
|
+
dataset[column] = dataset[column].astype(str)
|
|
596
|
+
examples = []
|
|
597
|
+
|
|
598
|
+
for _, row in dataset.iterrows():
|
|
599
|
+
example = Example(dataset_row=row.to_dict())
|
|
600
|
+
examples.append(example)
|
|
601
|
+
return examples
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _validate_task_signature(sig: inspect.Signature) -> None:
|
|
605
|
+
# Check that the function signature has a valid signature for use as a task
|
|
606
|
+
# If it does not, raise an error to exit early before running an experiment
|
|
607
|
+
params = sig.parameters
|
|
608
|
+
valid_named_params = {"input", "output", "metadata", "dataset_row"}
|
|
609
|
+
if len(params) == 0:
|
|
610
|
+
raise ValueError("Task function must have at least one parameter.")
|
|
611
|
+
if len(params) > 1:
|
|
612
|
+
for not_found in set(params) - valid_named_params:
|
|
613
|
+
param = params[not_found]
|
|
614
|
+
if (
|
|
615
|
+
param.kind is inspect.Parameter.VAR_KEYWORD
|
|
616
|
+
or param.default is not inspect.Parameter.empty
|
|
617
|
+
):
|
|
618
|
+
continue
|
|
619
|
+
raise ValueError(
|
|
620
|
+
f"Invalid parameter names in task function: {', '.join(not_found)}. "
|
|
621
|
+
"Parameters names for multi-argument functions must be "
|
|
622
|
+
f"any of: {', '.join(valid_named_params)}."
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
def _bind_task_signature(
|
|
627
|
+
sig: inspect.Signature, example: Example
|
|
628
|
+
) -> inspect.BoundArguments:
|
|
629
|
+
parameter_mapping = {
|
|
630
|
+
"input": example.input,
|
|
631
|
+
"output": example.output,
|
|
632
|
+
"metadata": example.metadata,
|
|
633
|
+
"dataset_row": example.dataset_row,
|
|
634
|
+
}
|
|
635
|
+
params = sig.parameters
|
|
636
|
+
if len(params) == 1:
|
|
637
|
+
parameter_name = next(iter(params))
|
|
638
|
+
if parameter_name in parameter_mapping:
|
|
639
|
+
return sig.bind(parameter_mapping[parameter_name])
|
|
640
|
+
else:
|
|
641
|
+
return sig.bind(parameter_mapping["dataset_row"])
|
|
642
|
+
return sig.bind_partial(
|
|
643
|
+
**{
|
|
644
|
+
name: parameter_mapping[name]
|
|
645
|
+
for name in set(parameter_mapping).intersection(params)
|
|
646
|
+
}
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def _evaluators_by_name(
|
|
651
|
+
obj: Evaluators | None,
|
|
652
|
+
) -> Mapping[EvaluatorName, Evaluator]:
|
|
653
|
+
evaluators_by_name: Dict[EvaluatorName, Evaluator] = {}
|
|
654
|
+
if obj is None:
|
|
655
|
+
return evaluators_by_name
|
|
656
|
+
if isinstance(obj, Mapping):
|
|
657
|
+
for name, value in obj.items():
|
|
658
|
+
evaluator = (
|
|
659
|
+
create_evaluator(name=name)(value)
|
|
660
|
+
if not isinstance(value, Evaluator)
|
|
661
|
+
else value
|
|
662
|
+
)
|
|
663
|
+
name = evaluator.name
|
|
664
|
+
if name in evaluators_by_name:
|
|
665
|
+
raise ValueError(f"Two evaluators have the same name: {name}")
|
|
666
|
+
evaluators_by_name[name] = evaluator
|
|
667
|
+
elif isinstance(obj, Sequence):
|
|
668
|
+
for value in obj:
|
|
669
|
+
evaluator = (
|
|
670
|
+
create_evaluator()(value)
|
|
671
|
+
if not isinstance(value, Evaluator)
|
|
672
|
+
else value
|
|
673
|
+
)
|
|
674
|
+
name = evaluator.name
|
|
675
|
+
if name in evaluators_by_name:
|
|
676
|
+
raise ValueError(f"Two evaluators have the same name: {name}")
|
|
677
|
+
evaluators_by_name[name] = evaluator
|
|
678
|
+
else:
|
|
679
|
+
assert not isinstance(obj, Mapping) and not isinstance(obj, Sequence)
|
|
680
|
+
evaluator = (
|
|
681
|
+
create_evaluator()(obj) if not isinstance(obj, Evaluator) else obj
|
|
682
|
+
)
|
|
683
|
+
name = evaluator.name
|
|
684
|
+
if name in evaluators_by_name:
|
|
685
|
+
raise ValueError(f"Two evaluators have the same name: {name}")
|
|
686
|
+
evaluators_by_name[name] = evaluator
|
|
687
|
+
return evaluators_by_name
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def get_func_name(fn: Callable[..., Any]) -> str:
|
|
691
|
+
"""
|
|
692
|
+
Makes a best-effort attempt to get the name of the function.
|
|
693
|
+
"""
|
|
694
|
+
if isinstance(fn, functools.partial):
|
|
695
|
+
return fn.func.__qualname__
|
|
696
|
+
if hasattr(fn, "__qualname__") and not fn.__qualname__.endswith("<lambda>"):
|
|
697
|
+
return fn.__qualname__.split(".<locals>.")[-1]
|
|
698
|
+
return str(fn)
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
def _print_experiment_error(
|
|
702
|
+
error: BaseException,
|
|
703
|
+
/,
|
|
704
|
+
*,
|
|
705
|
+
example_id: str,
|
|
706
|
+
kind: Literal["evaluator", "task"],
|
|
707
|
+
) -> None:
|
|
708
|
+
"""
|
|
709
|
+
Prints an experiment error.
|
|
710
|
+
"""
|
|
711
|
+
display_error = RuntimeError(
|
|
712
|
+
f"{kind} failed for example id {repr(example_id)}"
|
|
713
|
+
)
|
|
714
|
+
display_error.__cause__ = error
|
|
715
|
+
formatted_exception = "".join(
|
|
716
|
+
traceback.format_exception(
|
|
717
|
+
type(display_error), display_error, display_error.__traceback__
|
|
718
|
+
)
|
|
719
|
+
)
|
|
720
|
+
print("\033[91m" + formatted_exception + "\033[0m") # prints in red
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
def _decode_unix_nano(time_unix_nano: int) -> datetime:
|
|
724
|
+
return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
def _str_trace_id(id_: int) -> str:
|
|
728
|
+
return hexlify(id_.to_bytes(16, "big")).decode()
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
def get_tqdm_progress_bar_formatter(title: str) -> str:
|
|
732
|
+
"""
|
|
733
|
+
Returns a progress bar formatter for use with tqdm.
|
|
734
|
+
|
|
735
|
+
Args:
|
|
736
|
+
title (str): The title of the progress bar, displayed as a prefix.
|
|
737
|
+
|
|
738
|
+
Returns:
|
|
739
|
+
str: A formatter to be passed to the bar_format argument of tqdm.
|
|
740
|
+
|
|
741
|
+
"""
|
|
742
|
+
return (
|
|
743
|
+
title + " |{bar}| {n_fmt}/{total_fmt} ({percentage:3.1f}%) "
|
|
744
|
+
"| ⏳ {elapsed}<{remaining} | {rate_fmt}{postfix}"
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
INPUT_VALUE = SpanAttributes.INPUT_VALUE
|
|
749
|
+
OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
|
|
750
|
+
INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
|
|
751
|
+
OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
|
|
752
|
+
OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
|
|
753
|
+
METADATA = SpanAttributes.METADATA
|
|
754
|
+
|
|
755
|
+
CHAIN = OpenInferenceSpanKindValues.CHAIN.value
|
|
756
|
+
EVALUATOR = OpenInferenceSpanKindValues.EVALUATOR.value
|
|
757
|
+
JSON = OpenInferenceMimeTypeValues.JSON
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
def get_result_attr(r, attr, default=None):
|
|
761
|
+
return getattr(r.result, attr, default) if r.result else default
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def transform_to_experiment_format(
|
|
765
|
+
experiment_runs: List[Dict[str, Any]] | pd.DataFrame,
|
|
766
|
+
task_fields: ExperimentTaskResultFieldNames,
|
|
767
|
+
evaluator_fields: Dict[str, EvaluationResultFieldNames] | None = None,
|
|
768
|
+
) -> pd.DataFrame:
|
|
769
|
+
"""
|
|
770
|
+
Transform a DataFrame to match the format returned by run_experiment().
|
|
771
|
+
|
|
772
|
+
Args:
|
|
773
|
+
df: Input DataFrame containing experiment results
|
|
774
|
+
task_columns: Column mapping for task results
|
|
775
|
+
evaluator_columns: Dictionary mapping evaluator names (str)
|
|
776
|
+
to their column mappings (EvaluationResultColumnNames)
|
|
777
|
+
|
|
778
|
+
Returns:
|
|
779
|
+
DataFrame in the format matching run_experiment() output
|
|
780
|
+
"""
|
|
781
|
+
data = (
|
|
782
|
+
experiment_runs
|
|
783
|
+
if isinstance(experiment_runs, pd.DataFrame)
|
|
784
|
+
else pd.DataFrame(experiment_runs)
|
|
785
|
+
)
|
|
786
|
+
# Validate required columns
|
|
787
|
+
required_cols = {task_fields.example_id, task_fields.result}
|
|
788
|
+
missing_cols = required_cols - set(data.columns)
|
|
789
|
+
if missing_cols:
|
|
790
|
+
raise ValueError(f"Missing required columns: {missing_cols}")
|
|
791
|
+
|
|
792
|
+
# Initialize output DataFrame with required columns
|
|
793
|
+
out_df = data.copy()
|
|
794
|
+
out_df["id"] = range(len(data)) # Generate sequential IDs
|
|
795
|
+
out_df["example_id"] = data[task_fields.example_id]
|
|
796
|
+
if task_fields.example_id != "example_id":
|
|
797
|
+
out_df.drop(task_fields.example_id, axis=1, inplace=True)
|
|
798
|
+
out_df["result"] = data[task_fields.result].apply(
|
|
799
|
+
lambda x: json.dumps(x) if isinstance(x, dict) else x
|
|
800
|
+
)
|
|
801
|
+
if task_fields.result != "result":
|
|
802
|
+
out_df.drop(task_fields.result, axis=1, inplace=True)
|
|
803
|
+
|
|
804
|
+
# Process evaluator results
|
|
805
|
+
if evaluator_fields:
|
|
806
|
+
for evaluator_name, column_names in evaluator_fields.items():
|
|
807
|
+
_add_evaluator_columns(data, out_df, evaluator_name, column_names)
|
|
808
|
+
|
|
809
|
+
# Set index but keep id column
|
|
810
|
+
out_df.set_index("id", inplace=True, drop=False)
|
|
811
|
+
out_df.reset_index(drop=True, inplace=True)
|
|
812
|
+
return out_df
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
def _add_evaluator_columns(
|
|
816
|
+
input_df: pd.DataFrame,
|
|
817
|
+
output_df: pd.DataFrame,
|
|
818
|
+
evaluator_name: str,
|
|
819
|
+
column_names: EvaluationResultFieldNames,
|
|
820
|
+
) -> None:
|
|
821
|
+
"""Helper function to add evaluator columns to output DataFrame"""
|
|
822
|
+
# Add score if specified
|
|
823
|
+
if column_names.score and column_names.score in input_df.columns:
|
|
824
|
+
output_df[f"eval.{evaluator_name}.score"] = input_df[column_names.score]
|
|
825
|
+
output_df.drop(column_names.score, axis=1, inplace=True)
|
|
826
|
+
|
|
827
|
+
# Add label if specified
|
|
828
|
+
if column_names.label and column_names.label in input_df.columns:
|
|
829
|
+
output_df[f"eval.{evaluator_name}.label"] = input_df[column_names.label]
|
|
830
|
+
output_df.drop(column_names.label, axis=1, inplace=True)
|
|
831
|
+
|
|
832
|
+
# Add explanation if specified
|
|
833
|
+
if (
|
|
834
|
+
column_names.explanation
|
|
835
|
+
and column_names.explanation in input_df.columns
|
|
836
|
+
):
|
|
837
|
+
output_df[f"eval.{evaluator_name}.explanation"] = input_df[
|
|
838
|
+
column_names.explanation
|
|
839
|
+
]
|
|
840
|
+
output_df.drop(column_names.explanation, axis=1, inplace=True)
|
|
841
|
+
|
|
842
|
+
# Add metadata columns if specified
|
|
843
|
+
if column_names.metadata:
|
|
844
|
+
for metadata_key, column_name in column_names.metadata.items():
|
|
845
|
+
# If column_name not specified, use metadata_key as the column name
|
|
846
|
+
md_col_name = column_name if column_name else metadata_key
|
|
847
|
+
|
|
848
|
+
if md_col_name not in input_df.columns:
|
|
849
|
+
raise ValueError(
|
|
850
|
+
f"metadata column {md_col_name} not found in input DataFrame columns: "
|
|
851
|
+
f"{input_df.columns}"
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
output_col = f"eval.{evaluator_name}.metadata.{metadata_key}"
|
|
855
|
+
output_df.drop(md_col_name, axis=1, inplace=True)
|
|
856
|
+
|
|
857
|
+
output_vals = input_df[md_col_name].apply(
|
|
858
|
+
lambda x: str(x)
|
|
859
|
+
if x is not None and not isinstance(x, (int, float, str, bool))
|
|
860
|
+
else x
|
|
861
|
+
)
|
|
862
|
+
output_df[output_col] = output_vals
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def jsonify(obj: Any) -> Any:
|
|
866
|
+
"""
|
|
867
|
+
Coerce object to be json serializable.
|
|
868
|
+
"""
|
|
869
|
+
if isinstance(obj, Enum):
|
|
870
|
+
return jsonify(obj.value)
|
|
871
|
+
if isinstance(obj, (str, int, float, bool)) or obj is None:
|
|
872
|
+
return obj
|
|
873
|
+
if isinstance(obj, (list, set, frozenset, Sequence)):
|
|
874
|
+
return [jsonify(v) for v in obj]
|
|
875
|
+
if isinstance(obj, (dict, Mapping)):
|
|
876
|
+
return {jsonify(k): jsonify(v) for k, v in obj.items()}
|
|
877
|
+
if dataclasses.is_dataclass(obj):
|
|
878
|
+
result = {}
|
|
879
|
+
for field in dataclasses.fields(obj):
|
|
880
|
+
k = field.name
|
|
881
|
+
v = getattr(obj, k)
|
|
882
|
+
if not (
|
|
883
|
+
v is None
|
|
884
|
+
and get_origin(field) is Union
|
|
885
|
+
and type(None) in get_args(field)
|
|
886
|
+
):
|
|
887
|
+
result[k] = jsonify(v)
|
|
888
|
+
return result
|
|
889
|
+
if isinstance(obj, (date, datetime, time)):
|
|
890
|
+
return obj.isoformat()
|
|
891
|
+
if isinstance(obj, timedelta):
|
|
892
|
+
return obj.total_seconds()
|
|
893
|
+
if isinstance(obj, Path):
|
|
894
|
+
return str(obj)
|
|
895
|
+
if isinstance(obj, BaseException):
|
|
896
|
+
return str(obj)
|
|
897
|
+
if isinstance(obj, np.ndarray):
|
|
898
|
+
return [jsonify(v) for v in obj]
|
|
899
|
+
if hasattr(obj, "__float__"):
|
|
900
|
+
return float(obj)
|
|
901
|
+
if hasattr(obj, "model_dump") and callable(obj.model_dump):
|
|
902
|
+
# pydantic v2
|
|
903
|
+
try:
|
|
904
|
+
d = obj
|
|
905
|
+
assert isinstance(d, dict)
|
|
906
|
+
except BaseException:
|
|
907
|
+
pass
|
|
908
|
+
else:
|
|
909
|
+
return jsonify(d)
|
|
910
|
+
if hasattr(obj, "dict") and callable(obj.dict):
|
|
911
|
+
# pydantic v1
|
|
912
|
+
try:
|
|
913
|
+
d = obj.dict()
|
|
914
|
+
assert isinstance(d, dict)
|
|
915
|
+
except BaseException:
|
|
916
|
+
pass
|
|
917
|
+
else:
|
|
918
|
+
return jsonify(d)
|
|
919
|
+
cls = obj.__class__
|
|
920
|
+
return f"<{cls.__module__}.{cls.__name__} object>"
|