arize 8.0.0a14__py3-none-any.whl → 8.0.0a16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +70 -1
- arize/_flight/client.py +163 -43
- arize/_flight/types.py +1 -0
- arize/_generated/api_client/__init__.py +5 -1
- arize/_generated/api_client/api/datasets_api.py +6 -6
- arize/_generated/api_client/api/experiments_api.py +924 -61
- arize/_generated/api_client/api_client.py +1 -1
- arize/_generated/api_client/configuration.py +1 -1
- arize/_generated/api_client/exceptions.py +1 -1
- arize/_generated/api_client/models/__init__.py +3 -1
- arize/_generated/api_client/models/dataset.py +2 -2
- arize/_generated/api_client/models/dataset_version.py +1 -1
- arize/_generated/api_client/models/datasets_create_request.py +3 -3
- arize/_generated/api_client/models/datasets_list200_response.py +1 -1
- arize/_generated/api_client/models/datasets_list_examples200_response.py +1 -1
- arize/_generated/api_client/models/error.py +1 -1
- arize/_generated/api_client/models/experiment.py +6 -6
- arize/_generated/api_client/models/experiments_create_request.py +98 -0
- arize/_generated/api_client/models/experiments_list200_response.py +1 -1
- arize/_generated/api_client/models/experiments_runs_list200_response.py +92 -0
- arize/_generated/api_client/rest.py +1 -1
- arize/_generated/api_client/test/test_dataset.py +2 -1
- arize/_generated/api_client/test/test_dataset_version.py +1 -1
- arize/_generated/api_client/test/test_datasets_api.py +1 -1
- arize/_generated/api_client/test/test_datasets_create_request.py +2 -1
- arize/_generated/api_client/test/test_datasets_list200_response.py +1 -1
- arize/_generated/api_client/test/test_datasets_list_examples200_response.py +1 -1
- arize/_generated/api_client/test/test_error.py +1 -1
- arize/_generated/api_client/test/test_experiment.py +6 -1
- arize/_generated/api_client/test/test_experiments_api.py +23 -2
- arize/_generated/api_client/test/test_experiments_create_request.py +61 -0
- arize/_generated/api_client/test/test_experiments_list200_response.py +1 -1
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +56 -0
- arize/_generated/api_client_README.md +13 -8
- arize/client.py +19 -2
- arize/config.py +50 -3
- arize/constants/config.py +8 -2
- arize/constants/openinference.py +14 -0
- arize/constants/pyarrow.py +1 -0
- arize/datasets/__init__.py +0 -70
- arize/datasets/client.py +106 -19
- arize/datasets/errors.py +61 -0
- arize/datasets/validation.py +46 -0
- arize/experiments/client.py +455 -0
- arize/experiments/evaluators/__init__.py +0 -0
- arize/experiments/evaluators/base.py +255 -0
- arize/experiments/evaluators/exceptions.py +10 -0
- arize/experiments/evaluators/executors.py +502 -0
- arize/experiments/evaluators/rate_limiters.py +277 -0
- arize/experiments/evaluators/types.py +122 -0
- arize/experiments/evaluators/utils.py +198 -0
- arize/experiments/functions.py +920 -0
- arize/experiments/tracing.py +276 -0
- arize/experiments/types.py +394 -0
- arize/models/client.py +4 -1
- arize/spans/client.py +16 -20
- arize/utils/arrow.py +4 -3
- arize/utils/openinference_conversion.py +56 -0
- arize/utils/proto.py +13 -0
- arize/utils/size.py +22 -0
- arize/version.py +1 -1
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/METADATA +3 -1
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/RECORD +65 -44
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/WHEEL +0 -0
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import inspect
|
|
5
|
+
from abc import ABC
|
|
6
|
+
from types import MappingProxyType
|
|
7
|
+
from typing import Any, Awaitable, Callable, Mapping, Sequence
|
|
8
|
+
|
|
9
|
+
from arize.experiments.evaluators.types import (
|
|
10
|
+
AnnotatorKind,
|
|
11
|
+
EvaluationResult,
|
|
12
|
+
EvaluatorKind,
|
|
13
|
+
EvaluatorName,
|
|
14
|
+
EvaluatorOutput,
|
|
15
|
+
JSONSerializable,
|
|
16
|
+
)
|
|
17
|
+
from arize.experiments.types import (
|
|
18
|
+
ExampleInput,
|
|
19
|
+
ExampleMetadata,
|
|
20
|
+
ExampleOutput,
|
|
21
|
+
TaskOutput,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Evaluator(ABC):
|
|
26
|
+
"""
|
|
27
|
+
A helper super class to guide the implementation of an `Evaluator` object.
|
|
28
|
+
Subclasses must implement either the `evaluate` or `async_evaluate` method.
|
|
29
|
+
Implementing both methods is recommended, but not required.
|
|
30
|
+
|
|
31
|
+
This Class is intended to be subclassed, and should not be instantiated directly.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
_kind: EvaluatorKind
|
|
35
|
+
_name: EvaluatorName
|
|
36
|
+
|
|
37
|
+
@functools.cached_property
|
|
38
|
+
def name(self) -> EvaluatorName:
|
|
39
|
+
if hasattr(self, "_name"):
|
|
40
|
+
return self._name
|
|
41
|
+
return self.__class__.__name__
|
|
42
|
+
|
|
43
|
+
@functools.cached_property
|
|
44
|
+
def kind(self) -> EvaluatorKind:
|
|
45
|
+
if hasattr(self, "_kind"):
|
|
46
|
+
return self._kind
|
|
47
|
+
return AnnotatorKind.CODE.value
|
|
48
|
+
|
|
49
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> Evaluator:
|
|
50
|
+
if cls is Evaluator:
|
|
51
|
+
raise TypeError(
|
|
52
|
+
f"{cls.__name__} is an abstract class and should not be instantiated."
|
|
53
|
+
)
|
|
54
|
+
return object.__new__(cls)
|
|
55
|
+
|
|
56
|
+
def evaluate(
|
|
57
|
+
self,
|
|
58
|
+
*,
|
|
59
|
+
dataset_row: Mapping[str, JSONSerializable] | None = None,
|
|
60
|
+
input: ExampleInput = MappingProxyType({}),
|
|
61
|
+
output: TaskOutput | None = None,
|
|
62
|
+
experiment_output: TaskOutput | None = None,
|
|
63
|
+
dataset_output: ExampleOutput = MappingProxyType({}),
|
|
64
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
65
|
+
**kwargs: Any,
|
|
66
|
+
) -> EvaluationResult:
|
|
67
|
+
"""
|
|
68
|
+
Evaluate the given inputs and produce an evaluation result.
|
|
69
|
+
This method should be implemented by subclasses to perform the actual
|
|
70
|
+
evaluation logic. It is recommended to implement both this synchronous
|
|
71
|
+
method and the asynchronous `async_evaluate` method, but it is not required.
|
|
72
|
+
Args:
|
|
73
|
+
output (Optional[TaskOutput]): The output produced by the task.
|
|
74
|
+
expected (Optional[ExampleOutput]): The expected output for comparison.
|
|
75
|
+
dataset_row (Optional[Mapping[str, JSONSerializable]]): A row from the dataset.
|
|
76
|
+
metadata (ExampleMetadata): Metadata associated with the example.
|
|
77
|
+
input (ExampleInput): The input provided for evaluation.
|
|
78
|
+
**kwargs (Any): Additional keyword arguments.
|
|
79
|
+
Raises:
|
|
80
|
+
NotImplementedError: If the method is not implemented by the subclass.
|
|
81
|
+
"""
|
|
82
|
+
# For subclassing, one should implement either this sync method or the
|
|
83
|
+
# async version. Implementing both is recommended but not required.
|
|
84
|
+
raise NotImplementedError
|
|
85
|
+
|
|
86
|
+
async def async_evaluate(
|
|
87
|
+
self,
|
|
88
|
+
*,
|
|
89
|
+
dataset_row: Mapping[str, JSONSerializable] | None = None,
|
|
90
|
+
input: ExampleInput = MappingProxyType({}),
|
|
91
|
+
output: TaskOutput | None = None,
|
|
92
|
+
experiment_output: TaskOutput | None = None,
|
|
93
|
+
dataset_output: ExampleOutput = MappingProxyType({}),
|
|
94
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
95
|
+
**kwargs: Any,
|
|
96
|
+
) -> EvaluationResult:
|
|
97
|
+
"""
|
|
98
|
+
Asynchronously evaluate the given inputs and produce an evaluation result.
|
|
99
|
+
This method should be implemented by subclasses to perform the actual
|
|
100
|
+
evaluation logic. It is recommended to implement both this asynchronous
|
|
101
|
+
method and the synchronous `evaluate` method, but it is not required.
|
|
102
|
+
Args:
|
|
103
|
+
output (Optional[TaskOutput]): The output produced by the task.
|
|
104
|
+
expected (Optional[ExampleOutput]): The expected output for comparison.
|
|
105
|
+
dataset_row (Optional[Mapping[str, JSONSerializable]]): A row from the dataset.
|
|
106
|
+
metadata (ExampleMetadata): Metadata associated with the example.
|
|
107
|
+
input (ExampleInput): The input provided for evaluation.
|
|
108
|
+
**kwargs (Any): Additional keyword arguments.
|
|
109
|
+
Returns:
|
|
110
|
+
EvaluationResult: The result of the evaluation.
|
|
111
|
+
Raises:
|
|
112
|
+
NotImplementedError: If the method is not implemented by the subclass.
|
|
113
|
+
"""
|
|
114
|
+
# For subclassing, one should implement either this async method or the
|
|
115
|
+
# sync version. Implementing both is recommended but not required.
|
|
116
|
+
return self.evaluate(
|
|
117
|
+
dataset_row=dataset_row,
|
|
118
|
+
input=input,
|
|
119
|
+
output=output,
|
|
120
|
+
experiment_output=experiment_output,
|
|
121
|
+
dataset_output=dataset_output,
|
|
122
|
+
metadata=metadata,
|
|
123
|
+
**kwargs,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def __init_subclass__(
|
|
127
|
+
cls, is_abstract: bool = False, **kwargs: Any
|
|
128
|
+
) -> None:
|
|
129
|
+
super().__init_subclass__(**kwargs)
|
|
130
|
+
if is_abstract:
|
|
131
|
+
return
|
|
132
|
+
evaluate_fn_signature = inspect.signature(Evaluator.evaluate)
|
|
133
|
+
for super_cls in inspect.getmro(cls):
|
|
134
|
+
if super_cls in (LLMEvaluator, Evaluator):
|
|
135
|
+
break
|
|
136
|
+
if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__):
|
|
137
|
+
if isinstance(evaluate, classmethod):
|
|
138
|
+
evaluate = evaluate.__func__
|
|
139
|
+
assert callable(evaluate), (
|
|
140
|
+
"`evaluate()` method should be callable"
|
|
141
|
+
)
|
|
142
|
+
# need to remove the first param, i.e. `self`
|
|
143
|
+
_validate_sig(functools.partial(evaluate, None), "evaluate")
|
|
144
|
+
return
|
|
145
|
+
if async_evaluate := super_cls.__dict__.get(
|
|
146
|
+
Evaluator.async_evaluate.__name__
|
|
147
|
+
):
|
|
148
|
+
if isinstance(async_evaluate, classmethod):
|
|
149
|
+
async_evaluate = async_evaluate.__func__
|
|
150
|
+
assert callable(async_evaluate), (
|
|
151
|
+
"`async_evaluate()` method should be callable"
|
|
152
|
+
)
|
|
153
|
+
# need to remove the first param, i.e. `self`
|
|
154
|
+
_validate_sig(
|
|
155
|
+
functools.partial(async_evaluate, None), "async_evaluate"
|
|
156
|
+
)
|
|
157
|
+
return
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"Evaluator must implement either "
|
|
160
|
+
f"`def evaluate{evaluate_fn_signature}` or "
|
|
161
|
+
f"`async def async_evaluate{evaluate_fn_signature}`"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
|
|
166
|
+
sig = inspect.signature(fn)
|
|
167
|
+
validate_evaluator_signature(sig)
|
|
168
|
+
for param in sig.parameters.values():
|
|
169
|
+
if param.kind is inspect.Parameter.VAR_KEYWORD:
|
|
170
|
+
return
|
|
171
|
+
else:
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"`{fn_name}` should allow variadic keyword arguments `**kwargs`"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def validate_evaluator_signature(sig: inspect.Signature) -> None:
|
|
178
|
+
# Check that the wrapped function has a valid signature for use as an evaluator
|
|
179
|
+
# If it does not, raise an error to exit early before running evaluations
|
|
180
|
+
params = sig.parameters
|
|
181
|
+
valid_named_params = {
|
|
182
|
+
"dataset_row",
|
|
183
|
+
"input",
|
|
184
|
+
"output",
|
|
185
|
+
"experiment_output",
|
|
186
|
+
"dataset_output",
|
|
187
|
+
"metadata",
|
|
188
|
+
}
|
|
189
|
+
if len(params) == 0:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
"Evaluation function must have at least one parameter."
|
|
192
|
+
)
|
|
193
|
+
if len(params) > 1:
|
|
194
|
+
for not_found in set(params) - valid_named_params:
|
|
195
|
+
param = params[not_found]
|
|
196
|
+
if (
|
|
197
|
+
param.kind is inspect.Parameter.VAR_KEYWORD
|
|
198
|
+
or param.default is not inspect.Parameter.empty
|
|
199
|
+
):
|
|
200
|
+
continue
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"Invalid parameter names in evaluation function: {', '.join(not_found)}. "
|
|
203
|
+
"Parameters names for multi-argument functions must be "
|
|
204
|
+
f"any of: {', '.join(valid_named_params)}."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class CodeEvaluator(Evaluator, ABC, is_abstract=True):
|
|
209
|
+
"""
|
|
210
|
+
A convenience super class for defining code evaluators. There are functionally
|
|
211
|
+
no differences between this class and the `Evaluator` class, except that this
|
|
212
|
+
class has a default `_kind` attribute for AnnotatorKind.CODE.
|
|
213
|
+
This class is intended to be subclassed, and should not be instantiated directly.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
_kind = str(AnnotatorKind.CODE)
|
|
217
|
+
|
|
218
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> CodeEvaluator:
|
|
219
|
+
if cls is CodeEvaluator:
|
|
220
|
+
raise TypeError(
|
|
221
|
+
f"{cls.__name__} is an abstract class and should not be instantiated."
|
|
222
|
+
)
|
|
223
|
+
return object.__new__(cls)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class LLMEvaluator(Evaluator, ABC, is_abstract=True):
|
|
227
|
+
"""
|
|
228
|
+
A convenience super class for defining LLM evaluators. There are functionally
|
|
229
|
+
no differences between this class and the `Evaluator` class, except that this
|
|
230
|
+
class has a default `_kind` attribute for AnnotatorKind.LLM.
|
|
231
|
+
This class is intended to be subclassed, and should not be instantiated directly.
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
_kind = str(AnnotatorKind.LLM)
|
|
235
|
+
|
|
236
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> LLMEvaluator:
|
|
237
|
+
if cls is LLMEvaluator:
|
|
238
|
+
raise TypeError(
|
|
239
|
+
f"{cls.__name__} is an abstract class and should not be instantiated."
|
|
240
|
+
)
|
|
241
|
+
return object.__new__(cls)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
ExperimentEvaluator = (
|
|
245
|
+
Evaluator
|
|
246
|
+
| Callable[..., EvaluatorOutput]
|
|
247
|
+
| Callable[..., Awaitable[EvaluatorOutput]]
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
Evaluators = (
|
|
252
|
+
ExperimentEvaluator
|
|
253
|
+
| Sequence[ExperimentEvaluator]
|
|
254
|
+
| Mapping[EvaluatorName, ExperimentEvaluator]
|
|
255
|
+
)
|