arize-phoenix 4.4.4rc4__py3-none-any.whl → 4.4.4rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc5.dist-info}/METADATA +2 -2
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc5.dist-info}/RECORD +30 -28
- phoenix/datasets/evaluators/code_evaluators.py +25 -53
- phoenix/datasets/evaluators/llm_evaluators.py +63 -32
- phoenix/datasets/evaluators/utils.py +292 -0
- phoenix/datasets/experiments.py +147 -82
- phoenix/datasets/tracing.py +19 -0
- phoenix/datasets/types.py +18 -52
- phoenix/db/insertion/dataset.py +19 -16
- phoenix/db/migrations/versions/10460e46d750_datasets.py +2 -2
- phoenix/db/models.py +8 -3
- phoenix/server/api/context.py +2 -0
- phoenix/server/api/dataloaders/__init__.py +2 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
- phoenix/server/api/helpers/dataset_helpers.py +8 -7
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/mutations/project_mutations.py +9 -4
- phoenix/server/api/routers/v1/datasets.py +146 -42
- phoenix/server/api/routers/v1/experiment_evaluations.py +1 -0
- phoenix/server/api/routers/v1/experiment_runs.py +2 -2
- phoenix/server/api/types/Experiment.py +5 -0
- phoenix/server/api/types/ExperimentRun.py +1 -1
- phoenix/server/api/types/ExperimentRunAnnotation.py +1 -1
- phoenix/server/app.py +2 -0
- phoenix/server/static/index.js +610 -564
- phoenix/session/client.py +124 -2
- phoenix/version.py +1 -1
- phoenix/datasets/evaluators/_utils.py +0 -13
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc5.dist-info}/WHEEL +0 -0
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc5.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
from abc import ABC
|
|
4
|
+
from types import MappingProxyType
|
|
5
|
+
from typing import Any, Awaitable, Callable, Mapping, Optional, Union
|
|
6
|
+
|
|
7
|
+
from typing_extensions import TypeAlias
|
|
8
|
+
|
|
9
|
+
from phoenix.datasets.types import (
|
|
10
|
+
AnnotatorKind,
|
|
11
|
+
EvaluationResult,
|
|
12
|
+
JSONSerializable,
|
|
13
|
+
TaskOutput,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
|
|
18
|
+
if isinstance(obj, dict):
|
|
19
|
+
if len(obj) == 1:
|
|
20
|
+
key = next(iter(obj.keys()))
|
|
21
|
+
output = obj[key]
|
|
22
|
+
assert isinstance(
|
|
23
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
24
|
+
), "Output must be JSON serializable"
|
|
25
|
+
return output
|
|
26
|
+
return obj
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_signature(sig: inspect.Signature) -> None:
|
|
30
|
+
# Check that the wrapped function has a valid signature for use as an evaluator
|
|
31
|
+
# If it does not, raise an error to exit early before running evaluations
|
|
32
|
+
params = sig.parameters
|
|
33
|
+
valid_named_params = {"input", "output", "expected", "metadata"}
|
|
34
|
+
if len(params) == 0:
|
|
35
|
+
raise ValueError("Evaluation function must have at least one parameter.")
|
|
36
|
+
if len(params) > 1:
|
|
37
|
+
for not_found in set(params) - valid_named_params:
|
|
38
|
+
param = params[not_found]
|
|
39
|
+
if (
|
|
40
|
+
param.kind is inspect.Parameter.VAR_KEYWORD
|
|
41
|
+
or param.default is not inspect.Parameter.empty
|
|
42
|
+
):
|
|
43
|
+
continue
|
|
44
|
+
raise ValueError(
|
|
45
|
+
(
|
|
46
|
+
f"Invalid parameter names in evaluation function: {', '.join(not_found)}. "
|
|
47
|
+
"Parameters names for multi-argument functions must be "
|
|
48
|
+
f"any of: {', '.join(valid_named_params)}."
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _bind_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
|
|
54
|
+
parameter_mapping = {
|
|
55
|
+
"input": kwargs.get("input"),
|
|
56
|
+
"output": kwargs.get("output"),
|
|
57
|
+
"expected": kwargs.get("expected"),
|
|
58
|
+
"metadata": kwargs.get("metadata"),
|
|
59
|
+
}
|
|
60
|
+
params = sig.parameters
|
|
61
|
+
if len(params) == 1:
|
|
62
|
+
parameter_name = next(iter(params))
|
|
63
|
+
if parameter_name in parameter_mapping:
|
|
64
|
+
return sig.bind(parameter_mapping[parameter_name])
|
|
65
|
+
else:
|
|
66
|
+
return sig.bind(parameter_mapping["output"])
|
|
67
|
+
return sig.bind_partial(
|
|
68
|
+
**{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def create_evaluator(
|
|
73
|
+
kind: Union[str, AnnotatorKind] = AnnotatorKind.CODE,
|
|
74
|
+
name: Optional[str] = None,
|
|
75
|
+
scorer: Optional[Callable[[Any], EvaluationResult]] = None,
|
|
76
|
+
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
77
|
+
if scorer is None:
|
|
78
|
+
scorer = _default_eval_scorer
|
|
79
|
+
|
|
80
|
+
if isinstance(kind, str):
|
|
81
|
+
kind = AnnotatorKind(kind.upper())
|
|
82
|
+
|
|
83
|
+
def wrapper(func: Callable[..., Any]) -> Evaluator:
|
|
84
|
+
nonlocal name
|
|
85
|
+
if not name:
|
|
86
|
+
if hasattr(func, "__self__"):
|
|
87
|
+
name = func.__self__.__class__.__name__
|
|
88
|
+
elif hasattr(func, "__name__"):
|
|
89
|
+
name = func.__name__
|
|
90
|
+
else:
|
|
91
|
+
name = str(func)
|
|
92
|
+
assert name is not None
|
|
93
|
+
|
|
94
|
+
wrapped_signature = inspect.signature(func)
|
|
95
|
+
validate_signature(wrapped_signature)
|
|
96
|
+
|
|
97
|
+
if inspect.iscoroutinefunction(func):
|
|
98
|
+
return _wrap_coroutine_evaluation_function(name, kind, wrapped_signature, scorer)(func)
|
|
99
|
+
else:
|
|
100
|
+
return _wrap_sync_evaluation_function(name, kind, wrapped_signature, scorer)(func)
|
|
101
|
+
|
|
102
|
+
return wrapper
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _wrap_coroutine_evaluation_function(
|
|
106
|
+
name: str,
|
|
107
|
+
annotator_kind: AnnotatorKind,
|
|
108
|
+
sig: inspect.Signature,
|
|
109
|
+
convert_to_score: Callable[[Any], EvaluationResult],
|
|
110
|
+
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
111
|
+
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
112
|
+
class AsyncEvaluator(Evaluator):
|
|
113
|
+
def __init__(self) -> None:
|
|
114
|
+
self._name = name
|
|
115
|
+
self._kind = annotator_kind
|
|
116
|
+
|
|
117
|
+
@functools.wraps(func)
|
|
118
|
+
async def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
|
119
|
+
return await func(*args, **kwargs)
|
|
120
|
+
|
|
121
|
+
async def async_evaluate(self, **kwargs: Any) -> EvaluationResult:
|
|
122
|
+
bound_signature = _bind_signature(sig, **kwargs)
|
|
123
|
+
result = await func(*bound_signature.args, **bound_signature.kwargs)
|
|
124
|
+
return convert_to_score(result)
|
|
125
|
+
|
|
126
|
+
return AsyncEvaluator()
|
|
127
|
+
|
|
128
|
+
return wrapper
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _wrap_sync_evaluation_function(
|
|
132
|
+
name: str,
|
|
133
|
+
annotator_kind: AnnotatorKind,
|
|
134
|
+
sig: inspect.Signature,
|
|
135
|
+
convert_to_score: Callable[[Any], EvaluationResult],
|
|
136
|
+
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
137
|
+
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
138
|
+
class SyncEvaluator(Evaluator):
|
|
139
|
+
def __init__(self) -> None:
|
|
140
|
+
self._name = name
|
|
141
|
+
self._kind = annotator_kind
|
|
142
|
+
|
|
143
|
+
@functools.wraps(func)
|
|
144
|
+
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
|
145
|
+
return func(*args, **kwargs)
|
|
146
|
+
|
|
147
|
+
def evaluate(self, **kwargs: Any) -> EvaluationResult:
|
|
148
|
+
bound_signature = _bind_signature(sig, **kwargs)
|
|
149
|
+
result = func(*bound_signature.args, **bound_signature.kwargs)
|
|
150
|
+
return convert_to_score(result)
|
|
151
|
+
|
|
152
|
+
return SyncEvaluator()
|
|
153
|
+
|
|
154
|
+
return wrapper
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _default_eval_scorer(result: Any) -> EvaluationResult:
|
|
158
|
+
if isinstance(result, bool):
|
|
159
|
+
return EvaluationResult(score=float(result), label=str(result))
|
|
160
|
+
elif isinstance(result, (int, float)):
|
|
161
|
+
return EvaluationResult(score=float(result))
|
|
162
|
+
elif isinstance(result, EvaluationResult):
|
|
163
|
+
return result
|
|
164
|
+
else:
|
|
165
|
+
raise ValueError(f"Unsupported evaluation result type: {type(result)}")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
ExampleOutput: TypeAlias = Mapping[str, JSONSerializable]
|
|
169
|
+
ExampleMetadata: TypeAlias = Mapping[str, JSONSerializable]
|
|
170
|
+
ExampleInput: TypeAlias = Mapping[str, JSONSerializable]
|
|
171
|
+
|
|
172
|
+
EvaluatorName: TypeAlias = str
|
|
173
|
+
EvaluatorKind: TypeAlias = str
|
|
174
|
+
EvaluatorOutput: TypeAlias = Union[EvaluationResult, bool, int, float, str]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class Evaluator(ABC):
|
|
178
|
+
"""
|
|
179
|
+
A helper super class to guide the implementation of an `Evaluator` object.
|
|
180
|
+
Subclasses must implement either the `evaluate` or `async_evaluate` method.
|
|
181
|
+
Implementing both methods is recommended, but not required.
|
|
182
|
+
|
|
183
|
+
This Class is intended to be subclassed, and should not be instantiated directly.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
_kind: AnnotatorKind
|
|
187
|
+
_name: EvaluatorName
|
|
188
|
+
|
|
189
|
+
@functools.cached_property
|
|
190
|
+
def name(self) -> EvaluatorName:
|
|
191
|
+
if hasattr(self, "_name"):
|
|
192
|
+
return self._name
|
|
193
|
+
return self.__class__.__name__
|
|
194
|
+
|
|
195
|
+
@functools.cached_property
|
|
196
|
+
def kind(self) -> EvaluatorKind:
|
|
197
|
+
if hasattr(self, "_kind"):
|
|
198
|
+
return self._kind.value
|
|
199
|
+
return AnnotatorKind.CODE.value
|
|
200
|
+
|
|
201
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> "Evaluator":
|
|
202
|
+
if cls is Evaluator:
|
|
203
|
+
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
204
|
+
return object.__new__(cls)
|
|
205
|
+
|
|
206
|
+
def evaluate(
|
|
207
|
+
self,
|
|
208
|
+
*,
|
|
209
|
+
output: Optional[TaskOutput] = None,
|
|
210
|
+
expected: Optional[ExampleOutput] = None,
|
|
211
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
212
|
+
input: ExampleInput = MappingProxyType({}),
|
|
213
|
+
**kwargs: Any,
|
|
214
|
+
) -> EvaluationResult:
|
|
215
|
+
# For subclassing, one should implement either this sync method or the
|
|
216
|
+
# async version. Implementing both is recommended but not required.
|
|
217
|
+
raise NotImplementedError
|
|
218
|
+
|
|
219
|
+
async def async_evaluate(
|
|
220
|
+
self,
|
|
221
|
+
*,
|
|
222
|
+
output: Optional[TaskOutput] = None,
|
|
223
|
+
expected: Optional[ExampleOutput] = None,
|
|
224
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
225
|
+
input: ExampleInput = MappingProxyType({}),
|
|
226
|
+
**kwargs: Any,
|
|
227
|
+
) -> EvaluationResult:
|
|
228
|
+
# For subclassing, one should implement either this async method or the
|
|
229
|
+
# sync version. Implementing both is recommended but not required.
|
|
230
|
+
return self.evaluate(
|
|
231
|
+
output=output,
|
|
232
|
+
expected=expected,
|
|
233
|
+
metadata=metadata,
|
|
234
|
+
input=input,
|
|
235
|
+
**kwargs,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def __init_subclass__(cls, is_abstract: bool = False, **kwargs: Any) -> None:
|
|
239
|
+
super().__init_subclass__(**kwargs)
|
|
240
|
+
if is_abstract:
|
|
241
|
+
return
|
|
242
|
+
evaluate_fn_signature = inspect.signature(Evaluator.evaluate)
|
|
243
|
+
for super_cls in inspect.getmro(cls):
|
|
244
|
+
if super_cls in (LLMEvaluator, Evaluator):
|
|
245
|
+
break
|
|
246
|
+
if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__):
|
|
247
|
+
assert callable(evaluate), "`evaluate()` method should be callable"
|
|
248
|
+
# need to remove the first param, i.e. `self`
|
|
249
|
+
_validate_sig(functools.partial(evaluate, None), "evaluate")
|
|
250
|
+
return
|
|
251
|
+
if async_evaluate := super_cls.__dict__.get(Evaluator.async_evaluate.__name__):
|
|
252
|
+
assert callable(async_evaluate), "`async_evaluate()` method should be callable"
|
|
253
|
+
# need to remove the first param, i.e. `self`
|
|
254
|
+
_validate_sig(functools.partial(async_evaluate, None), "async_evaluate")
|
|
255
|
+
return
|
|
256
|
+
raise ValueError(
|
|
257
|
+
f"Evaluator must implement either "
|
|
258
|
+
f"`def evaluate{evaluate_fn_signature}` or "
|
|
259
|
+
f"`async def async_evaluate{evaluate_fn_signature}`"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
|
|
264
|
+
sig = inspect.signature(fn)
|
|
265
|
+
validate_signature(sig)
|
|
266
|
+
for param in sig.parameters.values():
|
|
267
|
+
if param.kind is inspect.Parameter.VAR_KEYWORD:
|
|
268
|
+
return
|
|
269
|
+
else:
|
|
270
|
+
raise ValueError(f"`{fn_name}` should allow variadic keyword arguments `**kwargs`")
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class LLMEvaluator(Evaluator, ABC, is_abstract=True):
|
|
274
|
+
"""
|
|
275
|
+
A convenience super class for setting `kind` as LLM.
|
|
276
|
+
|
|
277
|
+
This Class is intended to be subclassed, and should not be instantiated directly.
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
_kind = AnnotatorKind.LLM
|
|
281
|
+
|
|
282
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> "LLMEvaluator":
|
|
283
|
+
if cls is LLMEvaluator:
|
|
284
|
+
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
285
|
+
return object.__new__(cls)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
ExperimentEvaluator: TypeAlias = Union[
|
|
289
|
+
Evaluator,
|
|
290
|
+
Callable[..., EvaluatorOutput],
|
|
291
|
+
Callable[..., Awaitable[EvaluatorOutput]],
|
|
292
|
+
]
|