arize-phoenix 4.4.4rc5__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +11 -5
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +39 -36
- phoenix/config.py +21 -0
- phoenix/datetime_utils.py +4 -0
- phoenix/db/insertion/evaluation.py +4 -4
- phoenix/db/insertion/helpers.py +4 -12
- phoenix/db/insertion/span.py +3 -3
- phoenix/db/models.py +1 -1
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +153 -0
- phoenix/{datasets → experiments}/evaluators/code_evaluators.py +7 -7
- phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +9 -9
- phoenix/{datasets → experiments}/evaluators/utils.py +38 -141
- phoenix/{datasets/experiments.py → experiments/functions.py} +248 -182
- phoenix/experiments/types.py +722 -0
- phoenix/experiments/utils.py +9 -0
- phoenix/server/api/context.py +2 -0
- phoenix/server/api/dataloaders/__init__.py +2 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/routers/v1/__init__.py +1 -1
- phoenix/server/api/routers/v1/dataset_examples.py +10 -10
- phoenix/server/api/routers/v1/datasets.py +6 -6
- phoenix/server/api/routers/v1/evaluations.py +4 -11
- phoenix/server/api/routers/v1/experiment_evaluations.py +22 -23
- phoenix/server/api/routers/v1/experiment_runs.py +4 -16
- phoenix/server/api/routers/v1/experiments.py +5 -5
- phoenix/server/api/routers/v1/spans.py +6 -4
- phoenix/server/api/types/Experiment.py +7 -0
- phoenix/server/app.py +2 -0
- phoenix/server/static/index.js +648 -570
- phoenix/session/client.py +256 -85
- phoenix/trace/fixtures.py +6 -6
- phoenix/utilities/json.py +8 -8
- phoenix/version.py +1 -1
- phoenix/datasets/__init__.py +0 -0
- phoenix/datasets/evaluators/__init__.py +0 -18
- phoenix/datasets/types.py +0 -178
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → experiments}/tracing.py +0 -0
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import inspect
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from typing import Any, Awaitable, Callable, Mapping, Optional, Union
|
|
3
|
+
from itertools import chain, islice, repeat
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
|
6
5
|
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
from phoenix.datasets.types import (
|
|
6
|
+
from phoenix.experiments.types import (
|
|
10
7
|
AnnotatorKind,
|
|
11
8
|
EvaluationResult,
|
|
12
9
|
JSONSerializable,
|
|
13
|
-
TaskOutput,
|
|
14
10
|
)
|
|
15
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from phoenix.experiments.evaluators.base import Evaluator
|
|
14
|
+
|
|
16
15
|
|
|
17
|
-
def
|
|
16
|
+
def unwrap_json(obj: JSONSerializable) -> JSONSerializable:
|
|
18
17
|
if isinstance(obj, dict):
|
|
19
18
|
if len(obj) == 1:
|
|
20
19
|
key = next(iter(obj.keys()))
|
|
@@ -80,7 +79,7 @@ def create_evaluator(
|
|
|
80
79
|
if isinstance(kind, str):
|
|
81
80
|
kind = AnnotatorKind(kind.upper())
|
|
82
81
|
|
|
83
|
-
def wrapper(func: Callable[..., Any]) -> Evaluator:
|
|
82
|
+
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
84
83
|
nonlocal name
|
|
85
84
|
if not name:
|
|
86
85
|
if hasattr(func, "__self__"):
|
|
@@ -108,6 +107,8 @@ def _wrap_coroutine_evaluation_function(
|
|
|
108
107
|
sig: inspect.Signature,
|
|
109
108
|
convert_to_score: Callable[[Any], EvaluationResult],
|
|
110
109
|
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
110
|
+
from phoenix.experiments.evaluators.base import Evaluator
|
|
111
|
+
|
|
111
112
|
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
112
113
|
class AsyncEvaluator(Evaluator):
|
|
113
114
|
def __init__(self) -> None:
|
|
@@ -134,6 +135,8 @@ def _wrap_sync_evaluation_function(
|
|
|
134
135
|
sig: inspect.Signature,
|
|
135
136
|
convert_to_score: Callable[[Any], EvaluationResult],
|
|
136
137
|
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
138
|
+
from phoenix.experiments.evaluators.base import Evaluator
|
|
139
|
+
|
|
137
140
|
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
138
141
|
class SyncEvaluator(Evaluator):
|
|
139
142
|
def __init__(self) -> None:
|
|
@@ -155,138 +158,32 @@ def _wrap_sync_evaluation_function(
|
|
|
155
158
|
|
|
156
159
|
|
|
157
160
|
def _default_eval_scorer(result: Any) -> EvaluationResult:
|
|
161
|
+
if isinstance(result, EvaluationResult):
|
|
162
|
+
return result
|
|
158
163
|
if isinstance(result, bool):
|
|
159
164
|
return EvaluationResult(score=float(result), label=str(result))
|
|
160
|
-
|
|
165
|
+
if hasattr(result, "__float__"):
|
|
161
166
|
return EvaluationResult(score=float(result))
|
|
162
|
-
|
|
163
|
-
return result
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
""
|
|
185
|
-
|
|
186
|
-
_kind: AnnotatorKind
|
|
187
|
-
_name: EvaluatorName
|
|
188
|
-
|
|
189
|
-
@functools.cached_property
|
|
190
|
-
def name(self) -> EvaluatorName:
|
|
191
|
-
if hasattr(self, "_name"):
|
|
192
|
-
return self._name
|
|
193
|
-
return self.__class__.__name__
|
|
194
|
-
|
|
195
|
-
@functools.cached_property
|
|
196
|
-
def kind(self) -> EvaluatorKind:
|
|
197
|
-
if hasattr(self, "_kind"):
|
|
198
|
-
return self._kind.value
|
|
199
|
-
return AnnotatorKind.CODE.value
|
|
200
|
-
|
|
201
|
-
def __new__(cls, *args: Any, **kwargs: Any) -> "Evaluator":
|
|
202
|
-
if cls is Evaluator:
|
|
203
|
-
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
204
|
-
return object.__new__(cls)
|
|
205
|
-
|
|
206
|
-
def evaluate(
|
|
207
|
-
self,
|
|
208
|
-
*,
|
|
209
|
-
output: Optional[TaskOutput] = None,
|
|
210
|
-
expected: Optional[ExampleOutput] = None,
|
|
211
|
-
metadata: ExampleMetadata = MappingProxyType({}),
|
|
212
|
-
input: ExampleInput = MappingProxyType({}),
|
|
213
|
-
**kwargs: Any,
|
|
214
|
-
) -> EvaluationResult:
|
|
215
|
-
# For subclassing, one should implement either this sync method or the
|
|
216
|
-
# async version. Implementing both is recommended but not required.
|
|
217
|
-
raise NotImplementedError
|
|
218
|
-
|
|
219
|
-
async def async_evaluate(
|
|
220
|
-
self,
|
|
221
|
-
*,
|
|
222
|
-
output: Optional[TaskOutput] = None,
|
|
223
|
-
expected: Optional[ExampleOutput] = None,
|
|
224
|
-
metadata: ExampleMetadata = MappingProxyType({}),
|
|
225
|
-
input: ExampleInput = MappingProxyType({}),
|
|
226
|
-
**kwargs: Any,
|
|
227
|
-
) -> EvaluationResult:
|
|
228
|
-
# For subclassing, one should implement either this async method or the
|
|
229
|
-
# sync version. Implementing both is recommended but not required.
|
|
230
|
-
return self.evaluate(
|
|
231
|
-
output=output,
|
|
232
|
-
expected=expected,
|
|
233
|
-
metadata=metadata,
|
|
234
|
-
input=input,
|
|
235
|
-
**kwargs,
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
def __init_subclass__(cls, is_abstract: bool = False, **kwargs: Any) -> None:
|
|
239
|
-
super().__init_subclass__(**kwargs)
|
|
240
|
-
if is_abstract:
|
|
241
|
-
return
|
|
242
|
-
evaluate_fn_signature = inspect.signature(Evaluator.evaluate)
|
|
243
|
-
for super_cls in inspect.getmro(cls):
|
|
244
|
-
if super_cls in (LLMEvaluator, Evaluator):
|
|
245
|
-
break
|
|
246
|
-
if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__):
|
|
247
|
-
assert callable(evaluate), "`evaluate()` method should be callable"
|
|
248
|
-
# need to remove the first param, i.e. `self`
|
|
249
|
-
_validate_sig(functools.partial(evaluate, None), "evaluate")
|
|
250
|
-
return
|
|
251
|
-
if async_evaluate := super_cls.__dict__.get(Evaluator.async_evaluate.__name__):
|
|
252
|
-
assert callable(async_evaluate), "`async_evaluate()` method should be callable"
|
|
253
|
-
# need to remove the first param, i.e. `self`
|
|
254
|
-
_validate_sig(functools.partial(async_evaluate, None), "async_evaluate")
|
|
255
|
-
return
|
|
256
|
-
raise ValueError(
|
|
257
|
-
f"Evaluator must implement either "
|
|
258
|
-
f"`def evaluate{evaluate_fn_signature}` or "
|
|
259
|
-
f"`async def async_evaluate{evaluate_fn_signature}`"
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
|
|
264
|
-
sig = inspect.signature(fn)
|
|
265
|
-
validate_signature(sig)
|
|
266
|
-
for param in sig.parameters.values():
|
|
267
|
-
if param.kind is inspect.Parameter.VAR_KEYWORD:
|
|
268
|
-
return
|
|
269
|
-
else:
|
|
270
|
-
raise ValueError(f"`{fn_name}` should allow variadic keyword arguments `**kwargs`")
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
class LLMEvaluator(Evaluator, ABC, is_abstract=True):
|
|
274
|
-
"""
|
|
275
|
-
A convenience super class for setting `kind` as LLM.
|
|
276
|
-
|
|
277
|
-
This Class is intended to be subclassed, and should not be instantiated directly.
|
|
278
|
-
"""
|
|
279
|
-
|
|
280
|
-
_kind = AnnotatorKind.LLM
|
|
281
|
-
|
|
282
|
-
def __new__(cls, *args: Any, **kwargs: Any) -> "LLMEvaluator":
|
|
283
|
-
if cls is LLMEvaluator:
|
|
284
|
-
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
285
|
-
return object.__new__(cls)
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
ExperimentEvaluator: TypeAlias = Union[
|
|
289
|
-
Evaluator,
|
|
290
|
-
Callable[..., EvaluatorOutput],
|
|
291
|
-
Callable[..., Awaitable[EvaluatorOutput]],
|
|
292
|
-
]
|
|
167
|
+
if isinstance(result, str):
|
|
168
|
+
return EvaluationResult(label=result)
|
|
169
|
+
if isinstance(result, (tuple, list)) and 0 < len(result) <= 3:
|
|
170
|
+
# Possible interpretations are:
|
|
171
|
+
# - 3-tuple: (Score, Label, Explanation)
|
|
172
|
+
# - 2-tuple: (Score, Explanation) or (Label, Explanation)
|
|
173
|
+
# - 1-tuple: (Score, ) or (Label, )
|
|
174
|
+
# Note that (Score, Label) conflicts with (Score, Explanation) and we
|
|
175
|
+
# pick the latter because it's probably more prevalent. To get
|
|
176
|
+
# (Score, Label), use a 3-tuple instead, i.e. (Score, Label, None).
|
|
177
|
+
a, b, c = islice(chain(result, repeat(None)), 3)
|
|
178
|
+
score, label, explanation = None, a, b
|
|
179
|
+
if hasattr(a, "__float__"):
|
|
180
|
+
try:
|
|
181
|
+
score = float(a)
|
|
182
|
+
except ValueError:
|
|
183
|
+
pass
|
|
184
|
+
else:
|
|
185
|
+
label, explanation = (None, b) if len(result) < 3 else (b, c)
|
|
186
|
+
return EvaluationResult(score=score, label=label, explanation=explanation)
|
|
187
|
+
if result is None:
|
|
188
|
+
return EvaluationResult(score=0)
|
|
189
|
+
raise ValueError(f"Unsupported evaluation result type: {type(result)}")
|