arize-phoenix 4.4.4rc5__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (42) hide show
  1. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +11 -5
  2. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +39 -36
  3. phoenix/config.py +21 -0
  4. phoenix/datetime_utils.py +4 -0
  5. phoenix/db/insertion/evaluation.py +4 -4
  6. phoenix/db/insertion/helpers.py +4 -12
  7. phoenix/db/insertion/span.py +3 -3
  8. phoenix/db/models.py +1 -1
  9. phoenix/experiments/__init__.py +6 -0
  10. phoenix/experiments/evaluators/__init__.py +29 -0
  11. phoenix/experiments/evaluators/base.py +153 -0
  12. phoenix/{datasets → experiments}/evaluators/code_evaluators.py +7 -7
  13. phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +9 -9
  14. phoenix/{datasets → experiments}/evaluators/utils.py +38 -141
  15. phoenix/{datasets/experiments.py → experiments/functions.py} +248 -182
  16. phoenix/experiments/types.py +722 -0
  17. phoenix/experiments/utils.py +9 -0
  18. phoenix/server/api/context.py +2 -0
  19. phoenix/server/api/dataloaders/__init__.py +2 -0
  20. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  21. phoenix/server/api/routers/v1/__init__.py +1 -1
  22. phoenix/server/api/routers/v1/dataset_examples.py +10 -10
  23. phoenix/server/api/routers/v1/datasets.py +6 -6
  24. phoenix/server/api/routers/v1/evaluations.py +4 -11
  25. phoenix/server/api/routers/v1/experiment_evaluations.py +22 -23
  26. phoenix/server/api/routers/v1/experiment_runs.py +4 -16
  27. phoenix/server/api/routers/v1/experiments.py +5 -5
  28. phoenix/server/api/routers/v1/spans.py +6 -4
  29. phoenix/server/api/types/Experiment.py +7 -0
  30. phoenix/server/app.py +2 -0
  31. phoenix/server/static/index.js +648 -570
  32. phoenix/session/client.py +256 -85
  33. phoenix/trace/fixtures.py +6 -6
  34. phoenix/utilities/json.py +8 -8
  35. phoenix/version.py +1 -1
  36. phoenix/datasets/__init__.py +0 -0
  37. phoenix/datasets/evaluators/__init__.py +0 -18
  38. phoenix/datasets/types.py +0 -178
  39. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
  40. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
  41. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
  42. /phoenix/{datasets → experiments}/tracing.py +0 -0
@@ -1,20 +1,19 @@
1
1
  import functools
2
2
  import inspect
3
- from abc import ABC
4
- from types import MappingProxyType
5
- from typing import Any, Awaitable, Callable, Mapping, Optional, Union
3
+ from itertools import chain, islice, repeat
4
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
6
5
 
7
- from typing_extensions import TypeAlias
8
-
9
- from phoenix.datasets.types import (
6
+ from phoenix.experiments.types import (
10
7
  AnnotatorKind,
11
8
  EvaluationResult,
12
9
  JSONSerializable,
13
- TaskOutput,
14
10
  )
15
11
 
12
+ if TYPE_CHECKING:
13
+ from phoenix.experiments.evaluators.base import Evaluator
14
+
16
15
 
17
- def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
16
+ def unwrap_json(obj: JSONSerializable) -> JSONSerializable:
18
17
  if isinstance(obj, dict):
19
18
  if len(obj) == 1:
20
19
  key = next(iter(obj.keys()))
@@ -80,7 +79,7 @@ def create_evaluator(
80
79
  if isinstance(kind, str):
81
80
  kind = AnnotatorKind(kind.upper())
82
81
 
83
- def wrapper(func: Callable[..., Any]) -> Evaluator:
82
+ def wrapper(func: Callable[..., Any]) -> "Evaluator":
84
83
  nonlocal name
85
84
  if not name:
86
85
  if hasattr(func, "__self__"):
@@ -108,6 +107,8 @@ def _wrap_coroutine_evaluation_function(
108
107
  sig: inspect.Signature,
109
108
  convert_to_score: Callable[[Any], EvaluationResult],
110
109
  ) -> Callable[[Callable[..., Any]], "Evaluator"]:
110
+ from phoenix.experiments.evaluators.base import Evaluator
111
+
111
112
  def wrapper(func: Callable[..., Any]) -> "Evaluator":
112
113
  class AsyncEvaluator(Evaluator):
113
114
  def __init__(self) -> None:
@@ -134,6 +135,8 @@ def _wrap_sync_evaluation_function(
134
135
  sig: inspect.Signature,
135
136
  convert_to_score: Callable[[Any], EvaluationResult],
136
137
  ) -> Callable[[Callable[..., Any]], "Evaluator"]:
138
+ from phoenix.experiments.evaluators.base import Evaluator
139
+
137
140
  def wrapper(func: Callable[..., Any]) -> "Evaluator":
138
141
  class SyncEvaluator(Evaluator):
139
142
  def __init__(self) -> None:
@@ -155,138 +158,32 @@ def _wrap_sync_evaluation_function(
155
158
 
156
159
 
157
160
  def _default_eval_scorer(result: Any) -> EvaluationResult:
161
+ if isinstance(result, EvaluationResult):
162
+ return result
158
163
  if isinstance(result, bool):
159
164
  return EvaluationResult(score=float(result), label=str(result))
160
- elif isinstance(result, (int, float)):
165
+ if hasattr(result, "__float__"):
161
166
  return EvaluationResult(score=float(result))
162
- elif isinstance(result, EvaluationResult):
163
- return result
164
- else:
165
- raise ValueError(f"Unsupported evaluation result type: {type(result)}")
166
-
167
-
168
- ExampleOutput: TypeAlias = Mapping[str, JSONSerializable]
169
- ExampleMetadata: TypeAlias = Mapping[str, JSONSerializable]
170
- ExampleInput: TypeAlias = Mapping[str, JSONSerializable]
171
-
172
- EvaluatorName: TypeAlias = str
173
- EvaluatorKind: TypeAlias = str
174
- EvaluatorOutput: TypeAlias = Union[EvaluationResult, bool, int, float, str]
175
-
176
-
177
- class Evaluator(ABC):
178
- """
179
- A helper super class to guide the implementation of an `Evaluator` object.
180
- Subclasses must implement either the `evaluate` or `async_evaluate` method.
181
- Implementing both methods is recommended, but not required.
182
-
183
- This Class is intended to be subclassed, and should not be instantiated directly.
184
- """
185
-
186
- _kind: AnnotatorKind
187
- _name: EvaluatorName
188
-
189
- @functools.cached_property
190
- def name(self) -> EvaluatorName:
191
- if hasattr(self, "_name"):
192
- return self._name
193
- return self.__class__.__name__
194
-
195
- @functools.cached_property
196
- def kind(self) -> EvaluatorKind:
197
- if hasattr(self, "_kind"):
198
- return self._kind.value
199
- return AnnotatorKind.CODE.value
200
-
201
- def __new__(cls, *args: Any, **kwargs: Any) -> "Evaluator":
202
- if cls is Evaluator:
203
- raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
204
- return object.__new__(cls)
205
-
206
- def evaluate(
207
- self,
208
- *,
209
- output: Optional[TaskOutput] = None,
210
- expected: Optional[ExampleOutput] = None,
211
- metadata: ExampleMetadata = MappingProxyType({}),
212
- input: ExampleInput = MappingProxyType({}),
213
- **kwargs: Any,
214
- ) -> EvaluationResult:
215
- # For subclassing, one should implement either this sync method or the
216
- # async version. Implementing both is recommended but not required.
217
- raise NotImplementedError
218
-
219
- async def async_evaluate(
220
- self,
221
- *,
222
- output: Optional[TaskOutput] = None,
223
- expected: Optional[ExampleOutput] = None,
224
- metadata: ExampleMetadata = MappingProxyType({}),
225
- input: ExampleInput = MappingProxyType({}),
226
- **kwargs: Any,
227
- ) -> EvaluationResult:
228
- # For subclassing, one should implement either this async method or the
229
- # sync version. Implementing both is recommended but not required.
230
- return self.evaluate(
231
- output=output,
232
- expected=expected,
233
- metadata=metadata,
234
- input=input,
235
- **kwargs,
236
- )
237
-
238
- def __init_subclass__(cls, is_abstract: bool = False, **kwargs: Any) -> None:
239
- super().__init_subclass__(**kwargs)
240
- if is_abstract:
241
- return
242
- evaluate_fn_signature = inspect.signature(Evaluator.evaluate)
243
- for super_cls in inspect.getmro(cls):
244
- if super_cls in (LLMEvaluator, Evaluator):
245
- break
246
- if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__):
247
- assert callable(evaluate), "`evaluate()` method should be callable"
248
- # need to remove the first param, i.e. `self`
249
- _validate_sig(functools.partial(evaluate, None), "evaluate")
250
- return
251
- if async_evaluate := super_cls.__dict__.get(Evaluator.async_evaluate.__name__):
252
- assert callable(async_evaluate), "`async_evaluate()` method should be callable"
253
- # need to remove the first param, i.e. `self`
254
- _validate_sig(functools.partial(async_evaluate, None), "async_evaluate")
255
- return
256
- raise ValueError(
257
- f"Evaluator must implement either "
258
- f"`def evaluate{evaluate_fn_signature}` or "
259
- f"`async def async_evaluate{evaluate_fn_signature}`"
260
- )
261
-
262
-
263
- def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
264
- sig = inspect.signature(fn)
265
- validate_signature(sig)
266
- for param in sig.parameters.values():
267
- if param.kind is inspect.Parameter.VAR_KEYWORD:
268
- return
269
- else:
270
- raise ValueError(f"`{fn_name}` should allow variadic keyword arguments `**kwargs`")
271
-
272
-
273
- class LLMEvaluator(Evaluator, ABC, is_abstract=True):
274
- """
275
- A convenience super class for setting `kind` as LLM.
276
-
277
- This Class is intended to be subclassed, and should not be instantiated directly.
278
- """
279
-
280
- _kind = AnnotatorKind.LLM
281
-
282
- def __new__(cls, *args: Any, **kwargs: Any) -> "LLMEvaluator":
283
- if cls is LLMEvaluator:
284
- raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
285
- return object.__new__(cls)
286
-
287
-
288
- ExperimentEvaluator: TypeAlias = Union[
289
- Evaluator,
290
- Callable[..., EvaluatorOutput],
291
- Callable[..., Awaitable[EvaluatorOutput]],
292
- ]
167
+ if isinstance(result, str):
168
+ return EvaluationResult(label=result)
169
+ if isinstance(result, (tuple, list)) and 0 < len(result) <= 3:
170
+ # Possible interpretations are:
171
+ # - 3-tuple: (Score, Label, Explanation)
172
+ # - 2-tuple: (Score, Explanation) or (Label, Explanation)
173
+ # - 1-tuple: (Score, ) or (Label, )
174
+ # Note that (Score, Label) conflicts with (Score, Explanation) and we
175
+ # pick the latter because it's probably more prevalent. To get
176
+ # (Score, Label), use a 3-tuple instead, i.e. (Score, Label, None).
177
+ a, b, c = islice(chain(result, repeat(None)), 3)
178
+ score, label, explanation = None, a, b
179
+ if hasattr(a, "__float__"):
180
+ try:
181
+ score = float(a)
182
+ except ValueError:
183
+ pass
184
+ else:
185
+ label, explanation = (None, b) if len(result) < 3 else (b, c)
186
+ return EvaluationResult(score=score, label=label, explanation=explanation)
187
+ if result is None:
188
+ return EvaluationResult(score=0)
189
+ raise ValueError(f"Unsupported evaluation result type: {type(result)}")