arize-phoenix 4.4.4rc4__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (52) hide show
  1. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +12 -6
  2. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +47 -42
  3. phoenix/config.py +21 -0
  4. phoenix/datetime_utils.py +4 -0
  5. phoenix/db/insertion/dataset.py +19 -16
  6. phoenix/db/insertion/evaluation.py +4 -4
  7. phoenix/db/insertion/helpers.py +4 -12
  8. phoenix/db/insertion/span.py +3 -3
  9. phoenix/db/migrations/versions/10460e46d750_datasets.py +2 -2
  10. phoenix/db/models.py +8 -3
  11. phoenix/experiments/__init__.py +6 -0
  12. phoenix/experiments/evaluators/__init__.py +29 -0
  13. phoenix/experiments/evaluators/base.py +153 -0
  14. phoenix/{datasets → experiments}/evaluators/code_evaluators.py +25 -53
  15. phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +62 -31
  16. phoenix/experiments/evaluators/utils.py +189 -0
  17. phoenix/experiments/functions.py +616 -0
  18. phoenix/{datasets → experiments}/tracing.py +19 -0
  19. phoenix/experiments/types.py +722 -0
  20. phoenix/experiments/utils.py +9 -0
  21. phoenix/server/api/context.py +4 -0
  22. phoenix/server/api/dataloaders/__init__.py +4 -0
  23. phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
  24. phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
  25. phoenix/server/api/helpers/dataset_helpers.py +8 -7
  26. phoenix/server/api/input_types/ClearProjectInput.py +15 -0
  27. phoenix/server/api/mutations/project_mutations.py +9 -4
  28. phoenix/server/api/routers/v1/__init__.py +1 -1
  29. phoenix/server/api/routers/v1/dataset_examples.py +10 -10
  30. phoenix/server/api/routers/v1/datasets.py +152 -48
  31. phoenix/server/api/routers/v1/evaluations.py +4 -11
  32. phoenix/server/api/routers/v1/experiment_evaluations.py +23 -23
  33. phoenix/server/api/routers/v1/experiment_runs.py +5 -17
  34. phoenix/server/api/routers/v1/experiments.py +5 -5
  35. phoenix/server/api/routers/v1/spans.py +6 -4
  36. phoenix/server/api/types/Experiment.py +12 -0
  37. phoenix/server/api/types/ExperimentRun.py +1 -1
  38. phoenix/server/api/types/ExperimentRunAnnotation.py +1 -1
  39. phoenix/server/app.py +4 -0
  40. phoenix/server/static/index.js +712 -588
  41. phoenix/session/client.py +321 -28
  42. phoenix/trace/fixtures.py +6 -6
  43. phoenix/utilities/json.py +8 -8
  44. phoenix/version.py +1 -1
  45. phoenix/datasets/__init__.py +0 -0
  46. phoenix/datasets/evaluators/__init__.py +0 -18
  47. phoenix/datasets/evaluators/_utils.py +0 -13
  48. phoenix/datasets/experiments.py +0 -485
  49. phoenix/datasets/types.py +0 -212
  50. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
  51. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
  52. {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,153 @@
1
+ import functools
2
+ import inspect
3
+ from abc import ABC
4
+ from types import MappingProxyType
5
+ from typing import Any, Awaitable, Callable, Optional, Union
6
+
7
+ from typing_extensions import TypeAlias
8
+
9
+ from phoenix.experiments.evaluators.utils import validate_signature
10
+ from phoenix.experiments.types import (
11
+ AnnotatorKind,
12
+ EvaluationResult,
13
+ EvaluatorKind,
14
+ EvaluatorName,
15
+ EvaluatorOutput,
16
+ ExampleInput,
17
+ ExampleMetadata,
18
+ ExampleOutput,
19
+ TaskOutput,
20
+ )
21
+
22
+
23
+ class Evaluator(ABC):
24
+ """
25
+ A helper super class to guide the implementation of an `Evaluator` object.
26
+ Subclasses must implement either the `evaluate` or `async_evaluate` method.
27
+ Implementing both methods is recommended, but not required.
28
+
29
+ This Class is intended to be subclassed, and should not be instantiated directly.
30
+ """
31
+
32
+ _kind: AnnotatorKind
33
+ _name: EvaluatorName
34
+
35
+ @functools.cached_property
36
+ def name(self) -> EvaluatorName:
37
+ if hasattr(self, "_name"):
38
+ return self._name
39
+ return self.__class__.__name__
40
+
41
+ @functools.cached_property
42
+ def kind(self) -> EvaluatorKind:
43
+ if hasattr(self, "_kind"):
44
+ return self._kind.value
45
+ return AnnotatorKind.CODE.value
46
+
47
+ def __new__(cls, *args: Any, **kwargs: Any) -> "Evaluator":
48
+ if cls is Evaluator:
49
+ raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
50
+ return object.__new__(cls)
51
+
52
+ def evaluate(
53
+ self,
54
+ *,
55
+ output: Optional[TaskOutput] = None,
56
+ expected: Optional[ExampleOutput] = None,
57
+ metadata: ExampleMetadata = MappingProxyType({}),
58
+ input: ExampleInput = MappingProxyType({}),
59
+ **kwargs: Any,
60
+ ) -> EvaluationResult:
61
+ # For subclassing, one should implement either this sync method or the
62
+ # async version. Implementing both is recommended but not required.
63
+ raise NotImplementedError
64
+
65
+ async def async_evaluate(
66
+ self,
67
+ *,
68
+ output: Optional[TaskOutput] = None,
69
+ expected: Optional[ExampleOutput] = None,
70
+ metadata: ExampleMetadata = MappingProxyType({}),
71
+ input: ExampleInput = MappingProxyType({}),
72
+ **kwargs: Any,
73
+ ) -> EvaluationResult:
74
+ # For subclassing, one should implement either this async method or the
75
+ # sync version. Implementing both is recommended but not required.
76
+ return self.evaluate(
77
+ output=output,
78
+ expected=expected,
79
+ metadata=metadata,
80
+ input=input,
81
+ **kwargs,
82
+ )
83
+
84
+ def __init_subclass__(cls, is_abstract: bool = False, **kwargs: Any) -> None:
85
+ super().__init_subclass__(**kwargs)
86
+ if is_abstract:
87
+ return
88
+ evaluate_fn_signature = inspect.signature(Evaluator.evaluate)
89
+ for super_cls in inspect.getmro(cls):
90
+ if super_cls in (LLMEvaluator, Evaluator):
91
+ break
92
+ if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__):
93
+ assert callable(evaluate), "`evaluate()` method should be callable"
94
+ # need to remove the first param, i.e. `self`
95
+ _validate_sig(functools.partial(evaluate, None), "evaluate")
96
+ return
97
+ if async_evaluate := super_cls.__dict__.get(Evaluator.async_evaluate.__name__):
98
+ assert callable(async_evaluate), "`async_evaluate()` method should be callable"
99
+ # need to remove the first param, i.e. `self`
100
+ _validate_sig(functools.partial(async_evaluate, None), "async_evaluate")
101
+ return
102
+ raise ValueError(
103
+ f"Evaluator must implement either "
104
+ f"`def evaluate{evaluate_fn_signature}` or "
105
+ f"`async def async_evaluate{evaluate_fn_signature}`"
106
+ )
107
+
108
+
109
+ def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
110
+ sig = inspect.signature(fn)
111
+ validate_signature(sig)
112
+ for param in sig.parameters.values():
113
+ if param.kind is inspect.Parameter.VAR_KEYWORD:
114
+ return
115
+ else:
116
+ raise ValueError(f"`{fn_name}` should allow variadic keyword arguments `**kwargs`")
117
+
118
+
119
+ class CodeEvaluator(Evaluator, ABC, is_abstract=True):
120
+ """
121
+ A convenience super class for defining code evaluators.
122
+
123
+ This class is intended to be subclassed, and should not be instantiated directly.
124
+ """
125
+
126
+ _kind = AnnotatorKind.CODE
127
+
128
+ def __new__(cls, *args: Any, **kwargs: Any) -> "CodeEvaluator":
129
+ if cls is CodeEvaluator:
130
+ raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
131
+ return object.__new__(cls)
132
+
133
+
134
+ class LLMEvaluator(Evaluator, ABC, is_abstract=True):
135
+ """
136
+ A convenience super class for defining LLM evaluators.
137
+
138
+ This class is intended to be subclassed, and should not be instantiated directly.
139
+ """
140
+
141
+ _kind = AnnotatorKind.LLM
142
+
143
+ def __new__(cls, *args: Any, **kwargs: Any) -> "LLMEvaluator":
144
+ if cls is LLMEvaluator:
145
+ raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
146
+ return object.__new__(cls)
147
+
148
+
149
+ ExperimentEvaluator: TypeAlias = Union[
150
+ Evaluator,
151
+ Callable[..., EvaluatorOutput],
152
+ Callable[..., Awaitable[EvaluatorOutput]],
153
+ ]
@@ -2,19 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  import re
5
- from typing import TYPE_CHECKING, List, Optional, Union
5
+ from typing import Any, List, Optional, Union
6
6
 
7
- from phoenix.datasets.evaluators._utils import _unwrap_json
8
- from phoenix.datasets.types import EvaluationResult, Example, ExperimentEvaluator, ExperimentRun
7
+ from phoenix.experiments.evaluators.base import CodeEvaluator
8
+ from phoenix.experiments.types import EvaluationResult, TaskOutput
9
9
 
10
10
 
11
- class JSONParsable:
12
- annotator_kind = "CODE"
13
- name = "JSONParsable"
14
-
15
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
16
- assert exp_run.output is not None
17
- output = _unwrap_json(exp_run.output.result)
11
+ class JSONParsable(CodeEvaluator):
12
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
18
13
  assert isinstance(output, str), "Experiment run output must be a string"
19
14
  try:
20
15
  json.loads(output)
@@ -26,18 +21,14 @@ class JSONParsable:
26
21
  )
27
22
 
28
23
 
29
- class ContainsKeyword:
30
- annotator_kind = "CODE"
31
-
24
+ class ContainsKeyword(CodeEvaluator):
32
25
  def __init__(self, keyword: str, name: Optional[str] = None) -> None:
33
26
  self.keyword = keyword
34
- self.name = name or f"Contains({repr(keyword)})"
27
+ self._name = name or f"Contains({repr(keyword)})"
35
28
 
36
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
37
- assert exp_run.output is not None
38
- result = _unwrap_json(exp_run.output.result)
39
- assert isinstance(result, str), "Experiment run output must be a string"
40
- found = self.keyword in result
29
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
30
+ assert isinstance(output, str), "Experiment run output must be a string"
31
+ found = self.keyword in output
41
32
  return EvaluationResult(
42
33
  score=float(found),
43
34
  explanation=(
@@ -47,18 +38,14 @@ class ContainsKeyword:
47
38
  )
48
39
 
49
40
 
50
- class ContainsAnyKeyword:
51
- annotator_kind = "CODE"
52
-
41
+ class ContainsAnyKeyword(CodeEvaluator):
53
42
  def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
54
43
  self.keywords = keywords
55
- self.name = name or f"ContainsAny({keywords})"
44
+ self._name = name or f"ContainsAny({keywords})"
56
45
 
57
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
58
- assert exp_run.output is not None
59
- result = _unwrap_json(exp_run.output.result)
60
- assert isinstance(result, str), "Experiment run output must be a string"
61
- found = [keyword for keyword in self.keywords if keyword in result]
46
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
47
+ assert isinstance(output, str), "Experiment run output must be a string"
48
+ found = [keyword for keyword in self.keywords if keyword in output]
62
49
  if found:
63
50
  explanation = f"the keywords {found} were found in the output"
64
51
  else:
@@ -69,18 +56,14 @@ class ContainsAnyKeyword:
69
56
  )
70
57
 
71
58
 
72
- class ContainsAllKeywords:
73
- annotator_kind = "CODE"
74
-
59
+ class ContainsAllKeywords(CodeEvaluator):
75
60
  def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
76
61
  self.keywords = keywords
77
- self.name = name or f"ContainsAll({keywords})"
62
+ self._name = name or f"ContainsAll({keywords})"
78
63
 
79
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
80
- assert exp_run.output is not None
81
- result = _unwrap_json(exp_run.output.result)
82
- assert isinstance(result, str), "Experiment run output must be a string"
83
- not_found = [keyword for keyword in self.keywords if keyword not in result]
64
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
65
+ assert isinstance(output, str), "Experiment run output must be a string"
66
+ not_found = [keyword for keyword in self.keywords if keyword not in output]
84
67
  if not_found:
85
68
  contains_all = False
86
69
  explanation = f"the keywords {not_found} were not found in the output"
@@ -93,21 +76,17 @@ class ContainsAllKeywords:
93
76
  )
94
77
 
95
78
 
96
- class MatchesRegex:
97
- annotator_kind = "CODE"
98
-
79
+ class MatchesRegex(CodeEvaluator):
99
80
  def __init__(self, pattern: Union[str, re.Pattern[str]], name: Optional[str] = None) -> None:
100
81
  if isinstance(pattern, str):
101
82
  pattern = re.compile(pattern)
102
83
  self.pattern = pattern
103
84
  assert isinstance(pattern, re.Pattern)
104
- self.name = name or f"matches_({pattern})"
85
+ self._name = name or f"matches_({pattern})"
105
86
 
106
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
107
- assert exp_run.output is not None
108
- result = _unwrap_json(exp_run.output.result)
109
- assert isinstance(result, str), "Experiment run output must be a string"
110
- matches = self.pattern.findall(result)
87
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
88
+ assert isinstance(output, str), "Experiment run output must be a string"
89
+ matches = self.pattern.findall(output)
111
90
  if matches:
112
91
  explanation = (
113
92
  f"the substrings {matches} matched the regex pattern {self.pattern.pattern}"
@@ -118,10 +97,3 @@ class MatchesRegex:
118
97
  score=float(bool(matches)),
119
98
  explanation=explanation,
120
99
  )
121
-
122
-
123
- # Someday we'll do typing checking in unit tests.
124
- if TYPE_CHECKING:
125
- _: ExperimentEvaluator
126
- _ = JSONParsable()
127
- _ = ContainsKeyword("test")
@@ -1,14 +1,23 @@
1
1
  import re
2
- from typing import Callable, Optional, Type
2
+ from types import MappingProxyType
3
+ from typing import Any, Callable, Optional, Type
3
4
 
4
- from phoenix.datasets.evaluators._utils import _unwrap_json
5
- from phoenix.datasets.types import EvaluationResult, Example, ExperimentEvaluator, ExperimentRun
6
5
  from phoenix.evals.models.base import BaseModel as LLMBaseModel
7
6
  from phoenix.evals.utils import snap_to_rail
7
+ from phoenix.experiments.evaluators.base import (
8
+ ExperimentEvaluator,
9
+ LLMEvaluator,
10
+ )
11
+ from phoenix.experiments.evaluators.utils import unwrap_json
12
+ from phoenix.experiments.types import (
13
+ EvaluationResult,
14
+ ExampleInput,
15
+ ExampleMetadata,
16
+ TaskOutput,
17
+ )
8
18
 
9
19
 
10
- class LLMCriteriaEvaluator:
11
- annotator_kind = "LLM"
20
+ class LLMCriteriaEvaluator(LLMEvaluator):
12
21
  _base_template = (
13
22
  "Determine if the following text is {criteria}. {description}"
14
23
  "First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
@@ -37,21 +46,23 @@ class LLMCriteriaEvaluator:
37
46
  self.criteria = criteria
38
47
  self.description = description
39
48
  self.template = self._format_base_template(self.criteria, self.description)
40
- self.name = name
49
+ self._name = name
41
50
 
42
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
43
- formatted_template = self._format_eval_template(exp_run)
51
+ def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
52
+ formatted_template = self._format_eval_template(output)
44
53
  unparsed_response = self.model._generate(formatted_template)
45
54
  return self._parse_eval_output(unparsed_response)
46
55
 
47
- async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
48
- formatted_template = self._format_eval_template(exp_run)
56
+ async def async_evaluate(
57
+ self, *, output: Optional[TaskOutput] = None, **_: Any
58
+ ) -> EvaluationResult:
59
+ formatted_template = self._format_eval_template(output)
49
60
  unparsed_response = await self.model._async_generate(formatted_template)
50
61
  return self._parse_eval_output(unparsed_response)
51
62
 
52
- def _format_eval_template(self, experiment_run: ExperimentRun) -> str:
53
- assert experiment_run.output is not None
54
- result = _unwrap_json(experiment_run.output.result)
63
+ def _format_eval_template(self, output: TaskOutput) -> str:
64
+ assert output is not None
65
+ result = unwrap_json(output)
55
66
  return self.template.format(text=str(result))
56
67
 
57
68
  def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
@@ -137,8 +148,7 @@ def _parse_label_from_explanation(raw_string: str) -> str:
137
148
  return raw_string
138
149
 
139
150
 
140
- class RelevanceEvaluator:
141
- annotator_kind = "LLM"
151
+ class RelevanceEvaluator(LLMEvaluator):
142
152
  template = (
143
153
  "Determine if the following response is relevant to the query. In this context, "
144
154
  "'relevance' means that the response directly addresses the core question or topic of the "
@@ -162,19 +172,24 @@ class RelevanceEvaluator:
162
172
  def __init__(
163
173
  self,
164
174
  model: LLMBaseModel,
165
- get_query: Optional[Callable[[Example, ExperimentRun], str]] = None,
166
- get_response: Optional[Callable[[Example, ExperimentRun], str]] = None,
175
+ get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
176
+ get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
167
177
  name: str = "RelevanceEvaluator",
168
178
  ):
169
179
  self.model = model
170
- self.name = name
180
+ self._name = name
171
181
  self.get_query = get_query or self._default_get_query
172
182
  self.get_response = get_response or self._default_get_response
173
183
 
174
- def _format_eval_template(self, example: Example, experiment_run: ExperimentRun) -> str:
175
- assert experiment_run.output is not None
176
- query = self.get_query(example, experiment_run)
177
- response = self.get_response(example, experiment_run)
184
+ def _format_eval_template(
185
+ self,
186
+ output: Optional[TaskOutput] = None,
187
+ input: ExampleInput = MappingProxyType({}),
188
+ metadata: ExampleMetadata = MappingProxyType({}),
189
+ ) -> str:
190
+ assert output is not None
191
+ query = self.get_query(input, metadata)
192
+ response = self.get_response(output, metadata)
178
193
  return self.template.format(query=query, response=response)
179
194
 
180
195
  def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
@@ -195,19 +210,35 @@ class RelevanceEvaluator:
195
210
  metadata={},
196
211
  )
197
212
 
198
- def _default_get_query(self, example: Example, experiment_run: ExperimentRun) -> str:
199
- return str(example.input)
213
+ def _default_get_query(self, input: ExampleInput, *args: Any, **kwargs: Any) -> str:
214
+ return str(input)
200
215
 
201
- def _default_get_response(self, example: Example, experiment_run: ExperimentRun) -> str:
202
- assert experiment_run.output is not None
203
- return str(_unwrap_json(experiment_run.output.result))
216
+ def _default_get_response(
217
+ self, output: Optional[TaskOutput] = None, *args: Any, **kwargs: Any
218
+ ) -> str:
219
+ assert output is not None
220
+ return str(unwrap_json(output))
204
221
 
205
- def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
206
- formatted_template = self._format_eval_template(example, exp_run)
222
+ def evaluate(
223
+ self,
224
+ *,
225
+ output: Optional[TaskOutput] = None,
226
+ metadata: ExampleMetadata = MappingProxyType({}),
227
+ input: ExampleInput = MappingProxyType({}),
228
+ **_: Any,
229
+ ) -> EvaluationResult:
230
+ formatted_template = self._format_eval_template(output, input, metadata)
207
231
  unparsed_response = self.model._generate(formatted_template)
208
232
  return self._parse_eval_output(unparsed_response)
209
233
 
210
- async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
211
- formatted_template = self._format_eval_template(example, exp_run)
234
+ async def async_evaluate(
235
+ self,
236
+ *,
237
+ output: Optional[TaskOutput] = None,
238
+ metadata: ExampleMetadata = MappingProxyType({}),
239
+ input: ExampleInput = MappingProxyType({}),
240
+ **_: Any,
241
+ ) -> EvaluationResult:
242
+ formatted_template = self._format_eval_template(output, input, metadata)
212
243
  unparsed_response = await self.model._async_generate(formatted_template)
213
244
  return self._parse_eval_output(unparsed_response)
@@ -0,0 +1,189 @@
1
+ import functools
2
+ import inspect
3
+ from itertools import chain, islice, repeat
4
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
5
+
6
+ from phoenix.experiments.types import (
7
+ AnnotatorKind,
8
+ EvaluationResult,
9
+ JSONSerializable,
10
+ )
11
+
12
+ if TYPE_CHECKING:
13
+ from phoenix.experiments.evaluators.base import Evaluator
14
+
15
+
16
+ def unwrap_json(obj: JSONSerializable) -> JSONSerializable:
17
+ if isinstance(obj, dict):
18
+ if len(obj) == 1:
19
+ key = next(iter(obj.keys()))
20
+ output = obj[key]
21
+ assert isinstance(
22
+ output, (dict, list, str, int, float, bool, type(None))
23
+ ), "Output must be JSON serializable"
24
+ return output
25
+ return obj
26
+
27
+
28
+ def validate_signature(sig: inspect.Signature) -> None:
29
+ # Check that the wrapped function has a valid signature for use as an evaluator
30
+ # If it does not, raise an error to exit early before running evaluations
31
+ params = sig.parameters
32
+ valid_named_params = {"input", "output", "expected", "metadata"}
33
+ if len(params) == 0:
34
+ raise ValueError("Evaluation function must have at least one parameter.")
35
+ if len(params) > 1:
36
+ for not_found in set(params) - valid_named_params:
37
+ param = params[not_found]
38
+ if (
39
+ param.kind is inspect.Parameter.VAR_KEYWORD
40
+ or param.default is not inspect.Parameter.empty
41
+ ):
42
+ continue
43
+ raise ValueError(
44
+ (
45
+ f"Invalid parameter names in evaluation function: {', '.join(not_found)}. "
46
+ "Parameters names for multi-argument functions must be "
47
+ f"any of: {', '.join(valid_named_params)}."
48
+ )
49
+ )
50
+
51
+
52
+ def _bind_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
53
+ parameter_mapping = {
54
+ "input": kwargs.get("input"),
55
+ "output": kwargs.get("output"),
56
+ "expected": kwargs.get("expected"),
57
+ "metadata": kwargs.get("metadata"),
58
+ }
59
+ params = sig.parameters
60
+ if len(params) == 1:
61
+ parameter_name = next(iter(params))
62
+ if parameter_name in parameter_mapping:
63
+ return sig.bind(parameter_mapping[parameter_name])
64
+ else:
65
+ return sig.bind(parameter_mapping["output"])
66
+ return sig.bind_partial(
67
+ **{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
68
+ )
69
+
70
+
71
+ def create_evaluator(
72
+ kind: Union[str, AnnotatorKind] = AnnotatorKind.CODE,
73
+ name: Optional[str] = None,
74
+ scorer: Optional[Callable[[Any], EvaluationResult]] = None,
75
+ ) -> Callable[[Callable[..., Any]], "Evaluator"]:
76
+ if scorer is None:
77
+ scorer = _default_eval_scorer
78
+
79
+ if isinstance(kind, str):
80
+ kind = AnnotatorKind(kind.upper())
81
+
82
+ def wrapper(func: Callable[..., Any]) -> "Evaluator":
83
+ nonlocal name
84
+ if not name:
85
+ if hasattr(func, "__self__"):
86
+ name = func.__self__.__class__.__name__
87
+ elif hasattr(func, "__name__"):
88
+ name = func.__name__
89
+ else:
90
+ name = str(func)
91
+ assert name is not None
92
+
93
+ wrapped_signature = inspect.signature(func)
94
+ validate_signature(wrapped_signature)
95
+
96
+ if inspect.iscoroutinefunction(func):
97
+ return _wrap_coroutine_evaluation_function(name, kind, wrapped_signature, scorer)(func)
98
+ else:
99
+ return _wrap_sync_evaluation_function(name, kind, wrapped_signature, scorer)(func)
100
+
101
+ return wrapper
102
+
103
+
104
+ def _wrap_coroutine_evaluation_function(
105
+ name: str,
106
+ annotator_kind: AnnotatorKind,
107
+ sig: inspect.Signature,
108
+ convert_to_score: Callable[[Any], EvaluationResult],
109
+ ) -> Callable[[Callable[..., Any]], "Evaluator"]:
110
+ from phoenix.experiments.evaluators.base import Evaluator
111
+
112
+ def wrapper(func: Callable[..., Any]) -> "Evaluator":
113
+ class AsyncEvaluator(Evaluator):
114
+ def __init__(self) -> None:
115
+ self._name = name
116
+ self._kind = annotator_kind
117
+
118
+ @functools.wraps(func)
119
+ async def __call__(self, *args: Any, **kwargs: Any) -> Any:
120
+ return await func(*args, **kwargs)
121
+
122
+ async def async_evaluate(self, **kwargs: Any) -> EvaluationResult:
123
+ bound_signature = _bind_signature(sig, **kwargs)
124
+ result = await func(*bound_signature.args, **bound_signature.kwargs)
125
+ return convert_to_score(result)
126
+
127
+ return AsyncEvaluator()
128
+
129
+ return wrapper
130
+
131
+
132
+ def _wrap_sync_evaluation_function(
133
+ name: str,
134
+ annotator_kind: AnnotatorKind,
135
+ sig: inspect.Signature,
136
+ convert_to_score: Callable[[Any], EvaluationResult],
137
+ ) -> Callable[[Callable[..., Any]], "Evaluator"]:
138
+ from phoenix.experiments.evaluators.base import Evaluator
139
+
140
+ def wrapper(func: Callable[..., Any]) -> "Evaluator":
141
+ class SyncEvaluator(Evaluator):
142
+ def __init__(self) -> None:
143
+ self._name = name
144
+ self._kind = annotator_kind
145
+
146
+ @functools.wraps(func)
147
+ def __call__(self, *args: Any, **kwargs: Any) -> Any:
148
+ return func(*args, **kwargs)
149
+
150
+ def evaluate(self, **kwargs: Any) -> EvaluationResult:
151
+ bound_signature = _bind_signature(sig, **kwargs)
152
+ result = func(*bound_signature.args, **bound_signature.kwargs)
153
+ return convert_to_score(result)
154
+
155
+ return SyncEvaluator()
156
+
157
+ return wrapper
158
+
159
+
160
+ def _default_eval_scorer(result: Any) -> EvaluationResult:
161
+ if isinstance(result, EvaluationResult):
162
+ return result
163
+ if isinstance(result, bool):
164
+ return EvaluationResult(score=float(result), label=str(result))
165
+ if hasattr(result, "__float__"):
166
+ return EvaluationResult(score=float(result))
167
+ if isinstance(result, str):
168
+ return EvaluationResult(label=result)
169
+ if isinstance(result, (tuple, list)) and 0 < len(result) <= 3:
170
+ # Possible interpretations are:
171
+ # - 3-tuple: (Score, Label, Explanation)
172
+ # - 2-tuple: (Score, Explanation) or (Label, Explanation)
173
+ # - 1-tuple: (Score, ) or (Label, )
174
+ # Note that (Score, Label) conflicts with (Score, Explanation) and we
175
+ # pick the latter because it's probably more prevalent. To get
176
+ # (Score, Label), use a 3-tuple instead, i.e. (Score, Label, None).
177
+ a, b, c = islice(chain(result, repeat(None)), 3)
178
+ score, label, explanation = None, a, b
179
+ if hasattr(a, "__float__"):
180
+ try:
181
+ score = float(a)
182
+ except ValueError:
183
+ pass
184
+ else:
185
+ label, explanation = (None, b) if len(result) < 3 else (b, c)
186
+ return EvaluationResult(score=score, label=label, explanation=explanation)
187
+ if result is None:
188
+ return EvaluationResult(score=0)
189
+ raise ValueError(f"Unsupported evaluation result type: {type(result)}")