arize-phoenix 4.4.4rc5__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (118) hide show
  1. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/METADATA +5 -5
  2. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/RECORD +56 -117
  3. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/WHEEL +1 -1
  4. phoenix/__init__.py +27 -0
  5. phoenix/config.py +7 -21
  6. phoenix/core/model.py +25 -25
  7. phoenix/core/model_schema.py +62 -64
  8. phoenix/core/model_schema_adapter.py +25 -27
  9. phoenix/db/bulk_inserter.py +14 -54
  10. phoenix/db/insertion/evaluation.py +6 -6
  11. phoenix/db/insertion/helpers.py +2 -13
  12. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +28 -2
  13. phoenix/db/models.py +4 -236
  14. phoenix/inferences/fixtures.py +23 -23
  15. phoenix/inferences/inferences.py +7 -7
  16. phoenix/inferences/validation.py +1 -1
  17. phoenix/server/api/context.py +0 -18
  18. phoenix/server/api/dataloaders/__init__.py +0 -18
  19. phoenix/server/api/dataloaders/span_descendants.py +3 -2
  20. phoenix/server/api/routers/v1/__init__.py +2 -77
  21. phoenix/server/api/routers/v1/evaluations.py +2 -4
  22. phoenix/server/api/routers/v1/spans.py +1 -3
  23. phoenix/server/api/routers/v1/traces.py +4 -1
  24. phoenix/server/api/schema.py +303 -2
  25. phoenix/server/api/types/Cluster.py +19 -19
  26. phoenix/server/api/types/Dataset.py +63 -282
  27. phoenix/server/api/types/DatasetRole.py +23 -0
  28. phoenix/server/api/types/Dimension.py +29 -30
  29. phoenix/server/api/types/EmbeddingDimension.py +34 -40
  30. phoenix/server/api/types/Event.py +16 -16
  31. phoenix/server/api/{mutations/export_events_mutations.py → types/ExportEventsMutation.py} +14 -17
  32. phoenix/server/api/types/Model.py +42 -43
  33. phoenix/server/api/types/Project.py +12 -26
  34. phoenix/server/api/types/Span.py +2 -79
  35. phoenix/server/api/types/TimeSeries.py +6 -6
  36. phoenix/server/api/types/Trace.py +4 -15
  37. phoenix/server/api/types/UMAPPoints.py +1 -1
  38. phoenix/server/api/types/node.py +111 -5
  39. phoenix/server/api/types/pagination.py +52 -10
  40. phoenix/server/app.py +49 -101
  41. phoenix/server/main.py +27 -49
  42. phoenix/server/openapi/docs.py +0 -3
  43. phoenix/server/static/index.js +2595 -3523
  44. phoenix/server/templates/index.html +0 -1
  45. phoenix/services.py +15 -15
  46. phoenix/session/client.py +21 -438
  47. phoenix/session/session.py +37 -47
  48. phoenix/trace/exporter.py +9 -14
  49. phoenix/trace/fixtures.py +7 -133
  50. phoenix/trace/schemas.py +2 -1
  51. phoenix/trace/span_evaluations.py +3 -3
  52. phoenix/trace/trace_dataset.py +6 -6
  53. phoenix/version.py +1 -1
  54. phoenix/datasets/__init__.py +0 -0
  55. phoenix/datasets/evaluators/__init__.py +0 -18
  56. phoenix/datasets/evaluators/code_evaluators.py +0 -99
  57. phoenix/datasets/evaluators/llm_evaluators.py +0 -244
  58. phoenix/datasets/evaluators/utils.py +0 -292
  59. phoenix/datasets/experiments.py +0 -550
  60. phoenix/datasets/tracing.py +0 -85
  61. phoenix/datasets/types.py +0 -178
  62. phoenix/db/insertion/dataset.py +0 -237
  63. phoenix/db/migrations/types.py +0 -29
  64. phoenix/db/migrations/versions/10460e46d750_datasets.py +0 -291
  65. phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -100
  66. phoenix/server/api/dataloaders/dataset_example_spans.py +0 -43
  67. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -85
  68. phoenix/server/api/dataloaders/experiment_error_rates.py +0 -43
  69. phoenix/server/api/dataloaders/experiment_run_counts.py +0 -42
  70. phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -49
  71. phoenix/server/api/dataloaders/project_by_name.py +0 -31
  72. phoenix/server/api/dataloaders/span_projects.py +0 -33
  73. phoenix/server/api/dataloaders/trace_row_ids.py +0 -39
  74. phoenix/server/api/helpers/dataset_helpers.py +0 -179
  75. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -16
  76. phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -14
  77. phoenix/server/api/input_types/ClearProjectInput.py +0 -15
  78. phoenix/server/api/input_types/CreateDatasetInput.py +0 -12
  79. phoenix/server/api/input_types/DatasetExampleInput.py +0 -14
  80. phoenix/server/api/input_types/DatasetSort.py +0 -17
  81. phoenix/server/api/input_types/DatasetVersionSort.py +0 -16
  82. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -13
  83. phoenix/server/api/input_types/DeleteDatasetInput.py +0 -7
  84. phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -9
  85. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -35
  86. phoenix/server/api/input_types/PatchDatasetInput.py +0 -14
  87. phoenix/server/api/mutations/__init__.py +0 -13
  88. phoenix/server/api/mutations/auth.py +0 -11
  89. phoenix/server/api/mutations/dataset_mutations.py +0 -520
  90. phoenix/server/api/mutations/experiment_mutations.py +0 -65
  91. phoenix/server/api/mutations/project_mutations.py +0 -47
  92. phoenix/server/api/openapi/__init__.py +0 -0
  93. phoenix/server/api/openapi/main.py +0 -6
  94. phoenix/server/api/openapi/schema.py +0 -16
  95. phoenix/server/api/queries.py +0 -503
  96. phoenix/server/api/routers/v1/dataset_examples.py +0 -178
  97. phoenix/server/api/routers/v1/datasets.py +0 -965
  98. phoenix/server/api/routers/v1/experiment_evaluations.py +0 -66
  99. phoenix/server/api/routers/v1/experiment_runs.py +0 -108
  100. phoenix/server/api/routers/v1/experiments.py +0 -174
  101. phoenix/server/api/types/AnnotatorKind.py +0 -10
  102. phoenix/server/api/types/CreateDatasetPayload.py +0 -8
  103. phoenix/server/api/types/DatasetExample.py +0 -85
  104. phoenix/server/api/types/DatasetExampleRevision.py +0 -34
  105. phoenix/server/api/types/DatasetVersion.py +0 -14
  106. phoenix/server/api/types/ExampleRevisionInterface.py +0 -14
  107. phoenix/server/api/types/Experiment.py +0 -140
  108. phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -13
  109. phoenix/server/api/types/ExperimentComparison.py +0 -19
  110. phoenix/server/api/types/ExperimentRun.py +0 -91
  111. phoenix/server/api/types/ExperimentRunAnnotation.py +0 -57
  112. phoenix/server/api/types/Inferences.py +0 -80
  113. phoenix/server/api/types/InferencesRole.py +0 -23
  114. phoenix/utilities/json.py +0 -61
  115. phoenix/utilities/re.py +0 -50
  116. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/IP_NOTICE +0 -0
  117. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/LICENSE +0 -0
  118. /phoenix/server/api/{helpers/__init__.py → helpers.py} +0 -0
@@ -1,244 +0,0 @@
1
- import re
2
- from types import MappingProxyType
3
- from typing import Any, Callable, Optional, Type
4
-
5
- from phoenix.datasets.evaluators.utils import (
6
- ExampleInput,
7
- ExampleMetadata,
8
- ExperimentEvaluator,
9
- LLMEvaluator,
10
- _unwrap_json,
11
- )
12
- from phoenix.datasets.types import (
13
- EvaluationResult,
14
- TaskOutput,
15
- )
16
- from phoenix.evals.models.base import BaseModel as LLMBaseModel
17
- from phoenix.evals.utils import snap_to_rail
18
-
19
-
20
- class LLMCriteriaEvaluator(LLMEvaluator):
21
- _base_template = (
22
- "Determine if the following text is {criteria}. {description}"
23
- "First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
24
- "a single word label; 'true' if the text is {criteria} or 'false' if the text is not "
25
- "{criteria}. Here is an example template for whether the text meets a criteria:\n\n"
26
- "CRITERIA: the text is '{criteria}'\n"
27
- "TEXT: *the provided text to evaluate*\n"
28
- "EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
29
- "the criteria*\n"
30
- "LABEL: *true or false*\n\n"
31
- "Follow this template for the following example:\n\n"
32
- "CRITERIA: the text is '{criteria}'\n"
33
- "TEXT: {text}\n"
34
- "EXPLANATION: "
35
- )
36
- _description = "In this context, '{criteria}' means the text '{description}'. "
37
-
38
- def __init__(
39
- self,
40
- model: LLMBaseModel,
41
- criteria: str,
42
- description: str,
43
- name: str,
44
- ):
45
- self.model = model
46
- self.criteria = criteria
47
- self.description = description
48
- self.template = self._format_base_template(self.criteria, self.description)
49
- self._name = name
50
-
51
- def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
52
- formatted_template = self._format_eval_template(output)
53
- unparsed_response = self.model._generate(formatted_template)
54
- return self._parse_eval_output(unparsed_response)
55
-
56
- async def async_evaluate(
57
- self, *, output: Optional[TaskOutput] = None, **_: Any
58
- ) -> EvaluationResult:
59
- formatted_template = self._format_eval_template(output)
60
- unparsed_response = await self.model._async_generate(formatted_template)
61
- return self._parse_eval_output(unparsed_response)
62
-
63
- def _format_eval_template(self, output: TaskOutput) -> str:
64
- assert output is not None
65
- result = _unwrap_json(output)
66
- return self.template.format(text=str(result))
67
-
68
- def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
69
- raw_label, explanation = (
70
- _parse_label_from_explanation(unparsed_response),
71
- unparsed_response,
72
- )
73
- label = snap_to_rail(raw_label, ["true", "false"])
74
- if label == "true":
75
- score = 1.0
76
- elif label == "false":
77
- score = 0.0
78
- else:
79
- raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
80
- return EvaluationResult(
81
- score=score,
82
- explanation=explanation,
83
- metadata={},
84
- )
85
-
86
- @classmethod
87
- def _format_base_template(cls, criteria: str, description: Optional[str] = None) -> str:
88
- formatted_description = cls._description.format(criteria=criteria, description=description)
89
- formatted_template = cls._base_template.format(
90
- criteria=criteria,
91
- description=formatted_description,
92
- text="{text}", # leave the text field as a placeholder
93
- )
94
- return formatted_template
95
-
96
-
97
- def criteria_evaluator_factory(
98
- class_name: str, criteria: str, description: str, default_name: str
99
- ) -> Type[ExperimentEvaluator]:
100
- def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
101
- LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
102
-
103
- return type(
104
- class_name,
105
- (LLMCriteriaEvaluator,),
106
- {
107
- "__init__": _init,
108
- "__module__": __name__,
109
- "template": LLMCriteriaEvaluator._format_base_template(criteria, description),
110
- },
111
- )
112
-
113
-
114
- ConcisenessEvaluator = criteria_evaluator_factory(
115
- class_name="ConcisenessEvaluator",
116
- criteria="concise",
117
- description="is just a few sentences and easy to follow",
118
- default_name="Conciseness",
119
- )
120
-
121
-
122
- HelpfulnessEvaluator = criteria_evaluator_factory(
123
- class_name="HelpfulnessEvaluator",
124
- criteria="helpful",
125
- description="provides useful information",
126
- default_name="Helpfulness",
127
- )
128
-
129
-
130
- CoherenceEvaluator = criteria_evaluator_factory(
131
- class_name="CoherenceEvaluator",
132
- criteria="coherent",
133
- description="is coherent, well-structured, and logically sound",
134
- default_name="Coherence",
135
- )
136
-
137
-
138
- def _parse_label_from_explanation(raw_string: str) -> str:
139
- label_delimiter = r"(\W*label\W*)"
140
- parts = re.split(label_delimiter, raw_string, flags=re.IGNORECASE)
141
- if len(parts) > 1:
142
- # Find the last occurrence of the delimiter and take the part after it
143
- last_index = len(parts) - 1
144
- while last_index > 0:
145
- if re.match(label_delimiter, parts[last_index - 1], flags=re.IGNORECASE):
146
- return parts[last_index].strip()
147
- last_index -= 1
148
- return raw_string
149
-
150
-
151
- class RelevanceEvaluator(LLMEvaluator):
152
- template = (
153
- "Determine if the following response is relevant to the query. In this context, "
154
- "'relevance' means that the response directly addresses the core question or topic of the "
155
- "query. First, explain step-by-step why you think the text is or is not relevant. "
156
- "Then provide a single word label; 'true' if the text is relevant or 'false' if the text "
157
- "is not relevant. "
158
- "Here is an example template for your reponse:\n\n"
159
- "CRITERIA: the response is 'relevant' to the query\n"
160
- "QUERY: *text that contains a query*\n"
161
- "RESPONSE: *a response that may or may not be relevant to the query*\n"
162
- "EXPLANATION: *a step by step explanation of your reasoning for whether or not the "
163
- "response is relevant to the query*\n"
164
- "LABEL: *true or false*\n\n"
165
- "Follow this template for the following example:\n\n"
166
- "CRITERIA: the response is 'relevant' to the query\n"
167
- "QUERY: {reference}\n"
168
- "RESPONSE: {submission}\n"
169
- "EXPLANATION: "
170
- )
171
-
172
- def __init__(
173
- self,
174
- model: LLMBaseModel,
175
- get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
176
- get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
177
- name: str = "RelevanceEvaluator",
178
- ):
179
- self.model = model
180
- self._name = name
181
- self.get_query = get_query or self._default_get_query
182
- self.get_response = get_response or self._default_get_response
183
-
184
- def _format_eval_template(
185
- self,
186
- output: Optional[TaskOutput] = None,
187
- input: ExampleInput = MappingProxyType({}),
188
- metadata: ExampleMetadata = MappingProxyType({}),
189
- ) -> str:
190
- assert output is not None
191
- query = self.get_query(input, metadata)
192
- response = self.get_response(output, metadata)
193
- return self.template.format(query=query, response=response)
194
-
195
- def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
196
- raw_label, explanation = (
197
- _parse_label_from_explanation(unparsed_response),
198
- unparsed_response,
199
- )
200
- label = snap_to_rail(raw_label, ["true", "false"])
201
- if label == "true":
202
- score = 1.0
203
- elif label == "false":
204
- score = 0.0
205
- else:
206
- raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
207
- return EvaluationResult(
208
- score=score,
209
- explanation=explanation,
210
- metadata={},
211
- )
212
-
213
- def _default_get_query(self, input: ExampleInput, *args: Any, **kwargs: Any) -> str:
214
- return str(input)
215
-
216
- def _default_get_response(
217
- self, output: Optional[TaskOutput] = None, *args: Any, **kwargs: Any
218
- ) -> str:
219
- assert output is not None
220
- return str(_unwrap_json(output))
221
-
222
- def evaluate(
223
- self,
224
- *,
225
- output: Optional[TaskOutput] = None,
226
- metadata: ExampleMetadata = MappingProxyType({}),
227
- input: ExampleInput = MappingProxyType({}),
228
- **_: Any,
229
- ) -> EvaluationResult:
230
- formatted_template = self._format_eval_template(output, input, metadata)
231
- unparsed_response = self.model._generate(formatted_template)
232
- return self._parse_eval_output(unparsed_response)
233
-
234
- async def async_evaluate(
235
- self,
236
- *,
237
- output: Optional[TaskOutput] = None,
238
- metadata: ExampleMetadata = MappingProxyType({}),
239
- input: ExampleInput = MappingProxyType({}),
240
- **_: Any,
241
- ) -> EvaluationResult:
242
- formatted_template = self._format_eval_template(output, input, metadata)
243
- unparsed_response = await self.model._async_generate(formatted_template)
244
- return self._parse_eval_output(unparsed_response)
@@ -1,292 +0,0 @@
1
- import functools
2
- import inspect
3
- from abc import ABC
4
- from types import MappingProxyType
5
- from typing import Any, Awaitable, Callable, Mapping, Optional, Union
6
-
7
- from typing_extensions import TypeAlias
8
-
9
- from phoenix.datasets.types import (
10
- AnnotatorKind,
11
- EvaluationResult,
12
- JSONSerializable,
13
- TaskOutput,
14
- )
15
-
16
-
17
- def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
18
- if isinstance(obj, dict):
19
- if len(obj) == 1:
20
- key = next(iter(obj.keys()))
21
- output = obj[key]
22
- assert isinstance(
23
- output, (dict, list, str, int, float, bool, type(None))
24
- ), "Output must be JSON serializable"
25
- return output
26
- return obj
27
-
28
-
29
- def validate_signature(sig: inspect.Signature) -> None:
30
- # Check that the wrapped function has a valid signature for use as an evaluator
31
- # If it does not, raise an error to exit early before running evaluations
32
- params = sig.parameters
33
- valid_named_params = {"input", "output", "expected", "metadata"}
34
- if len(params) == 0:
35
- raise ValueError("Evaluation function must have at least one parameter.")
36
- if len(params) > 1:
37
- for not_found in set(params) - valid_named_params:
38
- param = params[not_found]
39
- if (
40
- param.kind is inspect.Parameter.VAR_KEYWORD
41
- or param.default is not inspect.Parameter.empty
42
- ):
43
- continue
44
- raise ValueError(
45
- (
46
- f"Invalid parameter names in evaluation function: {', '.join(not_found)}. "
47
- "Parameters names for multi-argument functions must be "
48
- f"any of: {', '.join(valid_named_params)}."
49
- )
50
- )
51
-
52
-
53
- def _bind_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
54
- parameter_mapping = {
55
- "input": kwargs.get("input"),
56
- "output": kwargs.get("output"),
57
- "expected": kwargs.get("expected"),
58
- "metadata": kwargs.get("metadata"),
59
- }
60
- params = sig.parameters
61
- if len(params) == 1:
62
- parameter_name = next(iter(params))
63
- if parameter_name in parameter_mapping:
64
- return sig.bind(parameter_mapping[parameter_name])
65
- else:
66
- return sig.bind(parameter_mapping["output"])
67
- return sig.bind_partial(
68
- **{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
69
- )
70
-
71
-
72
- def create_evaluator(
73
- kind: Union[str, AnnotatorKind] = AnnotatorKind.CODE,
74
- name: Optional[str] = None,
75
- scorer: Optional[Callable[[Any], EvaluationResult]] = None,
76
- ) -> Callable[[Callable[..., Any]], "Evaluator"]:
77
- if scorer is None:
78
- scorer = _default_eval_scorer
79
-
80
- if isinstance(kind, str):
81
- kind = AnnotatorKind(kind.upper())
82
-
83
- def wrapper(func: Callable[..., Any]) -> Evaluator:
84
- nonlocal name
85
- if not name:
86
- if hasattr(func, "__self__"):
87
- name = func.__self__.__class__.__name__
88
- elif hasattr(func, "__name__"):
89
- name = func.__name__
90
- else:
91
- name = str(func)
92
- assert name is not None
93
-
94
- wrapped_signature = inspect.signature(func)
95
- validate_signature(wrapped_signature)
96
-
97
- if inspect.iscoroutinefunction(func):
98
- return _wrap_coroutine_evaluation_function(name, kind, wrapped_signature, scorer)(func)
99
- else:
100
- return _wrap_sync_evaluation_function(name, kind, wrapped_signature, scorer)(func)
101
-
102
- return wrapper
103
-
104
-
105
- def _wrap_coroutine_evaluation_function(
106
- name: str,
107
- annotator_kind: AnnotatorKind,
108
- sig: inspect.Signature,
109
- convert_to_score: Callable[[Any], EvaluationResult],
110
- ) -> Callable[[Callable[..., Any]], "Evaluator"]:
111
- def wrapper(func: Callable[..., Any]) -> "Evaluator":
112
- class AsyncEvaluator(Evaluator):
113
- def __init__(self) -> None:
114
- self._name = name
115
- self._kind = annotator_kind
116
-
117
- @functools.wraps(func)
118
- async def __call__(self, *args: Any, **kwargs: Any) -> Any:
119
- return await func(*args, **kwargs)
120
-
121
- async def async_evaluate(self, **kwargs: Any) -> EvaluationResult:
122
- bound_signature = _bind_signature(sig, **kwargs)
123
- result = await func(*bound_signature.args, **bound_signature.kwargs)
124
- return convert_to_score(result)
125
-
126
- return AsyncEvaluator()
127
-
128
- return wrapper
129
-
130
-
131
- def _wrap_sync_evaluation_function(
132
- name: str,
133
- annotator_kind: AnnotatorKind,
134
- sig: inspect.Signature,
135
- convert_to_score: Callable[[Any], EvaluationResult],
136
- ) -> Callable[[Callable[..., Any]], "Evaluator"]:
137
- def wrapper(func: Callable[..., Any]) -> "Evaluator":
138
- class SyncEvaluator(Evaluator):
139
- def __init__(self) -> None:
140
- self._name = name
141
- self._kind = annotator_kind
142
-
143
- @functools.wraps(func)
144
- def __call__(self, *args: Any, **kwargs: Any) -> Any:
145
- return func(*args, **kwargs)
146
-
147
- def evaluate(self, **kwargs: Any) -> EvaluationResult:
148
- bound_signature = _bind_signature(sig, **kwargs)
149
- result = func(*bound_signature.args, **bound_signature.kwargs)
150
- return convert_to_score(result)
151
-
152
- return SyncEvaluator()
153
-
154
- return wrapper
155
-
156
-
157
- def _default_eval_scorer(result: Any) -> EvaluationResult:
158
- if isinstance(result, bool):
159
- return EvaluationResult(score=float(result), label=str(result))
160
- elif isinstance(result, (int, float)):
161
- return EvaluationResult(score=float(result))
162
- elif isinstance(result, EvaluationResult):
163
- return result
164
- else:
165
- raise ValueError(f"Unsupported evaluation result type: {type(result)}")
166
-
167
-
168
- ExampleOutput: TypeAlias = Mapping[str, JSONSerializable]
169
- ExampleMetadata: TypeAlias = Mapping[str, JSONSerializable]
170
- ExampleInput: TypeAlias = Mapping[str, JSONSerializable]
171
-
172
- EvaluatorName: TypeAlias = str
173
- EvaluatorKind: TypeAlias = str
174
- EvaluatorOutput: TypeAlias = Union[EvaluationResult, bool, int, float, str]
175
-
176
-
177
- class Evaluator(ABC):
178
- """
179
- A helper super class to guide the implementation of an `Evaluator` object.
180
- Subclasses must implement either the `evaluate` or `async_evaluate` method.
181
- Implementing both methods is recommended, but not required.
182
-
183
- This Class is intended to be subclassed, and should not be instantiated directly.
184
- """
185
-
186
- _kind: AnnotatorKind
187
- _name: EvaluatorName
188
-
189
- @functools.cached_property
190
- def name(self) -> EvaluatorName:
191
- if hasattr(self, "_name"):
192
- return self._name
193
- return self.__class__.__name__
194
-
195
- @functools.cached_property
196
- def kind(self) -> EvaluatorKind:
197
- if hasattr(self, "_kind"):
198
- return self._kind.value
199
- return AnnotatorKind.CODE.value
200
-
201
- def __new__(cls, *args: Any, **kwargs: Any) -> "Evaluator":
202
- if cls is Evaluator:
203
- raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
204
- return object.__new__(cls)
205
-
206
- def evaluate(
207
- self,
208
- *,
209
- output: Optional[TaskOutput] = None,
210
- expected: Optional[ExampleOutput] = None,
211
- metadata: ExampleMetadata = MappingProxyType({}),
212
- input: ExampleInput = MappingProxyType({}),
213
- **kwargs: Any,
214
- ) -> EvaluationResult:
215
- # For subclassing, one should implement either this sync method or the
216
- # async version. Implementing both is recommended but not required.
217
- raise NotImplementedError
218
-
219
- async def async_evaluate(
220
- self,
221
- *,
222
- output: Optional[TaskOutput] = None,
223
- expected: Optional[ExampleOutput] = None,
224
- metadata: ExampleMetadata = MappingProxyType({}),
225
- input: ExampleInput = MappingProxyType({}),
226
- **kwargs: Any,
227
- ) -> EvaluationResult:
228
- # For subclassing, one should implement either this async method or the
229
- # sync version. Implementing both is recommended but not required.
230
- return self.evaluate(
231
- output=output,
232
- expected=expected,
233
- metadata=metadata,
234
- input=input,
235
- **kwargs,
236
- )
237
-
238
- def __init_subclass__(cls, is_abstract: bool = False, **kwargs: Any) -> None:
239
- super().__init_subclass__(**kwargs)
240
- if is_abstract:
241
- return
242
- evaluate_fn_signature = inspect.signature(Evaluator.evaluate)
243
- for super_cls in inspect.getmro(cls):
244
- if super_cls in (LLMEvaluator, Evaluator):
245
- break
246
- if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__):
247
- assert callable(evaluate), "`evaluate()` method should be callable"
248
- # need to remove the first param, i.e. `self`
249
- _validate_sig(functools.partial(evaluate, None), "evaluate")
250
- return
251
- if async_evaluate := super_cls.__dict__.get(Evaluator.async_evaluate.__name__):
252
- assert callable(async_evaluate), "`async_evaluate()` method should be callable"
253
- # need to remove the first param, i.e. `self`
254
- _validate_sig(functools.partial(async_evaluate, None), "async_evaluate")
255
- return
256
- raise ValueError(
257
- f"Evaluator must implement either "
258
- f"`def evaluate{evaluate_fn_signature}` or "
259
- f"`async def async_evaluate{evaluate_fn_signature}`"
260
- )
261
-
262
-
263
- def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
264
- sig = inspect.signature(fn)
265
- validate_signature(sig)
266
- for param in sig.parameters.values():
267
- if param.kind is inspect.Parameter.VAR_KEYWORD:
268
- return
269
- else:
270
- raise ValueError(f"`{fn_name}` should allow variadic keyword arguments `**kwargs`")
271
-
272
-
273
- class LLMEvaluator(Evaluator, ABC, is_abstract=True):
274
- """
275
- A convenience super class for setting `kind` as LLM.
276
-
277
- This Class is intended to be subclassed, and should not be instantiated directly.
278
- """
279
-
280
- _kind = AnnotatorKind.LLM
281
-
282
- def __new__(cls, *args: Any, **kwargs: Any) -> "LLMEvaluator":
283
- if cls is LLMEvaluator:
284
- raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
285
- return object.__new__(cls)
286
-
287
-
288
- ExperimentEvaluator: TypeAlias = Union[
289
- Evaluator,
290
- Callable[..., EvaluatorOutput],
291
- Callable[..., Awaitable[EvaluatorOutput]],
292
- ]